diff options
Diffstat (limited to '')
134 files changed, 42722 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt new file mode 100644 index 000000000..cb9c2fea8 --- /dev/null +++ b/src/include/CMakeLists.txt @@ -0,0 +1,46 @@ +install(FILES + libcephsqlite.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +install(FILES + rados/librados.h + rados/rados_types.h + rados/rados_types.hpp + rados/librados_fwd.hpp + rados/librados.hpp + buffer.h + buffer_fwd.h + inline_memory.h + page.h + crc32c.h + rados/objclass.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados) +if(WITH_LIBRADOSSTRIPER) + install(FILES + radosstriper/libradosstriper.h + radosstriper/libradosstriper.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper) +endif() + +if(WITH_RBD) + install(FILES + rbd/features.h + rbd/librbd.h + rbd/librbd.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd) +endif() + +if(WITH_RADOSGW) + install(FILES + rados/librgw.h + rados/rgw_file.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados) +endif() + +if(WITH_LIBCEPHFS) + install(FILES + cephfs/libcephfs.h + cephfs/ceph_ll_client.h + cephfs/types.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cephfs) +endif() diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h new file mode 100644 index 000000000..35c7a7738 --- /dev/null +++ b/src/include/CompatSet.h @@ -0,0 +1,285 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMPATSET_H +#define CEPH_COMPATSET_H + +#include <iostream> +#include <map> +#include <string> + +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/types.h" +#include "common/Formatter.h" + +struct CompatSet { + + struct Feature { + uint64_t id; + std::string name; + + Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {} + }; + + class FeatureSet { + uint64_t mask; + std::map<uint64_t, std::string> names; + + public: + friend struct CompatSet; + friend class CephCompatSet_AllSet_Test; + friend class CephCompatSet_other_Test; + friend class CephCompatSet_merge_Test; + friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs); + friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat); + FeatureSet() : mask(1), names() {} + void insert(const Feature& f) { + ceph_assert(f.id > 0); + ceph_assert(f.id < 64); + mask |= ((uint64_t)1<<f.id); + names[f.id] = f.name; + } + + bool contains(const Feature& f) const { + return names.count(f.id); + } + bool contains(uint64_t f) const { + return names.count(f); + } + /** + * Getter instead of using name[] to be const safe + */ + std::string get_name(uint64_t const f) const { + std::map<uint64_t, std::string>::const_iterator i = names.find(f); + ceph_assert(i != names.end()); + return i->second; + } + + void remove(uint64_t f) { + if (names.count(f)) { + names.erase(f); + mask &= ~((uint64_t)1<<f); + } + } + void remove(const Feature& f) { + remove(f.id); + } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + /* See below, mask always has the lowest bit set in memory, but + * unset in the encoding */ + encode(mask & (~(uint64_t)1), bl); + encode(names, bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(mask, bl); + decode(names, bl); + /** + * Previously, there was a bug where insert did + * mask |= f.id rather than mask |= (1 << f.id). + * In FeatureSets from those version, mask always + * has the lowest bit set. Since then, masks always + * have the lowest bit unset. + * + * When we encounter such a FeatureSet, we have to + * reconstruct the mask from the names map. + */ + if (mask & 1) { + mask = 1; + std::map<uint64_t, std::string> temp_names; + temp_names.swap(names); + for (auto i = temp_names.begin(); i != temp_names.end(); ++i) { + insert(Feature(i->first, i->second)); + } + } else { + mask |= 1; + } + } + + void dump(ceph::Formatter *f) const { + for (auto p = names.cbegin(); p != names.cend(); ++p) { + char s[18]; + snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first); + f->dump_string(s, p->second); + } + } + }; + + // These features have no impact on the read / write status + FeatureSet compat; + // If any of these features are missing, read is possible ( as long + // as no incompat feature is missing ) but it is not possible to write + FeatureSet ro_compat; + // If any of these features are missing, read or write is not possible + FeatureSet incompat; + + CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) : + compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {} + + CompatSet() : compat(), ro_compat(), incompat() { } + + + /* does this filesystem implementation have the + features required to read the other? */ + bool readable(CompatSet const& other) const { + return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + } + + /* does this filesystem implementation have the + features required to write the other? */ + bool writeable(CompatSet const& other) const { + return readable(other) && + !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + } + + /* Compare this CompatSet to another. + * CAREFULLY NOTE: This operation is NOT commutative. + * a > b DOES NOT imply that b < a. + * If returns: + * 0: The CompatSets have the same feature set. + * 1: This CompatSet's features are a strict superset of the other's. + * -1: This CompatSet is missing at least one feature + * described in the other. It may still have more features, though. + */ + int compare(const CompatSet& other) const { + if ((other.compat.mask == compat.mask) && + (other.ro_compat.mask == ro_compat.mask) && + (other.incompat.mask == incompat.mask)) return 0; + //okay, they're not the same + + //if we're writeable we have a superset of theirs on incompat and ro_compat + if (writeable(other) && !((other.compat.mask ^ compat.mask) + & other.compat.mask)) return 1; + //if we make it here, we weren't writeable or had a difference compat set + return -1; + } + + /* Get the features supported by other CompatSet but not this one, + * as a CompatSet. + */ + CompatSet unsupported(const CompatSet& other) const { + CompatSet diff; + uint64_t other_compat = + ((other.compat.mask ^ compat.mask) & other.compat.mask); + uint64_t other_ro_compat = + ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + uint64_t other_incompat = + ((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + for (int id = 1; id < 64; ++id) { + uint64_t mask = (uint64_t)1 << id; + if (mask & other_compat) { + diff.compat.insert( Feature(id, other.compat.names.at(id))); + } + if (mask & other_ro_compat) { + diff.ro_compat.insert(Feature(id, other.ro_compat.names.at(id))); + } + if (mask & other_incompat) { + diff.incompat.insert( Feature(id, other.incompat.names.at(id))); + } + } + return diff; + } + + /* Merge features supported by other CompatSet into this one. + * Return: true if some features were merged + */ + bool merge(CompatSet const & other) { + uint64_t other_compat = + ((other.compat.mask ^ compat.mask) & other.compat.mask); + uint64_t other_ro_compat = + ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + uint64_t other_incompat = + ((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + if (!other_compat && !other_ro_compat && !other_incompat) + return false; + for (int id = 1; id < 64; ++id) { + uint64_t mask = (uint64_t)1 << id; + if (mask & other_compat) { + compat.insert( Feature(id, other.compat.get_name(id))); + } + if (mask & other_ro_compat) { + ro_compat.insert(Feature(id, other.ro_compat.get_name(id))); + } + if (mask & other_incompat) { + incompat.insert( Feature(id, other.incompat.get_name(id))); + } + } + return true; + } + + std::ostream& printlite(std::ostream& o) const { + o << "{c=[" << std::hex << compat.mask << "]"; + o << ",r=[" << std::hex << ro_compat.mask << "]"; + o << ",i=[" << std::hex << incompat.mask << "]}"; + o << std::dec; + return o; + } + + void encode(ceph::buffer::list& bl) const { + compat.encode(bl); + ro_compat.encode(bl); + incompat.encode(bl); + } + + void decode(ceph::buffer::list::const_iterator& bl) { + compat.decode(bl); + ro_compat.decode(bl); + incompat.decode(bl); + } + + void dump(ceph::Formatter *f) const { + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + f->open_object_section("ro_compat"); + ro_compat.dump(f); + f->close_section(); + f->open_object_section("incompat"); + incompat.dump(f); + f->close_section(); + } + + static void generate_test_instances(std::list<CompatSet*>& o) { + o.push_back(new CompatSet); + o.push_back(new CompatSet); + o.back()->compat.insert(Feature(1, "one")); + o.back()->compat.insert(Feature(2, "two")); + o.back()->ro_compat.insert(Feature(4, "four")); + o.back()->incompat.insert(Feature(3, "three")); + } +}; +WRITE_CLASS_ENCODER(CompatSet) + +inline std::ostream& operator<<(std::ostream& out, const CompatSet::Feature& f) +{ + return out << "F(" << f.id << ", \"" << f.name << "\")"; +} + +inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs) +{ + return out << fs.names; +} + +inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat) +{ + return out << "compat=" << compat.compat + << ",rocompat=" << compat.ro_compat + << ",incompat=" << compat.incompat; +} + +#endif diff --git a/src/include/Context.h b/src/include/Context.h new file mode 100644 index 000000000..bef85ca5b --- /dev/null +++ b/src/include/Context.h @@ -0,0 +1,535 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CONTEXT_H +#define CEPH_CONTEXT_H + +#include "common/dout.h" + +#include <functional> +#include <list> +#include <memory> +#include <set> + +#include <boost/function.hpp> +#include <boost/system/error_code.hpp> + +#include "common/error_code.h" + +#include "include/ceph_assert.h" +#include "common/ceph_mutex.h" + +#define mydout(cct, v) lgeneric_subdout(cct, context, v) + +/* + * GenContext - abstract callback class + */ +template <typename T> +class GenContext { + GenContext(const GenContext& other); + const GenContext& operator=(const GenContext& other); + + protected: + virtual void finish(T t) = 0; + + public: + GenContext() {} + virtual ~GenContext() {} // we want a virtual destructor!!! + + template <typename C> + void complete(C &&t) { + finish(std::forward<C>(t)); + delete this; + } + + template <typename C> + void operator()(C &&t) noexcept { + complete(std::forward<C>(t)); + } + + template<typename U = T> + auto operator()() noexcept + -> typename std::enable_if<std::is_default_constructible<U>::value, + void>::type { + complete(T{}); + } + + + std::reference_wrapper<GenContext> func() { + return std::ref(*this); + } +}; + +template <typename T> +using GenContextURef = std::unique_ptr<GenContext<T> >; + +/* + * Context - abstract callback class + */ +class Finisher; +class Context { + Context(const Context& other); + const Context& operator=(const Context& other); + + protected: + virtual void finish(int r) = 0; + + // variant of finish that is safe to call "synchronously." override should + // return true. + virtual bool sync_finish(int r) { + return false; + } + + public: + Context() {} + virtual ~Context() {} // we want a virtual destructor!!! + virtual void complete(int r) { + finish(r); + delete this; + } + virtual bool sync_complete(int r) { + if (sync_finish(r)) { + delete this; + return true; + } + return false; + } + void complete(boost::system::error_code ec) { + complete(ceph::from_error_code(ec)); + } + void operator()(boost::system::error_code ec) noexcept { + complete(ec); + } + + void operator()() noexcept { + complete({}); + } + + std::reference_wrapper<Context> func() { + return std::ref(*this); + } +}; + +/** + * Simple context holding a single object + */ +template<class T> +class ContainerContext : public Context { + T obj; +public: + ContainerContext(T &obj) : obj(obj) {} + void finish(int r) override {} +}; +template <typename T> +ContainerContext<T> *make_container_context(T &&t) { + return new ContainerContext<T>(std::forward<T>(t)); +} + +template <class T> +struct Wrapper : public Context { + Context *to_run; + T val; + Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {} + void finish(int r) override { + if (to_run) + to_run->complete(r); + } +}; +struct RunOnDelete { + Context *to_run; + RunOnDelete(Context *to_run) : to_run(to_run) {} + ~RunOnDelete() { + if (to_run) + to_run->complete(0); + } +}; +typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef; + +template <typename T> +class LambdaContext : public Context { +public: + LambdaContext(T &&t) : t(std::forward<T>(t)) {} + void finish(int r) override { + if constexpr (std::is_invocable_v<T, int>) + t(r); + else + t(); + } +private: + T t; +}; + +template <typename T> +LambdaContext<T> *make_lambda_context(T &&t) { + return new LambdaContext<T>(std::move(t)); +} + +template <typename F, typename T> +struct LambdaGenContext : GenContext<T> { + F f; + LambdaGenContext(F &&f) : f(std::forward<F>(f)) {} + void finish(T t) override { + f(std::forward<T>(t)); + } +}; +template <typename T, typename F> +GenContextURef<T> make_gen_lambda_context(F &&f) { + return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f))); +} + +/* + * finish and destroy a list of Contexts + */ +template<class C> +inline void finish_contexts(CephContext *cct, C& finished, int result = 0) +{ + if (finished.empty()) + return; + + C ls; + ls.swap(finished); // swap out of place to avoid weird loops + + if (cct) + mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl; + for (Context* c : ls) { + if (cct) + mydout(cct,10) << "---- " << c << dendl; + c->complete(result); + } +} + +class C_NoopContext : public Context { +public: + void finish(int r) override { } +}; + + +struct C_Lock : public Context { + ceph::mutex *lock; + Context *fin; + C_Lock(ceph::mutex *l, Context *c) : lock(l), fin(c) {} + ~C_Lock() override { + delete fin; + } + void finish(int r) override { + if (fin) { + std::lock_guard l{*lock}; + fin->complete(r); + fin = NULL; + } + } +}; + +/* + * C_Contexts - set of Contexts + * + * ContextType must be an ancestor class of ContextInstanceType, or the same class. + * ContextInstanceType must be default-constructable. + */ +template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>> +class C_ContextsBase : public ContextInstanceType { +public: + CephContext *cct; + Container contexts; + + C_ContextsBase(CephContext *cct_) + : cct(cct_) + { + } + ~C_ContextsBase() override { + for (auto c : contexts) { + delete c; + } + } + void add(ContextType* c) { + contexts.push_back(c); + } + void take(Container& ls) { + Container c; + c.swap(ls); + if constexpr (std::is_same_v<Container, std::list<ContextType *>>) { + contexts.splice(contexts.end(), c); + } else { + contexts.insert(contexts.end(), c.begin(), c.end()); + } + } + void complete(int r) override { + // Neuter any ContextInstanceType custom complete(), because although + // I want to look like it, I don't actually want to run its code. + Context::complete(r); + } + void finish(int r) override { + finish_contexts(cct, contexts, r); + } + bool empty() { return contexts.empty(); } + + template<class C> + static ContextType *list_to_context(C& cs) { + if (cs.size() == 0) { + return 0; + } else if (cs.size() == 1) { + ContextType *c = cs.front(); + cs.clear(); + return c; + } else { + C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0)); + c->take(cs); + return c; + } + } +}; + +typedef C_ContextsBase<Context, Context> C_Contexts; + +/* + * C_Gather + * + * ContextType must be an ancestor class of ContextInstanceType, or the same class. + * ContextInstanceType must be default-constructable. + * + * BUG:? only reports error from last sub to have an error return + */ +template <class ContextType, class ContextInstanceType> +class C_GatherBase { +private: + CephContext *cct; + int result = 0; + ContextType *onfinish; +#ifdef DEBUG_GATHER + std::set<ContextType*> waitfor; +#endif + int sub_created_count = 0; + int sub_existing_count = 0; + mutable ceph::recursive_mutex lock = + ceph::make_recursive_mutex("C_GatherBase::lock"); // disable lockdep + bool activated = false; + + void sub_finish(ContextType* sub, int r) { + lock.lock(); +#ifdef DEBUG_GATHER + ceph_assert(waitfor.count(sub)); + waitfor.erase(sub); +#endif + --sub_existing_count; + mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub +#ifdef DEBUG_GATHER + << " (remaining " << waitfor << ")" +#endif + << dendl; + if (r < 0 && result == 0) + result = r; + if ((activated == false) || (sub_existing_count != 0)) { + lock.unlock(); + return; + } + lock.unlock(); + delete_me(); + } + + void delete_me() { + if (onfinish) { + onfinish->complete(result); + onfinish = 0; + } + delete this; + } + + class C_GatherSub : public ContextInstanceType { + C_GatherBase *gather; + public: + C_GatherSub(C_GatherBase *g) : gather(g) {} + void complete(int r) override { + // Cancel any customized complete() functionality + // from the Context subclass we're templated for, + // we only want to hit that in onfinish, not at each + // sub finish. e.g. MDSInternalContext. + Context::complete(r); + } + void finish(int r) override { + gather->sub_finish(this, r); + gather = 0; + } + ~C_GatherSub() override { + if (gather) + gather->sub_finish(this, 0); + } + }; + +public: + C_GatherBase(CephContext *cct_, ContextType *onfinish_) + : cct(cct_), onfinish(onfinish_) + { + mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl; + } + ~C_GatherBase() { + mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl; + } + void set_finisher(ContextType *onfinish_) { + std::lock_guard l{lock}; + ceph_assert(!onfinish); + onfinish = onfinish_; + } + void activate() { + lock.lock(); + ceph_assert(activated == false); + activated = true; + if (sub_existing_count != 0) { + lock.unlock(); + return; + } + lock.unlock(); + delete_me(); + } + ContextType *new_sub() { + std::lock_guard l{lock}; + ceph_assert(activated == false); + sub_created_count++; + sub_existing_count++; + ContextType *s = new C_GatherSub(this); +#ifdef DEBUG_GATHER + waitfor.insert(s); +#endif + mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl; + return s; + } + + inline int get_sub_existing_count() const { + std::lock_guard l{lock}; + return sub_existing_count; + } + + inline int get_sub_created_count() const { + std::lock_guard l{lock}; + return sub_created_count; + } +}; + +/* + * The C_GatherBuilder remembers each C_Context created by + * C_GatherBuilder.new_sub() in a C_Gather. When a C_Context created + * by new_sub() is complete(), C_Gather forgets about it. When + * C_GatherBuilder notices that there are no C_Context left in + * C_Gather, it calls complete() on the C_Context provided as the + * second argument of the constructor (finisher). + * + * How to use C_GatherBuilder: + * + * 1. Create a C_GatherBuilder on the stack + * 2. Call gather_bld.new_sub() as many times as you want to create new subs + * It is safe to call this 0 times, or 100, or anything in between. + * 3. If you didn't supply a finisher in the C_GatherBuilder constructor, + * set one with gather_bld.set_finisher(my_finisher) + * 4. Call gather_bld.activate() + * + * Example: + * + * C_SaferCond all_done; + * C_GatherBuilder gb(g_ceph_context, all_done); + * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather + * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather + * gb.activate(); // consume C_Context as soon as they complete() + * all_done.wait(); // all_done is complete() after all new_sub() are complete() + * + * The finisher may be called at any point after step 4, including immediately + * from the activate() function. + * The finisher will never be called before activate(). + * + * Note: Currently, subs must be manually freed by the caller (for some reason.) + */ +template <class ContextType, class GatherType> +class C_GatherBuilderBase +{ +public: + C_GatherBuilderBase(CephContext *cct_) + : cct(cct_), c_gather(NULL), finisher(NULL), activated(false) + { + } + C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_) + : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false) + { + } + ~C_GatherBuilderBase() { + if (c_gather) { + ceph_assert(activated); // Don't forget to activate your C_Gather! + } + else { + delete finisher; + } + } + ContextType *new_sub() { + if (!c_gather) { + c_gather = new GatherType(cct, finisher); + } + return c_gather->new_sub(); + } + void activate() { + if (!c_gather) + return; + ceph_assert(finisher != NULL); + activated = true; + c_gather->activate(); + } + void set_finisher(ContextType *finisher_) { + finisher = finisher_; + if (c_gather) + c_gather->set_finisher(finisher); + } + GatherType *get() const { + return c_gather; + } + bool has_subs() const { + return (c_gather != NULL); + } + int num_subs_created() { + ceph_assert(!activated); + if (c_gather == NULL) + return 0; + return c_gather->get_sub_created_count(); + } + int num_subs_remaining() { + ceph_assert(!activated); + if (c_gather == NULL) + return 0; + return c_gather->get_sub_existing_count(); + } + +private: + CephContext *cct; + GatherType *c_gather; + ContextType *finisher; + bool activated; +}; + +typedef C_GatherBase<Context, Context> C_Gather; +typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder; + +template <class ContextType> +class ContextFactory { +public: + virtual ~ContextFactory() {} + virtual ContextType *build() = 0; +}; + +inline auto lambdafy(Context *c) { + return [fin = std::unique_ptr<Context>(c)] + (boost::system::error_code ec) mutable { + fin.release()->complete(ceph::from_error_code(ec)); + }; +} + + +#undef mydout + +#endif diff --git a/src/include/Distribution.h b/src/include/Distribution.h new file mode 100644 index 000000000..56e998757 --- /dev/null +++ b/src/include/Distribution.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_DISTRIBUTION_H +#define CEPH_DISTRIBUTION_H + +#include <vector> + +class Distribution { + std::vector<float> p; + std::vector<int> v; + + public: + //Distribution() { + //} + + unsigned get_width() { + return p.size(); + } + + void clear() { + p.clear(); + v.clear(); + } + void add(int val, float pr) { + p.push_back(pr); + v.push_back(val); + } + + void random() { + float sum = 0.0; + for (unsigned i=0; i<p.size(); i++) { + p[i] = (float)(rand() % 10000); + sum += p[i]; + } + for (unsigned i=0; i<p.size(); i++) + p[i] /= sum; + } + + int sample() { + float s = (float)(rand() % 10000) / 10000.0; + for (unsigned i=0; i<p.size(); i++) { + if (s < p[i]) return v[i]; + s -= p[i]; + } + ceph_abort(); + return v[p.size() - 1]; // hmm. :/ + } + + float normalize() { + float s = 0.0; + for (unsigned i=0; i<p.size(); i++) + s += p[i]; + for (unsigned i=0; i<p.size(); i++) + p[i] /= s; + return s; + } + +}; + +#endif diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h new file mode 100644 index 000000000..c205ac75f --- /dev/null +++ b/src/include/addr_parsing.h @@ -0,0 +1,28 @@ +/* + * addr_parsing.h + * + * Created on: Sep 14, 2010 + * Author: gregf + * contains functions used by Ceph to convert named addresses + * (eg ceph.com) into IP addresses (ie 127.0.0.1). + */ + +#ifndef ADDR_PARSING_H_ +#define ADDR_PARSING_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int safe_cat(char **pstr, int *plen, int pos, const char *str2); + +/* + * returns a string allocated by malloc; caller must free + */ +char *resolve_addrs(const char *orig_str); + +#ifdef __cplusplus +} +#endif + +#endif /* ADDR_PARSING_H_ */ diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h new file mode 100644 index 000000000..258c58338 --- /dev/null +++ b/src/include/alloc_ptr.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ALLOC_PTR_H +#define CEPH_ALLOC_PTR_H + +#include <memory> + +template <class T> +class alloc_ptr +{ +public: + typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer; + typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type; + + alloc_ptr() : ptr() {} + + template<class U> + alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {} + + alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {} + alloc_ptr(const alloc_ptr<pointer>& rhs) = delete; + alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) { + ptr = rhs.ptr; + } + alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) { + ptr = rhs.ptr; + } + + void swap (alloc_ptr<pointer>& rhs) { + ptr.swap(rhs.ptr); + } + element_type* release() { + return ptr.release(); + } + void reset(element_type *p = nullptr) { + ptr.reset(p); + } + element_type* get() const { + if (!ptr) + ptr.reset(new element_type); + return ptr.get(); + } + element_type& operator*() const { + if (!ptr) + ptr.reset(new element_type); + return *ptr; + } + element_type* operator->() const { + if (!ptr) + ptr.reset(new element_type); + return ptr.get(); + } + operator bool() const { + return !!ptr; + } + + friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::less<element_type>(*lhs, *rhs); + } + friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::less_equal<element_type>(*lhs, *rhs); + } + friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::greater<element_type>(*lhs, *rhs); + } + friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::greater_equal<element_type>(*lhs, *rhs); + } + friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return *lhs == *rhs; + } + friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return *lhs != *rhs; + } +private: + mutable std::unique_ptr<element_type> ptr; +}; + +#endif diff --git a/src/include/any.h b/src/include/any.h new file mode 100644 index 000000000..da59c88f4 --- /dev/null +++ b/src/include/any.h @@ -0,0 +1,704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef INCLUDE_STATIC_ANY +#define INCLUDE_STATIC_ANY + +#include <any> +#include <cstddef> +#include <initializer_list> +#include <memory> +#include <typeinfo> +#include <type_traits> + +#include <boost/smart_ptr/shared_ptr.hpp> +#include <boost/smart_ptr/make_shared.hpp> + +namespace ceph { + +namespace _any { + +// Shared Functionality +// -------------------- +// +// Common implementation details. Most functionality is here. We +// assume that destructors do not throw. Some of them might and +// they'll invoke terminate and that's fine. +// +// We are using the Curiously Recurring Template Pattern! We require +// that all classes inheriting from us provide: +// +// - `static constexpr size_t capacity`: Maximum capacity. No object +// larger than this may be +// stored. `dynamic` for dynamic. +// - `void* ptr() const noexcept`: returns a pointer to storage. +// (`alloc_storage` must have been called. +// `free_storage` must not have been called +// since.) +// - `void* alloc_storage(const std::size_t)`: allocate storage +// - `void free_storage() noexcept`: free storage. Must be idempotent. +// +// We provide most of the public interface, as well as the operator function, +// cast_helper, and the type() call. + +// Set `capacity` to this value to indicate that there is no fixed +// capacity. +// +inline constexpr std::size_t dynamic = ~0; + +// Driver Function +// --------------- +// +// The usual type-erasure control function trick. This one is simpler +// than usual since we punt on moving and copying. We could dispense +// with this and just store a deleter and a pointer to a typeinfo, but +// that would be twice the space. +// +// Moved out here so the type of `func_t` isn't dependent on the +// enclosing class. +// +enum class op { type, destroy }; +template<typename T> +inline void op_func(const op o, void* p) noexcept { + static const std::type_info& type = typeid(T); + switch (o) { + case op::type: + *(reinterpret_cast<const std::type_info**>(p)) = &type; + break; + case op::destroy: + reinterpret_cast<T*>(p)->~T(); + break; + } +} +using func_t = void (*)(const op, void* p) noexcept; + +// The base class +// -------------- +// +// The `storage_t` parameter gives the type of the value that manages +// storage and allocation. We use it to create a protected data member +// (named `storage`). This allows us to sidestep the problem in +// initialization order where, where exposed constructors were using +// trying to allocate or free storage *before* the data members of the +// derived class were initialized. +// +// Making storage_t a member type of the derived class won't work, due +// to C++'s rules for nested types being *horrible*. Just downright +// *horrible*. +// +template<typename D, typename storage_t> +class base { + // Make definitions from our superclass visible + // -------------------------------------------- + // + // And check that they fit the requirements. At least those that are + // statically checkable. + // + static constexpr std::size_t capacity = D::capacity; + + void* ptr() const noexcept { + static_assert( + noexcept(static_cast<const D*>(this)->ptr()) && + std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>, + "‘void* ptr() const noexcept’ missing from superclass"); + return static_cast<const D*>(this)->ptr(); + } + + void* alloc_storage(const std::size_t z) { + static_assert( + std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>, + "‘void* alloc_storage(const size_t)’ missing from superclass."); + return static_cast<D*>(this)->alloc_storage(z); + } + + void free_storage() noexcept { + static_assert( + noexcept(static_cast<D*>(this)->free_storage()) && + std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>, + "‘void free_storage() noexcept’ missing from superclass."); + static_cast<D*>(this)->free_storage(); + } + + + // Pile O' Templates + // ----------------- + // + // These are just verbose and better typed once than twice. They're + // used for SFINAE and declaring noexcept. + // + template<class T> + struct is_in_place_type_helper : std::false_type {}; + template<class T> + struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {}; + + template<class T> + static constexpr bool is_in_place_type_v = + is_in_place_type_helper<std::decay_t<T>>::value; + + // SFINAE condition for value initialized + // constructors/assigners. This is analogous to the standard's + // requirement that this overload only participate in overload + // resolution if std::decay_t<T> is not the same type as the + // any-type, nor a specialization of std::in_place_type_t + // + template<typename T> + using value_condition_t = std::enable_if_t< + !std::is_same_v<std::decay_t<T>, D> && + !is_in_place_type_v<std::decay_t<T>>>; + + // This `noexcept` condition for value construction lets + // `immobile_any`'s value constructor/assigner be noexcept, so long + // as the type's copy or move constructor cooperates. + // + template<typename T> + static constexpr bool value_noexcept_v = + std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic; + + // SFINAE condition for in-place constructors/assigners + // + template<typename T, typename... Args> + using in_place_condition_t = std::enable_if_t<std::is_constructible_v< + std::decay_t<T>, Args...>>; + + // Analogous to the above. Give noexcept to immobile_any::emplace + // when possible. + // + template<typename T, typename... Args> + static constexpr bool in_place_noexcept_v = + std::is_nothrow_constructible_v<std::decay_t<T>, Args...> && + capacity != dynamic; + +private: + + // Functionality! + // -------------- + + // The driver function for the currently stored object. Whether this + // is null is the canonical way to know whether an instance has a + // value. + // + func_t func = nullptr; + + // Construct an object within ourselves. As you can see we give the + // weak exception safety guarantee. + // + template<typename T, typename ...Args> + std::decay_t<T>& construct(Args&& ...args) { + using Td = std::decay_t<T>; + static_assert(capacity == dynamic || sizeof(Td) <= capacity, + "Supplied type is too large for this specialization."); + try { + func = &op_func<Td>; + return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td)))) + Td(std::forward<Args>(args)...); + } catch (...) { + reset(); + throw; + } + } + +protected: + + // We hold the storage, even if the superclass class manipulates it, + // so that its default initialization comes soon enough for us to + // use it in our constructors. + // + storage_t storage; + +public: + + base() noexcept = default; + ~base() noexcept { + reset(); + } + +protected: + // Since some of our derived classes /can/ be copied or moved. + // + base(const base& rhs) noexcept : func(rhs.func) { + if constexpr (std::is_copy_assignable_v<storage_t>) { + storage = rhs.storage; + } + } + base& operator =(const base& rhs) noexcept { + reset(); + func = rhs.func; + if constexpr (std::is_copy_assignable_v<storage_t>) { + storage = rhs.storage; + } + return *this; + } + + base(base&& rhs) noexcept : func(std::move(rhs.func)) { + if constexpr (std::is_move_assignable_v<storage_t>) { + storage = std::move(rhs.storage); + } + rhs.func = nullptr; + } + base& operator =(base&& rhs) noexcept { + reset(); + func = rhs.func; + if constexpr (std::is_move_assignable_v<storage_t>) { + storage = std::move(rhs.storage); + } + rhs.func = nullptr; + return *this; + } + +public: + + // Value construct/assign + // ---------------------- + // + template<typename T, + typename = value_condition_t<T>> + base(T&& t) noexcept(value_noexcept_v<T>) { + construct<T>(std::forward<T>(t)); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename = value_condition_t<T>> + base& operator =(T&& t) noexcept(value_noexcept_v<T>) { + reset(); + construct<T>(std::forward<T>(t)); + return *this; + } + + // In-place construct/assign + // ------------------------- + // + // I really hate the way the C++ standard library treats references + // as if they were stepchildren in a Charles Dickens novel. I am + // quite upset that std::optional lacks a specialization for + // references. There's no legitimate reason for it. The whole + // 're-seat or refuse' debate is simply a canard. The optional is + // effectively a container, so of course it can be emptied or + // reassigned. No, pointers are not an acceptable substitute. A + // pointer gives an address in memory which may be null and which + // may represent an object or may a location in which an object is + // to be created. An optional reference, on the other hand, is a + // reference to an initialized, live object or /empty/. This is an + // obvious difference that should be communicable to any programmer + // reading the code through the type system. + // + // `std::any`, even in the case of in-place construction, + // only stores the decayed type. I suspect this was to get around + // the question of whether, for a std::any holding a T&, + // std::any_cast<T> should return a copy or throw + // std::bad_any_cast. + // + // I think the appropriate response in that case would be to make a + // copy if the type supports it and fail otherwise. Once a concrete + // type is known the problem solves itself. + // + // If one were inclined, one could easily load the driver function + // with a heavy subset of the type traits (those that depend only on + // the type in question) and simply /ask/ whether it's a reference. + // + // At the moment, I'm maintaining compatibility with the standard + // library except for copy/move semantics. + // + template<typename T, + typename... Args, + typename = in_place_condition_t<T, Args...>> + base(std::in_place_type_t<T>, + Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) { + construct<T>(std::forward<Args>(args)...); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename... Args, + typename = in_place_condition_t<T>> + std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v< + T, Args...>) { + reset(); + return construct<T>(std::forward<Args>(args)...); + } + + template<typename T, + typename U, + typename... Args, + typename = in_place_condition_t<T, std::initializer_list<U>, + Args...>> + base(std::in_place_type_t<T>, + std::initializer_list<U> i, + Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>, + Args...>) { + construct<T>(i, std::forward<Args>(args)...); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename U, + typename... Args, + typename = in_place_condition_t<T, std::initializer_list<U>, + Args...>> + std::decay_t<T>& emplace(std::initializer_list<U> i, + Args&& ...args) noexcept(in_place_noexcept_v<T, + std::initializer_list<U>, + Args...>) { + reset(); + return construct<T>(i,std::forward<Args>(args)...); + } + + // Empty ourselves, using the subclass to free any storage. + // + void reset() noexcept { + if (has_value()) { + func(op::destroy, ptr()); + func = nullptr; + } + free_storage(); + } + + template<typename U = storage_t, + typename = std::enable_if<std::is_swappable_v<storage_t>>> + void swap(base& rhs) { + using std::swap; + swap(func, rhs.func); + swap(storage, rhs.storage); + } + + // All other functions should use this function to test emptiness + // rather than examining `func` directly. + // + bool has_value() const noexcept { + return !!func; + } + + // Returns the type of the value stored, if any. + // + const std::type_info& type() const noexcept { + if (has_value()) { + const std::type_info* t; + func(op::type, reinterpret_cast<void*>(&t)); + return *t; + } else { + return typeid(void); + } + } + + template<typename T, typename U, typename V> + friend inline void* cast_helper(const base<U, V>& b) noexcept; +}; + +// Function used by all `any_cast` functions +// +// Returns a void* to the contents if they exist and match the +// requested type, otherwise `nullptr`. +// +template<typename T, typename U, typename V> +inline void* cast_helper(const base<U, V>& b) noexcept { + if (b.func && ((&op_func<T> == b.func) || + (b.type() == typeid(T)))) { + return b.ptr(); + } else { + return nullptr; + } +} +} + +// `any_cast` +// ========== +// +// Just the usual gamut of `any_cast` overloads. These get a bit +// repetitive and it would be nice to think of a way to collapse them +// down a bit. +// + +// The pointer pair! +// +template<typename T, typename U, typename V> +inline T* any_cast(_any::base<U, V>* a) noexcept { + if (a) { + return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a)); + } + return nullptr; +} + +template<typename T, typename U, typename V> +inline const T* any_cast(const _any::base<U, V>* a) noexcept { + if (a) { + return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a)); + } + return nullptr; +} + +// While we disallow copying the immobile any itself, we can allow +// anything with an extracted value that the type supports. +// +template<typename T, typename U, typename V> +inline T any_cast(_any::base<U, V>& a) { + static_assert(std::is_reference_v<T> || + std::is_copy_constructible_v<T>, + "The supplied type must be either a reference or " + "copy constructible."); + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline T any_cast(const _any::base<U, V>& a) { + static_assert(std::is_reference_v<T> || + std::is_copy_constructible_v<T>, + "The supplied type must be either a reference or " + "copy constructible."); + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline std::enable_if_t<(std::is_move_constructible_v<T> || + std::is_copy_constructible_v<T>) && + !std::is_rvalue_reference_v<T>, T> +any_cast(_any::base<U, V>&& a) { + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return std::move((*p)); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline std::enable_if_t<std::is_rvalue_reference_v<T>, T> +any_cast(_any::base<U, V>&& a) { + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +// `immobile_any` +// ============== +// +// Sometimes, uncopyable objects exist and I want to do things with +// them. The C++ standard library is really quite keen on insisting +// things be copyable before it deigns to work. I find this annoying. +// +// Also, the allocator, while useful, is really not considerate of +// other people's time. Every time we go to visit it, it takes us +// quite an awfully long time to get away again. As such, I've been +// trying to avoid its company whenever it is convenient and seemly. +// +// We accept any type that will fit in the declared capacity. You may +// store types with throwing destructors, but terminate will be +// invoked when they throw. +// +template<std::size_t S> +class immobile_any : public _any::base<immobile_any<S>, + std::aligned_storage_t<S>> { + using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>; + friend base; + + using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage; + + // Superclass requirements! + // ------------------------ + // + // Simple as anything. We have a buffer of fixed size and return the + // pointer to it when asked. + // + static constexpr std::size_t capacity = S; + void* ptr() const noexcept { + return const_cast<void*>(static_cast<const void*>(&storage)); + } + void* alloc_storage(std::size_t) noexcept { + return ptr(); + } + void free_storage() noexcept {} + + static_assert(capacity != _any::dynamic, + "That is not a valid size for an immobile_any."); + +public: + + immobile_any() noexcept = default; + + immobile_any(const immobile_any&) = delete; + immobile_any& operator =(const immobile_any&) = delete; + immobile_any(immobile_any&&) = delete; + immobile_any& operator =(immobile_any&&) = delete; + + using base::base; + using base::operator =; + + void swap(immobile_any&) = delete; +}; + +template<typename T, std::size_t S, typename... Args> +inline immobile_any<S> make_immobile_any(Args&& ...args) { + return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, std::size_t S, typename U, typename... Args> +inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) { + return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...); +} + +// `unique_any` +// ============ +// +// Oh dear. Now we're getting back into allocation. You don't think +// the allocator noticed all those mean things we said about it, do +// you? +// +// Well. Okay, allocator. Sometimes when it's the middle of the night +// and you're writing template code you say things you don't exactly +// mean. If it weren't for you, we wouldn't have any memory to run all +// our programs in at all. Really, I'm just being considerate of +// *your* needs, trying to avoid having to run to you every time we +// instantiate a type, making a few that can be self-sufficient…uh… +// +// **Anyway**, this is movable but not copyable, as you should expect +// from anything with ‘unique’ in the name. +// +class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> { + using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>; + friend base; + + using base::storage; + + // Superclass requirements + // ----------------------- + // + // Our storage is a single chunk of RAM owned by a + // `std::unique_ptr`. + // + static constexpr std::size_t capacity = _any::dynamic; + void* ptr() const noexcept { + return static_cast<void*>(storage.get()); + return nullptr; + } + + void* alloc_storage(const std::size_t z) { + storage.reset(new std::byte[z]); + return ptr(); + } + + void free_storage() noexcept { + storage.reset(); + } + +public: + + unique_any() noexcept = default; + ~unique_any() noexcept = default; + + unique_any(const unique_any&) = delete; + unique_any& operator =(const unique_any&) = delete; + + // We can rely on the behavior of `unique_ptr` and the base class to + // give us a default move constructor that does the right thing. + // + unique_any(unique_any&& rhs) noexcept = default; + unique_any& operator =(unique_any&& rhs) = default; + + using base::base; + using base::operator =; +}; + +inline void swap(unique_any& lhs, unique_any& rhs) noexcept { + lhs.swap(rhs); +} + +template<typename T, typename... Args> +inline unique_any make_unique_any(Args&& ...args) { + return unique_any(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, typename U, typename... Args> +inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) { + return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...); +} + +// `shared_any` +// ============ +// +// Once more with feeling! +// +// This is both copyable *and* movable. In case you need that sort of +// thing. It seemed a reasonable completion. +// +class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> { + using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>; + friend base; + + using base::storage; + + // Superclass requirements + // ----------------------- + // + // Our storage is a single chunk of RAM allocated from the + // heap. This time it's owned by a `boost::shared_ptr` so we can use + // `boost::make_shared_noinit`. (This lets us get the optimization + // that allocates array and control block in one without wasting + // time on `memset`.) + // + static constexpr std::size_t capacity = _any::dynamic; + void* ptr() const noexcept { + return static_cast<void*>(storage.get()); + } + + void* alloc_storage(std::size_t n) { + storage = boost::make_shared_noinit<std::byte[]>(n); + return ptr(); + } + + void free_storage() noexcept { + storage.reset(); + } + +public: + + shared_any() noexcept = default; + ~shared_any() noexcept = default; + + shared_any(const shared_any& rhs) noexcept = default; + shared_any& operator =(const shared_any&) noexcept = default; + + shared_any(shared_any&& rhs) noexcept = default; + shared_any& operator =(shared_any&& rhs) noexcept = default; + + using base::base; + using base::operator =; +}; + +inline void swap(shared_any& lhs, shared_any& rhs) noexcept { + lhs.swap(rhs); +} + +template<typename T, typename... Args> +inline shared_any make_shared_any(Args&& ...args) { + return shared_any(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, typename U, typename... Args> +inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) { + return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...); +} +} + +#endif // INCLUDE_STATIC_ANY diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h new file mode 100644 index 000000000..5a65cc20f --- /dev/null +++ b/src/include/bitmapper.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BITMAPPER_H +#define CEPH_BITMAPPER_H + +class bitmapper { + char *_data; + int _len; + + public: + bitmapper() : _data(0), _len(0) { } + bitmapper(char *data, int len) : _data(data), _len(len) { } + + void set_data(char *data, int len) { _data = data; _len = len; } + + int bytes() const { return _len; } + int bits() const { return _len * 8; } + + bool operator[](int b) const { + return get(b); + } + bool get(int b) const { + return _data[b >> 3] & (1 << (b&7)); + } + void set(int b) { + _data[b >> 3] |= 1 << (b&7); + } + void clear(int b) { + _data[b >> 3] &= ~(1 << (b&7)); + } + void toggle(int b) { + _data[b >> 3] ^= 1 << (b&7); + } +}; + +#endif diff --git a/src/include/blobhash.h b/src/include/blobhash.h new file mode 100644 index 000000000..303892b13 --- /dev/null +++ b/src/include/blobhash.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BLOBHASH_H +#define CEPH_BLOBHASH_H + +#include <cstdint> +#include "hash.h" + +class blobhash { +public: + uint32_t operator()(const void* p, size_t len) { + static rjhash<std::uint32_t> H; + std::uint32_t acc = 0; + auto buf = static_cast<const unsigned char*>(p); + while (len >= sizeof(acc)) { + acc ^= unaligned_load(buf); + buf += sizeof(std::uint32_t); + len -= sizeof(std::uint32_t); + } + // handle the last few bytes of p[-(len % 4):] + switch (len) { + case 3: + acc ^= buf[2] << 16; + [[fallthrough]]; + case 2: + acc ^= buf[1] << 8; + [[fallthrough]]; + case 1: + acc ^= buf[0]; + } + return H(acc); + } +private: + static inline std::uint32_t unaligned_load(const unsigned char* p) { + std::uint32_t result; + __builtin_memcpy(&result, p, sizeof(result)); + return result; + } +}; + + +#endif diff --git a/src/include/btree_map.h b/src/include/btree_map.h new file mode 100644 index 000000000..218835a0f --- /dev/null +++ b/src/include/btree_map.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_INCLUDE_BTREE_MAP_H +#define CEPH_INCLUDE_BTREE_MAP_H + +#include "include/cpp-btree/btree.h" +#include "include/cpp-btree/btree_map.h" +#include "include/ceph_assert.h" // cpp-btree uses system assert, blech +#include "include/encoding.h" + +template<class T, class U> +inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl) +{ + using ceph::encode; + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U> +inline void encode(const btree::btree_map<T,U>& m, ceph::buffer::list& bl, uint64_t features) +{ + using ceph::encode; + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U> +inline void decode(btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U> +inline void encode_nohead(const btree::btree_map<T,U>& m, ceph::buffer::list& bl) +{ + using ceph::encode; + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U> +inline void decode_nohead(int n, btree::btree_map<T,U>& m, ceph::buffer::list::const_iterator& p) +{ + using ceph::decode; + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +#endif diff --git a/src/include/buffer.h b/src/include/buffer.h new file mode 100644 index 000000000..10dceaec2 --- /dev/null +++ b/src/include/buffer.h @@ -0,0 +1,1294 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_BUFFER_H +#define CEPH_BUFFER_H + +#if defined(__linux__) || defined(__FreeBSD__) +#include <stdlib.h> +#endif +#include <limits.h> + +#ifndef _XOPEN_SOURCE +# define _XOPEN_SOURCE 600 +#endif + +#include <stdio.h> +#include <sys/uio.h> + +#if defined(__linux__) // For malloc(2). +#include <malloc.h> +#endif + +#include <inttypes.h> +#include <stdint.h> +#include <string.h> + +#if !defined(__CYGWIN__) && !defined(_WIN32) +# include <sys/mman.h> +#endif + +#include <iosfwd> +#include <iomanip> +#include <list> +#include <memory> +#include <vector> +#include <string> +#if __cplusplus >= 201703L +#include <string_view> +#endif // __cplusplus >= 201703L + +#include <exception> +#include <type_traits> + +#include "page.h" +#include "crc32c.h" +#include "buffer_fwd.h" + + +#ifdef __CEPH__ +# include "include/ceph_assert.h" +#else +# include <assert.h> +#endif + +#include "inline_memory.h" + +#define CEPH_BUFFER_API + +#ifdef HAVE_SEASTAR +namespace seastar { +template <typename T> class temporary_buffer; +namespace net { +class packet; +} +} +#endif // HAVE_SEASTAR +class deleter; + +template<typename T> class DencDumper; + +namespace ceph { + +template <class T> +struct nop_delete { + void operator()(T*) {} +}; + +// This is not unique_ptr-like smart pointer! It just signalizes ownership +// but DOES NOT manage the resource. It WILL LEAK if not manually deleted. +// It's rather a replacement for raw pointer than any other smart one. +// +// Considered options: +// * unique_ptr with custom deleter implemented in .cc (would provide +// the non-zero-cost resource management), +// * GSL's owner<T*> (pretty neat but would impose an extra depedency), +// * unique_ptr with nop deleter, +// * raw pointer (doesn't embed ownership enforcement - std::move). +template <class T> +struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> { + using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr; +}; + +namespace buffer CEPH_BUFFER_API { +inline namespace v15_2_0 { + +/// Actual definitions in common/error_code.h +struct error; +struct bad_alloc; +struct end_of_buffer; +struct malformed_input; +struct error_code; + + /// count of cached crc hits (matching input) + int get_cached_crc(); + /// count of cached crc hits (mismatching input, required adjustment) + int get_cached_crc_adjusted(); + /// count of crc cache misses + int get_missed_crc(); + /// enable/disable tracking of cached crcs + void track_cached_crc(bool b); + + /* + * an abstract raw buffer. with a reference count. + */ + class raw; + class raw_malloc; + class raw_static; + class raw_posix_aligned; + class raw_hack_aligned; + class raw_claimed_char; + class raw_unshareable; // diagnostic, unshareable char buffer + class raw_combined; + class raw_claim_buffer; + + + /* + * named constructors + */ + ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len); + ceph::unique_leakable_ptr<raw> create(unsigned len); + ceph::unique_leakable_ptr<raw> create(unsigned len, char c); + ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool); + ceph::unique_leakable_ptr<raw> claim_char(unsigned len, char *buf); + ceph::unique_leakable_ptr<raw> create_malloc(unsigned len); + ceph::unique_leakable_ptr<raw> claim_malloc(unsigned len, char *buf); + ceph::unique_leakable_ptr<raw> create_static(unsigned len, char *buf); + ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align); + ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool); + ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len); + ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len); + ceph::unique_leakable_ptr<raw> claim_buffer(unsigned len, char *buf, deleter del); + +#ifdef HAVE_SEASTAR + /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to + /// make it safe to share between cpus + ceph::unique_leakable_ptr<buffer::raw> create(seastar::temporary_buffer<char>&& buf); + /// create a raw buffer to wrap seastar cpu-local memory, without the safety + /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is + /// destructed on this cpu + ceph::unique_leakable_ptr<buffer::raw> create_local(seastar::temporary_buffer<char>&& buf); +#endif + + /* + * a buffer pointer. references (a subsequence of) a raw buffer. + */ + class CEPH_BUFFER_API ptr { + friend class list; + protected: + raw *_raw; + unsigned _off, _len; + private: + + void release(); + + template<bool is_const> + class iterator_impl { + const ptr *bp; ///< parent ptr + const char *start; ///< starting pointer into bp->c_str() + const char *pos; ///< pointer into bp->c_str() + const char *end_ptr; ///< pointer to bp->end_c_str() + const bool deep; ///< if true, do not allow shallow ptr copies + + iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p, + size_t offset, bool d) + : bp(p), + start(p->c_str() + offset), + pos(start), + end_ptr(p->end_c_str()), + deep(d) + {} + + friend class ptr; + + public: + using pointer = typename std::conditional<is_const, const char*, char *>::type; + pointer get_pos_add(size_t n) { + auto r = pos; + *this += n; + return r; + } + ptr get_ptr(size_t len) { + if (deep) { + return buffer::copy(get_pos_add(len), len); + } else { + size_t off = pos - bp->c_str(); + *this += len; + return ptr(*bp, off, len); + } + } + + iterator_impl& operator+=(size_t len); + + const char *get_pos() { + return pos; + } + const char *get_end() { + return end_ptr; + } + + size_t get_offset() { + return pos - start; + } + + bool end() const { + return pos == end_ptr; + } + }; + + public: + using const_iterator = iterator_impl<true>; + using iterator = iterator_impl<false>; + + ptr() : _raw(nullptr), _off(0), _len(0) {} + ptr(ceph::unique_leakable_ptr<raw> r); + // cppcheck-suppress noExplicitConstructor + ptr(unsigned l); + ptr(const char *d, unsigned l); + ptr(const ptr& p); + ptr(ptr&& p) noexcept; + ptr(const ptr& p, unsigned o, unsigned l); + ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r); + ptr& operator= (const ptr& p); + ptr& operator= (ptr&& p) noexcept; + ~ptr() { + // BE CAREFUL: this destructor is called also for hypercombined ptr_node. + // After freeing underlying raw, `*this` can become inaccessible as well! + release(); + } + + bool have_raw() const { return _raw ? true:false; } + + void swap(ptr& other) noexcept; + + iterator begin(size_t offset=0) { + return iterator(this, offset, false); + } + const_iterator begin(size_t offset=0) const { + return const_iterator(this, offset, false); + } + const_iterator cbegin() const { + return begin(); + } + const_iterator begin_deep(size_t offset=0) const { + return const_iterator(this, offset, true); + } + + // misc + bool is_aligned(unsigned align) const { + return ((uintptr_t)c_str() & (align-1)) == 0; + } + bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); } + bool is_n_align_sized(unsigned align) const + { + return (length() % align) == 0; + } + bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); } + bool is_partial() const { + return have_raw() && (start() > 0 || end() < raw_length()); + } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + // accessors + const char *c_str() const; + char *c_str(); + const char *end_c_str() const; + char *end_c_str(); + unsigned length() const { return _len; } + unsigned offset() const { return _off; } + unsigned start() const { return _off; } + unsigned end() const { return _off + _len; } + unsigned unused_tail_length() const; + const char& operator[](unsigned n) const; + char& operator[](unsigned n); + + const char *raw_c_str() const; + unsigned raw_length() const; + int raw_nref() const; + + void copy_out(unsigned o, unsigned l, char *dest) const; + + unsigned wasted() const; + + int cmp(const ptr& o) const; + bool is_zero() const; + + // modifiers + void set_offset(unsigned o) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= o); +#else + assert(raw_length() >= o); +#endif + _off = o; + } + void set_length(unsigned l) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= l); +#else + assert(raw_length() >= l); +#endif + _len = l; + } + + unsigned append(char c); + unsigned append(const char *p, unsigned l); +#if __cplusplus >= 201703L + inline unsigned append(std::string_view s) { + return append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true); + void zero(bool crc_reset = true); + void zero(unsigned o, unsigned l, bool crc_reset = true); + unsigned append_zeros(unsigned l); + +#ifdef HAVE_SEASTAR + /// create a temporary_buffer, copying the ptr as its deleter + operator seastar::temporary_buffer<char>() &; + /// convert to temporary_buffer, stealing the ptr as its deleter + operator seastar::temporary_buffer<char>() &&; +#endif // HAVE_SEASTAR + + }; + + + struct ptr_hook { + mutable ptr_hook* next; + + ptr_hook() = default; + ptr_hook(ptr_hook* const next) + : next(next) { + } + }; + + class ptr_node : public ptr_hook, public ptr { + public: + struct cloner { + ptr_node* operator()(const ptr_node& clone_this); + }; + struct disposer { + void operator()(ptr_node* const delete_this) { + if (!__builtin_expect(dispose_if_hypercombined(delete_this), 0)) { + delete delete_this; + } + } + }; + + ~ptr_node() = default; + + static std::unique_ptr<ptr_node, disposer> + create(ceph::unique_leakable_ptr<raw> r) { + return create_hypercombined(std::move(r)); + } + static std::unique_ptr<ptr_node, disposer> + create(const unsigned l) { + return create_hypercombined(buffer::create(l)); + } + template <class... Args> + static std::unique_ptr<ptr_node, disposer> + create(Args&&... args) { + return std::unique_ptr<ptr_node, disposer>( + new ptr_node(std::forward<Args>(args)...)); + } + + static ptr_node* copy_hypercombined(const ptr_node& copy_this); + + private: + friend list; + + template <class... Args> + ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) { + } + ptr_node(const ptr_node&) = default; + + ptr& operator= (const ptr& p) = delete; + ptr& operator= (ptr&& p) noexcept = delete; + ptr_node& operator= (const ptr_node& p) = delete; + ptr_node& operator= (ptr_node&& p) noexcept = delete; + void swap(ptr& other) noexcept = delete; + void swap(ptr_node& other) noexcept = delete; + + static bool dispose_if_hypercombined(ptr_node* delete_this); + static std::unique_ptr<ptr_node, disposer> create_hypercombined( + ceph::unique_leakable_ptr<raw> r); + }; + /* + * list - the useful bit! + */ + + class CEPH_BUFFER_API list { + public: + // this the very low-level implementation of singly linked list + // ceph::buffer::list is built on. We don't use intrusive slist + // of Boost (or any other 3rd party) to save extra dependencies + // in our public headers. + class buffers_t { + // _root.next can be thought as _head + ptr_hook _root; + ptr_hook* _tail; + + public: + template <class T> + class buffers_iterator { + typename std::conditional< + std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur; + template <class U> friend class buffers_iterator; + public: + using value_type = T; + using reference = typename std::add_lvalue_reference<T>::type; + using pointer = typename std::add_pointer<T>::type; + using difference_type = std::ptrdiff_t; + using iterator_category = std::forward_iterator_tag; + + template <class U> + buffers_iterator(U* const p) + : cur(p) { + } + // copy constructor + buffers_iterator(const buffers_iterator<T>& other) + : cur(other.cur) { + } + // converting constructor, from iterator -> const_iterator only + template <class U, typename std::enable_if< + std::is_const<T>::value && !std::is_const<U>::value, int>::type = 0> + buffers_iterator(const buffers_iterator<U>& other) + : cur(other.cur) { + } + buffers_iterator() = default; + + T& operator*() const { + return *reinterpret_cast<T*>(cur); + } + T* operator->() const { + return reinterpret_cast<T*>(cur); + } + + buffers_iterator& operator++() { + cur = cur->next; + return *this; + } + buffers_iterator operator++(int) { + const auto temp(*this); + ++*this; + return temp; + } + + template <class U> + buffers_iterator& operator=(buffers_iterator<U>& other) { + cur = other.cur; + return *this; + } + + bool operator==(const buffers_iterator& rhs) const { + return cur == rhs.cur; + } + bool operator!=(const buffers_iterator& rhs) const { + return !(*this==rhs); + } + }; + + typedef buffers_iterator<const ptr_node> const_iterator; + typedef buffers_iterator<ptr_node> iterator; + + typedef const ptr_node& const_reference; + typedef ptr_node& reference; + + buffers_t() + : _root(&_root), + _tail(&_root) { + } + buffers_t(const buffers_t&) = delete; + buffers_t(buffers_t&& other) + : _root(other._root.next == &other._root ? &_root : other._root.next), + _tail(other._tail == &other._root ? &_root : other._tail) { + other._root.next = &other._root; + other._tail = &other._root; + + _tail->next = &_root; + } + buffers_t& operator=(buffers_t&& other) { + if (&other != this) { + clear_and_dispose(); + swap(other); + } + return *this; + } + + void push_back(reference item) { + item.next = &_root; + // this updates _root.next when called on empty + _tail->next = &item; + _tail = &item; + } + + void push_front(reference item) { + item.next = _root.next; + _root.next = &item; + _tail = _tail == &_root ? &item : _tail; + } + + // *_after + iterator erase_after(const_iterator it) { + const auto* to_erase = it->next; + + it->next = to_erase->next; + _root.next = _root.next == to_erase ? to_erase->next : _root.next; + _tail = _tail == to_erase ? (ptr_hook*)&*it : _tail; + return it->next; + } + + void insert_after(const_iterator it, reference item) { + item.next = it->next; + it->next = &item; + _root.next = it == end() ? &item : _root.next; + _tail = const_iterator(_tail) == it ? &item : _tail; + } + + void splice_back(buffers_t& other) { + if (other.empty()) { + return; + } + + other._tail->next = &_root; + // will update root.next if empty() == true + _tail->next = other._root.next; + _tail = other._tail; + + other._root.next = &other._root; + other._tail = &other._root; + } + + bool empty() const { return _tail == &_root; } + + const_iterator begin() const { + return _root.next; + } + const_iterator before_begin() const { + return &_root; + } + const_iterator end() const { + return &_root; + } + iterator begin() { + return _root.next; + } + iterator before_begin() { + return &_root; + } + iterator end() { + return &_root; + } + + reference front() { + return reinterpret_cast<reference>(*_root.next); + } + reference back() { + return reinterpret_cast<reference>(*_tail); + } + const_reference front() const { + return reinterpret_cast<const_reference>(*_root.next); + } + const_reference back() const { + return reinterpret_cast<const_reference>(*_tail); + } + + void clone_from(const buffers_t& other) { + clear_and_dispose(); + for (auto& node : other) { + ptr_node* clone = ptr_node::cloner()(node); + push_back(*clone); + } + } + void clear_and_dispose() { + ptr_node::disposer dispose; + for (auto it = begin(), e = end(); it != e; /* nop */) { + auto& node = *it++; + dispose(&node); + } + _tail = &_root; + _root.next = _tail; + } + iterator erase_after_and_dispose(iterator it) { + auto* to_dispose = &*std::next(it); + auto ret = erase_after(it); + ptr_node::disposer()(to_dispose); + return ret; + } + + void swap(buffers_t& other) { + const auto copy_root = _root; + _root.next = \ + other._root.next == &other._root ? &this->_root : other._root.next; + other._root.next = \ + copy_root.next == &_root ? &other._root : copy_root.next; + + const auto copy_tail = _tail; + _tail = other._tail == &other._root ? &this->_root : other._tail; + other._tail = copy_tail == &_root ? &other._root : copy_tail; + + _tail->next = &_root; + other._tail->next = &other._root; + } + }; + + class iterator; + + private: + // my private bits + buffers_t _buffers; + + // track bufferptr we can modify (especially ::append() to). Not all bptrs + // bufferlist holds have this trait -- if somebody ::push_back(const ptr&), + // he expects it won't change. + ptr_node* _carriage; + unsigned _len, _num; + + template <bool is_const> + class CEPH_BUFFER_API iterator_impl { + protected: + typedef typename std::conditional<is_const, + const list, + list>::type bl_t; + typedef typename std::conditional<is_const, + const buffers_t, + buffers_t >::type list_t; + typedef typename std::conditional<is_const, + typename buffers_t::const_iterator, + typename buffers_t::iterator>::type list_iter_t; + bl_t* bl; + list_t* ls; // meh.. just here to avoid an extra pointer dereference.. + list_iter_t p; + unsigned off; // in bl + unsigned p_off; // in *p + friend class iterator_impl<true>; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = typename std::conditional<is_const, const char, char>::type; + using difference_type = std::ptrdiff_t; + using pointer = typename std::add_pointer<value_type>::type; + using reference = typename std::add_lvalue_reference<value_type>::type; + + // constructor. position. + iterator_impl() + : bl(0), ls(0), off(0), p_off(0) {} + iterator_impl(bl_t *l, unsigned o=0); + iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po) + : bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {} + iterator_impl(const list::iterator& i); + + /// get current iterator offset in buffer::list + unsigned get_off() const { return off; } + + /// get number of bytes remaining from iterator position to the end of the buffer::list + unsigned get_remaining() const { return bl->length() - off; } + + /// true if iterator is at the end of the buffer::list + bool end() const { + return p == ls->end(); + //return off == bl->length(); + } + void seek(unsigned o); + char operator*() const; + iterator_impl& operator+=(unsigned o); + iterator_impl& operator++(); + ptr get_current_ptr() const; + bool is_pointing_same_raw(const ptr& other) const; + + bl_t& get_bl() const { return *bl; } + + // copy data out. + // note that these all _append_ to dest! + void copy(unsigned len, char *dest); + // deprecated, use copy_deep() + void copy(unsigned len, ptr &dest) __attribute__((deprecated)); + void copy_deep(unsigned len, ptr &dest); + void copy_shallow(unsigned len, ptr &dest); + void copy(unsigned len, list &dest); + void copy(unsigned len, std::string &dest); + void copy_all(list &dest); + + // get a pointer to the currenet iterator position, return the + // number of bytes we can read from that position (up to want), + // and advance the iterator by that amount. + size_t get_ptr_and_advance(size_t want, const char **p); + + /// calculate crc from iterator position + uint32_t crc32c(size_t length, uint32_t crc); + + friend bool operator==(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off(); + } + friend bool operator!=(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off(); + } + }; + + public: + typedef iterator_impl<true> const_iterator; + + class CEPH_BUFFER_API iterator : public iterator_impl<false> { + public: + iterator() = default; + iterator(bl_t *l, unsigned o=0); + iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po); + // copy data in + void copy_in(unsigned len, const char *src, bool crc_reset = true); + void copy_in(unsigned len, const list& otherl); + }; + + struct reserve_t { + char* bp_data; + unsigned* bp_len; + unsigned* bl_len; + }; + + class contiguous_appender { + ceph::bufferlist& bl; + ceph::bufferlist::reserve_t space; + char* pos; + bool deep; + + /// running count of bytes appended that are not reflected by @pos + size_t out_of_band_offset = 0; + + contiguous_appender(bufferlist& bl, size_t len, bool d) + : bl(bl), + space(bl.obtain_contiguous_space(len)), + pos(space.bp_data), + deep(d) { + } + + void flush_and_continue() { + const size_t l = pos - space.bp_data; + *space.bp_len += l; + *space.bl_len += l; + space.bp_data = pos; + } + + friend class list; + template<typename Type> friend class ::DencDumper; + + public: + ~contiguous_appender() { + flush_and_continue(); + } + + size_t get_out_of_band_offset() const { + return out_of_band_offset; + } + void append(const char* __restrict__ p, size_t l) { + maybe_inline_memcpy(pos, p, l, 16); + pos += l; + } + char *get_pos_add(size_t len) { + char *r = pos; + pos += len; + return r; + } + char *get_pos() const { + return pos; + } + + void append(const bufferptr& p) { + const auto plen = p.length(); + if (!plen) { + return; + } + if (deep) { + append(p.c_str(), plen); + } else { + flush_and_continue(); + bl.append(p); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += plen; + } + } + void append(const bufferlist& l) { + if (deep) { + for (const auto &p : l._buffers) { + append(p.c_str(), p.length()); + } + } else { + flush_and_continue(); + bl.append(l); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += l.length(); + } + } + + size_t get_logical_offset() const { + return out_of_band_offset + (pos - space.bp_data); + } + }; + + contiguous_appender get_contiguous_appender(size_t len, bool deep=false) { + return contiguous_appender(*this, len, deep); + } + + class contiguous_filler { + friend buffer::list; + char* pos; + + contiguous_filler(char* const pos) : pos(pos) {} + + public: + void advance(const unsigned len) { + pos += len; + } + void copy_in(const unsigned len, const char* const src) { + memcpy(pos, src, len); + advance(len); + } + char* c_str() { + return pos; + } + }; + // The contiguous_filler is supposed to be not costlier than a single + // pointer. Keep it dumb, please. + static_assert(sizeof(contiguous_filler) == sizeof(char*), + "contiguous_filler should be no costlier than pointer"); + + class page_aligned_appender { + bufferlist& bl; + unsigned min_alloc; + + page_aligned_appender(list *l, unsigned min_pages) + : bl(*l), + min_alloc(min_pages * CEPH_PAGE_SIZE) { + } + + void _refill(size_t len); + + template <class Func> + void _append_common(size_t len, Func&& impl_f) { + const auto free_in_last = bl.get_append_buffer_unused_tail_length(); + const auto first_round = std::min(len, free_in_last); + if (first_round) { + impl_f(first_round); + } + // no C++17 for the sake of the C++11 guarantees of librados, sorry. + const auto second_round = len - first_round; + if (second_round) { + _refill(second_round); + impl_f(second_round); + } + } + + friend class list; + + public: + void append(const bufferlist& l) { + bl.append(l); + bl.obtain_contiguous_space(0); + } + + void append(const char* buf, size_t entire_len) { + _append_common(entire_len, + [buf, this] (const size_t chunk_len) mutable { + bl.append(buf, chunk_len); + buf += chunk_len; + }); + } + + void append_zero(size_t entire_len) { + _append_common(entire_len, [this] (const size_t chunk_len) { + bl.append_zero(chunk_len); + }); + } + + void substr_of(const list& bl, unsigned off, unsigned len) { + for (const auto& bptr : bl.buffers()) { + if (off >= bptr.length()) { + off -= bptr.length(); + continue; + } + const auto round_size = std::min(bptr.length() - off, len); + append(bptr.c_str() + off, round_size); + len -= round_size; + off = 0; + } + } + }; + + page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) { + return page_aligned_appender(this, min_pages); + } + + private: + // always_empty_bptr has no underlying raw but its _len is always 0. + // This is useful for e.g. get_append_buffer_unused_tail_length() as + // it allows to avoid conditionals on hot paths. + static ptr_node always_empty_bptr; + ptr_node& refill_append_space(const unsigned len); + + // for page_aligned_appender; never ever expose this publicly! + // carriage / append_buffer is just an implementation's detail. + ptr& get_append_buffer() { + return *_carriage; + } + + public: + // cons/des + list() + : _carriage(&always_empty_bptr), + _len(0), + _num(0) { + } + // cppcheck-suppress noExplicitConstructor + // cppcheck-suppress noExplicitConstructor + list(unsigned prealloc) + : _carriage(&always_empty_bptr), + _len(0), + _num(0) { + reserve(prealloc); + } + + list(const list& other) + : _carriage(&always_empty_bptr), + _len(other._len), + _num(other._num) { + _buffers.clone_from(other._buffers); + } + + list(list&& other) noexcept + : _buffers(std::move(other._buffers)), + _carriage(other._carriage), + _len(other._len), + _num(other._num) { + other.clear(); + } + + ~list() { + _buffers.clear_and_dispose(); + } + + list& operator= (const list& other) { + if (this != &other) { + _carriage = &always_empty_bptr; + _buffers.clone_from(other._buffers); + _len = other._len; + _num = other._num; + } + return *this; + } + list& operator= (list&& other) noexcept { + _buffers = std::move(other._buffers); + _carriage = other._carriage; + _len = other._len; + _num = other._num; + other.clear(); + return *this; + } + + uint64_t get_wasted_space() const; + unsigned get_num_buffers() const { return _num; } + const ptr_node& front() const { return _buffers.front(); } + const ptr_node& back() const { return _buffers.back(); } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + size_t get_append_buffer_unused_tail_length() const { + return _carriage->unused_tail_length(); + } + + const buffers_t& buffers() const { return _buffers; } + buffers_t& mut_buffers() { return _buffers; } + void swap(list& other) noexcept; + unsigned length() const { +#if 0 + // DEBUG: verify _len + unsigned len = 0; + for (std::list<ptr>::const_iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + len += (*it).length(); + } +#ifdef __CEPH__ + ceph_assert(len == _len); +#else + assert(len == _len); +#endif // __CEPH__ +#endif + return _len; + } + + bool contents_equal(const buffer::list& other) const; + bool contents_equal(const void* other, size_t length) const; + + bool is_provided_buffer(const char *dst) const; + bool is_aligned(unsigned align) const; + bool is_page_aligned() const; + bool is_n_align_sized(unsigned align) const; + bool is_n_page_sized() const; + bool is_aligned_size_and_memory(unsigned align_size, + unsigned align_memory) const; + + bool is_zero() const; + + // modifiers + void clear() noexcept { + _carriage = &always_empty_bptr; + _buffers.clear_and_dispose(); + _len = 0; + _num = 0; + } + void push_back(const ptr& bp) { + if (bp.length() == 0) + return; + _buffers.push_back(*ptr_node::create(bp).release()); + _len += bp.length(); + _num += 1; + } + void push_back(ptr&& bp) { + if (bp.length() == 0) + return; + _len += bp.length(); + _num += 1; + _buffers.push_back(*ptr_node::create(std::move(bp)).release()); + _carriage = &always_empty_bptr; + } + void push_back(const ptr_node&) = delete; + void push_back(ptr_node&) = delete; + void push_back(ptr_node&&) = delete; + void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) { + _carriage = bp.get(); + _len += bp->length(); + _num += 1; + _buffers.push_back(*bp.release()); + } + void push_back(raw* const r) = delete; + void push_back(ceph::unique_leakable_ptr<raw> r) { + _buffers.push_back(*ptr_node::create(std::move(r)).release()); + _carriage = &_buffers.back(); + _len += _buffers.back().length(); + _num += 1; + } + + void zero(); + void zero(unsigned o, unsigned l); + + bool is_contiguous() const; + void rebuild(); + void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb); + bool rebuild_aligned(unsigned align); + // max_buffers = 0 mean don't care _buffers.size(), other + // must make _buffers.size() <= max_buffers after rebuilding. + bool rebuild_aligned_size_and_memory(unsigned align_size, + unsigned align_memory, + unsigned max_buffers = 0); + bool rebuild_page_aligned(); + + void reserve(size_t prealloc); + + [[deprecated("in favor of operator=(list&&)")]] void claim(list& bl) { + *this = std::move(bl); + } + void claim_append(list& bl); + void claim_append(list&& bl) { + claim_append(bl); + } + + // copy with explicit volatile-sharing semantics + void share(const list& bl) + { + if (this != &bl) { + clear(); + for (const auto& bp : bl._buffers) { + _buffers.push_back(*ptr_node::create(bp).release()); + } + _len = bl._len; + _num = bl._num; + } + } + +#ifdef HAVE_SEASTAR + /// convert the bufferlist into a network packet + operator seastar::net::packet() &&; +#endif + + iterator begin(size_t offset=0) { + return iterator(this, offset); + } + iterator end() { + return iterator(this, _len, _buffers.end(), 0); + } + + const_iterator begin(size_t offset=0) const { + return const_iterator(this, offset); + } + const_iterator cbegin(size_t offset=0) const { + return begin(offset); + } + const_iterator end() const { + return const_iterator(this, _len, _buffers.end(), 0); + } + + void append(char c); + void append(const char *data, unsigned len); + void append(std::string s) { + append(s.data(), s.length()); + } +#if __cplusplus >= 201703L + // To forcibly disambiguate between string and string_view in the + // case of arrays + template<std::size_t N> + void append(const char (&s)[N]) { + append(s, N); + } + void append(const char* s) { + append(s, strlen(s)); + } + void append(std::string_view s) { + append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void append(const ptr& bp); + void append(ptr&& bp); + void append(const ptr& bp, unsigned off, unsigned len); + void append(const list& bl); + /// append each non-empty line from the stream and add '\n', + /// so a '\n' will be added even the stream does not end with EOL. + /// + /// For example, if the stream contains "ABC\n\nDEF", "ABC\nDEF\n" is + /// actually appended. + void append(std::istream& in); + contiguous_filler append_hole(unsigned len); + void append_zero(unsigned len); + void prepend_zero(unsigned len); + + reserve_t obtain_contiguous_space(const unsigned len); + + /* + * get a char + */ + const char& operator[](unsigned n) const; + char *c_str(); + std::string to_str() const; + + void substr_of(const list& other, unsigned off, unsigned len); + + // funky modifer + void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */); + void write(int off, int len, std::ostream& out) const; + + void encode_base64(list& o); + void decode_base64(list& o); + + void write_stream(std::ostream &out) const; + void hexdump(std::ostream &out, bool trailing_newline = true) const; + ssize_t pread_file(const char *fn, uint64_t off, uint64_t len, std::string *error); + int read_file(const char *fn, std::string *error); + ssize_t read_fd(int fd, size_t len); + ssize_t recv_fd(int fd, size_t len); + int write_file(const char *fn, int mode=0644); + int write_fd(int fd) const; + int write_fd(int fd, uint64_t offset) const; + int send_fd(int fd) const; + template<typename VectorT> + void prepare_iov(VectorT *piov) const { +#ifdef __CEPH__ + ceph_assert(_num <= IOV_MAX); +#else + assert(_num <= IOV_MAX); +#endif + piov->resize(_num); + unsigned n = 0; + for (auto& p : _buffers) { + (*piov)[n].iov_base = (void *)p.c_str(); + (*piov)[n].iov_len = p.length(); + ++n; + } + } + + struct iovec_t { + uint64_t offset; + uint64_t length; + std::vector<iovec> iov; + }; + using iov_vec_t = std::vector<iovec_t>; + iov_vec_t prepare_iovs() const; + + uint32_t crc32c(uint32_t crc) const; + void invalidate_crc(); + + // These functions return a bufferlist with a pointer to a single + // static buffer. They /must/ not outlive the memory they + // reference. + static list static_from_mem(char* c, size_t l); + static list static_from_cstring(char* c); + static list static_from_string(std::string& s); + }; + +} // inline namespace v15_2_0 + + /* + * efficient hash of one or more bufferlists + */ + + class hash { + uint32_t crc; + + public: + hash() : crc(0) { } + // cppcheck-suppress noExplicitConstructor + hash(uint32_t init) : crc(init) { } + + void update(const buffer::list& bl) { + crc = bl.crc32c(crc); + } + + uint32_t digest() { + return crc; + } + }; + +inline bool operator==(const bufferlist &lhs, const bufferlist &rhs) { + if (lhs.length() != rhs.length()) + return false; + return std::equal(lhs.begin(), lhs.end(), rhs.begin()); +} + +inline bool operator<(const bufferlist& lhs, const bufferlist& rhs) { + auto l = lhs.begin(), r = rhs.begin(); + for (; l != lhs.end() && r != rhs.end(); ++l, ++r) { + if (*l < *r) return true; + if (*l > *r) return false; + } + return (l == lhs.end()) && (r != rhs.end()); // lhs.length() < rhs.length() +} + +inline bool operator<=(const bufferlist& lhs, const bufferlist& rhs) { + auto l = lhs.begin(), r = rhs.begin(); + for (; l != lhs.end() && r != rhs.end(); ++l, ++r) { + if (*l < *r) return true; + if (*l > *r) return false; + } + return l == lhs.end(); // lhs.length() <= rhs.length() +} + +inline bool operator!=(const bufferlist &l, const bufferlist &r) { + return !(l == r); +} +inline bool operator>(const bufferlist& lhs, const bufferlist& rhs) { + return rhs < lhs; +} +inline bool operator>=(const bufferlist& lhs, const bufferlist& rhs) { + return rhs <= lhs; +} + +std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); + +std::ostream& operator<<(std::ostream& out, const buffer::raw &r); + +std::ostream& operator<<(std::ostream& out, const buffer::list& bl); + +inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) { + l.update(r); + return l; +} + +} // namespace buffer + +} // namespace ceph + + +#endif diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h new file mode 100644 index 000000000..6de7b1a1f --- /dev/null +++ b/src/include/buffer_fwd.h @@ -0,0 +1,19 @@ +#ifndef BUFFER_FWD_H +#define BUFFER_FWD_H + +namespace ceph { + namespace buffer { + inline namespace v15_2_0 { + class ptr; + class list; + } + class hash; + } + + using bufferptr = buffer::ptr; + using bufferlist = buffer::list; + using bufferhash = buffer::hash; +} + +#endif + diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h new file mode 100644 index 000000000..2298525c9 --- /dev/null +++ b/src/include/buffer_raw.h @@ -0,0 +1,120 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BUFFER_RAW_H +#define CEPH_BUFFER_RAW_H + +#include <map> +#include <utility> +#include <type_traits> +#include "common/ceph_atomic.h" +#include "include/buffer.h" +#include "include/mempool.h" +#include "include/spinlock.h" + +namespace ceph::buffer { +inline namespace v15_2_0 { + + class raw { + public: + // In the future we might want to have a slab allocator here with few + // embedded slots. This would allow to avoid the "if" in dtor of ptr_node. + std::aligned_storage<sizeof(ptr_node), + alignof(ptr_node)>::type bptr_storage; + protected: + char *data; + unsigned len; + public: + ceph::atomic<unsigned> nref { 0 }; + int mempool; + + std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()}; + std::pair<uint32_t, uint32_t> last_crc_val; + + mutable ceph::spinlock crc_spinlock; + + explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon) + : data(nullptr), len(l), nref(0), mempool(mempool) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon) + : data(c), len(l), nref(0), mempool(mempool) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + virtual ~raw() { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + } + + void _set_len(unsigned l) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + len = l; + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + + void reassign_to_mempool(int pool) { + if (pool == mempool) { + return; + } + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + mempool = pool; + mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len); + } + + void try_assign_to_mempool(int pool) { + if (mempool == mempool::mempool_buffer_anon) { + reassign_to_mempool(pool); + } + } + +private: + // no copying. + // cppcheck-suppress noExplicitConstructor + raw(const raw &other) = delete; + const raw& operator=(const raw &other) = delete; +public: + char *get_data() const { + return data; + } + unsigned get_len() const { + return len; + } + bool get_crc(const std::pair<size_t, size_t> &fromto, + std::pair<uint32_t, uint32_t> *crc) const { + std::lock_guard lg(crc_spinlock); + if (last_crc_offset == fromto) { + *crc = last_crc_val; + return true; + } + return false; + } + void set_crc(const std::pair<size_t, size_t> &fromto, + const std::pair<uint32_t, uint32_t> &crc) { + std::lock_guard lg(crc_spinlock); + last_crc_offset = fromto; + last_crc_val = crc; + } + void invalidate_crc() { + std::lock_guard lg(crc_spinlock); + last_crc_offset.first = std::numeric_limits<size_t>::max(); + last_crc_offset.second = std::numeric_limits<size_t>::max(); + } + }; + +} // inline namespace v15_2_0 +} // namespace ceph::buffer + +#endif // CEPH_BUFFER_RAW_H diff --git a/src/include/byteorder.h b/src/include/byteorder.h new file mode 100644 index 000000000..eb6d5e102 --- /dev/null +++ b/src/include/byteorder.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#pragma once + +#include <boost/endian/conversion.hpp> + +#include "int_types.h" + +template<typename T> +inline T swab(T val) { + return boost::endian::endian_reverse(val); +} + +template<typename T> +struct ceph_le { +private: + T v; +public: + ceph_le() = default; + explicit ceph_le(T nv) + : v{boost::endian::native_to_little(nv)} + {} + ceph_le<T>& operator=(T nv) { + v = boost::endian::native_to_little(nv); + return *this; + } + operator T() const { return boost::endian::little_to_native(v); } + friend inline bool operator==(ceph_le a, ceph_le b) { + return a.v == b.v; + } +} __attribute__ ((packed)); + +using ceph_le64 = ceph_le<__u64>; +using ceph_le32 = ceph_le<__u32>; +using ceph_le16 = ceph_le<__u16>; + +using ceph_les64 = ceph_le<__s64>; +using ceph_les32 = ceph_le<__s32>; +using ceph_les16 = ceph_le<__s16>; + +inline ceph_les64 init_les64(__s64 x) { + ceph_les64 v; + v = x; + return v; +} +inline ceph_les32 init_les32(__s32 x) { + ceph_les32 v; + v = x; + return v; +} +inline ceph_les16 init_les16(__s16 x) { + ceph_les16 v; + v = x; + return v; +} diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h new file mode 100644 index 000000000..0627894ea --- /dev/null +++ b/src/include/ceph_assert.h @@ -0,0 +1,147 @@ +#ifndef CEPH_ASSERT_H +#define CEPH_ASSERT_H + +#include <cstdlib> +#include <string> + +#ifndef __STRING +# define __STRING(x) #x +#endif + +#if defined(__linux__) +#include <features.h> + +#elif defined(__FreeBSD__) +#include <sys/cdefs.h> +#define __GNUC_PREREQ(minor, major) __GNUC_PREREQ__(minor, major) +#elif defined(__sun) || defined(_AIX) +#include "include/compat.h" +#include <assert.h> +#endif + +#ifdef __CEPH__ +# include "acconfig.h" +#endif + +#include "include/common_fwd.h" + +namespace ceph { + +struct BackTrace; + +/* + * Select a function-name variable based on compiler tests, and any compiler + * specific overrides. + */ +#if defined(HAVE_PRETTY_FUNC) +# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__ +#elif defined(HAVE_FUNC) +# define __CEPH_ASSERT_FUNCTION __func__ +#else +# define __CEPH_ASSERT_FUNCTION ((__const char *) 0) +#endif + +extern void register_assert_context(CephContext *cct); + +struct assert_data { + const char *assertion; + const char *file; + const int line; + const char *function; +}; + +extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function) + __attribute__ ((__noreturn__)); +extern void __ceph_assert_fail(const assert_data &ctx) + __attribute__ ((__noreturn__)); + +extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...) + __attribute__ ((__noreturn__)); +extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function); + +[[noreturn]] void __ceph_abort(const char *file, int line, const char *func, + const std::string& msg); + +[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func, + const char* msg, ...); + +#define _CEPH_ASSERT_VOID_CAST static_cast<void> + +#define assert_warn(expr) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION)) + +} + +using namespace ceph; + + +/* + * ceph_abort aborts the program with a nice backtrace. + * + * Currently, it's the same as assert(0), but we may one day make assert a + * debug-only thing, like it is in many projects. + */ +#define ceph_abort(msg, ...) \ + ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called") + +#define ceph_abort_msg(msg) \ + ::ceph::__ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg) + +#define ceph_abort_msgf(...) \ + ::ceph::__ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__) + +#ifdef __SANITIZE_ADDRESS__ +#define ceph_assert(expr) \ + do { \ + ((expr)) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \ + } while (false) +#else +#define ceph_assert(expr) \ + do { static const ceph::assert_data assert_data_ctx = \ + {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false) +#endif + +// this variant will *never* get compiled out to NDEBUG in the future. +// (ceph_assert currently doesn't either, but in the future it might.) +#ifdef __SANITIZE_ADDRESS__ +#define ceph_assert_always(expr) \ + do { \ + ((expr)) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \ + } while(false) +#else +#define ceph_assert_always(expr) \ + do { static const ceph::assert_data assert_data_ctx = \ + {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assert_fail(assert_data_ctx)); } while(false) +#endif + +// Named by analogy with printf. Along with an expression, takes a format +// string and parameters which are printed if the assertion fails. +#define assertf(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) +#define ceph_assertf(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) + +// this variant will *never* get compiled out to NDEBUG in the future. +// (ceph_assertf currently doesn't either, but in the future it might.) +#define ceph_assertf_always(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : ::ceph::__ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) + +#endif diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h new file mode 100644 index 000000000..794e10efd --- /dev/null +++ b/src/include/ceph_features.h @@ -0,0 +1,280 @@ +#ifndef __CEPH_FEATURES +#define __CEPH_FEATURES + +#include "sys/types.h" + +/* + * Each time we reclaim bits for reuse we need to specify another + * bitmask that, if all bits are set, indicates we have the new + * incarnation of that feature. Base case is 1 (first use) + */ +#define CEPH_FEATURE_INCARNATION_1 (0ull) +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC + +#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +// this bit is ignored but still advertised by release *when* +#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ + const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +// this bit is ignored by release *unused* and not advertised by +// release *unadvertised* +#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) + + +// test for a feature. this test is safer than a typical mask against +// the bit because it ensures that we have the bit AND the marker for the +// bit's incarnation. this must be used in any case where the features +// bits may include an old meaning of the bit. +#define HAVE_FEATURE(x, name) \ + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) + + +/* + * Notes on deprecation: + * + * For feature bits used *only* on the server-side: + * + * - In the first phase we indicate that a feature is DEPRECATED as of + * a particular release. This is the first major release X (say, + * mimic) that does not depend on its peers advertising the feature. + * That is, it safely assumes its peers all have the feature. We + * indicate this with the DEPRECATED macro. For example, + * + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MON_METADATA, MIMIC) + * + * because 13.2.z (mimic) did not care if its peers advertised this + * feature bit. + * + * - In the second phase we stop advertising the the bit and call it + * RETIRED. This can normally be done 2 major releases + * following the one in which we marked the feature DEPRECATED. In + * the above example, for 15.0.z (octopus) we can say: + * + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MON_METADATA, MIMIC, OCTOPUS) + * + * - The bit can be reused in the next release that will never talk to + * a pre-octopus daemon (13 mimic or 14 nautlius) that advertises the + * bit: in this case, the 16.y.z (P-release). + * + * This ensures that no two versions who have different meanings for + * the bit ever speak to each other. + */ + +/* + * Notes on the kernel client: + * + * - "X" means that the feature bit has been advertised and supported + * since kernel X + * + * - "X req" means that the feature bit has been advertised and required + * since kernel X + * + * The remaining feature bits are not and have never been used by the + * kernel client. + */ + +DEFINE_CEPH_FEATURE( 0, 1, UID) +DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) // 2.6.35 req +DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS) +DEFINE_CEPH_FEATURE( 3, 1, FLOCK) // 2.6.36 +DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) // 4.6 req +DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) +DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) // 3.10 req +DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) // 2.6.38 +DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) +DEFINE_CEPH_FEATURE( 9, 1, PGID64) // 3.9 req +DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) +DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) // 3.9 req +DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) +DEFINE_CEPH_FEATURE(13, 1, OSDENC) // 3.9 req +DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) +DEFINE_CEPH_FEATURE(15, 1, MONENC) +DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(16, 3, SERVER_OCTOPUS) +DEFINE_CEPH_FEATURE(16, 3, OSD_REPOP_MLCOD) +DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS) +DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) // 3.6 +DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT) +DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(20, 3, SERVER_PACIFIC) +DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) // 4.13 +DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap +DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap +DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST) +DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) // 3.19 req (unless nocephx_require_signatures) +DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2) +DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) // 3.9 +DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) +DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9 +DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC) +DEFINE_CEPH_FEATURE(29, 1, MDSENC) // 4.7 +DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) // 3.9 +DEFINE_CEPH_FEATURE_RETIRED(31, 1, MON_SINGLE_PAXOS, NAUTILUS, PACIFIC) +DEFINE_CEPH_FEATURE(31, 3, SERVER_REEF) +DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(32, 3, STRETCH_MODE) +DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(33, 3, SERVER_QUINCY) +DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(34, 3, RANGE_BLOCKLIST) +DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) // 3.14 +DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) // 3.14 +DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) // 3.14 +DEFINE_CEPH_FEATURE_RETIRED(38, 1, OSD_ERASURE_CODES, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) // 3.15 +DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) // 3.19 +DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) // 3.15 +DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap +DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) // 4.3 (for consistency) +DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) // 4.13 +DEFINE_CEPH_FEATURE_RETIRED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) +// available +DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) // 4.17 +DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) // 4.1 +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap +// available +DEFINE_CEPH_FEATURE_RETIRED(50, 1, MON_METADATA, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(54, 1, OSD_HITSET_GMT, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE_RETIRED(55, 1, HAMMER_0_94_4, MIMIC, OCTOPUS) +// available +DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25) +DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13 +DEFINE_CEPH_FEATURE_RETIRED(57, 1, MON_ROUTE_OSDMAP, MIMIC, OCTOPUS) // overlap +DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap +DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) // 4.5 +DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap +DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap +DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) +DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap +DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* +DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // 4.19, *do not share this bit* + +DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinel +DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client-facing +// available + + +/* + * Features supported. Should be everything above. + */ +#define CEPH_FEATURES_ALL \ + (CEPH_FEATURE_UID | \ + CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_FLOCK | \ + CEPH_FEATURE_SUBSCRIBE2 | \ + CEPH_FEATURE_MONNAMES | \ + CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH | \ + CEPH_FEATURE_OBJECTLOCATOR | \ + CEPH_FEATURE_PGID64 | \ + CEPH_FEATURE_INCSUBOSDMAP | \ + CEPH_FEATURE_PGPOOL3 | \ + CEPH_FEATURE_OSDREPLYMUX | \ + CEPH_FEATURE_OSDENC | \ + CEPH_FEATURE_MONENC | \ + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_MSG_AUTH | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ + CEPH_FEATURE_CREATEPOOLID | \ + CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_MDSENC | \ + CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_EXPORT_PEER | \ + CEPH_FEATURE_OSDMAP_ENC | \ + CEPH_FEATURE_MDS_INLINE_DATA | \ + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_MSGR_KEEPALIVE2 | \ + CEPH_FEATURE_OSD_POOLRESEND | \ + CEPH_FEATURE_OSD_FADVISE_FLAGS | \ + CEPH_FEATURE_MDS_QUOTA | \ + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_FS_FILE_LAYOUT_V2 | \ + CEPH_FEATURE_SERVER_KRAKEN | \ + CEPH_FEATURE_FS_BTIME | \ + CEPH_FEATURE_FS_CHANGE_ATTR | \ + CEPH_FEATURE_MSG_ADDR2 | \ + CEPH_FEATURE_SERVER_LUMINOUS | \ + CEPH_FEATURE_RESEND_ON_SPLIT | \ + CEPH_FEATURE_RADOS_BACKOFF | \ + CEPH_FEATURE_OSD_RECOVERY_DELETES | \ + CEPH_FEATURE_SERVER_MIMIC | \ + CEPH_FEATURE_RECOVERY_RESERVATION_2 | \ + CEPH_FEATURE_SERVER_NAUTILUS | \ + CEPH_FEATURE_CEPHX_V2 | \ + CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \ + CEPH_FEATUREMASK_SERVER_OCTOPUS | \ + CEPH_FEATUREMASK_STRETCH_MODE | \ + CEPH_FEATUREMASK_OSD_REPOP_MLCOD | \ + CEPH_FEATUREMASK_SERVER_PACIFIC | \ + CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \ + CEPH_FEATUREMASK_SERVER_QUINCY | \ + CEPH_FEATURE_RANGE_BLOCKLIST | \ + CEPH_FEATUREMASK_SERVER_REEF | \ + 0ULL) + +#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL + +/* + * crush related features + */ +#define CEPH_FEATURES_CRUSH \ + (CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS) + +/* + * make sure we don't try to use the reserved features + */ +#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0])) + +static inline void ____build_time_check_for_reserved_bits(void) { + CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL & CEPH_FEATURE_RESERVED) == 0); +} + +#endif diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h new file mode 100644 index 000000000..5babb8e95 --- /dev/null +++ b/src/include/ceph_frag.h @@ -0,0 +1,109 @@ +#ifndef FS_CEPH_FRAG_H +#define FS_CEPH_FRAG_H + +/* + * "Frags" are a way to describe a subset of a 32-bit number space, + * using a mask and a value to match against that mask. Any given frag + * (subset of the number space) can be partitioned into 2^n sub-frags. + * + * Frags are encoded into a 32-bit word: + * 8 upper bits = "bits" + * 24 lower bits = "value" + * (We could go to 5+27 bits, but who cares.) + * + * We use the _most_ significant bits of the 24 bit value. This makes + * values logically sort. + * + * Unfortunately, because the "bits" field is still in the high bits, we + * can't sort encoded frags numerically. However, it does allow you + * to feed encoded frags as values into frag_contains_value. + */ +static inline __u32 ceph_frag_make(__u32 b, __u32 v) +{ + return (b << 24) | + (v & (0xffffffu << (24-b)) & 0xffffffu); +} +static inline __u32 ceph_frag_bits(__u32 f) +{ + return f >> 24; +} +static inline __u32 ceph_frag_value(__u32 f) +{ + return f & 0xffffffu; +} +static inline __u32 ceph_frag_mask(__u32 f) +{ + return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; +} +static inline __u32 ceph_frag_mask_shift(__u32 f) +{ + return 24 - ceph_frag_bits(f); +} + +static inline int ceph_frag_contains_value(__u32 f, __u32 v) +{ + return (v & ceph_frag_mask(f)) == ceph_frag_value(f); +} +static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) +{ + /* is sub as specific as us, and contained by us? */ + return ceph_frag_bits(sub) >= ceph_frag_bits(f) && + (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); +} + +static inline __u32 ceph_frag_parent(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f) - 1, + ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); +} +static inline int ceph_frag_is_left_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; +} +static inline int ceph_frag_is_right_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; +} +static inline __u32 ceph_frag_sibling(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); +} +static inline __u32 ceph_frag_left_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); +} +static inline __u32 ceph_frag_right_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, + ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); +} +static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) +{ + int newbits = ceph_frag_bits(f) + by; + return ceph_frag_make(newbits, + ceph_frag_value(f) | (i << (24 - newbits))); +} +static inline int ceph_frag_is_leftmost(__u32 f) +{ + return ceph_frag_value(f) == 0; +} +static inline int ceph_frag_is_rightmost(__u32 f) +{ + return ceph_frag_value(f) == ceph_frag_mask(f); +} +static inline __u32 ceph_frag_next(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); +} + +/* + * comparator to sort frags logically, as when traversing the + * number space in ascending order... + */ +int ceph_frag_compare(__u32 a, __u32 b); + +#endif diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h new file mode 100644 index 000000000..28440c820 --- /dev/null +++ b/src/include/ceph_fs.h @@ -0,0 +1,1137 @@ +/* + * ceph_fs.h - Ceph constants and data types to share between kernel and + * user space. + * + * Most types in this file are defined as little-endian, and are + * primarily intended to describe data structures that pass over the + * wire or that are stored on disk. + * + * LGPL-2.1 or LGPL-3.0 + */ + +#ifndef CEPH_FS_H +#define CEPH_FS_H + +#include "msgr.h" +#include "rados.h" +#include "include/encoding.h" +#include "include/denc.h" + +/* + * The data structures defined here are shared between Linux kernel and + * user space. Also, those data structures are maintained always in + * little-endian byte order, even on big-endian systems. This is handled + * differently in kernel vs. user space. For use as kernel headers, the + * little-endian fields need to use the __le16/__le32/__le64 types. These + * are markers that indicate endian conversion routines must be used + * whenever such fields are accessed, which can be verified by checker + * tools like "sparse". For use as user-space headers, the little-endian + * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++ + * classes that implement automatic endian conversion on every access. + * To still allow for header sharing, this file uses the __le types, but + * redefines those to the ceph_ types when compiled in user space. + */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * subprotocol versions. when specific messages types or high-level + * protocols change, bump the affected components. we keep rev + * internal cluster protocols separately from the public, + * client-facing protocol. + */ +#define CEPH_OSDC_PROTOCOL 24 /* server/client */ +#define CEPH_MDSC_PROTOCOL 32 /* server/client */ +#define CEPH_MONC_PROTOCOL 15 /* server/client */ + + +#define CEPH_INO_ROOT 1 +/* + * hidden .ceph dir, which is no longer created but + * recognised in existing filesystems so that we + * don't try to fragment it. + */ +#define CEPH_INO_CEPH 2 +#define CEPH_INO_GLOBAL_SNAPREALM 3 +#define CEPH_INO_LOST_AND_FOUND 4 /* reserved ino for use in recovery */ + +/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ +#define CEPH_MAX_MON 31 + +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + __le32 fl_stripe_count; /* over this many objects */ + __le32 fl_object_size; /* until objects are this big, then move to + new objects */ + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ + + /* object -> pg layout */ + __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */ + __le32 fl_pg_pool; /* namespace, crush rule, rep level */ +} __attribute__ ((packed)); + +#define CEPH_MIN_STRIPE_UNIT 65536 + +struct ceph_dir_layout { + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ + __u8 dl_unused1; + __u16 dl_unused2; + __u32 dl_unused3; +} __attribute__ ((packed)); + +/* crypto algorithms */ +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 + +#define CEPH_AES_IV "cephsageyudagreg" + +/* security/authentication protocols */ +#define CEPH_AUTH_UNKNOWN 0x0 +#define CEPH_AUTH_NONE 0x1 +#define CEPH_AUTH_CEPHX 0x2 + +/* msgr2 protocol modes */ +#define CEPH_CON_MODE_UNKNOWN 0x0 +#define CEPH_CON_MODE_CRC 0x1 +#define CEPH_CON_MODE_SECURE 0x2 + +extern const char *ceph_con_mode_name(int con_mode); + +/* For options with "_", like: GSS_GSS + which means: Mode/Protocol to validate "authentication_authorization", + where: + - Authentication: Verifying the identity of an entity. + - Authorization: Verifying that an authenticated entity has + the right to access a particular resource. +*/ +#define CEPH_AUTH_GSS 0x4 +#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS + +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + + +/********************************************* + * message layer + */ + +/* + * message types + */ + +/* misc */ +#define CEPH_MSG_SHUTDOWN 1 +#define CEPH_MSG_PING 2 + +/* client <-> monitor */ +#define CEPH_MSG_MON_MAP 4 +#define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_MON_GET_OSDMAP 6 +#define CEPH_MSG_MON_METADATA 7 +#define CEPH_MSG_STATFS 13 +#define CEPH_MSG_STATFS_REPLY 14 +#define CEPH_MSG_MON_SUBSCRIBE 15 +#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 +#define CEPH_MSG_AUTH 17 +#define CEPH_MSG_AUTH_REPLY 18 +#define CEPH_MSG_MON_GET_VERSION 19 +#define CEPH_MSG_MON_GET_VERSION_REPLY 20 + +/* client <-> mds */ +#define CEPH_MSG_MDS_MAP 21 + +#define CEPH_MSG_CLIENT_SESSION 22 +#define CEPH_MSG_CLIENT_RECONNECT 23 + +#define CEPH_MSG_CLIENT_REQUEST 24 +#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 +#define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_RECLAIM 27 +#define CEPH_MSG_CLIENT_RECLAIM_REPLY 28 +#define CEPH_MSG_CLIENT_METRICS 29 +#define CEPH_MSG_CLIENT_CAPS 0x310 +#define CEPH_MSG_CLIENT_LEASE 0x311 +#define CEPH_MSG_CLIENT_SNAP 0x312 +#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +#define CEPH_MSG_CLIENT_QUOTA 0x314 + +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + +/* osd */ +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_WATCH_NOTIFY 44 +#define CEPH_MSG_OSD_BACKOFF 61 + +/* FSMap subscribers (see all MDS clusters at once) */ +#define CEPH_MSG_FS_MAP 45 +/* FSMapUser subscribers (get MDS clusters name->ID mapping) */ +#define CEPH_MSG_FS_MAP_USER 103 + +/* watch-notify operations */ +enum { + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ +}; + +const char *ceph_watch_event_name(int o); + +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + +struct ceph_mon_request_header { + __le64 have_version; + __le16 session_mon; + __le64 session_mon_tid; +} __attribute__ ((packed)); + +struct ceph_mon_statfs { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_statfs { + __le64 kb, kb_used, kb_avail; + __le64 num_objects; +} __attribute__ ((packed)); + +struct ceph_mon_statfs_reply { + struct ceph_fsid fsid; + __le64 version; + struct ceph_statfs st; +} __attribute__ ((packed)); + +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 __old_auid; // obsolete + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + +struct ceph_osd_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 start; +} __attribute__ ((packed)); + +struct ceph_mds_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_client_mount { + struct ceph_mon_request_header monhdr; +} __attribute__ ((packed)); + +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ + +struct ceph_mon_subscribe_item { + __le64 start; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_ack { + __le32 duration; /* seconds */ + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +/* + * mdsmap flags + */ +#define CEPH_MDSMAP_NOT_JOINABLE (1<<0) /* standbys cannot join */ +#define CEPH_MDSMAP_DOWN (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */ +#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */ +/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */ +/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */ +#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS (1<<4) /* cluster alllowed to enable MULTIMDS + and SNAPS at the same time */ +#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY (1<<5) /* cluster alllowed to enable MULTIMDS */ +#define CEPH_MDSMAP_REFUSE_CLIENT_SESSION (1<<6) /* cluster allowed to refuse client session + request */ +#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \ + CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS) + +/* + * mds states + * > 0 -> in + * <= 0 -> out + */ +#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ +#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. + empty log. */ +#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ +#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ +#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ +#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ +#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ +#define CEPH_MDS_STATE_REPLAYONCE -9 /* Legacy, unused */ +#define CEPH_MDS_STATE_NULL -10 + +#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ +#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed + operations (import, rename, etc.) */ +#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ +#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ +#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ +#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ +#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ +#define CEPH_MDS_STATE_DAMAGED 15 /* rank not replayable, need repair */ + +extern const char *ceph_mds_state_name(int s); + + +/* + * metadata lock types. + * - these are bitmasks.. we can compose them + * - they also define the lock ordering by the MDS + * - a few of these are internal to the mds + */ +#define CEPH_LOCK_DN (1 << 0) +#define CEPH_LOCK_DVERSION (1 << 1) +#define CEPH_LOCK_ISNAP (1 << 4) /* snapshot lock. MDS internal */ +#define CEPH_LOCK_IPOLICY (1 << 5) /* policy lock on dirs. MDS internal */ +#define CEPH_LOCK_IFILE (1 << 6) +#define CEPH_LOCK_INEST (1 << 7) /* mds internal */ +#define CEPH_LOCK_IDFT (1 << 8) /* dir frag tree */ +#define CEPH_LOCK_IAUTH (1 << 9) +#define CEPH_LOCK_ILINK (1 << 10) +#define CEPH_LOCK_IXATTR (1 << 11) +#define CEPH_LOCK_IFLOCK (1 << 12) /* advisory file locks */ +#define CEPH_LOCK_IVERSION (1 << 13) /* mds internal */ + +#define CEPH_LOCK_IFIRST CEPH_LOCK_ISNAP + + +/* client_session ops */ +enum { + CEPH_SESSION_REQUEST_OPEN, + CEPH_SESSION_OPEN, + CEPH_SESSION_REQUEST_CLOSE, + CEPH_SESSION_CLOSE, + CEPH_SESSION_REQUEST_RENEWCAPS, + CEPH_SESSION_RENEWCAPS, + CEPH_SESSION_STALE, + CEPH_SESSION_RECALL_STATE, + CEPH_SESSION_FLUSHMSG, + CEPH_SESSION_FLUSHMSG_ACK, + CEPH_SESSION_FORCE_RO, + // A response to REQUEST_OPEN indicating that the client should + // permanently desist from contacting the MDS + CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG +}; + +// flags for state reclaim +#define CEPH_RECLAIM_RESET 1 + +extern const char *ceph_session_op_name(int op); + +struct ceph_mds_session_head { + __le32 op; + __le64 seq; + struct ceph_timespec stamp; + __le32 max_caps, max_leases; +} __attribute__ ((packed)); + +/* client_request */ +/* + * metadata ops. + * & 0x001000 -> write op + * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). + & & 0x100000 -> use weird ino/path trace + */ +#define CEPH_MDS_OP_WRITE 0x001000 +enum { + CEPH_MDS_OP_LOOKUP = 0x00100, + CEPH_MDS_OP_GETATTR = 0x00101, + CEPH_MDS_OP_LOOKUPHASH = 0x00102, + CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + CEPH_MDS_OP_LOOKUPINO = 0x00104, + CEPH_MDS_OP_LOOKUPNAME = 0x00105, + CEPH_MDS_OP_GETVXATTR = 0x00106, + CEPH_MDS_OP_DUMMY = 0x00107, + + CEPH_MDS_OP_SETXATTR = 0x01105, + CEPH_MDS_OP_RMXATTR = 0x01106, + CEPH_MDS_OP_SETLAYOUT = 0x01107, + CEPH_MDS_OP_SETATTR = 0x01108, + CEPH_MDS_OP_SETFILELOCK= 0x01109, + CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, + + CEPH_MDS_OP_MKNOD = 0x01201, + CEPH_MDS_OP_LINK = 0x01202, + CEPH_MDS_OP_UNLINK = 0x01203, + CEPH_MDS_OP_RENAME = 0x01204, + CEPH_MDS_OP_MKDIR = 0x01220, + CEPH_MDS_OP_RMDIR = 0x01221, + CEPH_MDS_OP_SYMLINK = 0x01222, + + CEPH_MDS_OP_CREATE = 0x01301, + CEPH_MDS_OP_OPEN = 0x00302, + CEPH_MDS_OP_READDIR = 0x00305, + + CEPH_MDS_OP_LOOKUPSNAP = 0x00400, + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, + CEPH_MDS_OP_RENAMESNAP = 0x01403, + CEPH_MDS_OP_READDIR_SNAPDIFF = 0x01404, + + // internal op + CEPH_MDS_OP_FRAGMENTDIR= 0x01500, + CEPH_MDS_OP_EXPORTDIR = 0x01501, + CEPH_MDS_OP_FLUSH = 0x01502, + CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503, + CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504, + CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505, + CEPH_MDS_OP_RDLOCK_FRAGSSTATS = 0x01507 +}; + +#define IS_CEPH_MDS_OP_NEWINODE(op) (op == CEPH_MDS_OP_CREATE || \ + op == CEPH_MDS_OP_MKNOD || \ + op == CEPH_MDS_OP_MKDIR || \ + op == CEPH_MDS_OP_SYMLINK) + +extern const char *ceph_mds_op_name(int op); + +// setattr mask is an int +#ifndef CEPH_SETATTR_MODE +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#define CEPH_SETATTR_KILL_SGUID (1 << 10) +#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11) +#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12) +#define CEPH_SETATTR_KILL_SUID (1 << 13) +#define CEPH_SETATTR_KILL_SGID (1 << 14) +#endif + +/* + * open request flags + */ +#define CEPH_O_RDONLY 00000000 +#define CEPH_O_WRONLY 00000001 +#define CEPH_O_RDWR 00000002 +#define CEPH_O_CREAT 00000100 +#define CEPH_O_EXCL 00000200 +#define CEPH_O_TRUNC 00001000 +#define CEPH_O_LAZY 00020000 +#define CEPH_O_DIRECTORY 00200000 +#define CEPH_O_NOFOLLOW 00400000 + +int ceph_flags_sys2wire(int flags); + +/* + * Ceph setxattr request flags. + */ +#define CEPH_XATTR_CREATE (1 << 0) +#define CEPH_XATTR_REPLACE (1 << 1) +#define CEPH_XATTR_REMOVE (1 << 31) + +/* + * readdir/readdir_snapdiff request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir/readdir_snapdiff reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) +#define CEPH_READDIR_OFFSET_HASH (1<<10) + +/* Note that this is embedded wthin ceph_mds_request_head_legacy. */ +union ceph_mds_request_args_legacy { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 pool; /* if >= 0 and CREATEPOOLID feature */ + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; /* if O_TRUNC */ + } __attribute__ ((packed)) open; + struct { + __le32 flags; + __le32 osdmap_epoch; /* use for set file/dir layout */ + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id requesting the lock */ + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; +} __attribute__ ((packed)); + +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ +#define CEPH_MDS_FLAG_ASYNC 4 /* request is async */ + +struct ceph_mds_request_head_legacy { + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_legacy args; +} __attribute__ ((packed)); + +/* + * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility + * with the ceph_mds_request_args_legacy must be maintained! + */ +union ceph_mds_request_args { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 pool; /* if >= 0 and CREATEPOOLID feature */ + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; /* if O_TRUNC */ + } __attribute__ ((packed)) open; + struct { + __le32 flags; + __le32 osdmap_epoch; /* use for set file/dir layout */ + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id requesting the lock */ + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; + struct { + __le32 mask; /* CEPH_CAP_* */ + __le64 snapid; + __le64 parent; + __le32 hash; + } __attribute__ ((packed)) lookupino; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + __le64 snap_other; + } __attribute__ ((packed)) snapdiff; +} __attribute__ ((packed)); + +#define CEPH_MDS_REQUEST_HEAD_VERSION 3 + +/* + * Note that any change to this structure must ensure that it is compatible + * with ceph_mds_request_head_legacy. + */ +struct ceph_mds_request_head { + __le16 version; + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args args; + + __le32 ext_num_retry; /* new count retry attempts */ + __le32 ext_num_fwd; /* new count fwd attempts */ + + __le32 struct_len; /* to store size of struct ceph_mds_request_head */ + __le32 owner_uid, owner_gid; /* used for OPs which create inodes */ +} __attribute__ ((packed)); + +void inline encode(const struct ceph_mds_request_head& h, ceph::buffer::list& bl) { + using ceph::encode; + encode(h.version, bl); + encode(h.oldest_client_tid, bl); + encode(h.mdsmap_epoch, bl); + encode(h.flags, bl); + + // For old MDS daemons + __u8 num_retry = __u32(h.ext_num_retry); + __u8 num_fwd = __u32(h.ext_num_fwd); + encode(num_retry, bl); + encode(num_fwd, bl); + + encode(h.num_releases, bl); + encode(h.op, bl); + encode(h.caller_uid, bl); + encode(h.caller_gid, bl); + encode(h.ino, bl); + bl.append((char*)&h.args, sizeof(h.args)); + + if (h.version >= 2) { + encode(h.ext_num_retry, bl); + encode(h.ext_num_fwd, bl); + } + + if (h.version >= 3) { + __u32 struct_len = sizeof(struct ceph_mds_request_head); + encode(struct_len, bl); + encode(h.owner_uid, bl); + encode(h.owner_gid, bl); + + /* + * Please, add new fields handling here. + * You don't need to check h.version as we do it + * in decode(), because decode can properly skip + * all unsupported fields if h.version >= 3. + */ + } +} + +void inline decode(struct ceph_mds_request_head& h, ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + unsigned struct_end = bl.get_off(); + + decode(h.version, bl); + decode(h.oldest_client_tid, bl); + decode(h.mdsmap_epoch, bl); + decode(h.flags, bl); + decode(h.num_retry, bl); + decode(h.num_fwd, bl); + decode(h.num_releases, bl); + decode(h.op, bl); + decode(h.caller_uid, bl); + decode(h.caller_gid, bl); + decode(h.ino, bl); + bl.copy(sizeof(h.args), (char*)&(h.args)); + + if (h.version >= 2) { + decode(h.ext_num_retry, bl); + decode(h.ext_num_fwd, bl); + } else { + h.ext_num_retry = h.num_retry; + h.ext_num_fwd = h.num_fwd; + } + + if (h.version >= 3) { + decode(h.struct_len, bl); + struct_end += h.struct_len; + + decode(h.owner_uid, bl); + decode(h.owner_gid, bl); + } else { + /* + * client is old: let's take caller_{u,g}id as owner_{u,g}id + * this is how it worked before adding of owner_{u,g}id fields. + */ + h.owner_uid = h.caller_uid; + h.owner_gid = h.caller_gid; + } + + /* add new fields handling here */ + + /* + * From version 3 we have struct_len field. + * It allows us to properly handle a case + * when client send struct ceph_mds_request_head + * bigger in size than MDS supports. In this + * case we just want to skip all remaining bytes + * at the end. + * + * See also DECODE_FINISH macro. Unfortunately, + * we can't start using it right now as it will be + * an incompatible protocol change. + */ + if (h.version >= 3) { + if (bl.get_off() > struct_end) + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); + if (bl.get_off() < struct_end) + bl += struct_end - bl.get_off(); + } +} + +/* cap/lease release record */ +struct ceph_mds_request_release { + __le64 ino, cap_id; /* ino and unique cap id */ + __le32 caps, wanted; /* new issued, wanted */ + __le32 seq, issue_seq, mseq; + __le32 dname_seq; /* if releasing a dentry lease, a */ + __le32 dname_len; /* string follows. */ +} __attribute__ ((packed)); + +static inline void +copy_from_legacy_head(struct ceph_mds_request_head *head, + struct ceph_mds_request_head_legacy *legacy) +{ + struct ceph_mds_request_head_legacy *embedded_legacy = + (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; + *embedded_legacy = *legacy; +} + +static inline void +copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy, + struct ceph_mds_request_head *head) +{ + struct ceph_mds_request_head_legacy *embedded_legacy = + (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; + *legacy = *embedded_legacy; +} + +/* client reply */ +struct ceph_mds_reply_head { + __le32 op; + __le32 result; + __le32 mdsmap_epoch; + __u8 safe; /* true if committed to disk */ + __u8 is_dentry, is_target; /* true if dentry, target inode records + are included with reply */ +} __attribute__ ((packed)); + +/* one for each node split */ +struct ceph_frag_tree_split { + __le32 frag; /* this frag splits... */ + __le32 by; /* ...by this many bits */ +} __attribute__ ((packed)); + +struct ceph_frag_tree_head { + __le32 nsplits; /* num ceph_frag_tree_split records */ + struct ceph_frag_tree_split splits[]; +} __attribute__ ((packed)); + +/* capability issue, for bundling with mds reply */ +struct ceph_mds_reply_cap { + __le32 caps, wanted; /* caps issued, wanted */ + __le64 cap_id; + __le32 seq, mseq; + __le64 realm; /* snap realm */ + __u8 flags; /* CEPH_CAP_FLAG_* */ +} __attribute__ ((packed)); + +#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* ask client to release the cap */ + +/* reply_lease follows dname, and reply_inode */ +struct ceph_mds_reply_lease { + __le16 mask; /* lease type(s) */ + __le32 duration_ms; /* lease duration */ + __le32 seq; +} __attribute__ ((packed)); + +#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */ +#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */ + +struct ceph_mds_reply_dirfrag { + __le32 frag; /* fragment */ + __le32 auth; /* auth mds, if this is a delegation point */ + __le32 ndist; /* number of mds' this is replicated on */ + __le32 dist[]; +} __attribute__ ((packed)); + +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 +#define CEPH_LOCK_FCNTL_INTR 3 +#define CEPH_LOCK_FLOCK_INTR 4 + +#define CEPH_LOCK_SHARED 1 +#define CEPH_LOCK_EXCL 2 +#define CEPH_LOCK_UNLOCK 4 + +struct ceph_filelock { + __le64 start;/* file offset to start lock at */ + __le64 length; /* num bytes to lock; 0 for all following start */ + __le64 client; /* which client holds the lock */ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id holding the lock on the client */ + __u8 type; /* shared lock, exclusive lock, or unlock */ +} __attribute__ ((packed)); + + +/* file access modes */ +#define CEPH_FILE_MODE_PIN 0 +#define CEPH_FILE_MODE_RD 1 +#define CEPH_FILE_MODE_WR 2 +#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ +#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ +#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ + +int ceph_flags_to_mode(int flags); + +/* inline data state */ +#define CEPH_INLINE_NONE ((__u64)-1) +#define CEPH_INLINE_MAX_SIZE CEPH_MIN_STRIPE_UNIT + +/* capability bits */ +#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ + +/* generic cap bits */ +/* note: these definitions are duplicated in mds/locks.c */ +#define CEPH_CAP_GSHARED 1 /* client can reads */ +#define CEPH_CAP_GEXCL 2 /* client can read and update */ +#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ +#define CEPH_CAP_GRD 8 /* (file) client can read */ +#define CEPH_CAP_GWR 16 /* (file) client can write */ +#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ +#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ +#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ + +#define CEPH_CAP_SIMPLE_BITS 2 +#define CEPH_CAP_FILE_BITS 8 + +/* per-lock shift */ +#define CEPH_CAP_SAUTH 2 +#define CEPH_CAP_SLINK 4 +#define CEPH_CAP_SXATTR 6 +#define CEPH_CAP_SFILE 8 + +/* composed values */ +#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) +#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) +#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) +#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) +#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) +#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) +#define CEPH_CAP_FILE(x) ((x) << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) + +/* cap masks (for getattr) */ +#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN +#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ +#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN +#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED +#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ +#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED +#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ + CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_XATTR_SHARED) +#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_FILE_RD) +#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND + +#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_XATTR_SHARED | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_CACHE) + +#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ + CEPH_CAP_LINK_EXCL | \ + CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) +#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ + CEPH_CAP_PIN) + +#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ + CEPH_LOCK_IXATTR) + +/* cap masks async dir operations */ +#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE +#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD +#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO) + + +int ceph_caps_for_mode(int mode); + +enum { + CEPH_CAP_OP_GRANT, /* mds->client grant */ + CEPH_CAP_OP_REVOKE, /* mds->client revoke */ + CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ + CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ + CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ + CEPH_CAP_OP_UPDATE, /* client->mds update */ + CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ + CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ + CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ + CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ + CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ + CEPH_CAP_OP_RENEW, /* client->mds renewal request */ +}; + +extern const char *ceph_cap_op_name(int op); + +/* extra info for cap import/export */ +struct ceph_mds_cap_peer { + __le64 cap_id; + __le32 seq; + __le32 mseq; + __le32 mds; + __u8 flags; +} __attribute__ ((packed)); + +/* + * caps message, used for capability callbacks, acks, requests, etc. + */ +struct ceph_mds_caps_head { + __le32 op; /* CEPH_CAP_OP_* */ + __le64 ino, realm; + __le64 cap_id; + __le32 seq, issue_seq; + __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ + __le32 migrate_seq; + __le64 snap_follows; + __le32 snap_trace_len; + + /* authlock */ + __le32 uid, gid, mode; + + /* linklock */ + __le32 nlink; + + /* xattrlock */ + __le32 xattr_len; + __le64 xattr_version; +} __attribute__ ((packed)); + +struct ceph_mds_caps_non_export_body { + /* all except export */ + /* filelock */ + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + struct ceph_timespec mtime, atime, ctime; + struct ceph_file_layout layout; + __le32 time_warp_seq; +} __attribute__ ((packed)); + +struct ceph_mds_caps_export_body { + /* export message */ + struct ceph_mds_cap_peer peer; +} __attribute__ ((packed)); + +/* cap release msg head */ +struct ceph_mds_cap_release { + __le32 num; /* number of cap_items that follow */ +} __attribute__ ((packed)); + +struct ceph_mds_cap_item { + __le64 ino; + __le64 cap_id; + __le32 migrate_seq, seq; +} __attribute__ ((packed)); + +#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ +#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ +#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ +#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ + +extern const char *ceph_lease_op_name(int o); + +/* lease msg header */ +struct ceph_mds_lease { + __u8 action; /* CEPH_MDS_LEASE_* */ + __le16 mask; /* which lease */ + __le64 ino; + __le64 first, last; /* snap range */ + __le32 seq; + __le32 duration_ms; /* duration of renewal */ +} __attribute__ ((packed)); +/* followed by a __le32+string for dname */ + +/* client reconnect */ +struct ceph_mds_cap_reconnect { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ + __le32 flock_len; /* size of flock state blob, if any */ +} __attribute__ ((packed)); +/* followed by flock blob */ + +struct ceph_mds_cap_reconnect_v1 { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 size; + struct ceph_timespec mtime, atime; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); + +struct ceph_mds_snaprealm_reconnect { + __le64 ino; /* snap realm base */ + __le64 seq; /* snap seq for this snap realm */ + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); + +/* + * snaps + */ +enum { + CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ + CEPH_SNAP_OP_CREATE, + CEPH_SNAP_OP_DESTROY, + CEPH_SNAP_OP_SPLIT, +}; + +extern const char *ceph_snap_op_name(int o); + +/* snap msg header */ +struct ceph_mds_snap_head { + __le32 op; /* CEPH_SNAP_OP_* */ + __le64 split; /* ino to split off, if any */ + __le32 num_split_inos; /* # inos belonging to new child realm */ + __le32 num_split_realms; /* # child realms udner new child realm */ + __le32 trace_len; /* size of snap trace blob */ +} __attribute__ ((packed)); +/* followed by split ino list, then split realms, then the trace blob */ + +/* + * encode info about a snaprealm, as viewed by a client + */ +struct ceph_mds_snap_realm { + __le64 ino; /* ino */ + __le64 created; /* snap: when created */ + __le64 parent; /* ino: parent realm */ + __le64 parent_since; /* snap: same parent since */ + __le64 seq; /* snap: version */ + __le32 num_snaps; + __le32 num_prior_parent_snaps; +} __attribute__ ((packed)); +/* followed by my snap list, then prior parent snap list */ + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h new file mode 100644 index 000000000..cfa8097bb --- /dev/null +++ b/src/include/ceph_fuse.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Inktank Storage, Inc. + * Copyright (C) 2014 Red Hat <contact@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#ifndef CEPH_FUSE_H +#define CEPH_FUSE_H + +/* + * The API version that we want to use, regardless of what the + * library version is. Note that this must be defined before + * fuse.h is included. + */ +#ifndef FUSE_USE_VERSION +#define FUSE_USE_VERSION 312 +#endif + +#include <fuse.h> +#include "acconfig.h" + +/* + * Redefine the FUSE_VERSION macro defined in "fuse_common.h" + * header file, because the MINOR numner has been forgotten to + * update since libfuse 3.2 to 3.8. We need to fetch the MINOR + * number from pkgconfig file. + */ +#ifdef FUSE_VERSION +#undef FUSE_VERSION +#define FUSE_VERSION FUSE_MAKE_VERSION(CEPH_FUSE_MAJOR_VERSION, CEPH_FUSE_MINOR_VERSION) +#endif + +static inline int filler_compat(fuse_fill_dir_t filler, + void *buf, const char *name, + const struct stat *stbuf, + off_t off) +{ + return filler(buf, name, stbuf, off +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , static_cast<enum fuse_fill_dir_flags>(0) +#endif + ); +} +#endif /* CEPH_FUSE_H */ diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h new file mode 100644 index 000000000..f9d80ac36 --- /dev/null +++ b/src/include/ceph_hash.h @@ -0,0 +1,14 @@ +#ifndef FS_CEPH_HASH_H +#define FS_CEPH_HASH_H + +#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ +#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ + +extern unsigned ceph_str_hash_linux(const char *s, unsigned len); +extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); + +extern unsigned ceph_str_hash(int type, const char *s, unsigned len); +extern const char *ceph_str_hash_name(int type); +extern bool ceph_str_hash_valid(int type); + +#endif diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h new file mode 100644 index 000000000..ac5b7c224 --- /dev/null +++ b/src/include/cephfs/ceph_ll_client.h @@ -0,0 +1,215 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * scalable distributed file system + * + * Copyright (C) Jeff Layton <jlayton@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_CEPH_LL_CLIENT_H +#define CEPH_CEPH_LL_CLIENT_H +#include <stdint.h> + +#ifdef _WIN32 +#include "include/win32/fs_compat.h" +#endif + +#ifdef __cplusplus +extern "C" { + +class Fh; + +struct inodeno_t; +struct vinodeno_t; +typedef struct vinodeno_t vinodeno; + +#else /* __cplusplus */ + +typedef struct Fh Fh; + +typedef struct inodeno_t { + uint64_t val; +} inodeno_t; + +typedef struct _snapid_t { + uint64_t val; +} snapid_t; + +typedef struct vinodeno_t { + inodeno_t ino; + snapid_t snapid; +} vinodeno_t; + +#endif /* __cplusplus */ + +/* + * Heavily borrowed from David Howells' draft statx patchset. + * + * Since the xstat patches are still a work in progress, we borrow its data + * structures and #defines to implement ceph_getattrx. Once the xstat stuff + * has been merged we should drop this and switch over to using that instead. + */ +struct ceph_statx { + uint32_t stx_mask; + uint32_t stx_blksize; + uint32_t stx_nlink; + uint32_t stx_uid; + uint32_t stx_gid; + uint16_t stx_mode; + uint64_t stx_ino; + uint64_t stx_size; + uint64_t stx_blocks; + dev_t stx_dev; + dev_t stx_rdev; + struct timespec stx_atime; + struct timespec stx_ctime; + struct timespec stx_mtime; + struct timespec stx_btime; + uint64_t stx_version; +}; + +#define CEPH_STATX_MODE 0x00000001U /* Want/got stx_mode */ +#define CEPH_STATX_NLINK 0x00000002U /* Want/got stx_nlink */ +#define CEPH_STATX_UID 0x00000004U /* Want/got stx_uid */ +#define CEPH_STATX_GID 0x00000008U /* Want/got stx_gid */ +#define CEPH_STATX_RDEV 0x00000010U /* Want/got stx_rdev */ +#define CEPH_STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define CEPH_STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define CEPH_STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define CEPH_STATX_INO 0x00000100U /* Want/got stx_ino */ +#define CEPH_STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define CEPH_STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define CEPH_STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define CEPH_STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define CEPH_STATX_VERSION 0x00001000U /* Want/got stx_version */ +#define CEPH_STATX_ALL_STATS 0x00001fffU /* All supported stats */ + +/* + * Compatibility macros until these defines make their way into glibc + */ +#ifndef AT_STATX_DONT_SYNC +#define AT_STATX_SYNC_TYPE 0x6000 +#define AT_STATX_SYNC_AS_STAT 0x0000 +#define AT_STATX_FORCE_SYNC 0x2000 +#define AT_STATX_DONT_SYNC 0x4000 /* Don't sync attributes with the server */ +#endif + +/* + * This is deprecated and just for backwards compatibility. + * Please use AT_STATX_DONT_SYNC instead. + */ +#define AT_NO_ATTR_SYNC AT_STATX_DONT_SYNC /* Deprecated */ + +/* + * The statx interfaces only allow these flags. In order to allow us to add + * others in the future, we disallow setting any that aren't recognized. + */ +#define CEPH_REQ_FLAG_MASK (AT_SYMLINK_NOFOLLOW|AT_STATX_DONT_SYNC) + +/* fallocate mode flags */ +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 +#endif +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 +#endif + +/** ceph_deleg_cb_t: Delegation recalls + * + * Called when there is an outstanding Delegation and there is conflicting + * access, either locally or via cap activity. + * @fh: open filehandle + * @priv: private info registered when delegation was acquired + */ +typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv); + +/** + * client_ino_callback_t: Inode data/metadata invalidation + * + * Called when the client wants to invalidate the cached data for a range + * in the file. + * @handle: client callback handle + * @ino: vino of inode to be invalidated + * @off: starting offset of content to be invalidated + * @len: length of region to invalidate + */ +typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, + int64_t off, int64_t len); + +/** + * client_dentry_callback_t: Dentry invalidation + * + * Called when the client wants to purge a dentry from its cache. + * @handle: client callback handle + * @dirino: vino of directory that contains dentry to be invalidate + * @ino: vino of inode attached to dentry to be invalidated + * @name: name of dentry to be invalidated + * @len: length of @name + */ +typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino, + vinodeno_t ino, const char *name, + size_t len); + +/** + * client_remount_callback_t: Remount entire fs + * + * Called when the client needs to purge the dentry cache and the application + * doesn't have a way to purge an individual dentry. Mostly used for ceph-fuse + * on older kernels. + * @handle: client callback handle + */ + +typedef int (*client_remount_callback_t)(void *handle); + +/** + * client_switch_interrupt_callback_t: Lock request interrupted + * + * Called before file lock request to set the interrupt handler while waiting + * After the wait, called with "data" set to NULL pointer. + * @handle: client callback handle + * @data: opaque data passed to interrupt before call, NULL pointer after. + */ +typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data); + +/** + * client_umask_callback_t: Fetch umask of actor + * + * Called when the client needs the umask of the requestor. + * @handle: client callback handle + */ +typedef mode_t (*client_umask_callback_t)(void *handle); + +/** + * client_ino_release_t: Request that application release Inode references + * + * Called when the MDS wants to trim caps and Inode records. + * @handle: client callback handle + * @ino: vino of Inode being released + */ +typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino); + +/* + * The handle is an opaque value that gets passed to some callbacks. Any fields + * set to NULL will be left alone. There is no way to unregister callbacks. + */ +struct ceph_client_callback_args { + void *handle; + client_ino_callback_t ino_cb; + client_dentry_callback_t dentry_cb; + client_switch_interrupt_callback_t switch_intr_cb; + client_remount_callback_t remount_cb; + client_umask_callback_t umask_cb; + client_ino_release_t ino_release_cb; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_STATX_H */ + diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h new file mode 100644 index 000000000..dc62698fa --- /dev/null +++ b/src/include/cephfs/libcephfs.h @@ -0,0 +1,2201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009-2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIB_H +#define CEPH_LIB_H + +#if defined(__linux__) +#include <features.h> +#endif +#include <utime.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/statvfs.h> +#include <sys/socket.h> +#include <stdint.h> +#include <stdbool.h> +#include <fcntl.h> +#include <dirent.h> + +#include "ceph_ll_client.h" + +#ifdef __cplusplus +namespace ceph::common { + class CephContext; +} +using CephContext = ceph::common::CephContext; +extern "C" { +#endif + +#define LIBCEPHFS_VER_MAJOR 10 +#define LIBCEPHFS_VER_MINOR 0 +#define LIBCEPHFS_VER_EXTRA 3 + +#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA) + +#if __GNUC__ >= 4 + #define LIBCEPHFS_DEPRECATED __attribute__((deprecated)) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#else + #define LIBCEPHFS_DEPRECATED +#endif + +/* + * If using glibc check that file offset is 64-bit. + */ +#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64) +# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted +#endif + +/* + * XXXX redeclarations from ceph_fs.h, rados.h, etc. We need more of this + * in the interface, but shouldn't be re-typing it (and using different + * C data types). + */ +#ifndef __cplusplus + +#define CEPH_INO_ROOT 1 +#define CEPH_NOSNAP ((uint64_t)(-2)) + +struct ceph_file_layout { + /* file -> object mapping */ + uint32_t fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + uint32_t fl_stripe_count; /* over this many objects */ + uint32_t fl_object_size; /* until objects are this big, then move to + new objects */ + uint32_t fl_cas_hash; /* 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + uint32_t fl_object_stripe_unit; /* for per-object parity, if any */ + + /* object -> pg layout */ + uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */ + uint32_t fl_pg_pool; /* namespace, crush rule, rep level */ +} __attribute__ ((packed)); + +struct CephContext; +#endif /* ! __cplusplus */ + +struct UserPerm; +typedef struct UserPerm UserPerm; + +struct Inode; +typedef struct Inode Inode; + +struct ceph_mount_info; +struct ceph_dir_result; + +// user supplied key,value pair to be associated with a snapshot. +// callers can supply an array of this struct via ceph_mksnap(). +struct snap_metadata { + const char *key; + const char *value; +}; + +struct snap_info { + uint64_t id; + size_t nr_snap_metadata; + struct snap_metadata *snap_metadata; +}; + +struct ceph_snapdiff_entry_t { + struct dirent dir_entry; + uint64_t snapid; //should be snapid_t but prefer not to exposure it +}; + +/* setattr mask bits (up to an int in size) */ +#ifndef CEPH_SETATTR_MODE +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#define CEPH_SETATTR_KILL_SGUID (1 << 10) +#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11) +#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12) +#define CEPH_SETATTR_KILL_SUID (1 << 13) +#define CEPH_SETATTR_KILL_SGID (1 << 14) +#endif + +/* define error codes for the mount function*/ +# define CEPHFS_ERROR_MON_MAP_BUILD 1000 +# define CEPHFS_ERROR_NEW_CLIENT 1002 +# define CEPHFS_ERROR_MESSENGER_START 1003 + +/** + * Create a UserPerm credential object. + * + * Some calls (most notably, the ceph_ll_* ones), take a credential object + * that represents the credentials that the calling program is using. This + * function creates a new credential object for this purpose. Returns a + * pointer to the object, or NULL if it can't be allocated. + * + * Note that the gidlist array is used directly and is not copied. It must + * remain valid over the lifetime of the created UserPerm object. + * + * @param uid uid to be used + * @param gid gid to be used + * @param ngids number of gids in supplemental grouplist + * @param gidlist array of gid_t's in the list of groups + */ +UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist); + +/** + * Destroy a UserPerm credential object. + * + * @param perm pointer to object to be destroyed + * + * Currently this just frees the object. Note that the gidlist array is not + * freed. The caller must do so if it's necessary. + */ +void ceph_userperm_destroy(UserPerm *perm); + +/** + * Get a pointer to the default UserPerm object for the mount. + * + * @param cmount the mount info handle + * + * Every cmount has a default set of credentials. This returns a pointer to + * that object. + * + * Unlike with ceph_userperm_new, this object should not be freed. + */ +struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount); + +/** + * Set cmount's default permissions + * + * @param cmount the mount info handle + * @param perm permissions to set to default for mount + * + * Every cmount has a default set of credentials. This does a deep copy of + * the given permissions to the ones in the cmount. Must be done after + * ceph_init but before ceph_mount. + * + * Returns 0 on success, and -EISCONN if the cmount is already mounted. + */ +int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm); + +/** + * @defgroup libcephfs_h_init Setup and Teardown + * These are the first and last functions that should be called + * when using libcephfs. + * + * @{ + */ + +/** + * Get the version of libcephfs. + * + * The version number is major.minor.patch. + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param patch where to store the extra version number + */ +const char *ceph_version(int *major, int *minor, int *patch); + +/** + * Create a mount handle for interacting with Ceph. All libcephfs + * functions operate on a mount info handle. + * + * @param cmount the mount info handle to initialize + * @param id the id of the client. This can be a unique id that identifies + * this client, and will get appended onto "client.". Callers can + * pass in NULL, and the id will be the process id of the client. + * @returns 0 on success, negative error code on failure + */ +int ceph_create(struct ceph_mount_info **cmount, const char * const id); + +/** + * Create a mount handle from a CephContext, which holds the configuration + * for the ceph cluster. A CephContext can be acquired from an existing ceph_mount_info + * handle, using the @ref ceph_get_mount_context call. Note that using the same CephContext + * for two different mount handles results in the same client entity id being used. + * + * @param cmount the mount info handle to initialize + * @param conf reuse this pre-existing CephContext config + * @returns 0 on success, negative error code on failure + */ +#ifdef __cplusplus +int ceph_create_with_context(struct ceph_mount_info **cmount, CephContext *conf); +#else +int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf); +#endif + +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif // VOIDPTR_RADOS_T + +/** + * Create a mount handle from a rados_t, for using libcephfs in the + * same process as librados. + * + * @param cmount the mount info handle to initialize + * @param cluster reference to already-initialized librados handle + * @returns 0 on success, negative error code on failure + */ +int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster); + +/** + * Initialize the filesystem client (but do not mount the filesystem yet) + * + * @returns 0 on success, negative error code on failure + */ +int ceph_init(struct ceph_mount_info *cmount); + +/** + * Optionally set which filesystem to mount, before calling mount. + * + * An error will be returned if this libcephfs instance is already + * mounted. This function is an alternative to setting the global + * client_fs setting. Using this function enables multiple libcephfs + * instances in the same process to mount different filesystems. + * + * The filesystem name is *not* validated in this function. That happens + * during mount(), where an ENOENT error will result if a non-existent + * filesystem was specified here. + * + * @param cmount the mount info handle + * @returns 0 on success, negative error code on failure + */ +int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name); + + +/** + * Perform a mount using the path for the root of the mount. + * + * It is optional to call ceph_init before this. If ceph_init has + * not already been called, it will be called in the course of this operation. + * + * @param cmount the mount info handle + * @param root the path for the root of the mount. This can be an existing + * directory within the ceph cluster, but most likely it will + * be "/". Passing in NULL is equivalent to "/". + * @returns 0 on success, negative error code on failure + */ +int ceph_mount(struct ceph_mount_info *cmount, const char *root); + +/** + * Return cluster ID for a mounted ceph filesystem + * + * Every ceph filesystem has a filesystem ID associated with it. This + * function returns that value. If the ceph_mount_info does not refer to a + * mounted filesystem, this returns a negative error code. + */ +int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount); + +/** + * Execute a management command remotely on an MDS. + * + * Must have called ceph_init or ceph_mount before calling this. + * + * @param mds_spec string representing rank, MDS name, GID or '*' + * @param cmd array of null-terminated strings + * @param cmdlen length of cmd array + * @param inbuf non-null-terminated input data to command + * @param inbuflen length in octets of inbuf + * @param outbuf populated with pointer to buffer (command output data) + * @param outbuflen length of allocated outbuf + * @param outs populated with pointer to buffer (command error strings) + * @param outslen length of allocated outs + * + * @return 0 on success, negative error code on failure + * + */ +int ceph_mds_command(struct ceph_mount_info *cmount, + const char *mds_spec, + const char **cmd, + size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * Free a buffer, such as those used for output arrays from ceph_mds_command + */ +void ceph_buffer_free(char *buf); + +/** + * Unmount a mount handle. + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure + */ +int ceph_unmount(struct ceph_mount_info *cmount); + +/** + * Abort mds connections + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure + */ +int ceph_abort_conn(struct ceph_mount_info *cmount); + +/** + * Destroy the mount handle. + * + * The handle should not be mounted. This should be called on completion of + * all libcephfs functions. + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure. + */ +int ceph_release(struct ceph_mount_info *cmount); + +/** + * Deprecated. Unmount and destroy the ceph mount handle. This should be + * called on completion of all libcephfs functions. + * + * Equivalent to ceph_unmount() + ceph_release() without error handling. + * + * @param cmount the mount handle to shutdown + */ +void ceph_shutdown(struct ceph_mount_info *cmount); + +/** + * Return associated client addresses + * + * @param cmount the mount handle + * @param addrs the output addresses + * @returns 0 on success, a negative error code on failure + * @note the returned addrs should be free by the caller + */ +int ceph_getaddrs(struct ceph_mount_info *cmount, char** addrs); + +/** + * Get a global id for current instance + * + * The handle should not be mounted. This should be called on completion of + * all libcephfs functions. + * + * @param cmount the mount handle + * @returns instance global id + */ +uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount); + +/** + * Extract the CephContext from the mount point handle. + * + * @param cmount the ceph mount handle to get the context from. + * @returns the CephContext associated with the mount handle. + */ +#ifdef __cplusplus +CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount); +#else +struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount); +#endif +/* + * Check mount status. + * + * Return non-zero value if mounted. Otherwise, zero. + */ +int ceph_is_mounted(struct ceph_mount_info *cmount); + +/** @} init */ + +/** + * @defgroup libcephfs_h_config Config + * Functions for manipulating the Ceph configuration at runtime. + * + * @{ + */ + +/** + * Load the ceph configuration from the specified config file. + * + * @param cmount the mount handle to load the configuration into. + * @param path_list the configuration file path + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list); + +/** + * Parse the command line arguments and load the configuration parameters. + * + * @param cmount the mount handle to load the configuration parameters into. + * @param argc count of the arguments in argv + * @param argv the argument list + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv); + +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre ceph_mount() has not been called on the handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cmount handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var); + +/** Sets a configuration value from a string. + * + * @param cmount the mount handle to set the configuration value on + * @param option the configuration option to set + * @param value the value of the configuration option to set + * + * @returns 0 on success, negative error code otherwise. + */ +int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value); + +/** Set mount timeout. + * + * @param cmount mount handle to set the configuration value on + * @param timeout mount timeout interval + * + * @returns 0 on success, negative error code otherwise. + */ +int ceph_set_mount_timeout(struct ceph_mount_info *cmount, uint32_t timeout); + +/** + * Gets the configuration value as a string. + * + * @param cmount the mount handle to set the configuration value on + * @param option the config option to get + * @param buf the buffer to fill with the value + * @param len the length of the buffer. + * @returns the size of the buffer filled in with the value, or negative error code on failure + */ +int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len); + +/** @} config */ + +/** + * @defgroup libcephfs_h_fsops File System Operations. + * Functions for getting/setting file system wide information specific to a particular + * mount handle. + * + * @{ + */ + +/** + * Perform a statfs on the ceph file system. This call fills in file system wide statistics + * into the passed in buffer. + * + * @param cmount the ceph mount handle to use for performing the statfs. + * @param path can be any path within the mounted filesystem + * @param stbuf the file system statistics filled in by this function. + * @return 0 on success, negative error code otherwise. + */ +int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf); + +/** + * Synchronize all filesystem data to persistent media. + * + * @param cmount the ceph mount handle to use for performing the sync_fs. + * @returns 0 on success or negative error code on failure. + */ +int ceph_sync_fs(struct ceph_mount_info *cmount); + +/** + * Get the current working directory. + * + * @param cmount the ceph mount to get the current working directory for. + * @returns the path to the current working directory + */ +const char* ceph_getcwd(struct ceph_mount_info *cmount); + +/** + * Change the current working directory. + * + * @param cmount the ceph mount to change the current working directory for. + * @param path the path to the working directory to change into. + * @returns 0 on success, negative error code otherwise. + */ +int ceph_chdir(struct ceph_mount_info *cmount, const char *path); + +/** @} fsops */ + +/** + * @defgroup libcephfs_h_dir Directory Operations. + * Functions for manipulating and listing directories. + * + * @{ + */ + +/** + * Open the given directory. + * + * @param cmount the ceph mount handle to use to open the directory + * @param name the path name of the directory to open. Must be either an absolute path + * or a path relative to the current working directory. + * @param dirpp the directory result pointer structure to fill in. + * @returns 0 on success or negative error code otherwise. + */ +int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp); + +/** + * Open a directory referred to by a file descriptor + * + * @param cmount the ceph mount handle to use to open the directory + * @param dirfd open file descriptor for the directory + * @param dirpp the directory result pointer structure to fill in + * @returns 0 on success or negative error code otherwise + */ +int ceph_fdopendir(struct ceph_mount_info *cmount, int dirfd, struct ceph_dir_result **dirpp); + +/** + * Close the open directory. + * + * @param cmount the ceph mount handle to use for closing the directory + * @param dirp the directory result pointer (set by ceph_opendir) to close + * @returns 0 on success or negative error code on failure. + */ +int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Get the next entry in an open directory. + * + * @param cmount the ceph mount handle to use for performing the readdir. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @returns the next directory entry or NULL if at the end of the directory (or the directory + * is empty. This pointer should not be freed by the caller, and is only safe to + * access between return and the next call to ceph_readdir or ceph_closedir. + */ +struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller. + * + * @param cmount the ceph mount handle to use for performing the readdir. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @param de the directory entry pointer filled in with the next directory entry of the dirp state. + * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached, + * and a negative error code on failure. + */ +int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de); + +/** + * A safe version of ceph_readdir that also returns the file statistics (readdir+stat). + * + * @param cmount the ceph mount handle to use for performing the readdir_plus_r. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @param de the directory entry pointer filled in with the next directory entry of the dirp state. + * @param stx the stats of the file/directory of the entry returned + * @param want mask showing desired inode attrs for returned entry + * @param flags bitmask of flags to use when filling out attributes + * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on + * the inode and the pointer set on success. + * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached, + * and a negative error code on failure. + */ +int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de, + struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out); + +struct ceph_snapdiff_info +{ + struct ceph_mount_info* cmount; + struct ceph_dir_result* dir1; // primary dir entry to build snapdiff for. + struct ceph_dir_result* dir_aux; // aux dir entry to identify the second snapshot. + // Can point to the parent dir entry if entry-in-question + // doesn't exist in the second snapshot +}; + +/** + * Opens snapdiff stream to get snapshots delta (aka snapdiff). + * + * @param cmount the ceph mount handle to use for snapdiff retrieval. + * @param root_path root path for snapshots-in-question + * @param rel_path subpath under the root to build delta for + * @param snap1 the first snapshot name + * @param snap2 the second snapshot name + * @param out resulting snapdiff stream handle to be used for snapdiff results + retrieval via ceph_readdir_snapdiff + * @returns 0 on success and negative error code otherwise + */ +int ceph_open_snapdiff(struct ceph_mount_info* cmount, + const char* root_path, + const char* rel_path, + const char* snap1, + const char* snap2, + struct ceph_snapdiff_info* out); +/** + * Get the next snapshot delta entry. + * + * @param info snapdiff stream handle opened via ceph_open_snapdiff() + * @param out the next snapdiff entry which includes directory entry and the + * entry's snapshot id - later one for emerged/existing entry or + * former snapshot id for the removed entry. + * @returns >0 on success, 0 if no more entries in the stream and negative + * error code otherwise + */ +int ceph_readdir_snapdiff(struct ceph_snapdiff_info* snapdiff, + struct ceph_snapdiff_entry_t* out); +/** + * Close snapdiff stream. + * + * @param info snapdiff stream handle opened via ceph_open_snapdiff() + * @returns 0 on success and negative error code otherwise + */ +int ceph_close_snapdiff(struct ceph_snapdiff_info* snapdiff); + +/** + * Gets multiple directory entries. + * + * @param cmount the ceph mount handle to use for performing the getdents. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry/entries to return. + * @param name an array of struct dirent that gets filled in with the to fill returned directory entries into. + * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent). + * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a + * negative error code. If the buffer is not large enough for a single entry, -ERANGE is returned. + */ +int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen); + +/** + * Gets multiple directory names. + * + * @param cmount the ceph mount handle to use for performing the getdents. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry/entries to return. + * @param name a buffer to fill in with directory entry names. + * @param buflen the length of the buffer that can be filled in. + * @returns the length of the buffer filled in with entry names, or a negative error code on failure. + * If the buffer isn't large enough for a single entry, -ERANGE is returned. + */ +int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen); + +/** + * Rewind the directory stream to the beginning of the directory. + * + * @param cmount the ceph mount handle to use for performing the rewinddir. + * @param dirp the directory stream pointer to rewind. + */ +void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Get the current position of a directory stream. + * + * @param cmount the ceph mount handle to use for performing the telldir. + * @param dirp the directory stream pointer to get the current position of. + * @returns the position of the directory stream. Note that the offsets returned + * by ceph_telldir do not have a particular order (cannot be compared with + * inequality). + */ +int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Move the directory stream to a position specified by the given offset. + * + * @param cmount the ceph mount handle to use for performing the seekdir. + * @param dirp the directory stream pointer to move. + * @param offset the position to move the directory stream to. This offset should be + * a value returned by telldir. Note that this value does not refer to the nth + * entry in a directory, and can not be manipulated with plus or minus. + */ +void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset); + +/** + * Create a directory. + * + * @param cmount the ceph mount handle to use for making the directory. + * @param path the path of the directory to create. This must be either an + * absolute path or a relative path off of the current working directory. + * @param mode the permissions the directory should have once created. + * @returns 0 on success or a negative return code on error. + */ +int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Create a directory relative to a file descriptor + * + * @param cmount the ceph mount handle to use for making the directory. + * @param dirfd open file descriptor for a directory (or CEPHFS_AT_FDCWD) + * @param relpath the path of the directory to create. + * @param mode the permissions the directory should have once created. + * @returns 0 on success or a negative return code on error. + */ +int ceph_mkdirat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, mode_t mode); + +/** + * Create a snapshot + * + * @param cmount the ceph mount handle to use for making the directory. + * @param path the path of the directory to create snapshot. This must be either an + * absolute path or a relative path off of the current working directory. + * @param name snapshot name + * @param mode the permissions the directory should have once created. + * @param snap_metadata array of snap metadata structs + * @param nr_snap_metadata number of snap metadata struct entries + * @returns 0 on success or a negative return code on error. + */ +int ceph_mksnap(struct ceph_mount_info *cmount, const char *path, const char *name, + mode_t mode, struct snap_metadata *snap_metadata, size_t nr_snap_metadata); + +/** + * Remove a snapshot + * + * @param cmount the ceph mount handle to use for making the directory. + * @param path the path of the directory to create snapshot. This must be either an + * absolute path or a relative path off of the current working directory. + * @param name snapshot name + * @returns 0 on success or a negative return code on error. + */ +int ceph_rmsnap(struct ceph_mount_info *cmount, const char *path, const char *name); + +/** + * Create multiple directories at once. + * + * @param cmount the ceph mount handle to use for making the directories. + * @param path the full path of directories and sub-directories that should + * be created. + * @param mode the permissions the directory should have once created. + * @returns 0 on success or a negative return code on error. + */ +int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Remove a directory. + * + * @param cmount the ceph mount handle to use for removing directories. + * @param path the path of the directory to remove. + * @returns 0 on success or a negative return code on error. + */ +int ceph_rmdir(struct ceph_mount_info *cmount, const char *path); + +/** @} dir */ + +/** + * @defgroup libcephfs_h_links Links and Link Handling. + * Functions for creating and manipulating hard links and symbolic inks. + * + * @{ + */ + +/** + * Create a link. + * + * @param cmount the ceph mount handle to use for creating the link. + * @param existing the path to the existing file/directory to link to. + * @param newname the path to the new file/directory to link from. + * @returns 0 on success or a negative return code on error. + */ +int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname); + +/** + * Read a symbolic link. + * + * @param cmount the ceph mount handle to use for creating the link. + * @param path the path to the symlink to read + * @param buf the buffer to hold the path of the file that the symlink points to. + * @param size the length of the buffer + * @returns number of bytes copied on success or negative error code on failure + */ +int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size); + +/** + * Read a symbolic link relative to a file descriptor + * + * @param cmount the ceph mount handle to use for creating the link. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the path to the symlink to read + * @param buf the buffer to hold the path of the file that the symlink points to. + * @param size the length of the buffer + * @returns number of bytes copied on success or negative error code on failure + */ +int ceph_readlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, char *buf, + int64_t size); + +/** + * Creates a symbolic link. + * + * @param cmount the ceph mount handle to use for creating the symbolic link. + * @param existing the path to the existing file/directory to link to. + * @param newname the path to the new file/directory to link from. + * @returns 0 on success or a negative return code on failure. + */ +int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname); + +/** + * Creates a symbolic link relative to a file descriptor + * + * @param cmount the ceph mount handle to use for creating the symbolic link. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param existing the path to the existing file/directory to link to. + * @param newname the path to the new file/directory to link from. + * @returns 0 on success or a negative return code on failure. + */ +int ceph_symlinkat(struct ceph_mount_info *cmount, const char *existing, int dirfd, + const char *newname); + +/** @} links */ + +/** + * @defgroup libcephfs_h_files File manipulation and handling. + * Functions for creating and manipulating files. + * + * @{ + */ + + +/** + * Checks if deleting a file, link or directory is allowed. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file, link or directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_may_delete(struct ceph_mount_info *cmount, const char *path); + +/** + * Removes a file, link, or symbolic link. If the file/link has multiple links to it, the + * file will not disappear from the namespace until all references to it are removed. + * + * @param cmount the ceph mount handle to use for performing the unlink. + * @param path the path of the file or link to unlink. + * @returns 0 on success or negative error code on failure. + */ +int ceph_unlink(struct ceph_mount_info *cmount, const char *path); + +/** + * Removes a file, link, or symbolic link relative to a file descriptor. + * If the file/link has multiple links to it, the file will not + * disappear from the namespace until all references to it are removed. + * + * @param cmount the ceph mount handle to use for performing the unlink. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the path of the file or link to unlink. + * @param flags bitfield that can be used to set AT_* modifier flags (only AT_REMOVEDIR) + * @returns 0 on success or negative error code on failure. + */ +int ceph_unlinkat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags); + +/** + * Rename a file or directory. + * + * @param cmount the ceph mount handle to use for performing the rename. + * @param from the path to the existing file or directory. + * @param to the new name of the file or directory + * @returns 0 on success or negative error code on failure. + */ +int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to); + +/** + * Get an open file's extended statistics and attributes. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param fd the file descriptor of the file to get statistics of. + * @param stx the ceph_statx struct that will be filled in with the file's statistics. + * @param want bitfield of CEPH_STATX_* flags showing designed attributes + * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, + unsigned int want, unsigned int flags); + +/** + * Get attributes of a file relative to a file descriptor + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath to the file/directory to get statistics of + * @param stx the ceph_statx struct that will be filled in with the file's statistics. + * @param want bitfield of CEPH_STATX_* flags showing designed attributes + * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_statxat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, + struct ceph_statx *stx, unsigned int want, unsigned int flags); + +/** + * Get a file's extended statistics and attributes. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stx the ceph_statx struct that will be filled in with the file's statistics. + * @param want bitfield of CEPH_STATX_* flags showing designed attributes + * @param flags bitfield that can be used to set AT_* modifier flags (AT_STATX_SYNC_AS_STAT, AT_STATX_FORCE_SYNC, AT_STATX_DONT_SYNC and AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx, + unsigned int want, unsigned int flags); + +/** + * Get a file's statistics and attributes. + * + * ceph_stat() is deprecated, use ceph_statx() instead. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stbuf the stat struct that will be filled in with the file's statistics. + * @returns 0 on success or negative error code on failure. + */ +int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf) + LIBCEPHFS_DEPRECATED; + +/** + * Get a file's statistics and attributes, without following symlinks. + * + * ceph_lstat() is deprecated, use ceph_statx(.., AT_SYMLINK_NOFOLLOW) instead. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stbuf the stat struct that will be filled in with the file's statistics. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf) + LIBCEPHFS_DEPRECATED; + +/** + * Get the open file's statistics. + * + * ceph_fstat() is deprecated, use ceph_fstatx() instead. + * + * @param cmount the ceph mount handle to use for performing the fstat. + * @param fd the file descriptor of the file to get statistics of. + * @param stbuf the stat struct of the file's statistics, filled in by the + * function. + * @returns 0 on success or a negative error code on failure + */ +int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf) + LIBCEPHFS_DEPRECATED; + +/** + * Set a file's attributes. + * + * @param cmount the ceph mount handle to use for performing the setattr. + * @param relpath the path to the file/directory to set the attributes of. + * @param stx the statx struct that must include attribute values to set on the file. + * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct. + * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now) + * @returns 0 on success or negative error code on failure. + */ +int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags); + +/** + * Set a file's attributes (extended version). + * + * @param cmount the ceph mount handle to use for performing the setattr. + * @param fd the fd of the open file/directory to set the attributes of. + * @param stx the statx struct that must include attribute values to set on the file. + * @param mask a mask of all the stat values that have been set on the stat struct. + * @returns 0 on success or negative error code on failure. + */ +int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask); + +/** + * Change the mode bits (permissions) of a file/directory. + * + * @param cmount the ceph mount handle to use for performing the chmod. + * @param path the path to the file/directory to change the mode bits on. + * @param mode the new permissions to set. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Change the mode bits (permissions) of a file/directory. If the path is a + * symbolic link, it's not de-referenced. + * + * @param cmount the ceph mount handle to use for performing the chmod. + * @param path the path of file/directory to change the mode bits on. + * @param mode the new permissions to set. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lchmod(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Change the mode bits (permissions) of an open file. + * + * @param cmount the ceph mount handle to use for performing the chmod. + * @param fd the open file descriptor to change the mode bits on. + * @param mode the new permissions to set. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode); + +/** + * Change the mode bits (permissions) of a file relative to a file descriptor. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the relpath of the file/directory to change the ownership of. + * @param mode the new permissions to set. + * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_chmodat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, + mode_t mode, int flags); + +/** + * Change the ownership of a file/directory. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param path the path of the file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid); + +/** + * Change the ownership of a file from an open file descriptor. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param fd the fd of the open file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid); + +/** + * Change the ownership of a file/directory, don't follow symlinks. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param path the path of the file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid); + +/** + * Change the ownership of a file/directory releative to a file descriptor. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the relpath of the file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_chownat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, + uid_t uid, gid_t gid, int flags); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param buf holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param buf holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]); + +/** + * Change file/directory last access and modification times, don't follow symlinks. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]); + +/** + * Change file/directory last access and modification times relative + * to a file descriptor. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the relpath of the file/directory to change the ownership of. + * @param dirfd the fd of the open file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @param flags bitfield that can be used to set AT_* modifier flags (AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_utimensat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, + struct timespec times[2], int flags); + +/** + * Apply or remove an advisory lock. + * + * @param cmount the ceph mount handle to use for performing the lock. + * @param fd the open file descriptor to change advisory lock. + * @param operation the advisory lock operation to be performed on the file + * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock), + * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a + * non-blocking operation. + * @param owner the user-supplied owner identifier (an arbitrary integer) + * @returns 0 on success or negative error code on failure. + */ +int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation, + uint64_t owner); + +/** + * Truncate the file to the given size. If this operation causes the + * file to expand, the empty bytes will be filled in with zeros. + * + * @param cmount the ceph mount handle to use for performing the truncate. + * @param path the path to the file to truncate. + * @param size the new size of the file. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size); + +/** + * Make a block or character special file. + * + * @param cmount the ceph mount handle to use for performing the mknod. + * @param path the path to the special file. + * @param mode the permissions to use and the type of special file. The type can be + * one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO. + * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the + * major and minor numbers of the newly created device special file. Otherwise, + * it is ignored. + * @returns 0 on success or negative error code on failure. + */ +int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev); +/** + * Create and/or open a file. + * + * @param cmount the ceph mount handle to use for performing the open. + * @param path the path of the file to open. If the flags parameter includes O_CREAT, + * the file will first be created before opening. + * @param flags a set of option masks that control how the file is created/opened. + * @param mode the permissions to place on the file if the file does not exist and O_CREAT + * is specified in the flags. + * @returns a non-negative file descriptor number on success or a negative error code on failure. + */ +int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode); + +/** + * Create and/or open a file relative to a directory + * + * @param cmount the ceph mount handle to use for performing the open. + * @param dirfd open file descriptor (or CEPHFS_AT_FDCWD) + * @param relpath the path of the file to open. If the flags parameter includes O_CREAT, + * the file will first be created before opening. + * @param flags a set of option masks that control how the file is created/opened. + * @param mode the permissions to place on the file if the file does not exist and O_CREAT + * is specified in the flags. + * @returns a non-negative file descriptor number on success or a negative error code on failure. + */ +int ceph_openat(struct ceph_mount_info *cmount, int dirfd, const char *relpath, int flags, mode_t mode); + +/** + * Create and/or open a file with a specific file layout. + * + * @param cmount the ceph mount handle to use for performing the open. + * @param path the path of the file to open. If the flags parameter includes O_CREAT, + * the file will first be created before opening. + * @param flags a set of option masks that control how the file is created/opened. + * @param mode the permissions to place on the file if the file does not exist and O_CREAT + * is specified in the flags. + * @param stripe_unit the stripe unit size (option, 0 for default) + * @param stripe_count the stripe count (optional, 0 for default) + * @param object_size the object size (optional, 0 for default) + * @param data_pool name of target data pool name (optional, NULL or empty string for default) + * @returns a non-negative file descriptor number on success or a negative error code on failure. + */ +int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags, + mode_t mode, int stripe_unit, int stripe_count, int object_size, + const char *data_pool); + +/** + * Close the open file. + * + * @param cmount the ceph mount handle to use for performing the close. + * @param fd the file descriptor referring to the open file. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_close(struct ceph_mount_info *cmount, int fd); + +/** + * Reposition the open file stream based on the given offset. + * + * @param cmount the ceph mount handle to use for performing the lseek. + * @param fd the open file descriptor referring to the open file and holding the + * current position of the stream. + * @param offset the offset to set the stream to + * @param whence the flag to indicate what type of seeking to perform: + * SEEK_SET: the offset is set to the given offset in the file. + * SEEK_CUR: the offset is set to the current location plus @e offset bytes. + * SEEK_END: the offset is set to the end of the file plus @e offset bytes. + * @returns 0 on success or a negative error code on failure. + */ +int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence); +/** + * Read data from the file. + * + * @param cmount the ceph mount handle to use for performing the read. + * @param fd the file descriptor of the open file to read from. + * @param buf the buffer to read data into + * @param size the initial size of the buffer + * @param offset the offset in the file to read from. If this value is negative, the + * function reads from the current offset of the file descriptor. + * @returns the number of bytes read into buf, or a negative error code on failure. + */ +int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset); + +/** + * Read data from the file. + * @param cmount the ceph mount handle to use for performing the read. + * @param fd the file descriptor of the open file to read from. + * @param iov the iov structure to read data into + * @param iovcnt the number of items that iov includes + * @param offset the offset in the file to read from. If this value is negative, the + * function reads from the current offset of the file descriptor. + * @returns the number of bytes read into buf, or a negative error code on failure. + */ +int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt, + int64_t offset); + +/** + * Write data to a file. + * + * @param cmount the ceph mount handle to use for performing the write. + * @param fd the file descriptor of the open file to write to + * @param buf the bytes to write to the file + * @param size the size of the buf array + * @param offset the offset of the file write into. If this value is negative, the + * function writes to the current offset of the file descriptor. + * @returns the number of bytes written, or a negative error code + */ +int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size, + int64_t offset); + +/** + * Write data to a file. + * + * @param cmount the ceph mount handle to use for performing the write. + * @param fd the file descriptor of the open file to write to + * @param iov the iov structure to read data into + * @param iovcnt the number of items that iov includes + * @param offset the offset of the file write into. If this value is negative, the + * function writes to the current offset of the file descriptor. + * @returns the number of bytes written, or a negative error code + */ +int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt, + int64_t offset); + +/** + * Truncate a file to the given size. + * + * @param cmount the ceph mount handle to use for performing the ftruncate. + * @param fd the file descriptor of the file to truncate + * @param size the new size of the file + * @returns 0 on success or a negative error code on failure. + */ +int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size); + +/** + * Synchronize an open file to persistent media. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param syncdataonly a boolean whether to synchronize metadata and data (0) + * or just data (1). + * @return 0 on success or a negative error code on failure. + */ +int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly); + +/** + * Preallocate or release disk space for the file for the byte range. + * + * @param cmount the ceph mount handle to use for performing the fallocate. + * @param fd the file descriptor of the file to fallocate. + * @param mode the flags determines the operation to be performed on the given range. + * default operation (0) allocate and initialize to zero the file in the byte range, + * and the file size will be changed if offset + length is greater than + * the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode, + * the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is + * specified in the mode, the operation is deallocate space and zero the byte range. + * @param offset the byte range starting. + * @param length the length of the range. + * @return 0 on success or a negative error code on failure. + */ +int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode, + int64_t offset, int64_t length); + +/** + * Enable/disable lazyio for the file. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param enable a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable); + + +/** + * Flushes the write buffer for the file thereby propogating the buffered write to the file. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param offset a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count); + + +/** + * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param offset a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count); + +/** @} file */ + +/** + * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling. + * Functions for creating and manipulating extended attributes on files. + * + * @{ + */ + +/** + * Get an extended attribute. + * + * @param cmount the ceph mount handle to use for performing the getxattr. + * @param path the path to the file + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + void *value, size_t size); + +/** + * Get an extended attribute. + * + * @param cmount the ceph mount handle to use for performing the getxattr. + * @param fd the open file descriptor referring to the file to get extended attribute from. + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name, + void *value, size_t size); + +/** + * Get an extended attribute without following symbolic links. This function is + * identical to ceph_getxattr, but if the path refers to a symbolic link, + * we get the extended attributes of the symlink rather than the attributes + * of the link itself. + * + * @param cmount the ceph mount handle to use for performing the lgetxattr. + * @param path the path to the file + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + void *value, size_t size); + +/** + * List the extended attribute keys on a file. + * + * @param cmount the ceph mount handle to use for performing the listxattr. + * @param path the path to the file. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size); + +/** + * List the extended attribute keys on a file. + * + * @param cmount the ceph mount handle to use for performing the listxattr. + * @param fd the open file descriptor referring to the file to list extended attributes on. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size); + +/** + * Get the list of extended attribute keys on a file, but do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the llistxattr. + * @param path the path to the file. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size); + +/** + * Remove an extended attribute from a file. + * + * @param cmount the ceph mount handle to use for performing the removexattr. + * @param path the path to the file. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name); + +/** + * Remove an extended attribute from a file. + * + * @param cmount the ceph mount handle to use for performing the removexattr. + * @param fd the open file descriptor referring to the file to remove extended attribute from. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name); + +/** + * Remove the extended attribute from a file, do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the lremovexattr. + * @param path the path to the file. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name); + +/** + * Set an extended attribute on a file. + * + * @param cmount the ceph mount handle to use for performing the setxattr. + * @param path the path to the file. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + const void *value, size_t size, int flags); + +/** + * Set an extended attribute on a file. + * + * @param cmount the ceph mount handle to use for performing the setxattr. + * @param fd the open file descriptor referring to the file to set extended attribute on. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name, + const void *value, size_t size, int flags); + +/** + * Set an extended attribute on a file, do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the lsetxattr. + * @param path the path to the file. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + const void *value, size_t size, int flags); + +/** @} xattr */ + +/** + * @defgroup libcephfs_h_filelayout Control File Layout. + * Functions for setting and getting the file layout of existing files. + * + * @{ + */ + +/** + * Get the file striping unit from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the striping unit of. + * @returns the striping unit of the file or a negative error code on failure. + */ +int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file striping unit. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the striping unit of. + * @returns the striping unit of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file striping count from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file striping count. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file object size from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file object size. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file pool information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the pool information of. + * @returns the ceph pool id that the file is in + */ +int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file pool information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the pool information of. + * @returns the ceph pool id that the file is in + */ +int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the name of the pool a opened file is stored in, + * + * Write the name of the file's pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen); + +/** + * get the name of a pool by id + * + * Given a pool's numeric identifier, get the pool's alphanumeric name. + * + * @param cmount the ceph mount handle to use + * @param pool the numeric pool id + * @param buf buffer to sore the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough + */ +int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen); + +/** + * Get the name of the pool a file is stored in + * + * Write the name of the file's pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen); + +/** + * Get the default pool name of cephfs + * Write the name of the default pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * @param cmount the ceph mount handle to use. + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen); + +/** + * Get the file layout from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file layout. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file replication information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the replication information of. + * @returns the replication factor of the file. + */ +int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file replication information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the replication information of. + * @returns the replication factor of the file. + */ +int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the id of the named pool. + * + * @param cmount the ceph mount handle to use. + * @param pool_name the name of the pool. + * @returns the pool id, or a negative error code on failure. + */ +int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name); + +/** + * Get the pool replication factor. + * + * @param cmount the ceph mount handle to use. + * @param pool_id the pool id to look up + * @returns the replication factor, or a negative error code on failure. + */ +int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id); + +/** + * Get the OSD address where the primary copy of a file stripe is located. + * + * @param cmount the ceph mount handle to use. + * @param fd the open file descriptor referring to the file to get the striping unit of. + * @param offset the offset into the file to specify the stripe. The offset can be + * anywhere within the stripe unit. + * @param addr the address of the OSD holding that stripe + * @param naddr the capacity of the address passed in. + * @returns the size of the addressed filled into the @e addr parameter, or a negative + * error code on failure. + */ +int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset, + struct sockaddr_storage *addr, int naddr); + +/** + * Get the list of OSDs where the objects containing a file offset are located. + * + * @param cmount the ceph mount handle to use. + * @param fd the open file descriptor referring to the file. + * @param offset the offset within the file. + * @param length return the number of bytes between the offset and the end of + * the stripe unit (optional). + * @param osds an integer array to hold the OSD ids. + * @param nosds the size of the integer array. + * @returns the number of items stored in the output array, or -ERANGE if the + * array is not large enough. + */ +int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd, + int64_t offset, int64_t *length, int *osds, int nosds); + +/** + * Get the fully qualified CRUSH location of an OSD. + * + * Returns (type, name) string pairs for each device in the CRUSH bucket + * hierarchy starting from the given osd to the root. Each pair element is + * separated by a NULL character. + * + * @param cmount the ceph mount handle to use. + * @param osd the OSD id. + * @param path buffer to store location. + * @param len size of buffer. + * @returns the amount of bytes written into the buffer, or -ERANGE if the + * array is not large enough. + */ +int ceph_get_osd_crush_location(struct ceph_mount_info *cmount, + int osd, char *path, size_t len); + +/** + * Get the network address of an OSD. + * + * @param cmount the ceph mount handle. + * @param osd the OSD id. + * @param addr the OSD network address. + * @returns zero on success, other returns a negative error code. + */ +int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd, + struct sockaddr_storage *addr); + +/** + * Get the file layout stripe unit granularity. + * @param cmount the ceph mount handle. + * @returns the stripe unit granularity or a negative error code on failure. + */ +int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount); + +/** @} filelayout */ + +/** + * No longer available. Do not use. + * These functions will return -EOPNOTSUPP. + */ +int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe); +int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count); +int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size); +int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd); +int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication); + +/** + * Read from local replicas when possible. + * + * @param cmount the ceph mount handle to use. + * @param val a boolean to set (1) or clear (0) the option to favor local objects + * for reads. + * @returns 0 + */ +int ceph_localize_reads(struct ceph_mount_info *cmount, int val); + +/** + * Get the osd id of the local osd (if any) + * + * @param cmount the ceph mount handle to use. + * @returns the osd (if any) local to the node where this call is made, otherwise + * -1 is returned. + */ +int ceph_get_local_osd(struct ceph_mount_info *cmount); + +/** @} default_filelayout */ + +/** + * Get the capabilities currently issued to the client. + * + * @param cmount the ceph mount handle to use. + * @param fd the file descriptor to get issued + * @returns the current capabilities issued to this client + * for the open file + */ +int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd); + +/** + * Get the capabilities currently issued to the client. + * + * @param cmount the ceph mount handle to use. + * @param path the path to the file + * @returns the current capabilities issued to this client + * for the file + */ +int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path); + +/* Low Level */ +struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount, + vinodeno_t vino); + +int ceph_ll_lookup_vino(struct ceph_mount_info *cmount, vinodeno_t vino, + Inode **inode); + +int ceph_ll_lookup_inode( + struct ceph_mount_info *cmount, + struct inodeno_t ino, + Inode **inode); + +/** + * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it! + * + * @param cmount the ceph mount handle to use. + * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned + * @returns 0 if all good + */ +int ceph_ll_lookup_root(struct ceph_mount_info *cmount, + Inode **parent); +int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent, + const char *name, Inode **out, struct ceph_statx *stx, + unsigned want, unsigned flags, const UserPerm *perms); +int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in); +int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in, + int count); +int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i, + struct ceph_statx *stx, unsigned int want, unsigned int flags, + const UserPerm *perms); +int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_statx *stx, unsigned int want, unsigned int flags, + const UserPerm *perms); +int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_statx *stx, int mask, const UserPerm *perms); +int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags, + struct Fh **fh, const UserPerm *perms); +off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle, + off_t offset, int whence); +int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle, + int64_t off, uint64_t len, char* buf); +int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh, + int syncdataonly); +int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in, + int syncdataonly); +int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh, + int mode, int64_t offset, int64_t length); +int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle, + int64_t off, uint64_t len, const char *data); +int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh, + const struct iovec *iov, int iovcnt, int64_t off); +int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh, + const struct iovec *iov, int iovcnt, int64_t off); +int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle); +int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode); +/** + * Get xattr value by xattr name. + * + * @param cmount the ceph mount handle to use. + * @param in file handle + * @param name name of attribute + * @param value pointer to begin buffer + * @param size buffer size + * @param perms pointer to UserPerms object + * @returns size of returned buffer. Negative number in error case + */ +int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, void *value, size_t size, + const UserPerm *perms); +int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const void *value, size_t size, + int flags, const UserPerm *perms); +int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in, + char *list, size_t buf_size, size_t *list_size, + const UserPerm *perms); +int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, int oflags, Inode **outp, + Fh **fhp, struct ceph_statx *stx, unsigned want, + unsigned lflags, const UserPerm *perms); +int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, dev_t rdev, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm *perms); +int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, Inode **out, + struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm *perms); +int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in, + struct Inode *newparent, const char *name, + const UserPerm *perms); +int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_dir_result **dirpp, const UserPerm *perms); +int ceph_ll_releasedir(struct ceph_mount_info *cmount, + struct ceph_dir_result* dir); +int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent, + const char *name, struct Inode *newparent, + const char *newname, const UserPerm *perms); +int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in, + struct statvfs *stbuf); +int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in, + char *buf, size_t bufsize, const UserPerm *perms); +int ceph_ll_symlink(struct ceph_mount_info *cmount, + Inode *in, const char *name, const char *value, + Inode **out, struct ceph_statx *stx, + unsigned want, unsigned flags, + const UserPerm *perms); +int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount, + struct Inode *in); +uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount, + struct Inode *in, + struct ceph_file_layout *layout); +uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount, + struct Inode *in); +int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount, + struct Inode *in, + uint64_t blockno, + struct ceph_file_layout* layout); +int ceph_ll_num_osds(struct ceph_mount_info *cmount); +int ceph_ll_osdaddr(struct ceph_mount_info *cmount, + int osd, uint32_t *addr); +uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockno); +int ceph_ll_read_block(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockid, + char* bl, uint64_t offset, uint64_t length, + struct ceph_file_layout* layout); +int ceph_ll_write_block(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockid, + char* buf, uint64_t offset, + uint64_t length, struct ceph_file_layout* layout, + uint64_t snapseq, uint32_t sync); +int ceph_ll_commit_blocks(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t offset, uint64_t range); + + +int ceph_ll_getlk(struct ceph_mount_info *cmount, + Fh *fh, struct flock *fl, uint64_t owner); +int ceph_ll_setlk(struct ceph_mount_info *cmount, + Fh *fh, struct flock *fl, uint64_t owner, int sleep); + +int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable); + +/* + * Delegation support + * + * Delegations are way for an application to request exclusive or + * semi-exclusive access to an Inode. The client requests the delegation and + * if it's successful it can reliably cache file data and metadata until the + * delegation is recalled. + * + * Recalls are issued via a callback function, provided by the application. + * Callback functions should act something like signal handlers. You want to + * do as little as possible in the callback. Any major work should be deferred + * in some fashion as it's difficult to predict the context in which this + * function will be called. + * + * Once the delegation has been recalled, the application should return it as + * soon as possible. The application has client_deleg_timeout seconds to + * return it, after which the cmount structure is forcibly unmounted and + * further calls into it fail. + * + * The application can set the client_deleg_timeout config option to suit its + * needs, but it should take care to choose a value that allows it to avoid + * forcible eviction from the cluster in the event of an application bug. + */ + +/* Commands for manipulating delegation state */ +#ifndef CEPH_DELEGATION_NONE +# define CEPH_DELEGATION_NONE 0 +# define CEPH_DELEGATION_RD 1 +# define CEPH_DELEGATION_WR 2 +#endif + +/** + * Get the amount of time that the client has to return caps + * @param cmount the ceph mount handle to use. + * + * In the event that a client does not return its caps, the MDS may blocklist + * it after this timeout. Applications should check this value and ensure + * that they set the delegation timeout to a value lower than this. + * + * This call returns the cap return timeout (in seconds) for this cmount, or + * zero if it's not mounted. + */ +uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount); + +/** + * Set the delegation timeout for the mount (thereby enabling delegations) + * @param cmount the ceph mount handle to use. + * @param timeout the delegation timeout (in seconds) + * + * Since the client could end up blocklisted if it doesn't return delegations + * in time, we mandate that any application wanting to use delegations + * explicitly set the timeout beforehand. Until this call is done on the + * mount, attempts to set a delegation will return -ETIME. + * + * Once a delegation is recalled, if it is not returned in this amount of + * time, the cmount will be forcibly unmounted and further access attempts + * will fail (usually with -ENOTCONN errors). + * + * This value is further vetted against the cap return timeout, and this call + * can fail with -EINVAL if the timeout value is too long. Delegations can be + * disabled again by setting the timeout to 0. + */ +int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout); + +/** + * Request a delegation on an open Fh + * @param cmount the ceph mount handle to use. + * @param fh file handle + * @param cmd CEPH_DELEGATION_* command + * @param cb callback function for recalling delegation + * @param priv opaque token passed back during recalls + * + * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict + * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM, + * -ETIME) + */ +int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh, + unsigned int cmd, ceph_deleg_cb_t cb, void *priv); + +mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode); + +/* state reclaim */ +#define CEPH_RECLAIM_RESET 1 + +/** + * Set ceph client uuid + * @param cmount the ceph mount handle to use. + * @param uuid the uuid to set + * + * Must be called before mount. + */ +void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid); + +/** + * Set ceph client session timeout + * @param cmount the ceph mount handle to use. + * @param timeout the timeout to set + * + * Must be called before mount. + */ +void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout); + +/** + * Start to reclaim states of other client + * @param cmount the ceph mount handle to use. + * @param uuid uuid of client whose states need to be reclaimed + * @param flags flags that control how states get reclaimed + * + * Returns 0 success, -EOPNOTSUPP if mds does not support the operation, + * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client + * with the given uuid, -ENOTRECOVERABLE in all other error cases. + */ +int ceph_start_reclaim(struct ceph_mount_info *cmount, + const char *uuid, unsigned flags); + +/** + * finish reclaiming states of other client ( + * @param cmount the ceph mount handle to use. + */ +void ceph_finish_reclaim(struct ceph_mount_info *cmount); + +/** + * Register a set of callbacks to be used with this cmount + * + * This is deprecated, use ceph_ll_register_callbacks2() instead. + * + * @param cmount the ceph mount handle on which the cb's should be registerd + * @param args callback arguments to register with the cmount + * + * Any fields set to NULL will be ignored. There currently is no way to + * unregister these callbacks, so this is a one-way change. + */ +void ceph_ll_register_callbacks(struct ceph_mount_info *cmount, + struct ceph_client_callback_args *args); + +/** + * Register a set of callbacks to be used with this cmount + * @param cmount the ceph mount handle on which the cb's should be registerd + * @param args callback arguments to register with the cmount + * + * Any fields set to NULL will be ignored. There currently is no way to + * unregister these callbacks, so this is a one-way change. + * + * Returns 0 on success or -EBUSY if the cmount is mounting or already mounted. + */ +int ceph_ll_register_callbacks2(struct ceph_mount_info *cmount, + struct ceph_client_callback_args *args); + +/** + * Get snapshot info + * + * @param cmount the ceph mount handle to use for making the directory. + * @param path the path of the snapshot. This must be either an + * absolute path or a relative path off of the current working directory. + * @returns 0 on success or a negative return code on error. + */ +int ceph_get_snap_info(struct ceph_mount_info *cmount, + const char *path, struct snap_info *snap_info); + +/** + * Free snapshot info buffers + * + * @param snap_info snapshot info struct (fetched via call to ceph_get_snap_info()). + */ +void ceph_free_snap_info_buffer(struct snap_info *snap_info); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/cephfs/metrics/Types.h b/src/include/cephfs/metrics/Types.h new file mode 100644 index 000000000..d7cf56138 --- /dev/null +++ b/src/include/cephfs/metrics/Types.h @@ -0,0 +1,699 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H +#define CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H + +#include <string> +#include <boost/variant.hpp> + +#include "common/Formatter.h" +#include "include/buffer_fwd.h" +#include "include/encoding.h" +#include "include/int_types.h" +#include "include/stringify.h" +#include "include/utime.h" + +namespace ceph { class Formatter; } + +enum ClientMetricType { + CLIENT_METRIC_TYPE_CAP_INFO, + CLIENT_METRIC_TYPE_READ_LATENCY, + CLIENT_METRIC_TYPE_WRITE_LATENCY, + CLIENT_METRIC_TYPE_METADATA_LATENCY, + CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_OPENED_FILES, + CLIENT_METRIC_TYPE_PINNED_ICAPS, + CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_READ_IO_SIZES, + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, +}; +inline std::ostream &operator<<(std::ostream &os, const ClientMetricType &type) { + switch(type) { + case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO: + os << "CAP_INFO"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY: + os << "READ_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY: + os << "WRITE_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY: + os << "METADATA_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE: + os << "DENTRY_LEASE"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES: + os << "OPENED_FILES"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS: + os << "PINNED_ICAPS"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES: + os << "OPENED_INODES"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES: + os << "READ_IO_SIZES"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES: + os << "WRITE_IO_SIZES"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_READ_LATENCY: + os << "AVG_READ_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_READ_LATENCY: + os << "STDEV_READ_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY: + os << "AVG_WRITE_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY: + os << "STDEV_WRITE_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY: + os << "AVG_METADATA_LATENCY"; + break; + case ClientMetricType::CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY: + os << "STDEV_METADATA_LATENCY"; + break; + default: + os << "(UNKNOWN:" << static_cast<std::underlying_type<ClientMetricType>::type>(type) << ")"; + break; + } + + return os; +} + +struct ClientMetricPayloadBase { + ClientMetricPayloadBase(ClientMetricType type) : metric_type(type) {} + + ClientMetricType get_type() const { + return metric_type; + } + + void print_type(std::ostream *out) const { + *out << metric_type; + } + + private: + ClientMetricType metric_type; +}; + +struct CapInfoPayload : public ClientMetricPayloadBase { + uint64_t cap_hits = 0; + uint64_t cap_misses = 0; + uint64_t nr_caps = 0; + + CapInfoPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO) { } + CapInfoPayload(uint64_t cap_hits, uint64_t cap_misses, uint64_t nr_caps) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO), + cap_hits(cap_hits), cap_misses(cap_misses), nr_caps(nr_caps) { + } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(cap_hits, bl); + encode(cap_misses, bl); + encode(nr_caps, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(cap_hits, iter); + decode(cap_misses, iter); + decode(nr_caps, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("cap_hits", cap_hits); + f->dump_int("cap_misses", cap_misses); + f->dump_int("num_caps", nr_caps); + } + + void print(std::ostream *out) const { + *out << "cap_hits: " << cap_hits << " " + << "cap_misses: " << cap_misses << " " + << "num_caps: " << nr_caps; + } +}; + +struct ReadLatencyPayload : public ClientMetricPayloadBase { + utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count + + ReadLatencyPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY) { } + ReadLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count) { + } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(2, 1, bl); + encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(2, iter); + decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); + } + + void print(std::ostream *out) const { + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; + } +}; + +struct WriteLatencyPayload : public ClientMetricPayloadBase { + utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count + + WriteLatencyPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY) { } + WriteLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count){ + } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(2, 1, bl); + encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(2, iter); + decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); + } + + void print(std::ostream *out) const { + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; + } +}; + +struct MetadataLatencyPayload : public ClientMetricPayloadBase { + utime_t lat; + utime_t mean; + uint64_t sq_sum; // sum of squares + uint64_t count; // IO count + + MetadataLatencyPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY) { } + MetadataLatencyPayload(utime_t lat, utime_t mean, uint64_t sq_sum, uint64_t count) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY), + lat(lat), + mean(mean), + sq_sum(sq_sum), + count(count) { + } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(2, 1, bl); + encode(lat, bl); + encode(mean, bl); + encode(sq_sum, bl); + encode(count, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(2, iter); + decode(lat, iter); + if (struct_v >= 2) { + decode(mean, iter); + decode(sq_sum, iter); + decode(count, iter); + } + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("latency", lat); + f->dump_int("avg_latency", mean); + f->dump_unsigned("sq_sum", sq_sum); + f->dump_unsigned("count", count); + } + + void print(std::ostream *out) const { + *out << "latency: " << lat << ", avg_latency: " << mean + << ", sq_sum: " << sq_sum << ", count=" << count; + } +}; + +struct DentryLeasePayload : public ClientMetricPayloadBase { + uint64_t dlease_hits = 0; + uint64_t dlease_misses = 0; + uint64_t nr_dentries = 0; + + DentryLeasePayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE) { } + DentryLeasePayload(uint64_t dlease_hits, uint64_t dlease_misses, uint64_t nr_dentries) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE), + dlease_hits(dlease_hits), dlease_misses(dlease_misses), nr_dentries(nr_dentries) { } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(dlease_hits, bl); + encode(dlease_misses, bl); + encode(nr_dentries, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(dlease_hits, iter); + decode(dlease_misses, iter); + decode(nr_dentries, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("dlease_hits", dlease_hits); + f->dump_int("dlease_misses", dlease_misses); + f->dump_int("num_dentries", nr_dentries); + } + + void print(std::ostream *out) const { + *out << "dlease_hits: " << dlease_hits << " " + << "dlease_misses: " << dlease_misses << " " + << "num_dentries: " << nr_dentries; + } +}; + +struct OpenedFilesPayload : public ClientMetricPayloadBase { + uint64_t opened_files = 0; + uint64_t total_inodes = 0; + + OpenedFilesPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES) { } + OpenedFilesPayload(uint64_t opened_files, uint64_t total_inodes) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES), + opened_files(opened_files), total_inodes(total_inodes) { } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(opened_files, bl); + encode(total_inodes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(opened_files, iter); + decode(total_inodes, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("opened_files", opened_files); + f->dump_int("total_inodes", total_inodes); + } + + void print(std::ostream *out) const { + *out << "opened_files: " << opened_files << " " + << "total_inodes: " << total_inodes; + } +}; + +struct PinnedIcapsPayload : public ClientMetricPayloadBase { + uint64_t pinned_icaps = 0; + uint64_t total_inodes = 0; + + PinnedIcapsPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS) { } + PinnedIcapsPayload(uint64_t pinned_icaps, uint64_t total_inodes) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS), + pinned_icaps(pinned_icaps), total_inodes(total_inodes) { } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(pinned_icaps, bl); + encode(total_inodes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(pinned_icaps, iter); + decode(total_inodes, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("pinned_icaps", pinned_icaps); + f->dump_int("total_inodes", total_inodes); + } + + void print(std::ostream *out) const { + *out << "pinned_icaps: " << pinned_icaps << " " + << "total_inodes: " << total_inodes; + } +}; + +struct OpenedInodesPayload : public ClientMetricPayloadBase { + uint64_t opened_inodes = 0; + uint64_t total_inodes = 0; + + OpenedInodesPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES) { } + OpenedInodesPayload(uint64_t opened_inodes, uint64_t total_inodes) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES), + opened_inodes(opened_inodes), total_inodes(total_inodes) { } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(opened_inodes, bl); + encode(total_inodes, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(opened_inodes, iter); + decode(total_inodes, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("opened_inodes", opened_inodes); + f->dump_int("total_inodes", total_inodes); + } + + void print(std::ostream *out) const { + *out << "opened_inodes: " << opened_inodes << " " + << "total_inodes: " << total_inodes; + } +}; + +struct ReadIoSizesPayload : public ClientMetricPayloadBase { + uint64_t total_ops = 0; + uint64_t total_size = 0; + + ReadIoSizesPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES) { } + ReadIoSizesPayload(uint64_t total_ops, uint64_t total_size) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES), + total_ops(total_ops), total_size(total_size) { } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(total_ops, bl); + encode(total_size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(total_ops, iter); + decode(total_size, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("total_ops", total_ops); + f->dump_int("total_size", total_size); + } + + void print(std::ostream *out) const { + *out << "total_ops: " << total_ops << " total_size: " << total_size; + } +}; + +struct WriteIoSizesPayload : public ClientMetricPayloadBase { + uint64_t total_ops = 0; + uint64_t total_size = 0; + + WriteIoSizesPayload() + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES) { } + WriteIoSizesPayload(uint64_t total_ops, uint64_t total_size) + : ClientMetricPayloadBase(ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES), + total_ops(total_ops), total_size(total_size) { + } + + void encode(bufferlist &bl) const { + using ceph::encode; + ENCODE_START(1, 1, bl); + encode(total_ops, bl); + encode(total_size, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(1, iter); + decode(total_ops, iter); + decode(total_size, iter); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + f->dump_int("total_ops", total_ops); + f->dump_int("total_size", total_size); + } + + void print(std::ostream *out) const { + *out << "total_ops: " << total_ops << " total_size: " << total_size; + } +}; + +struct UnknownPayload : public ClientMetricPayloadBase { + UnknownPayload() + : ClientMetricPayloadBase(static_cast<ClientMetricType>(-1)) { } + UnknownPayload(ClientMetricType metric_type) + : ClientMetricPayloadBase(metric_type) { } + + void encode(bufferlist &bl) const { + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + DECODE_START(254, iter); + iter.seek(struct_len); + DECODE_FINISH(iter); + } + + void dump(Formatter *f) const { + } + + void print(std::ostream *out) const { + } +}; + +typedef boost::variant<CapInfoPayload, + ReadLatencyPayload, + WriteLatencyPayload, + MetadataLatencyPayload, + DentryLeasePayload, + OpenedFilesPayload, + PinnedIcapsPayload, + OpenedInodesPayload, + ReadIoSizesPayload, + WriteIoSizesPayload, + UnknownPayload> ClientMetricPayload; + +// metric update message sent by clients +struct ClientMetricMessage { +public: + ClientMetricMessage(const ClientMetricPayload &payload = UnknownPayload()) + : payload(payload) { + } + + class EncodePayloadVisitor : public boost::static_visitor<void> { + public: + explicit EncodePayloadVisitor(bufferlist &bl) : m_bl(bl) { + } + + template <typename ClientMetricPayload> + inline void operator()(const ClientMetricPayload &payload) const { + using ceph::encode; + encode(static_cast<uint32_t>(payload.get_type()), m_bl); + payload.encode(m_bl); + } + + private: + bufferlist &m_bl; + }; + + class DecodePayloadVisitor : public boost::static_visitor<void> { + public: + DecodePayloadVisitor(bufferlist::const_iterator &iter) : m_iter(iter) { + } + + template <typename ClientMetricPayload> + inline void operator()(ClientMetricPayload &payload) const { + using ceph::decode; + payload.decode(m_iter); + } + + private: + bufferlist::const_iterator &m_iter; + }; + + class DumpPayloadVisitor : public boost::static_visitor<void> { + public: + explicit DumpPayloadVisitor(Formatter *formatter) : m_formatter(formatter) { + } + + template <typename ClientMetricPayload> + inline void operator()(const ClientMetricPayload &payload) const { + m_formatter->dump_string("client_metric_type", stringify(payload.get_type())); + payload.dump(m_formatter); + } + + private: + Formatter *m_formatter; + }; + + class PrintPayloadVisitor : public boost::static_visitor<void> { + public: + explicit PrintPayloadVisitor(std::ostream *out) : _out(out) { + } + + template <typename ClientMetricPayload> + inline void operator()(const ClientMetricPayload &payload) const { + *_out << "[client_metric_type: "; + payload.print_type(_out); + *_out << " "; + payload.print(_out); + *_out << "]"; + } + + private: + std::ostream *_out; + }; + + void encode(bufferlist &bl) const { + boost::apply_visitor(EncodePayloadVisitor(bl), payload); + } + + void decode(bufferlist::const_iterator &iter) { + using ceph::decode; + + uint32_t metric_type; + decode(metric_type, iter); + + switch (metric_type) { + case ClientMetricType::CLIENT_METRIC_TYPE_CAP_INFO: + payload = CapInfoPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_READ_LATENCY: + payload = ReadLatencyPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_LATENCY: + payload = WriteLatencyPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_METADATA_LATENCY: + payload = MetadataLatencyPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_DENTRY_LEASE: + payload = DentryLeasePayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_FILES: + payload = OpenedFilesPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_PINNED_ICAPS: + payload = PinnedIcapsPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_OPENED_INODES: + payload = OpenedInodesPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_READ_IO_SIZES: + payload = ReadIoSizesPayload(); + break; + case ClientMetricType::CLIENT_METRIC_TYPE_WRITE_IO_SIZES: + payload = WriteIoSizesPayload(); + break; + default: + payload = UnknownPayload(static_cast<ClientMetricType>(metric_type)); + break; + } + + boost::apply_visitor(DecodePayloadVisitor(iter), payload); + } + + void dump(Formatter *f) const { + apply_visitor(DumpPayloadVisitor(f), payload); + } + + void print(std::ostream *out) const { + apply_visitor(PrintPayloadVisitor(out), payload); + } + + ClientMetricPayload payload; +}; +WRITE_CLASS_ENCODER(ClientMetricMessage); + +#endif // CEPH_INCLUDE_CEPHFS_METRICS_TYPES_H diff --git a/src/include/cephfs/types.h b/src/include/cephfs/types.h new file mode 100644 index 000000000..cca0a6193 --- /dev/null +++ b/src/include/cephfs/types.h @@ -0,0 +1,970 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#ifndef CEPH_CEPHFS_TYPES_H +#define CEPH_CEPHFS_TYPES_H +#include "include/int_types.h" + +#include <ostream> +#include <set> +#include <map> +#include <string_view> + +#include "common/config.h" +#include "common/Clock.h" +#include "common/DecayCounter.h" +#include "common/StackStringStream.h" +#include "common/entity_name.h" + +#include "include/compat.h" +#include "include/Context.h" +#include "include/frag.h" +#include "include/xlist.h" +#include "include/interval_set.h" +#include "include/compact_set.h" +#include "include/fs_types.h" +#include "include/ceph_fs.h" + +#include "mds/inode_backtrace.h" + +#include <boost/spirit/include/qi.hpp> +#include <boost/pool/pool.hpp> +#include "include/ceph_assert.h" +#include <boost/serialization/strong_typedef.hpp> +#include "common/ceph_json.h" + +#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011" +#define MAX_MDS 0x100 + +BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t) +extern const mds_gid_t MDS_GID_NONE; + +typedef int32_t fs_cluster_id_t; +constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1; + +// The namespace ID of the anonymous default filesystem from legacy systems +constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0; + +typedef int32_t mds_rank_t; +constexpr mds_rank_t MDS_RANK_NONE = -1; +constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST = -2; +constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND = -3; + +struct scatter_info_t { + version_t version = 0; +}; + +struct frag_info_t : public scatter_info_t { + int64_t size() const { return nfiles + nsubdirs; } + + void zero() { + *this = frag_info_t(); + } + + // *this += cur - acc; + void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) { + if (cur.mtime > mtime) { + mtime = cur.mtime; + if (touched_mtime) + *touched_mtime = true; + } + if (cur.change_attr > change_attr) { + change_attr = cur.change_attr; + if (touched_chattr) + *touched_chattr = true; + } + nfiles += cur.nfiles - acc.nfiles; + nsubdirs += cur.nsubdirs - acc.nsubdirs; + } + + void add(const frag_info_t& other) { + if (other.mtime > mtime) + mtime = other.mtime; + if (other.change_attr > change_attr) + change_attr = other.change_attr; + nfiles += other.nfiles; + nsubdirs += other.nsubdirs; + } + + bool same_sums(const frag_info_t &o) const { + return mtime <= o.mtime && + nfiles == o.nfiles && + nsubdirs == o.nsubdirs; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<frag_info_t*>& ls); + + // this frag + utime_t mtime; + uint64_t change_attr = 0; + int64_t nfiles = 0; // files + int64_t nsubdirs = 0; // subdirs +}; +WRITE_CLASS_ENCODER(frag_info_t) + +inline bool operator==(const frag_info_t &l, const frag_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const frag_info_t &l, const frag_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const frag_info_t &f); + +struct nest_info_t : public scatter_info_t { + int64_t rsize() const { return rfiles + rsubdirs; } + + void zero() { + *this = nest_info_t(); + } + + void sub(const nest_info_t &other) { + add(other, -1); + } + void add(const nest_info_t &other, int fac=1) { + if (other.rctime > rctime) + rctime = other.rctime; + rbytes += fac*other.rbytes; + rfiles += fac*other.rfiles; + rsubdirs += fac*other.rsubdirs; + rsnaps += fac*other.rsnaps; + } + + // *this += cur - acc; + void add_delta(const nest_info_t &cur, const nest_info_t &acc) { + if (cur.rctime > rctime) + rctime = cur.rctime; + rbytes += cur.rbytes - acc.rbytes; + rfiles += cur.rfiles - acc.rfiles; + rsubdirs += cur.rsubdirs - acc.rsubdirs; + rsnaps += cur.rsnaps - acc.rsnaps; + } + + bool same_sums(const nest_info_t &o) const { + return rctime <= o.rctime && + rbytes == o.rbytes && + rfiles == o.rfiles && + rsubdirs == o.rsubdirs && + rsnaps == o.rsnaps; + } + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<nest_info_t*>& ls); + + // this frag + children + utime_t rctime; + int64_t rbytes = 0; + int64_t rfiles = 0; + int64_t rsubdirs = 0; + int64_t rsnaps = 0; +}; +WRITE_CLASS_ENCODER(nest_info_t) + +inline bool operator==(const nest_info_t &l, const nest_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} +inline bool operator!=(const nest_info_t &l, const nest_info_t &r) { + return !(l == r); +} + +std::ostream& operator<<(std::ostream &out, const nest_info_t &n); + +struct vinodeno_t { + vinodeno_t() {} + vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {} + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(ino, bl); + encode(snapid, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(ino, p); + decode(snapid, p); + } + + inodeno_t ino; + snapid_t snapid; +}; +WRITE_CLASS_ENCODER(vinodeno_t) + +inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) { + return l.ino == r.ino && l.snapid == r.snapid; +} +inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) { + return !(l == r); +} +inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) { + return + l.ino < r.ino || + (l.ino == r.ino && l.snapid < r.snapid); +} + +typedef enum { + QUOTA_MAX_FILES, + QUOTA_MAX_BYTES, + QUOTA_ANY +} quota_max_t; + +struct quota_info_t +{ + void encode(ceph::buffer::list& bl) const { + ENCODE_START(1, 1, bl); + encode(max_bytes, bl); + encode(max_files, bl); + ENCODE_FINISH(bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p); + decode(max_bytes, p); + decode(max_files, p); + DECODE_FINISH(p); + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<quota_info_t *>& ls); + + bool is_valid() const { + return max_bytes >=0 && max_files >=0; + } + bool is_enabled(quota_max_t type=QUOTA_ANY) const { + switch (type) { + case QUOTA_MAX_FILES: + return !!max_files; + case QUOTA_MAX_BYTES: + return !!max_bytes; + case QUOTA_ANY: + default: + return !!max_bytes || !!max_files; + } + } + void decode_json(JSONObj *obj); + + int64_t max_bytes = 0; + int64_t max_files = 0; +}; +WRITE_CLASS_ENCODER(quota_info_t) + +inline bool operator==(const quota_info_t &l, const quota_info_t &r) { + return memcmp(&l, &r, sizeof(l)) == 0; +} + +std::ostream& operator<<(std::ostream &out, const quota_info_t &n); + +struct client_writeable_range_t { + struct byte_range_t { + uint64_t first = 0, last = 0; // interval client can write to + byte_range_t() {} + void decode_json(JSONObj *obj); + }; + + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<client_writeable_range_t*>& ls); + + byte_range_t range; + snapid_t follows = 0; // aka "data+metadata flushed thru" +}; + +inline void decode(client_writeable_range_t::byte_range_t& range, ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(range.first, bl); + decode(range.last, bl); +} + +WRITE_CLASS_ENCODER(client_writeable_range_t) + +std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r); + +inline bool operator==(const client_writeable_range_t& l, + const client_writeable_range_t& r) { + return l.range.first == r.range.first && l.range.last == r.range.last && + l.follows == r.follows; +} + +struct inline_data_t { +public: + inline_data_t() {} + inline_data_t(const inline_data_t& o) : version(o.version) { + if (o.blp) + set_data(*o.blp); + } + inline_data_t& operator=(const inline_data_t& o) { + version = o.version; + if (o.blp) + set_data(*o.blp); + else + free_data(); + return *this; + } + + void free_data() { + blp.reset(); + } + void get_data(ceph::buffer::list& ret) const { + if (blp) + ret = *blp; + else + ret.clear(); + } + void set_data(const ceph::buffer::list& bl) { + if (!blp) + blp.reset(new ceph::buffer::list); + *blp = bl; + } + size_t length() const { return blp ? blp->length() : 0; } + + bool operator==(const inline_data_t& o) const { + return length() == o.length() && + (length() == 0 || + (*const_cast<ceph::buffer::list*>(blp.get()) == *const_cast<ceph::buffer::list*>(o.blp.get()))); + } + bool operator!=(const inline_data_t& o) const { + return !(*this == o); + } + void encode(ceph::buffer::list &bl) const; + void decode(ceph::buffer::list::const_iterator& bl); + + version_t version = 1; + +private: + std::unique_ptr<ceph::buffer::list> blp; +}; +WRITE_CLASS_ENCODER(inline_data_t) + +enum { + DAMAGE_STATS, // statistics (dirstat, size, etc) + DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat) + DAMAGE_FRAGTREE // fragtree -- repair by searching +}; + +template<template<typename> class Allocator = std::allocator> +struct inode_t { + /** + * *************** + * Do not forget to add any new fields to the compare() function. + * *************** + */ + using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>; + + inode_t() + { + clear_layout(); + } + + // file type + bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; } + bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; } + bool is_file() const { return (mode & S_IFMT) == S_IFREG; } + + bool is_truncating() const { return (truncate_pending > 0); } + void truncate(uint64_t old_size, uint64_t new_size, const bufferlist &fbl) { + truncate(old_size, new_size); + fscrypt_last_block = fbl; + } + void truncate(uint64_t old_size, uint64_t new_size) { + ceph_assert(new_size <= old_size); + if (old_size > max_size_ever) + max_size_ever = old_size; + truncate_from = old_size; + size = new_size; + rstat.rbytes = new_size; + truncate_size = size; + truncate_seq++; + truncate_pending++; + } + + bool has_layout() const { + return layout != file_layout_t(); + } + + void clear_layout() { + layout = file_layout_t(); + } + + uint64_t get_layout_size_increment() const { + return layout.get_period(); + } + + bool is_dirty_rstat() const { return !(rstat == accounted_rstat); } + + uint64_t get_client_range(client_t client) const { + auto it = client_ranges.find(client); + return it != client_ranges.end() ? it->second.range.last : 0; + } + + uint64_t get_max_size() const { + uint64_t max = 0; + for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + if (p->second.range.last > max) + max = p->second.range.last; + return max; + } + void set_max_size(uint64_t new_max) { + if (new_max == 0) { + client_ranges.clear(); + } else { + for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin(); + p != client_ranges.end(); + ++p) + p->second.range.last = new_max; + } + } + + void trim_client_ranges(snapid_t last) { + std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin(); + while (p != client_ranges.end()) { + if (p->second.follows >= last) + client_ranges.erase(p++); + else + ++p; + } + } + + bool is_backtrace_updated() const { + return backtrace_version == version; + } + void update_backtrace(version_t pv=0) { + backtrace_version = pv ? pv : version; + } + + void add_old_pool(int64_t l) { + backtrace_version = version; + old_pools.insert(l); + } + + void encode(ceph::buffer::list &bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& bl); + void dump(ceph::Formatter *f) const; + static void client_ranges_cb(client_range_map& c, JSONObj *obj); + static void old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj); + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<inode_t*>& ls); + /** + * Compare this inode_t with another that represent *the same inode* + * at different points in time. + * @pre The inodes are the same ino + * + * @param other The inode_t to compare ourselves with + * @param divergent A bool pointer which will be set to true + * if the values are different in a way that can't be explained + * by one being a newer version than the other. + * + * @returns 1 if we are newer than the other, 0 if equal, -1 if older. + */ + int compare(const inode_t &other, bool *divergent) const; + + // base (immutable) + inodeno_t ino = 0; + uint32_t rdev = 0; // if special file + + // affected by any inode change... + utime_t ctime; // inode change time + utime_t btime; // birth time + + // perm (namespace permissions) + uint32_t mode = 0; + uid_t uid = 0; + gid_t gid = 0; + + // nlink + int32_t nlink = 0; + + // file (data access) + ceph_dir_layout dir_layout = {}; // [dir only] + file_layout_t layout; + compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools; + uint64_t size = 0; // on directory, # dentries + uint64_t max_size_ever = 0; // max size the file has ever been + uint32_t truncate_seq = 0; + uint64_t truncate_size = 0, truncate_from = 0; + uint32_t truncate_pending = 0; + utime_t mtime; // file data modify time. + utime_t atime; // file data access time. + uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes()) + inline_data_t inline_data; // FIXME check + + // change attribute + uint64_t change_attr = 0; + + client_range_map client_ranges; // client(s) can write to these ranges + + // dirfrag, recursive accountin + frag_info_t dirstat; // protected by my filelock + nest_info_t rstat; // protected by my nestlock + nest_info_t accounted_rstat; // protected by parent's nestlock + + quota_info_t quota; + + mds_rank_t export_pin = MDS_RANK_NONE; + + double export_ephemeral_random_pin = 0; + bool export_ephemeral_distributed_pin = false; + + // special stuff + version_t version = 0; // auth only + version_t file_data_version = 0; // auth only + version_t xattr_version = 0; + + utime_t last_scrub_stamp; // start time of last complete scrub + version_t last_scrub_version = 0;// (parent) start version of last complete scrub + + version_t backtrace_version = 0; + + snapid_t oldest_snap; + + std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink + + std::vector<uint8_t> fscrypt_auth; + std::vector<uint8_t> fscrypt_file; + + bufferlist fscrypt_last_block; + +private: + bool older_is_consistent(const inode_t &other) const; +}; + +// These methods may be moved back to mdstypes.cc when we have pmr +template<template<typename> class Allocator> +void inode_t<Allocator>::encode(ceph::buffer::list &bl, uint64_t features) const +{ + ENCODE_START(19, 6, bl); + + encode(ino, bl); + encode(rdev, bl); + encode(ctime, bl); + + encode(mode, bl); + encode(uid, bl); + encode(gid, bl); + + encode(nlink, bl); + { + // removed field + bool anchored = 0; + encode(anchored, bl); + } + + encode(dir_layout, bl); + encode(layout, bl, features); + encode(size, bl); + encode(truncate_seq, bl); + encode(truncate_size, bl); + encode(truncate_from, bl); + encode(truncate_pending, bl); + encode(mtime, bl); + encode(atime, bl); + encode(time_warp_seq, bl); + encode(client_ranges, bl); + + encode(dirstat, bl); + encode(rstat, bl); + encode(accounted_rstat, bl); + + encode(version, bl); + encode(file_data_version, bl); + encode(xattr_version, bl); + encode(backtrace_version, bl); + encode(old_pools, bl); + encode(max_size_ever, bl); + encode(inline_data, bl); + encode(quota, bl); + + encode(stray_prior_path, bl); + + encode(last_scrub_version, bl); + encode(last_scrub_stamp, bl); + + encode(btime, bl); + encode(change_attr, bl); + + encode(export_pin, bl); + + encode(export_ephemeral_random_pin, bl); + encode(export_ephemeral_distributed_pin, bl); + + encode(!fscrypt_auth.empty(), bl); + encode(fscrypt_auth, bl); + encode(fscrypt_file, bl); + encode(fscrypt_last_block, bl); + ENCODE_FINISH(bl); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::decode(ceph::buffer::list::const_iterator &p) +{ + DECODE_START_LEGACY_COMPAT_LEN(19, 6, 6, p); + + decode(ino, p); + decode(rdev, p); + decode(ctime, p); + + decode(mode, p); + decode(uid, p); + decode(gid, p); + + decode(nlink, p); + { + bool anchored; + decode(anchored, p); + } + + if (struct_v >= 4) + decode(dir_layout, p); + else { + // FIPS zeroization audit 20191117: this memset is not security related. + memset(&dir_layout, 0, sizeof(dir_layout)); + } + decode(layout, p); + decode(size, p); + decode(truncate_seq, p); + decode(truncate_size, p); + decode(truncate_from, p); + if (struct_v >= 5) + decode(truncate_pending, p); + else + truncate_pending = 0; + decode(mtime, p); + decode(atime, p); + decode(time_warp_seq, p); + if (struct_v >= 3) { + decode(client_ranges, p); + } else { + std::map<client_t, client_writeable_range_t::byte_range_t> m; + decode(m, p); + for (auto q = m.begin(); q != m.end(); ++q) + client_ranges[q->first].range = q->second; + } + + decode(dirstat, p); + decode(rstat, p); + decode(accounted_rstat, p); + + decode(version, p); + decode(file_data_version, p); + decode(xattr_version, p); + if (struct_v >= 2) + decode(backtrace_version, p); + if (struct_v >= 7) + decode(old_pools, p); + if (struct_v >= 8) + decode(max_size_ever, p); + if (struct_v >= 9) { + decode(inline_data, p); + } else { + inline_data.version = CEPH_INLINE_NONE; + } + if (struct_v < 10) + backtrace_version = 0; // force update backtrace + if (struct_v >= 11) + decode(quota, p); + + if (struct_v >= 12) { + std::string tmp; + decode(tmp, p); + stray_prior_path = std::string_view(tmp); + } + + if (struct_v >= 13) { + decode(last_scrub_version, p); + decode(last_scrub_stamp, p); + } + if (struct_v >= 14) { + decode(btime, p); + decode(change_attr, p); + } else { + btime = utime_t(); + change_attr = 0; + } + + if (struct_v >= 15) { + decode(export_pin, p); + } else { + export_pin = MDS_RANK_NONE; + } + + if (struct_v >= 16) { + decode(export_ephemeral_random_pin, p); + decode(export_ephemeral_distributed_pin, p); + } else { + export_ephemeral_random_pin = 0; + export_ephemeral_distributed_pin = false; + } + + if (struct_v >= 17) { + bool fscrypt_flag; + decode(fscrypt_flag, p); // ignored + } + + if (struct_v >= 18) { + decode(fscrypt_auth, p); + decode(fscrypt_file, p); + } + + if (struct_v >= 19) { + decode(fscrypt_last_block, p); + } + DECODE_FINISH(p); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::dump(ceph::Formatter *f) const +{ + f->dump_unsigned("ino", ino); + f->dump_unsigned("rdev", rdev); + f->dump_stream("ctime") << ctime; + f->dump_stream("btime") << btime; + f->dump_unsigned("mode", mode); + f->dump_unsigned("uid", uid); + f->dump_unsigned("gid", gid); + f->dump_unsigned("nlink", nlink); + + f->open_object_section("dir_layout"); + ::dump(dir_layout, f); + f->close_section(); + + f->dump_object("layout", layout); + + f->open_array_section("old_pools"); + for (const auto &p : old_pools) { + f->dump_int("pool", p); + } + f->close_section(); + + f->dump_unsigned("size", size); + f->dump_unsigned("truncate_seq", truncate_seq); + f->dump_unsigned("truncate_size", truncate_size); + f->dump_unsigned("truncate_from", truncate_from); + f->dump_unsigned("truncate_pending", truncate_pending); + f->dump_stream("mtime") << mtime; + f->dump_stream("atime") << atime; + f->dump_unsigned("time_warp_seq", time_warp_seq); + f->dump_unsigned("change_attr", change_attr); + f->dump_int("export_pin", export_pin); + f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin); + f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin); + + f->open_array_section("client_ranges"); + for (const auto &p : client_ranges) { + f->open_object_section("client"); + f->dump_unsigned("client", p.first.v); + p.second.dump(f); + f->close_section(); + } + f->close_section(); + + f->open_object_section("dirstat"); + dirstat.dump(f); + f->close_section(); + + f->open_object_section("rstat"); + rstat.dump(f); + f->close_section(); + + f->open_object_section("accounted_rstat"); + accounted_rstat.dump(f); + f->close_section(); + + f->dump_unsigned("version", version); + f->dump_unsigned("file_data_version", file_data_version); + f->dump_unsigned("xattr_version", xattr_version); + f->dump_unsigned("backtrace_version", backtrace_version); + + f->dump_string("stray_prior_path", stray_prior_path); + f->dump_unsigned("max_size_ever", max_size_ever); + + f->open_object_section("quota"); + quota.dump(f); + f->close_section(); + + f->dump_stream("last_scrub_stamp") << last_scrub_stamp; + f->dump_unsigned("last_scrub_version", last_scrub_version); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::client_ranges_cb(typename inode_t<Allocator>::client_range_map& c, JSONObj *obj){ + + int64_t client; + JSONDecoder::decode_json("client", client, obj, true); + client_writeable_range_t client_range_tmp; + JSONDecoder::decode_json("byte range", client_range_tmp.range, obj, true); + JSONDecoder::decode_json("follows", client_range_tmp.follows.val, obj, true); + c[client] = client_range_tmp; +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj){ + + int64_t tmp; + decode_json_obj(tmp, obj); + c.insert(tmp); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::decode_json(JSONObj *obj) +{ + + JSONDecoder::decode_json("ino", ino.val, obj, true); + JSONDecoder::decode_json("rdev", rdev, obj, true); + //JSONDecoder::decode_json("ctime", ctime, obj, true); + //JSONDecoder::decode_json("btime", btime, obj, true); + JSONDecoder::decode_json("mode", mode, obj, true); + JSONDecoder::decode_json("uid", uid, obj, true); + JSONDecoder::decode_json("gid", gid, obj, true); + JSONDecoder::decode_json("nlink", nlink, obj, true); + JSONDecoder::decode_json("dir_layout", dir_layout, obj, true); + JSONDecoder::decode_json("layout", layout, obj, true); + JSONDecoder::decode_json("old_pools", old_pools, inode_t<Allocator>::old_pools_cb, obj, true); + JSONDecoder::decode_json("size", size, obj, true); + JSONDecoder::decode_json("truncate_seq", truncate_seq, obj, true); + JSONDecoder::decode_json("truncate_size", truncate_size, obj, true); + JSONDecoder::decode_json("truncate_from", truncate_from, obj, true); + JSONDecoder::decode_json("truncate_pending", truncate_pending, obj, true); + //JSONDecoder::decode_json("mtime", mtime, obj, true); + //JSONDecoder::decode_json("atime", atime, obj, true); + JSONDecoder::decode_json("time_warp_seq", time_warp_seq, obj, true); + JSONDecoder::decode_json("change_attr", change_attr, obj, true); + JSONDecoder::decode_json("export_pin", export_pin, obj, true); + JSONDecoder::decode_json("client_ranges", client_ranges, inode_t<Allocator>::client_ranges_cb, obj, true); + JSONDecoder::decode_json("dirstat", dirstat, obj, true); + JSONDecoder::decode_json("rstat", rstat, obj, true); + JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true); + JSONDecoder::decode_json("version", version, obj, true); + JSONDecoder::decode_json("file_data_version", file_data_version, obj, true); + JSONDecoder::decode_json("xattr_version", xattr_version, obj, true); + JSONDecoder::decode_json("backtrace_version", backtrace_version, obj, true); + JSONDecoder::decode_json("stray_prior_path", stray_prior_path, obj, true); + JSONDecoder::decode_json("max_size_ever", max_size_ever, obj, true); + JSONDecoder::decode_json("quota", quota, obj, true); + JSONDecoder::decode_json("last_scrub_stamp", last_scrub_stamp, obj, true); + JSONDecoder::decode_json("last_scrub_version", last_scrub_version, obj, true); +} + +template<template<typename> class Allocator> +void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls) +{ + ls.push_back(new inode_t<Allocator>); + ls.push_back(new inode_t<Allocator>); + ls.back()->ino = 1; + // i am lazy. +} + +template<template<typename> class Allocator> +int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const +{ + ceph_assert(ino == other.ino); + *divergent = false; + if (version == other.version) { + if (rdev != other.rdev || + ctime != other.ctime || + btime != other.btime || + mode != other.mode || + uid != other.uid || + gid != other.gid || + nlink != other.nlink || + memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) || + layout != other.layout || + old_pools != other.old_pools || + size != other.size || + max_size_ever != other.max_size_ever || + truncate_seq != other.truncate_seq || + truncate_size != other.truncate_size || + truncate_from != other.truncate_from || + truncate_pending != other.truncate_pending || + change_attr != other.change_attr || + mtime != other.mtime || + atime != other.atime || + time_warp_seq != other.time_warp_seq || + inline_data != other.inline_data || + client_ranges != other.client_ranges || + !(dirstat == other.dirstat) || + !(rstat == other.rstat) || + !(accounted_rstat == other.accounted_rstat) || + file_data_version != other.file_data_version || + xattr_version != other.xattr_version || + backtrace_version != other.backtrace_version) { + *divergent = true; + } + return 0; + } else if (version > other.version) { + *divergent = !older_is_consistent(other); + return 1; + } else { + ceph_assert(version < other.version); + *divergent = !other.older_is_consistent(*this); + return -1; + } +} + +template<template<typename> class Allocator> +bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const +{ + if (max_size_ever < other.max_size_ever || + truncate_seq < other.truncate_seq || + time_warp_seq < other.time_warp_seq || + inline_data.version < other.inline_data.version || + dirstat.version < other.dirstat.version || + rstat.version < other.rstat.version || + accounted_rstat.version < other.accounted_rstat.version || + file_data_version < other.file_data_version || + xattr_version < other.xattr_version || + backtrace_version < other.backtrace_version) { + return false; + } + return true; +} + +template<template<typename> class Allocator> +inline void encode(const inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features) +{ + ENCODE_DUMP_PRE(); + c.encode(bl, features); + ENCODE_DUMP_POST(cl); +} +template<template<typename> class Allocator> +inline void decode(inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p) +{ + c.decode(p); +} + +// parse a map of keys/values. +namespace qi = boost::spirit::qi; + +template <typename Iterator> +struct keys_and_values + : qi::grammar<Iterator, std::map<std::string, std::string>()> +{ + keys_and_values() + : keys_and_values::base_type(query) + { + query = pair >> *(qi::lit(' ') >> pair); + pair = key >> '=' >> value; + key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9"); + value = +qi::char_("a-zA-Z0-9-_."); + } + qi::rule<Iterator, std::map<std::string, std::string>()> query; + qi::rule<Iterator, std::pair<std::string, std::string>()> pair; + qi::rule<Iterator, std::string()> key, value; +}; + +#endif diff --git a/src/include/color.h b/src/include/color.h new file mode 100644 index 000000000..6c8df40e0 --- /dev/null +++ b/src/include/color.h @@ -0,0 +1,13 @@ +#ifndef CEPH_COLOR_H +#define CEPH_COLOR_H + +#define TEXT_NORMAL "\033[0m" +/*#define TEXT_HAZARD "\033[5;31m"*/ +#define TEXT_RED "\033[0;31m" +#define TEXT_GREEN "\033[0;32m" +#define TEXT_YELLOW "\033[0;33m" +#define TEXT_BLUE "\033[0;34m" +#define TEXT_MAGENTA "\033[0;35m" +#define TEXT_CYAN "\033[0;36m" + +#endif diff --git a/src/include/common_fwd.h b/src/include/common_fwd.h new file mode 100644 index 000000000..d906aadfa --- /dev/null +++ b/src/include/common_fwd.h @@ -0,0 +1,32 @@ +#pragma once + +#if defined(WITH_SEASTAR) && !defined(WITH_ALIEN) +#define TOPNSPC crimson +#else +#define TOPNSPC ceph +#endif + +namespace TOPNSPC::common { + class CephContext; + class PerfCounters; + class PerfCountersBuilder; + class PerfCountersCollection; + class PerfCountersCollectionImpl; + class PerfGuard; + class RefCountedObject; + class RefCountedObjectSafe; + class RefCountedCond; + class RefCountedWaitObject; + class ConfigProxy; +} +using TOPNSPC::common::CephContext; +using TOPNSPC::common::PerfCounters; +using TOPNSPC::common::PerfCountersBuilder; +using TOPNSPC::common::PerfCountersCollection; +using TOPNSPC::common::PerfCountersCollectionImpl; +using TOPNSPC::common::PerfGuard; +using TOPNSPC::common::RefCountedObject; +using TOPNSPC::common::RefCountedObjectSafe; +using TOPNSPC::common::RefCountedCond; +using TOPNSPC::common::RefCountedWaitObject; +using TOPNSPC::common::ConfigProxy; diff --git a/src/include/compact_map.h b/src/include/compact_map.h new file mode 100644 index 000000000..21645e3d1 --- /dev/null +++ b/src/include/compact_map.h @@ -0,0 +1,383 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_COMPACT_MAP_H +#define CEPH_COMPACT_MAP_H + +#include "buffer.h" +#include "encoding.h" + +#include <map> +#include <memory> + +#include "include/encoding.h" + +template <class Key, class T, class Map> +class compact_map_base { +protected: + std::unique_ptr<Map> map; + void alloc_internal() { + if (!map) + map.reset(new Map); + } + void free_internal() { + map.reset(); + } + template <class It> + class const_iterator_base { + const compact_map_base *map; + It it; + const_iterator_base() : map(0) { } + const_iterator_base(const compact_map_base* m) : map(m) { } + const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { } + friend class compact_map_base; + friend class iterator_base; + public: + const_iterator_base(const const_iterator_base& o) { + map = o.map; + it = o.it; + } + bool operator==(const const_iterator_base& o) const { + return (map == o.map) && (!map->map || it == o.it); + } + bool operator!=(const const_iterator_base& o) const { + return !(*this == o);; + } + const_iterator_base& operator=(const const_iterator_base& o) { + map = o.map; + it = o.it; + return *this; + } + const_iterator_base& operator++() { + ++it; + return *this; + } + const_iterator_base& operator--() { + --it; + return *this; + } + const std::pair<const Key,T>& operator*() { + return *it; + } + const std::pair<const Key,T>* operator->() { + return it.operator->(); + } + }; + template <class It> + class iterator_base { + private: + const compact_map_base* map; + It it; + iterator_base() : map(0) { } + iterator_base(compact_map_base* m) : map(m) { } + iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { } + friend class compact_map_base; + public: + iterator_base(const iterator_base& o) { + map = o.map; + it = o.it; + } + bool operator==(const iterator_base& o) const { + return (map == o.map) && (!map->map || it == o.it); + } + bool operator!=(const iterator_base& o) const { + return !(*this == o);; + } + iterator_base& operator=(const iterator_base& o) { + map = o.map; + it = o.it; + return *this; + } + iterator_base& operator++() { + ++it; + return *this; + } + iterator_base operator++(int) { + iterator_base tmp = *this; + ++it; + return tmp; + } + iterator_base& operator--() { + --it; + return *this; + } + std::pair<const Key,T>& operator*() { + return *it; + } + std::pair<const Key,T>* operator->() { + return it.operator->(); + } + operator const_iterator_base<It>() const { + return const_iterator_base<It>(map, it); + } + }; + +public: + class iterator : public iterator_base<typename Map::iterator> { + public: + iterator() { } + iterator(const iterator_base<typename Map::iterator>& o) + : iterator_base<typename Map::iterator>(o) { } + iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { } + iterator(compact_map_base* m, const typename Map::iterator& i) + : iterator_base<typename Map::iterator>(m, i) { } + }; + class const_iterator : public const_iterator_base<typename Map::const_iterator> { + public: + const_iterator() { } + const_iterator(const iterator_base<typename Map::const_iterator>& o) + : const_iterator_base<typename Map::const_iterator>(o) { } + const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { } + const_iterator(const compact_map_base* m, const typename Map::const_iterator& i) + : const_iterator_base<typename Map::const_iterator>(m, i) { } + }; + class reverse_iterator : public iterator_base<typename Map::reverse_iterator> { + public: + reverse_iterator() { } + reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o) + : iterator_base<typename Map::reverse_iterator>(o) { } + reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { } + reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i) + : iterator_base<typename Map::reverse_iterator>(m, i) { } + }; + class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> { + public: + const_reverse_iterator() { } + const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o) + : iterator_base<typename Map::const_reverse_iterator>(o) { } + const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { } + const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i) + : const_iterator_base<typename Map::const_reverse_iterator>(m, i) { } + }; + compact_map_base(const compact_map_base& o) { + if (o.map) { + alloc_internal(); + *map = *o.map; + } + } + compact_map_base() {} + ~compact_map_base() {} + + bool empty() const { + return !map || map->empty(); + } + size_t size() const { + return map ? map->size() : 0; + } + bool operator==(const compact_map_base& o) const { + return (empty() && o.empty()) || (map && o.map && *map == *o.map); + } + bool operator!=(const compact_map_base& o) const { + return !(*this == o); + } + size_t count (const Key& k) const { + return map ? map->count(k) : 0; + } + iterator erase (iterator p) { + if (map) { + ceph_assert(this == p.map); + auto it = map->erase(p.it); + if (map->empty()) { + free_internal(); + return iterator(this); + } else { + return iterator(this, it); + } + } else { + return iterator(this); + } + } + size_t erase (const Key& k) { + if (!map) + return 0; + size_t r = map->erase(k); + if (map->empty()) + free_internal(); + return r; + } + void clear() { + free_internal(); + } + void swap(compact_map_base& o) { + map.swap(o.map); + } + compact_map_base& operator=(const compact_map_base& o) { + if (o.map) { + alloc_internal(); + *map = *o.map; + } else + free_internal(); + return *this; + } + iterator insert(const std::pair<const Key, T>& val) { + alloc_internal(); + return iterator(this, map->insert(val)); + } + template <class... Args> + std::pair<iterator,bool> emplace ( Args&&... args ) { + alloc_internal(); + auto em = map->emplace(std::forward<Args>(args)...); + return std::pair<iterator,bool>(iterator(this, em.first), em.second); + } + iterator begin() { + if (!map) + return iterator(this); + return iterator(this, map->begin()); + } + iterator end() { + if (!map) + return iterator(this); + return iterator(this, map->end()); + } + reverse_iterator rbegin() { + if (!map) + return reverse_iterator(this); + return reverse_iterator(this, map->rbegin()); + } + reverse_iterator rend() { + if (!map) + return reverse_iterator(this); + return reverse_iterator(this, map->rend()); + } + iterator find(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->find(k)); + } + iterator lower_bound(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->lower_bound(k)); + } + iterator upper_bound(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->upper_bound(k)); + } + const_iterator begin() const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->begin()); + } + const_iterator end() const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->end()); + } + const_reverse_iterator rbegin() const { + if (!map) + return const_reverse_iterator(this); + return const_reverse_iterator(this, map->rbegin()); + } + const_reverse_iterator rend() const { + if (!map) + return const_reverse_iterator(this); + return const_reverse_iterator(this, map->rend()); + } + const_iterator find(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->find(k)); + } + const_iterator lower_bound(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->lower_bound(k)); + } + const_iterator upper_bound(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->upper_bound(k)); + } + void encode(ceph::buffer::list &bl) const { + using ceph::encode; + if (map) + encode(*map, bl); + else + encode((uint32_t)0, bl); + } + void encode(ceph::buffer::list &bl, uint64_t features) const { + using ceph::encode; + if (map) + encode(*map, bl, features); + else + encode((uint32_t)0, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + using ceph::decode_nohead; + uint32_t n; + decode(n, p); + if (n > 0) { + alloc_internal(); + decode_nohead(n, *map, p); + } else + free_internal(); + } +}; + +template<class Key, class T, class Map> +inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl) { + m.encode(bl); +} +template<class Key, class T, class Map> +inline void encode(const compact_map_base<Key, T, Map>& m, ceph::buffer::list& bl, + uint64_t features) { + m.encode(bl, features); +} +template<class Key, class T, class Map> +inline void decode(compact_map_base<Key, T, Map>& m, ceph::buffer::list::const_iterator& p) { + m.decode(p); +} + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > { +public: + T& operator[](const Key& k) { + this->alloc_internal(); + return (*(this->map))[k]; + } +}; + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m) +{ + out << "{"; + bool first = true; + for (const auto &p : m) { + if (!first) + out << ","; + out << p.first << "=" << p.second; + first = false; + } + out << "}"; + return out; +} + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > { +}; + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m) +{ + out << "{{"; + bool first = true; + for (const auto &p : m) { + if (!first) + out << ","; + out << p.first << "=" << p.second; + first = false; + } + out << "}}"; + return out; +} +#endif diff --git a/src/include/compact_set.h b/src/include/compact_set.h new file mode 100644 index 000000000..a364fd8c4 --- /dev/null +++ b/src/include/compact_set.h @@ -0,0 +1,305 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_COMPACT_SET_H +#define CEPH_COMPACT_SET_H + +#include "buffer.h" +#include "encoding.h" + +#include <memory> +#include <set> + +template <class T, class Set> +class compact_set_base { +protected: + std::unique_ptr<Set> set; + void alloc_internal() { + if (!set) + set.reset(new Set); + } + void free_internal() { + set.reset(); + } + template <class It> + class iterator_base { + private: + const compact_set_base* set; + It it; + iterator_base() : set(0) { } + iterator_base(const compact_set_base* s) : set(s) { } + iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { } + friend class compact_set_base; + public: + iterator_base(const iterator_base& o) { + set = o.set; + it = o.it; + } + bool operator==(const iterator_base& o) const { + return (set == o.set) && (!set->set || it == o.it); + } + bool operator!=(const iterator_base& o) const { + return !(*this == o);; + } + iterator_base& operator=(const iterator_base& o) { + set->set = o.set; + it = o.it; + return *this; + } + iterator_base& operator++() { + ++it; + return *this; + } + iterator_base operator++(int) { + iterator_base tmp = *this; + ++it; + return tmp; + } + iterator_base& operator--() { + --it; + return *this; + } + const T& operator*() { + return *it; + } + }; +public: + class const_iterator : public iterator_base<typename Set::const_iterator> { + public: + const_iterator() { } + const_iterator(const iterator_base<typename Set::const_iterator>& o) + : iterator_base<typename Set::const_iterator>(o) { } + const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { } + const_iterator(const compact_set_base* s, const typename Set::const_iterator& i) + : iterator_base<typename Set::const_iterator>(s, i) { } + }; + class iterator : public iterator_base<typename Set::iterator> { + public: + iterator() { } + iterator(const iterator_base<typename Set::iterator>& o) + : iterator_base<typename Set::iterator>(o) { } + iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { } + iterator(compact_set_base* s, const typename Set::iterator& i) + : iterator_base<typename Set::iterator>(s, i) { } + operator const_iterator() const { + return const_iterator(this->set, this->it); + } + }; + class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> { + public: + const_reverse_iterator() { } + const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o) + : iterator_base<typename Set::const_reverse_iterator>(o) { } + const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { } + const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i) + : iterator_base<typename Set::const_reverse_iterator>(s, i) { } + }; + class reverse_iterator : public iterator_base<typename Set::reverse_iterator> { + public: + reverse_iterator() { } + reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o) + : iterator_base<typename Set::reverse_iterator>(o) { } + reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { } + reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i) + : iterator_base<typename Set::reverse_iterator>(s, i) { } + operator const_iterator() const { + return const_iterator(this->set, this->it); + } + }; + + compact_set_base() {} + compact_set_base(const compact_set_base& o) { + if (o.set) { + alloc_internal(); + *set = *o.set; + } + } + ~compact_set_base() {} + + + bool empty() const { + return !set || set->empty(); + } + size_t size() const { + return set ? set->size() : 0; + } + bool operator==(const compact_set_base& o) const { + return (empty() && o.empty()) || (set && o.set && *set == *o.set); + } + bool operator!=(const compact_set_base& o) const { + return !(*this == o); + } + size_t count(const T& t) const { + return set ? set->count(t) : 0; + } + iterator erase (iterator p) { + if (set) { + ceph_assert(this == p.set); + auto it = set->erase(p.it); + if (set->empty()) { + free_internal(); + return iterator(this); + } else { + return iterator(this, it); + } + } else { + return iterator(this); + } + } + size_t erase (const T& t) { + if (!set) + return 0; + size_t r = set->erase(t); + if (set->empty()) + free_internal(); + return r; + } + void clear() { + free_internal(); + } + void swap(compact_set_base& o) { + set.swap(o.set); + } + compact_set_base& operator=(const compact_set_base& o) { + if (o.set) { + alloc_internal(); + *set = *o.set; + } else + free_internal(); + return *this; + } + std::pair<iterator,bool> insert(const T& t) { + alloc_internal(); + std::pair<typename Set::iterator,bool> r = set->insert(t); + return std::make_pair(iterator(this, r.first), r.second); + } + template <class... Args> + std::pair<iterator,bool> emplace ( Args&&... args ) { + alloc_internal(); + auto em = set->emplace(std::forward<Args>(args)...); + return std::pair<iterator,bool>(iterator(this, em.first), em.second); + } + + iterator begin() { + if (!set) + return iterator(this); + return iterator(this, set->begin()); + } + iterator end() { + if (!set) + return iterator(this); + return iterator(this, set->end()); + } + reverse_iterator rbegin() { + if (!set) + return reverse_iterator(this); + return reverse_iterator(this, set->rbegin()); + } + reverse_iterator rend() { + if (!set) + return reverse_iterator(this); + return reverse_iterator(this, set->rend()); + } + iterator find(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->find(t)); + } + iterator lower_bound(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->lower_bound(t)); + } + iterator upper_bound(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->upper_bound(t)); + } + const_iterator begin() const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->begin()); + } + const_iterator end() const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->end()); + } + const_reverse_iterator rbegin() const { + if (!set) + return const_reverse_iterator(this); + return const_reverse_iterator(this, set->rbegin()); + } + const_reverse_iterator rend() const { + if (!set) + return const_reverse_iterator(this); + return const_reverse_iterator(this, set->rend()); + } + const_iterator find(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->find(t)); + } + const_iterator lower_bound(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->lower_bound(t)); + } + const_iterator upper_bound(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->upper_bound(t)); + } + void encode(ceph::buffer::list &bl) const { + using ceph::encode; + if (set) + encode(*set, bl); + else + encode((uint32_t)0, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + uint32_t n; + decode(n, p); + if (n > 0) { + alloc_internal(); + ceph::decode_nohead(n, *set, p); + } else + free_internal(); + } +}; + +template<class T, class Set> +inline void encode(const compact_set_base<T, Set>& m, ceph::buffer::list& bl) { + m.encode(bl); +} +template<class T, class Set> +inline void decode(compact_set_base<T, Set>& m, ceph::buffer::list::const_iterator& p) { + m.decode(p); +} + +template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> > +class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > { +}; + +template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> > +inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s) +{ + bool first = true; + for (auto &v : s) { + if (!first) + out << ","; + out << v; + first = false; + } + return out; +} +#endif diff --git a/src/include/compat.h b/src/include/compat.h new file mode 100644 index 000000000..1100d69eb --- /dev/null +++ b/src/include/compat.h @@ -0,0 +1,420 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_COMPAT_H +#define CEPH_COMPAT_H + +#include "acconfig.h" +#include <sys/types.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> + +#if defined(__linux__) +#define PROCPREFIX +#endif + +#include <fcntl.h> +#ifndef F_OFD_SETLK +#define F_OFD_SETLK F_SETLK +#endif + +#include <sys/stat.h> + +#ifdef _WIN32 +#include "include/win32/fs_compat.h" +#endif + +#ifndef ACCESSPERMS +#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO) +#endif + +#ifndef ALLPERMS +#define ALLPERMS (S_ISUID|S_ISGID|S_ISVTX|S_IRWXU|S_IRWXG|S_IRWXO) +#endif + +#if defined(__FreeBSD__) + +// FreeBSD supports Linux procfs with its compatibility module +// And all compatibility stuff is standard mounted on this +#define PROCPREFIX "/compat/linux" + +#ifndef MSG_MORE +#define MSG_MORE 0 +#endif + +#ifndef O_DSYNC +#define O_DSYNC O_SYNC +#endif + +/* And include the extra required include file */ +#include <pthread_np.h> + +#include <sys/param.h> +#include <sys/cpuset.h> +#define cpu_set_t cpuset_t +int sched_setaffinity(pid_t pid, size_t cpusetsize, + cpu_set_t *mask); + +#endif /* __FreeBSD__ */ + +#if defined(__APPLE__) +struct cpu_set_t; +#endif + +#if defined(__APPLE__) || defined(__FreeBSD__) +/* Make sure that ENODATA is defined in the correct way */ +#ifdef ENODATA +#if (ENODATA == 9919) +// #warning ENODATA already defined to be 9919, redefining to fix +// Silencing this warning because it fires at all files where compat.h +// is included after boost files. +// +// This value stems from the definition in the boost library +// And when this case occurs it is due to the fact that boost files +// are included before this file. Redefinition might not help in this +// case since already parsed code has evaluated to the wrong value. +// This would warrrant for d definition that would actually be evaluated +// at the location of usage and report a possible conflict. +// This is left up to a future improvement +#elif (ENODATA != 87) +// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix +#endif +#undef ENODATA +#endif +#define ENODATA ENOATTR + +// Fix clock accuracy +#if !defined(CLOCK_MONOTONIC_COARSE) +#if defined(CLOCK_MONOTONIC_FAST) +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST +#else +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC +#endif +#endif +#if !defined(CLOCK_REALTIME_COARSE) +#if defined(CLOCK_REALTIME_FAST) +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST +#else +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME +#endif +#endif + +/* get PATH_MAX */ +#include <limits.h> + +#ifndef EUCLEAN +#define EUCLEAN 117 +#endif +#ifndef EREMOTEIO +#define EREMOTEIO 121 +#endif +#ifndef EKEYREJECTED +#define EKEYREJECTED 129 +#endif +#ifndef XATTR_CREATE +#define XATTR_CREATE 1 +#endif + +#endif /* __APPLE__ */ + +#ifndef HOST_NAME_MAX +#ifdef MAXHOSTNAMELEN +#define HOST_NAME_MAX MAXHOSTNAMELEN +#else +#define HOST_NAME_MAX 255 +#endif +#endif /* HOST_NAME_MAX */ + +/* O_LARGEFILE is not defined/required on OSX/FreeBSD */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +/* Could be relevant for other platforms */ +#ifndef ERESTART +#define ERESTART EINTR +#endif + +#ifndef TEMP_FAILURE_RETRY +#define TEMP_FAILURE_RETRY(expression) ({ \ + __typeof(expression) __result; \ + do { \ + __result = (expression); \ + } while (__result == -1 && errno == EINTR); \ + __result; }) +#endif + +#ifdef __cplusplus +# define VOID_TEMP_FAILURE_RETRY(expression) \ + static_cast<void>(TEMP_FAILURE_RETRY(expression)) +#else +# define VOID_TEMP_FAILURE_RETRY(expression) \ + do { (void)TEMP_FAILURE_RETRY(expression); } while (0) +#endif + +#if defined(__FreeBSD__) || defined(__APPLE__) +#define lseek64(fd, offset, whence) lseek(fd, offset, whence) +#endif + +#if defined(__sun) || defined(_AIX) +#define LOG_AUTHPRIV (10<<3) +#define LOG_FTP (11<<3) +#define __STRING(x) "x" +#endif + +#if defined(__sun) || defined(_AIX) || defined(_WIN32) +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#endif + +#if defined(_AIX) +#define MSG_DONTWAIT MSG_NONBLOCK +#endif + +#if defined(HAVE_PTHREAD_SETNAME_NP) + #if defined(__APPLE__) + #define ceph_pthread_setname(thread, name) ({ \ + int __result = 0; \ + if (thread == pthread_self()) \ + __result = pthread_setname_np(name); \ + __result; }) + #else + #define ceph_pthread_setname pthread_setname_np + #endif +#elif defined(HAVE_PTHREAD_SET_NAME_NP) + /* Fix a small name diff and return 0 */ + #define ceph_pthread_setname(thread, name) ({ \ + pthread_set_name_np(thread, name); \ + 0; }) +#else + /* compiler warning free success noop */ + #define ceph_pthread_setname(thread, name) ({ \ + int __i = 0; \ + __i; }) +#endif + +#if defined(HAVE_PTHREAD_GETNAME_NP) + #define ceph_pthread_getname pthread_getname_np +#elif defined(HAVE_PTHREAD_GET_NAME_NP) + #define ceph_pthread_getname(thread, name, len) ({ \ + pthread_get_name_np(thread, name, len); \ + 0; }) +#else + /* compiler warning free success noop */ + #define ceph_pthread_getname(thread, name, len) ({ \ + if (name != NULL) \ + *name = '\0'; \ + 0; }) +#endif + +int ceph_posix_fallocate(int fd, off_t offset, off_t len); + +#ifdef __cplusplus +extern "C" { +#endif + +int pipe_cloexec(int pipefd[2], int flags); +char *ceph_strerror_r(int errnum, char *buf, size_t buflen); +unsigned get_page_size(); +// On success, returns the number of bytes written to the buffer. On +// failure, returns -1. +ssize_t get_self_exe_path(char* path, int buff_length); + +int ceph_memzero_s(void *dest, size_t destsz, size_t count); + +#ifdef __cplusplus +} +#endif + +#if defined(_WIN32) + +#include "include/win32/winsock_compat.h" + +#include <windows.h> +#include <time.h> + +#include "include/win32/win32_errno.h" + +// There are a few name collisions between Windows headers and Ceph. +// Updating Ceph definitions would be the prefferable fix in order to avoid +// confussion, unless it requires too many changes, in which case we're going +// to redefine Windows values by adding the "WIN32_" prefix. +#define WIN32_DELETE 0x00010000L +#undef DELETE + +#define WIN32_ERROR 0 +#undef ERROR + +#ifndef uint +typedef unsigned int uint; +#endif + +typedef _sigset_t sigset_t; + +typedef unsigned int blksize_t; +typedef unsigned __int64 blkcnt_t; +typedef unsigned short nlink_t; + +typedef long long loff_t; + +#define CPU_SETSIZE (sizeof(size_t)*8) + +typedef union +{ + char cpuset[CPU_SETSIZE/8]; + size_t _align; +} cpu_set_t; + +struct iovec { + void *iov_base; + size_t iov_len; +}; + +#define SHUT_RD SD_RECEIVE +#define SHUT_WR SD_SEND +#define SHUT_RDWR SD_BOTH + +#ifndef SIGINT +#define SIGINT 2 +#endif + +#ifndef SIGKILL +#define SIGKILL 9 +#endif + +#define IOV_MAX 1024 + +#ifdef __cplusplus +extern "C" { +#endif + +ssize_t readv(int fd, const struct iovec *iov, int iov_cnt); +ssize_t writev(int fd, const struct iovec *iov, int iov_cnt); + +int fsync(int fd); +ssize_t pread(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); + +long int lrand48(void); +int random(); + +int pipe(int pipefd[2]); + +int posix_memalign(void **memptr, size_t alignment, size_t size); + +char *strptime(const char *s, const char *format, struct tm *tm); + +int chown(const char *path, uid_t owner, gid_t group); +int fchown(int fd, uid_t owner, gid_t group); +int lchown(const char *path, uid_t owner, gid_t group); +int setenv(const char *name, const char *value, int overwrite); + +int geteuid(); +int getegid(); +int getuid(); +int getgid(); + +#define unsetenv(name) _putenv_s(name, "") + +int win_socketpair(int socks[2]); + +#ifdef __MINGW32__ +extern _CRTIMP errno_t __cdecl _putenv_s(const char *_Name,const char *_Value); + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define htobe16(x) __builtin_bswap16(x) +#define htole16(x) (x) +#define be16toh(x) __builtin_bswap16(x) +#define le16toh(x) (x) + +#define htobe32(x) __builtin_bswap32(x) +#define htole32(x) (x) +#define be32toh(x) __builtin_bswap32(x) +#define le32toh(x) (x) + +#define htobe64(x) __builtin_bswap64(x) +#define htole64(x) (x) +#define be64toh(x) __builtin_bswap64(x) +#define le64toh(x) (x) +#endif // defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + +#endif // __MINGW32__ + +#ifdef __cplusplus +} +#endif + +#define compat_closesocket closesocket +// Use "aligned_free" when freeing memory allocated using posix_memalign or +// _aligned_malloc. Using "free" will crash. +static inline void aligned_free(void* ptr) { + _aligned_free(ptr); +} + +// O_CLOEXEC is not defined on Windows. Since handles aren't inherited +// with subprocesses unless explicitly requested, we'll define this +// flag as a no-op. +#define O_CLOEXEC 0 +#define SOCKOPT_VAL_TYPE char* + +#define DEV_NULL "nul" + +#else /* WIN32 */ + +#define SOCKOPT_VAL_TYPE void* + +static inline void aligned_free(void* ptr) { + free(ptr); +} +static inline int compat_closesocket(int fildes) { + return close(fildes); +} + +#define DEV_NULL "/dev/null" + +#endif /* WIN32 */ + +/* Supplies code to be run at startup time before invoking main(). + * Use as: + * + * CEPH_CONSTRUCTOR(my_constructor) { + * ...some code... + * } + */ +#ifdef _MSC_VER +#pragma section(".CRT$XCU",read) +#define CEPH_CONSTRUCTOR(f) \ + static void __cdecl f(void); \ + __declspec(allocate(".CRT$XCU")) static void (__cdecl*f##_)(void) = f; \ + static void __cdecl f(void) +#else +#define CEPH_CONSTRUCTOR(f) \ + static void f(void) __attribute__((constructor)); \ + static void f(void) +#endif + +/* This should only be used with the socket API. */ +static inline int ceph_sock_errno() { +#ifdef _WIN32 + return wsae_to_errno(WSAGetLastError()); +#else + return errno; +#endif +} + +// Needed on Windows when handling binary files. Without it, line +// endings will be replaced and certain characters can be treated as +// EOF. +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +#endif /* !CEPH_COMPAT_H */ diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake new file mode 100644 index 000000000..cc9ad0ec7 --- /dev/null +++ b/src/include/config-h.in.cmake @@ -0,0 +1,393 @@ +/* config.h file expanded by Cmake for build */ + +#ifndef CONFIG_H +#define CONFIG_H + +/* Define to 1 if you have the `memset_s()` function. */ +#cmakedefine HAVE_MEMSET_S + +/* fallocate(2) is supported */ +#cmakedefine CEPH_HAVE_FALLOCATE + +/* Define to 1 if you have the `posix_fadvise' function. */ +#cmakedefine HAVE_POSIX_FADVISE 1 + +/* Define to 1 if you have the `posix_fallocate' function. */ +#cmakedefine HAVE_POSIX_FALLOCATE 1 + +/* Define to 1 if you have the `syncfs' function. */ +#cmakedefine HAVE_SYS_SYNCFS 1 + +/* sync_file_range(2) is supported */ +#cmakedefine HAVE_SYNC_FILE_RANGE + +/* Define if you have mallinfo */ +#cmakedefine HAVE_MALLINFO + +/* Define to 1 if you have the `pwritev' function. */ +#cmakedefine HAVE_PWRITEV 1 + +/* Define to 1 if you have the <sys/mount.h> header file. */ +#cmakedefine HAVE_SYS_MOUNT_H 1 + +/* Define to 1 if you have the <sys/param.h> header file. */ +#cmakedefine HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the <sys/types.h> header file. */ +#cmakedefine HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the <sys/vfs.h> header file. */ +#cmakedefine HAVE_SYS_VFS_H 1 + +/* Define to 1 if you have the <execinfo.h> header file. */ +#cmakedefine HAVE_EXECINFO_H 1 + +/* Define to 1 if the system has the type `__s16'. */ +#cmakedefine HAVE___S16 1 + +/* Define to 1 if the system has the type `__s32'. */ +#cmakedefine HAVE___S32 1 + +/* Define to 1 if the system has the type `__s64'. */ +#cmakedefine HAVE___S64 1 + +/* Define to 1 if the system has the type `__s8'. */ +#cmakedefine HAVE___S8 1 + +/* Define to 1 if the system has the type `__u16'. */ +#cmakedefine HAVE___U16 1 + +/* Define to 1 if the system has the type `__u32'. */ +#cmakedefine HAVE___U32 1 + +/* Define to 1 if the system has the type `__u64'. */ +#cmakedefine HAVE___U64 1 + +/* Define to 1 if the system has the type `__u8'. */ +#cmakedefine HAVE___U8 1 + +/* Define if the system has the type `in_addr_t' */ +#cmakedefine HAVE_IN_ADDR_T + +/* Define if you have suseconds_t */ +#cmakedefine HAVE_SUSECONDS_T + +/* Define if you have res_nquery */ +#cmakedefine HAVE_RES_NQUERY + +/* Defined if you have LZ4 */ +#cmakedefine HAVE_LZ4 + +/* Defined if you have BROTLI */ +#cmakedefine HAVE_BROTLI + +/* Defined if you have libaio */ +#cmakedefine HAVE_LIBAIO + +/* Defined if you have libdml */ +#cmakedefine HAVE_LIBDML + +/* Defined if you have libzbd */ +#cmakedefine HAVE_LIBZBD + +/* Defined if you have liburing */ +#cmakedefine HAVE_LIBURING + +/* Defind if you have POSIX AIO */ +#cmakedefine HAVE_POSIXAIO + +/* Defined if OpenLDAP enabled */ +#cmakedefine HAVE_OPENLDAP + +/* Define if you have fuse */ +#cmakedefine HAVE_LIBFUSE + +/* Define version major */ +#define CEPH_FUSE_MAJOR_VERSION @FUSE_MAJOR_VERSION@ + +/* Define version minor */ +#define CEPH_FUSE_MINOR_VERSION @FUSE_MINOR_VERSION@ + +/* Define to 1 if you have libxfs */ +#cmakedefine HAVE_LIBXFS 1 + +/* SPDK conditional compilation */ +#cmakedefine HAVE_SPDK + +/* DPDK conditional compilation */ +#cmakedefine HAVE_DPDK + +/* PMEM_DEVICE (OSD) conditional compilation */ +#cmakedefine HAVE_BLUESTORE_PMEM + +/* Define if you have tcmalloc */ +#cmakedefine HAVE_LIBTCMALLOC +#cmakedefine LIBTCMALLOC_MISSING_ALIGNED_ALLOC + +/* AsyncMessenger RDMA conditional compilation */ +#cmakedefine HAVE_RDMA + +/* ibverbs experimental conditional compilation */ +#cmakedefine HAVE_IBV_EXP + +/* define if bluestore enabled */ +#cmakedefine WITH_BLUESTORE + +/* define if cephfs enabled */ +#cmakedefine WITH_CEPHFS + +/* define if systemed is enabled */ +#cmakedefine WITH_SYSTEMD + +/*define if GSSAPI/KRB5 enabled */ +#cmakedefine HAVE_GSSAPI + +/* define if rbd enabled */ +#cmakedefine WITH_RBD + +/* define if kernel rbd enabled */ +#cmakedefine WITH_KRBD + +/* define if key-value-store is enabled */ +#cmakedefine WITH_KVS + +/* define if radosgw enabled */ +#cmakedefine WITH_RADOSGW + +/* define if radosgw has openssl support */ +#cmakedefine WITH_CURL_OPENSSL + +/* define if HAVE_THREAD_SAFE_RES_QUERY */ +#cmakedefine HAVE_THREAD_SAFE_RES_QUERY + +/* define if HAVE_REENTRANT_STRSIGNAL */ +#cmakedefine HAVE_REENTRANT_STRSIGNAL + +/* Define if you want to use LTTng */ +#cmakedefine WITH_LTTNG + +/* Define if you want to use Jaeger */ +#cmakedefine HAVE_JAEGER + +/* Define if you want to use EVENTTRACE */ +#cmakedefine WITH_EVENTTRACE + +/* Define if you want to OSD function instrumentation */ +#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS + +/* Define if you want to use Babeltrace */ +#cmakedefine WITH_BABELTRACE + +/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */ +#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1 + +/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */ +#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1 + +/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */ +#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1 + +/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */ +#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1 + +/* FastCGI headers are in /usr/include/fastcgi */ +#cmakedefine FASTCGI_INCLUDE_DIR + +/* splice(2) is supported */ +#cmakedefine CEPH_HAVE_SPLICE + +/* Define if you want C_Gather debugging */ +#cmakedefine DEBUG_GATHER + +/* Define to 1 if you have the `getgrouplist' function. */ +#cmakedefine HAVE_GETGROUPLIST 1 + +/* LTTng is disabled, so define this macro to be nothing. */ +#cmakedefine tracepoint + +/* Define to 1 if you have fdatasync. */ +#cmakedefine HAVE_FDATASYNC 1 + +/* Define to 1 if you have the <valgrind/helgrind.h> header file. */ +#cmakedefine HAVE_VALGRIND_HELGRIND_H 1 + +/* Define to 1 if you have the <sys/prctl.h> header file. */ +#cmakedefine HAVE_SYS_PRCTL_H 1 + +/* Define to 1 if you have the <linux/types.h> header file. */ +#cmakedefine HAVE_LINUX_TYPES_H 1 + +/* Define to 1 if you have the <linux/version.h> header file. */ +#cmakedefine HAVE_LINUX_VERSION_H 1 + +/* Define to 1 if you have sched.h. */ +#cmakedefine HAVE_SCHED 1 + +/* Define to 1 if you have sigdescr_np. */ +#cmakedefine HAVE_SIGDESCR_NP 1 + +/* Support SSE (Streaming SIMD Extensions) instructions */ +#cmakedefine HAVE_SSE + +/* Support SSE2 (Streaming SIMD Extensions 2) instructions */ +#cmakedefine HAVE_SSE2 + +/* Define to 1 if you have the `pipe2' function. */ +#cmakedefine HAVE_PIPE2 1 + +/* Support NEON instructions */ +#cmakedefine HAVE_NEON + +/* Define if you have pthread_spin_init */ +#cmakedefine HAVE_PTHREAD_SPINLOCK + +/* name_to_handle_at exists */ +#cmakedefine HAVE_NAME_TO_HANDLE_AT + +/* we have a recent nasm and are x86_64 */ +#cmakedefine HAVE_NASM_X64 + +/* nasm can also build the isa-l:avx512 */ +#cmakedefine HAVE_NASM_X64_AVX512 + +/* Define if the erasure code isa-l plugin is compiled */ +#cmakedefine WITH_EC_ISA_PLUGIN + +/* Define to 1 if strerror_r returns char *. */ +#cmakedefine STRERROR_R_CHAR_P 1 + +/* Defined if you have libzfs enabled */ +#cmakedefine HAVE_LIBZFS + +/* Define if the C compiler supports __func__ */ +#cmakedefine HAVE_FUNC + +/* Define if the C compiler supports __PRETTY_FUNCTION__ */ +#cmakedefine HAVE_PRETTY_FUNC + +/* Define if the C compiler supports __attribute__((__symver__ (".."))) */ +#cmakedefine HAVE_ATTR_SYMVER + +/* Define if the C compiler supports __asm__(".symver ..") */ +#cmakedefine HAVE_ASM_SYMVER + +/* Have eventfd extension. */ +#cmakedefine HAVE_EVENTFD + +/* Define if enabling coverage. */ +#cmakedefine ENABLE_COVERAGE + +/* Defined if you want pg ref debugging */ +#cmakedefine PG_DEBUG_REFS + +/* Support ARMv8 CRC instructions */ +#cmakedefine HAVE_ARMV8_CRC + +/* Support ARMv8 CRYPTO instructions */ +#cmakedefine HAVE_ARMV8_CRYPTO + +/* Support ARMv8 CRC and CRYPTO intrinsics */ +#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + +/* Define if you have struct stat.st_mtimespec.tv_nsec */ +#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC + +/* Define if you have struct stat.st_mtim.tv_nsec */ +#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC + +/* Define if compiler supports static_cast<> */ +#cmakedefine HAVE_STATIC_CAST + +/* Version number of package */ +#cmakedefine PROJECT_VERSION "@PROJECT_VERSION@" + +/* Defined if pthread_setname_np() is available */ +#cmakedefine HAVE_PTHREAD_SETNAME_NP 1 + +/* Defined if pthread_rwlockattr_setkind_np() is available */ +#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP + +/* Defined if blkin enabled */ +#cmakedefine WITH_BLKIN + +/* Defined if pthread_set_name_np() is available */ +#cmakedefine HAVE_PTHREAD_SET_NAME_NP + +/* Defined if pthread_getname_np() is available */ +#cmakedefine HAVE_PTHREAD_GETNAME_NP 1 + +/* Support POWER8 instructions */ +#cmakedefine HAVE_POWER8 + +/* Define if endian type is big endian */ +#cmakedefine CEPH_BIG_ENDIAN + +/* Define if endian type is little endian */ +#cmakedefine CEPH_LITTLE_ENDIAN + +#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@" + +/* Define to 1 if you have the `getprogname' function. */ +#cmakedefine HAVE_GETPROGNAME 1 + +/* Defined if getentropy() is available */ +#cmakedefine HAVE_GETENTROPY + +/* Defined if libradosstriper is enabled: */ +#cmakedefine WITH_LIBRADOSSTRIPER + +/* Defined if OpenSSL is available for the rgw beast frontend */ +#cmakedefine WITH_RADOSGW_BEAST_OPENSSL + +/* Defined if rabbitmq-c is available for rgw amqp push endpoint */ +#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT + +/* Defined if libedkafka is available for rgw kafka push endpoint */ +#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT + +/* Defined if lua packages can be installed by radosgw */ +#cmakedefine WITH_RADOSGW_LUA_PACKAGES + +/* Backend dbstore for Rados Gateway */ +#cmakedefine WITH_RADOSGW_DBSTORE + +/* Backend CORTX-Motr for Rados Gateway */ +#cmakedefine WITH_RADOSGW_MOTR + +/* Backend CORTX-DAOS for Rados Gateway */ +#cmakedefine WITH_RADOSGW_DAOS + +/* Defined if std::map::merge() is supported */ +#cmakedefine HAVE_STDLIB_MAP_SPLICING + +/* Defined if Intel QAT compress/decompress is supported */ +#cmakedefine HAVE_QATZIP + +/* Define if seastar is available. */ +#cmakedefine HAVE_SEASTAR + +/* Define if unit tests are built. */ +#cmakedefine UNIT_TESTS_BUILT + +/* Define if RBD QCOW migration format is enabled */ +#cmakedefine WITH_RBD_MIGRATION_FORMAT_QCOW_V1 + +/* Define if libcephsqlite is enabled */ +#cmakedefine WITH_LIBCEPHSQLITE + +/* Define if RWL is enabled */ +#cmakedefine WITH_RBD_RWL + +/* Define if PWL-SSD is enabled */ +#cmakedefine WITH_RBD_SSD_CACHE + +/* Define if libcryptsetup can be used (linux only) */ +#cmakedefine HAVE_LIBCRYPTSETUP + +/* Shared library extension, such as .so, .dll or .dylib */ +#cmakedefine CMAKE_SHARED_LIBRARY_SUFFIX "@CMAKE_SHARED_LIBRARY_SUFFIX@" + +/* libexec directory path */ +#cmakedefine CMAKE_INSTALL_LIBEXECDIR "@CMAKE_INSTALL_LIBEXECDIR@" + +#endif /* CONFIG_H */ diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h new file mode 100644 index 000000000..60b91e999 --- /dev/null +++ b/src/include/coredumpctl.h @@ -0,0 +1,105 @@ +#pragma once + +#include "acconfig.h" + +#ifdef HAVE_SYS_PRCTL_H +#include <iostream> +#include <sys/prctl.h> +#include "common/errno.h" + +class PrCtl { + int saved_state = -1; + static int get_dumpable() { + int r = prctl(PR_GET_DUMPABLE); + if (r == -1) { + r = errno; + std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r) + << std::endl; + } + return r; + } + static int set_dumpable(bool new_state) { + int r = prctl(PR_SET_DUMPABLE, new_state); + if (r) { + r = -errno; + std::cerr << "warning: unable to " << (new_state ? "set" : "unset") + << " dumpable flag: " << cpp_strerror(r) + << std::endl; + } + return r; + } +public: + PrCtl(int new_state = 0) { + int r = get_dumpable(); + if (r == -1) { + return; + } + if (r != new_state) { + if (!set_dumpable(new_state)) { + saved_state = r; + } + } + } + ~PrCtl() { + if (saved_state < 0) { + return; + } + set_dumpable(saved_state); + } +}; + +#else +#ifdef RLIMIT_CORE +#include <sys/resource.h> +#include <iostream> +#include <sys/resource.h> +#include "common/errno.h" + +class PrCtl { + rlimit saved_lim; + static int get_dumpable(rlimit* saved) { + int r = getrlimit(RLIMIT_CORE, saved); + if (r) { + r = errno; + std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r) + << std::endl; + } + return r; + } + static void set_dumpable(const rlimit& rlim) { + int r = setrlimit(RLIMIT_CORE, &rlim); + if (r) { + r = -errno; + std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r) + << std::endl; + } + } +public: + PrCtl(int new_state = 0) { + int r = get_dumpable(&saved_lim); + if (r == -1) { + return; + } + rlimit new_lim; + if (new_state) { + new_lim.rlim_cur = saved_lim.rlim_max; + } else { + new_lim.rlim_cur = new_lim.rlim_max = 0; + } + if (new_lim.rlim_cur == saved_lim.rlim_cur) { + return; + } + set_dumpable(new_lim); + } + ~PrCtl() { + set_dumpable(saved_lim); + } +}; +#else +struct PrCtl { + // to silence the Wunused-variable warning + PrCtl() {} +}; + +#endif // RLIMIT_CORE +#endif diff --git a/src/include/counter.h b/src/include/counter.h new file mode 100644 index 000000000..61ed7409c --- /dev/null +++ b/src/include/counter.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COUNTER_H +#define CEPH_COUNTER_H + +#include <atomic> + +template <typename T> +class Counter { +public: + Counter() { + _count()++; + _increments()++; + } + Counter(const Counter &rhs) { + _count()++; + _increments()++; + } + Counter(Counter &&rhs) {} + ~Counter() { + _count()--; + } + static uint64_t count() { + return _count(); + } + static uint64_t increments() { + return _increments(); + } + static uint64_t decrements() { + return increments()-count(); + } + +private: + static std::atomic<uint64_t> &_count() { + static std::atomic<uint64_t> c; + return c; + } + static std::atomic<uint64_t> &_increments() { + static std::atomic<uint64_t> i; + return i; + } +}; + +#endif diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h new file mode 100644 index 000000000..2eddc2abe --- /dev/null +++ b/src/include/cpp-btree/btree.h @@ -0,0 +1,2571 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// A btree implementation of the STL set and map interfaces. A btree is smaller +// and generally also faster than STL set/map (refer to the benchmarks below). +// The red-black tree implementation of STL set/map has an overhead of 3 +// pointers (left, right and parent) plus the node color information for each +// stored value. So a set<int32_t> consumes 40 bytes for each value stored in +// 64-bit mode. This btree implementation stores multiple values on fixed +// size nodes (usually 256 bytes) and doesn't store child pointers for leaf +// nodes. The result is that a btree_set<int32_t> may use much less memory per +// stored value. For the random insertion benchmark in btree_bench.cc, a +// btree_set<int32_t> with node-size of 256 uses 5.1 bytes per stored value. +// +// The packing of multiple values on to each node of a btree has another effect +// besides better space utilization: better cache locality due to fewer cache +// lines being accessed. Better cache locality translates into faster +// operations. +// +// CAVEATS +// +// Insertions and deletions on a btree can cause splitting, merging or +// rebalancing of btree nodes. And even without these operations, insertions +// and deletions on a btree will move values around within a node. In both +// cases, the result is that insertions and deletions can invalidate iterators +// pointing to values other than the one being inserted/deleted. Therefore, this +// container does not provide pointer stability. This is notably different from +// STL set/map which takes care to not invalidate iterators on insert/erase +// except, of course, for iterators pointing to the value being erased. A +// partial workaround when erasing is available: erase() returns an iterator +// pointing to the item just after the one that was erased (or end() if none +// exists). + +#pragma once + +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <experimental/type_traits> +#include <functional> +#include <iterator> +#include <limits> +#include <new> +#include <type_traits> +#include <utility> + +namespace btree::internal { + +template <typename Compare, typename T> +using btree_is_key_compare_to = + std::is_signed<std::invoke_result_t<Compare, T, T>>; + +template<typename T> +using compare_to_t = decltype(std::declval<T&>().compare(std::declval<const T&>())); +template<typename T> +inline constexpr bool has_compare_to = std::experimental::is_detected_v<compare_to_t, T>; +// A helper class to convert a boolean comparison into a three-way "compare-to" +// comparison that returns a negative value to indicate less-than, zero to +// indicate equality and a positive value to indicate greater-than. This helper +// class is specialized for less<std::string>, greater<std::string>, +// less<string_view>, and greater<string_view>. +// +// key_compare_to_adapter is provided so that btree users +// automatically get the more efficient compare-to code when using common +// google string types with common comparison functors. +// These string-like specializations also turn on heterogeneous lookup by +// default. +template <typename Compare, typename=void> +struct key_compare_to_adapter { + using type = Compare; +}; + +template <typename K> +struct key_compare_to_adapter<std::less<K>, std::enable_if_t<has_compare_to<K>>> +{ + struct type { + inline int operator()(const K& lhs, const K& rhs) const noexcept { + return lhs.compare(rhs); + } + }; +}; + +template <typename K> +struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_signed_v<K>>> +{ + struct type { + inline K operator()(const K& lhs, const K& rhs) const noexcept { + return lhs - rhs; + } + }; +}; + +template <typename K> +struct key_compare_to_adapter<std::less<K>, std::enable_if_t<std::is_unsigned_v<K>>> +{ + struct type { + inline int operator()(const K& lhs, const K& rhs) const noexcept { + if (lhs < rhs) { + return -1; + } else if (lhs > rhs) { + return 1; + } else { + return 0; + } + } + }; +}; + +template <typename Key, typename Compare, typename Alloc, + int TargetNodeSize, int ValueSize, + bool Multi> +struct common_params { + // If Compare is a common comparator for a std::string-like type, then we adapt it + // to use heterogeneous lookup and to be a key-compare-to comparator. + using key_compare = typename key_compare_to_adapter<Compare>::type; + // A type which indicates if we have a key-compare-to functor or a plain old + // key-compare functor. + using is_key_compare_to = btree_is_key_compare_to<key_compare, Key>; + + using allocator_type = Alloc; + using key_type = Key; + using size_type = std::make_signed<size_t>::type; + using difference_type = ptrdiff_t; + + // True if this is a multiset or multimap. + using is_multi_container = std::integral_constant<bool, Multi>; + + constexpr static int kTargetNodeSize = TargetNodeSize; + constexpr static int kValueSize = ValueSize; + // Upper bound for the available space for values. This is largest for leaf + // nodes, which have overhead of at least a pointer + 3 bytes (for storing + // 3 field_types) + paddings. if alignof(key_type) is 1, the size of padding + // would be 0. + constexpr static int kNodeValueSpace = + TargetNodeSize - /*minimum overhead=*/(sizeof(void *) + 4); + + // This is an integral type large enough to hold as many + // ValueSize-values as will fit a node of TargetNodeSize bytes. + using node_count_type = + std::conditional_t<(kNodeValueSpace / ValueSize > + (std::numeric_limits<uint8_t>::max)()), + uint16_t, + uint8_t>; +}; + +// The internal storage type +// +// It is convenient for the value_type of a btree_map<K, V> to be +// pair<const K, V>; the "const K" prevents accidental modification of the key +// when dealing with the reference returned from find() and similar methods. +// However, this creates other problems; we want to be able to emplace(K, V) +// efficiently with move operations, and similarly be able to move a +// pair<K, V> in insert(). +// +// The solution is this union, which aliases the const and non-const versions +// of the pair. This also allows flat_hash_map<const K, V> to work, even though +// that has the same efficiency issues with move in emplace() and insert() - +// but people do it anyway. +template <class K, class V> +union map_slot_type { + map_slot_type() {} + ~map_slot_type() = delete; + map_slot_type& operator=(const map_slot_type& slot) { + mutable_value = slot.mutable_value; + return *this; + } + map_slot_type& operator=(map_slot_type&& slot) { + mutable_value = std::move(slot.mutable_value); + return *this; + } + using value_type = std::pair<const K, V>; + using mutable_value_type = std::pair<K, V>; + + value_type value; + mutable_value_type mutable_value; + K key; +}; + +template <class K, class V> +void swap(map_slot_type<K, V>& lhs, map_slot_type<K, V>& rhs) { + std::swap(lhs.mutable_value, rhs.mutable_value); +} + +// A parameters structure for holding the type parameters for a btree_map. +// Compare and Alloc should be nothrow copy-constructible. +template <typename Key, typename Data, typename Compare, typename Alloc, + int TargetNodeSize, bool Multi> +struct map_params : common_params<Key, Compare, Alloc, TargetNodeSize, + sizeof(Key) + sizeof(Data), Multi> { + using super_type = typename map_params::common_params; + using mapped_type = Data; + using value_type = std::pair<const Key, mapped_type>; + using mutable_value_type = std::pair<Key, mapped_type>; + using slot_type = map_slot_type<Key, mapped_type>; + using pointer = value_type*; + using const_pointer = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + using key_compare = typename super_type::key_compare; + using init_type = mutable_value_type; + + static constexpr size_t kValueSize = sizeof(Key) + sizeof(mapped_type); + + // Inherit from key_compare for empty base class optimization. + struct value_compare : private key_compare { + value_compare() = default; + explicit value_compare(const key_compare &cmp) : key_compare(cmp) {} + + template <typename T, typename U> + auto operator()(const T &left, const U &right) const + -> decltype(std::declval<key_compare>()(left.first, right.first)) { + return key_compare::operator()(left.first, right.first); + } + }; + using is_map_container = std::true_type; + + static const Key &key(const value_type &value) { return value.first; } + static mapped_type &value(value_type *value) { return value->second; } + static const Key &key(const slot_type *slot) { return slot->key; } + static value_type& element(slot_type* slot) { return slot->value; } + static const value_type& element(const slot_type* slot) { return slot->value; } + template <class... Args> + static void construct(Alloc *alloc, slot_type *slot, Args &&... args) { + std::allocator_traits<Alloc>::construct(*alloc, + &slot->mutable_value, + std::forward<Args>(args)...); + } + // Construct this slot by moving from another slot. + static void construct(Alloc* alloc, slot_type* slot, slot_type* other) { + emplace(slot); + std::allocator_traits<Alloc>::construct(*alloc, &slot->value, + std::move(other->value)); + } + static void move(Alloc *alloc, slot_type *src, slot_type *dest) { + dest->mutable_value = std::move(src->mutable_value); + } + static void destroy(Alloc *alloc, slot_type *slot) { + std::allocator_traits<Alloc>::destroy(*alloc, &slot->mutable_value); + } + +private: + static void emplace(slot_type* slot) { + // The construction of union doesn't do anything at runtime but it allows us + // to access its members without violating aliasing rules. + new (slot) slot_type; + } +}; + +// A parameters structure for holding the type parameters for a btree_set. +template <typename Key, typename Compare, typename Alloc, int TargetNodeSize, bool Multi> +struct set_params + : public common_params<Key, Compare, Alloc, TargetNodeSize, + sizeof(Key), Multi> { + using value_type = Key; + using mutable_value_type = value_type; + using slot_type = Key; + using pointer = value_type *; + using const_pointer = const value_type *; + using value_compare = typename set_params::common_params::key_compare; + using reference = value_type &; + using const_reference = const value_type &; + using is_map_container = std::false_type; + using init_type = mutable_value_type; + + template <class... Args> + static void construct(Alloc *alloc, slot_type *slot, Args &&... args) { + std::allocator_traits<Alloc>::construct(*alloc, + slot, + std::forward<Args>(args)...); + } + static void construct(Alloc *alloc, slot_type *slot, slot_type *other) { + std::allocator_traits<Alloc>::construct(*alloc, slot, std::move(*other)); + } + static void move(Alloc *alloc, slot_type *src, slot_type *dest) { + *dest = std::move(*src); + } + static void destroy(Alloc *alloc, slot_type *slot) { + std::allocator_traits<Alloc>::destroy(*alloc, slot); + } + static const Key &key(const value_type &x) { return x; } + static const Key &key(const slot_type *slot) { return *slot; } + static value_type &element(slot_type *slot) { return *slot; } + static const value_type &element(const slot_type *slot) { return *slot; } +}; + +// Helper functions to do a boolean comparison of two keys given a boolean +// or three-way comparator. +// SFINAE prevents implicit conversions to bool (such as from int). +template <typename Result> +constexpr bool compare_result_as_less_than(const Result r) { + if constexpr (std::is_signed_v<Result>) { + return r < 0; + } else { + return r; + } +} +// An adapter class that converts a lower-bound compare into an upper-bound +// compare. Note: there is no need to make a version of this adapter specialized +// for key-compare-to functors because the upper-bound (the first value greater +// than the input) is never an exact match. +template <typename Compare> +struct upper_bound_adapter { + explicit upper_bound_adapter(const Compare &c) : comp(c) {} + template <typename K, typename LK> + bool operator()(const K &a, const LK &b) const { + // Returns true when a is not greater than b. + return !compare_result_as_less_than(comp(b, a)); + } +private: + const Compare& comp; +}; + +enum class MatchKind : uint8_t { kEq, kNe }; + +template <typename V, bool IsCompareTo> +struct SearchResult { + V value; + MatchKind match; + + static constexpr bool has_match = true; + bool IsEq() const { return match == MatchKind::kEq; } +}; + +// When we don't use CompareTo, `match` is not present. +// This ensures that callers can't use it accidentally when it provides no +// useful information. +template <typename V> +struct SearchResult<V, false> { + V value; + + static constexpr bool has_match = false; + static constexpr bool IsEq() { return false; } +}; + +// A node in the btree holding. The same node type is used for both internal +// and leaf nodes in the btree, though the nodes are allocated in such a way +// that the children array is only valid in internal nodes. +template <typename Params> +class btree_node { + using is_key_compare_to = typename Params::is_key_compare_to; + using is_multi_container = typename Params::is_multi_container; + using field_type = typename Params::node_count_type; + using allocator_type = typename Params::allocator_type; + using slot_type = typename Params::slot_type; + + public: + using params_type = Params; + using key_type = typename Params::key_type; + using value_type = typename Params::value_type; + using mutable_value_type = typename Params::mutable_value_type; + using pointer = typename Params::pointer; + using const_pointer = typename Params::const_pointer; + using reference = typename Params::reference; + using const_reference = typename Params::const_reference; + using key_compare = typename Params::key_compare; + using size_type = typename Params::size_type; + using difference_type = typename Params::difference_type; + + // Btree decides whether to use linear node search as follows: + // - If the key is arithmetic and the comparator is std::less or + // std::greater, choose linear. + // - Otherwise, choose binary. + // TODO(ezb): Might make sense to add condition(s) based on node-size. + using use_linear_search = std::integral_constant< + bool, + std::is_arithmetic_v<key_type> && + (std::is_same_v<std::less<key_type>, key_compare> || + std::is_same_v<std::greater<key_type>, key_compare>)>; + + ~btree_node() = default; + btree_node(const btree_node&) = delete; + btree_node& operator=(const btree_node&) = delete; + + protected: + btree_node() = default; + + private: + constexpr static size_type SizeWithNValues(size_type n) { + return sizeof(base_fields) + n * sizeof(value_type);; + } + // A lower bound for the overhead of fields other than values in a leaf node. + constexpr static size_type MinimumOverhead() { + return SizeWithNValues(1) - sizeof(value_type); + } + + // Compute how many values we can fit onto a leaf node taking into account + // padding. + constexpr static size_type NodeTargetValues(const int begin, const int end) { + return begin == end ? begin + : SizeWithNValues((begin + end) / 2 + 1) > + params_type::kTargetNodeSize + ? NodeTargetValues(begin, (begin + end) / 2) + : NodeTargetValues((begin + end) / 2 + 1, end); + } + + constexpr static int kValueSize = params_type::kValueSize; + constexpr static int kTargetNodeSize = params_type::kTargetNodeSize; + constexpr static int kNodeTargetValues = NodeTargetValues(0, kTargetNodeSize); + + // We need a minimum of 3 values per internal node in order to perform + // splitting (1 value for the two nodes involved in the split and 1 value + // propagated to the parent as the delimiter for the split). + constexpr static size_type kNodeValues = std::max(kNodeTargetValues, 3); + + // The node is internal (i.e. is not a leaf node) if and only if `max_count` + // has this value. + constexpr static size_type kInternalNodeMaxCount = 0; + + struct base_fields { + // A pointer to the node's parent. + btree_node *parent; + // The position of the node in the node's parent. + field_type position; + // The count of the number of values in the node. + field_type count; + // The maximum number of values the node can hold. + field_type max_count; + }; + + struct leaf_fields : public base_fields { + // The array of values. Only the first count of these values have been + // constructed and are valid. + slot_type values[kNodeValues]; + }; + + struct internal_fields : public leaf_fields { + // The array of child pointers. The keys in children_[i] are all less than + // key(i). The keys in children_[i + 1] are all greater than key(i). There + // are always count + 1 children. + btree_node *children[kNodeValues + 1]; + }; + + constexpr static size_type LeafSize(const int max_values = kNodeValues) { + return SizeWithNValues(max_values); + } + constexpr static size_type InternalSize() { + return sizeof(internal_fields); + } + + template<auto MemPtr> + auto& GetField() { + return reinterpret_cast<internal_fields*>(this)->*MemPtr; + } + + template<auto MemPtr> + auto& GetField() const { + return reinterpret_cast<const internal_fields*>(this)->*MemPtr; + } + + void set_parent(btree_node *p) { GetField<&base_fields::parent>() = p; } + field_type &mutable_count() { return GetField<&base_fields::count>(); } + slot_type *slot(int i) { return &GetField<&leaf_fields::values>()[i]; } + const slot_type *slot(int i) const { return &GetField<&leaf_fields::values>()[i]; } + void set_position(field_type v) { GetField<&base_fields::position>() = v; } + void set_count(field_type v) { GetField<&base_fields::count>() = v; } + // This method is only called by the node init methods. + void set_max_count(field_type v) { GetField<&base_fields::max_count>() = v; } + +public: + constexpr static size_type Alignment() { + static_assert(alignof(leaf_fields) == alignof(internal_fields), + "Alignment of all nodes must be equal."); + return alignof(internal_fields); + } + + // Getter/setter for whether this is a leaf node or not. This value doesn't + // change after the node is created. + bool leaf() const { return GetField<&base_fields::max_count>() != kInternalNodeMaxCount; } + + // Getter for the position of this node in its parent. + field_type position() const { return GetField<&base_fields::position>(); } + + // Getter for the number of values stored in this node. + field_type count() const { return GetField<&base_fields::count>(); } + field_type max_count() const { + // Internal nodes have max_count==kInternalNodeMaxCount. + // Leaf nodes have max_count in [1, kNodeValues]. + const field_type max_count = GetField<&base_fields::max_count>(); + return max_count == field_type{kInternalNodeMaxCount} + ? field_type{kNodeValues} + : max_count; + } + + // Getter for the parent of this node. + btree_node* parent() const { return GetField<&base_fields::parent>(); } + // Getter for whether the node is the root of the tree. The parent of the + // root of the tree is the leftmost node in the tree which is guaranteed to + // be a leaf. + bool is_root() const { return parent()->leaf(); } + void make_root() { + assert(parent()->is_root()); + set_parent(parent()->parent()); + } + + // Getters for the key/value at position i in the node. + const key_type& key(int i) const { return params_type::key(slot(i)); } + reference value(int i) { return params_type::element(slot(i)); } + const_reference value(int i) const { return params_type::element(slot(i)); } + + // Getters/setter for the child at position i in the node. + btree_node* child(int i) const { return GetField<&internal_fields::children>()[i]; } + btree_node*& mutable_child(int i) { return GetField<&internal_fields::children>()[i]; } + void clear_child(int i) { +#ifndef NDEBUG + memset(&mutable_child(i), 0, sizeof(btree_node*)); +#endif + } + void set_child(int i, btree_node *c) { + mutable_child(i) = c; + c->set_position(i); + } + void init_child(int i, btree_node *c) { + set_child(i, c); + c->set_parent(this); + } + // Returns the position of the first value whose key is not less than k. + template <typename K> + SearchResult<int, is_key_compare_to::value> lower_bound( + const K &k, const key_compare &comp) const { + return use_linear_search::value ? linear_search(k, comp) + : binary_search(k, comp); + } + // Returns the position of the first value whose key is greater than k. + template <typename K> + int upper_bound(const K &k, const key_compare &comp) const { + auto upper_compare = upper_bound_adapter<key_compare>(comp); + return use_linear_search::value ? linear_search(k, upper_compare).value + : binary_search(k, upper_compare).value; + } + + template <typename K, typename Compare> + SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value> + linear_search(const K &k, const Compare &comp) const { + return linear_search_impl(k, 0, count(), comp, + btree_is_key_compare_to<Compare, key_type>()); + } + + template <typename K, typename Compare> + SearchResult<int, btree_is_key_compare_to<Compare, key_type>::value> + binary_search(const K &k, const Compare &comp) const { + return binary_search_impl(k, 0, count(), comp, + btree_is_key_compare_to<Compare, key_type>()); + } + // Returns the position of the first value whose key is not less than k using + // linear search performed using plain compare. + template <typename K, typename Compare> + SearchResult<int, false> linear_search_impl( + const K &k, int s, const int e, const Compare &comp, + std::false_type /* IsCompareTo */) const { + while (s < e) { + if (!comp(key(s), k)) { + break; + } + ++s; + } + return {s}; + } + + // Returns the position of the first value whose key is not less than k using + // linear search performed using compare-to. + template <typename K, typename Compare> + SearchResult<int, true> linear_search_impl( + const K &k, int s, const int e, const Compare &comp, + std::true_type /* IsCompareTo */) const { + while (s < e) { + const auto c = comp(key(s), k); + if (c == 0) { + return {s, MatchKind::kEq}; + } else if (c > 0) { + break; + } + ++s; + } + return {s, MatchKind::kNe}; + } + + // Returns the position of the first value whose key is not less than k using + // binary search performed using plain compare. + template <typename K, typename Compare> + SearchResult<int, false> binary_search_impl( + const K &k, int s, int e, const Compare &comp, + std::false_type /* IsCompareTo */) const { + while (s != e) { + const int mid = (s + e) >> 1; + if (comp(key(mid), k)) { + s = mid + 1; + } else { + e = mid; + } + } + return {s}; + } + + // Returns the position of the first value whose key is not less than k using + // binary search performed using compare-to. + template <typename K, typename CompareTo> + SearchResult<int, true> binary_search_impl( + const K &k, int s, int e, const CompareTo &comp, + std::true_type /* IsCompareTo */) const { + if constexpr (is_multi_container::value) { + MatchKind exact_match = MatchKind::kNe; + while (s != e) { + const int mid = (s + e) >> 1; + const auto c = comp(key(mid), k); + if (c < 0) { + s = mid + 1; + } else { + e = mid; + if (c == 0) { + // Need to return the first value whose key is not less than k, + // which requires continuing the binary search if this is a + // multi-container. + exact_match = MatchKind::kEq; + } + } + } + return {s, exact_match}; + } else { // Not a multi-container. + while (s != e) { + const int mid = (s + e) >> 1; + const auto c = comp(key(mid), k); + if (c < 0) { + s = mid + 1; + } else if (c > 0) { + e = mid; + } else { + return {mid, MatchKind::kEq}; + } + } + return {s, MatchKind::kNe}; + } + } + + // Emplaces a value at position i, shifting all existing values and + // children at positions >= i to the right by 1. + template <typename... Args> + void emplace_value(size_type i, allocator_type *alloc, Args &&... args); + + // Removes the value at position i, shifting all existing values and children + // at positions > i to the left by 1. + void remove_value(const int i, allocator_type *alloc); + + // Removes the values at positions [i, i + to_erase), shifting all values + // after that range to the left by to_erase. Does not change children at all. + void remove_values_ignore_children(int i, int to_erase, + allocator_type *alloc); + + // Rebalances a node with its right sibling. + void rebalance_right_to_left(const int to_move, btree_node *right, + allocator_type *alloc); + void rebalance_left_to_right(const int to_move, btree_node *right, + allocator_type *alloc); + + // Splits a node, moving a portion of the node's values to its right sibling. + void split(const int insert_position, btree_node *dest, allocator_type *alloc); + + // Merges a node with its right sibling, moving all of the values and the + // delimiting key in the parent node onto itself. + void merge(btree_node *sibling, allocator_type *alloc); + + // Swap the contents of "this" and "src". + void swap(btree_node *src, allocator_type *alloc); + + // Node allocation/deletion routines. + static btree_node *init_leaf(btree_node *n, btree_node *parent, + int max_count) { + n->set_parent(parent); + n->set_position(0); + n->set_count(0); + n->set_max_count(max_count); + return n; + } + static btree_node *init_internal(btree_node *n, btree_node *parent) { + init_leaf(n, parent, kNodeValues); + // Set `max_count` to a sentinel value to indicate that this node is + // internal. + n->set_max_count(kInternalNodeMaxCount); + return n; + } + void destroy(allocator_type *alloc) { + for (int i = 0; i < count(); ++i) { + value_destroy(i, alloc); + } + } + + private: + template <typename... Args> + void value_init(const size_type i, allocator_type *alloc, Args &&... args) { + params_type::construct(alloc, slot(i), std::forward<Args>(args)...); + } + void value_destroy(const size_type i, allocator_type *alloc) { + params_type::destroy(alloc, slot(i)); + } + + // Move n values starting at value i in this node into the values starting at + // value j in node x. + void uninitialized_move_n(const size_type n, const size_type i, + const size_type j, btree_node *x, + allocator_type *alloc) { + for (slot_type *src = slot(i), *end = src + n, *dest = x->slot(j); + src != end; ++src, ++dest) { + params_type::construct(alloc, dest, src); + } + } + + // Destroys a range of n values, starting at index i. + void value_destroy_n(const size_type i, const size_type n, + allocator_type *alloc) { + for (int j = 0; j < n; ++j) { + value_destroy(i + j, alloc); + } + } + +private: + template <typename P> + friend class btree; + template <typename N, typename R, typename P> + friend struct btree_iterator; +}; + +template <typename Node, typename Reference, typename Pointer> +struct btree_iterator { + private: + using key_type = typename Node::key_type; + using size_type = typename Node::size_type; + using params_type = typename Node::params_type; + + using node_type = Node; + using normal_node = typename std::remove_const<Node>::type; + using const_node = const Node; + using normal_pointer = typename params_type::pointer; + using normal_reference = typename params_type::reference; + using const_pointer = typename params_type::const_pointer; + using const_reference = typename params_type::const_reference; + using slot_type = typename params_type::slot_type; + + using iterator = + btree_iterator<normal_node, normal_reference, normal_pointer>; + using const_iterator = + btree_iterator<const_node, const_reference, const_pointer>; + + public: + // These aliases are public for std::iterator_traits. + using difference_type = typename Node::difference_type; + using value_type = typename params_type::value_type; + using pointer = Pointer; + using reference = Reference; + using iterator_category = std::bidirectional_iterator_tag; + + btree_iterator() = default; + btree_iterator(Node *n, int p) : node(n), position(p) {} + + // NOTE: this SFINAE allows for implicit conversions from iterator to + // const_iterator, but it specifically avoids defining copy constructors so + // that btree_iterator can be trivially copyable. This is for performance and + // binary size reasons. + template<typename N, typename R, typename P, + std::enable_if_t< + std::is_same_v<btree_iterator<N, R, P>, iterator> && + std::is_same_v<btree_iterator, const_iterator>, + int> = 0> + btree_iterator(const btree_iterator<N, R, P> &x) + : node(x.node), position(x.position) {} + + private: + // This SFINAE allows explicit conversions from const_iterator to + // iterator, but also avoids defining a copy constructor. + // NOTE: the const_cast is safe because this constructor is only called by + // non-const methods and the container owns the nodes. + template <typename N, typename R, typename P, + std::enable_if_t< + std::is_same_v<btree_iterator<N, R, P>, const_iterator> && + std::is_same_v<btree_iterator, iterator>, + int> = 0> + explicit btree_iterator(const btree_iterator<N, R, P> &x) + : node(const_cast<node_type *>(x.node)), position(x.position) {} + + // Increment/decrement the iterator. + void increment() { + if (node->leaf() && ++position < node->count()) { + return; + } + increment_slow(); + } + void increment_slow(); + + void decrement() { + if (node->leaf() && --position >= 0) { + return; + } + decrement_slow(); + } + void decrement_slow(); + + public: + bool operator==(const const_iterator &x) const { + return node == x.node && position == x.position; + } + bool operator!=(const const_iterator &x) const { + return node != x.node || position != x.position; + } + bool operator==(const iterator& x) const { + return node == x.node && position == x.position; + } + bool operator!=(const iterator& x) const { + return node != x.node || position != x.position; + } + + // Accessors for the key/value the iterator is pointing at. + reference operator*() const { + return node->value(position); + } + pointer operator->() const { + return &node->value(position); + } + + btree_iterator& operator++() { + increment(); + return *this; + } + btree_iterator& operator--() { + decrement(); + return *this; + } + btree_iterator operator++(int) { + btree_iterator tmp = *this; + ++*this; + return tmp; + } + btree_iterator operator--(int) { + btree_iterator tmp = *this; + --*this; + return tmp; + } + + private: + template <typename Params> + friend class btree; + template <typename Tree> + friend class btree_container; + template <typename Tree> + friend class btree_set_container; + template <typename Tree> + friend class btree_map_container; + template <typename Tree> + friend class btree_multiset_container; + template <typename N, typename R, typename P> + friend struct btree_iterator; + + const key_type &key() const { return node->key(position); } + slot_type *slot() { return node->slot(position); } + + // The node in the tree the iterator is pointing at. + Node *node = nullptr; + // The position within the node of the tree the iterator is pointing at. + int position = -1; +}; + +template <size_t Alignment, class Alloc> +class AlignedAlloc { + struct alignas(Alignment) M {}; + using alloc_t = + typename std::allocator_traits<Alloc>::template rebind_alloc<M>; + using traits_t = + typename std::allocator_traits<Alloc>::template rebind_traits<M>; + static constexpr size_t num_aligned_objects(size_t size) { + return (size + sizeof(M) - 1) / sizeof(M); + } +public: + static void* allocate(Alloc* alloc, size_t size) { + alloc_t aligned_alloc(*alloc); + void* p = traits_t::allocate(aligned_alloc, + num_aligned_objects(size)); + assert(reinterpret_cast<uintptr_t>(p) % Alignment == 0 && + "allocator does not respect alignment"); + return p; + } + static void deallocate(Alloc* alloc, void* p, size_t size) { + alloc_t aligned_alloc(*alloc); + traits_t::deallocate(aligned_alloc, static_cast<M*>(p), + num_aligned_objects(size)); + } +}; + +template <typename Params> +class btree { + using node_type = btree_node<Params>; + using is_key_compare_to = typename Params::is_key_compare_to; + + // We use a static empty node for the root/leftmost/rightmost of empty btrees + // in order to avoid branching in begin()/end(). + struct alignas(node_type::Alignment()) EmptyNodeType : node_type { + using field_type = typename node_type::field_type; + node_type *parent; + field_type position = 0; + field_type count = 0; + // max_count must be != kInternalNodeMaxCount (so that this node is regarded + // as a leaf node). max_count() is never called when the tree is empty. + field_type max_count = node_type::kInternalNodeMaxCount + 1; + + constexpr EmptyNodeType(node_type *p) : parent(p) {} + }; + + static node_type *EmptyNode() { + static constexpr EmptyNodeType empty_node( + const_cast<EmptyNodeType *>(&empty_node)); + return const_cast<EmptyNodeType *>(&empty_node); + } + + constexpr static int kNodeValues = node_type::kNodeValues; + constexpr static int kMinNodeValues = kNodeValues / 2; + constexpr static int kValueSize = node_type::kValueSize; + + // A helper class to get the empty base class optimization for 0-size + // allocators. Base is allocator_type. + // (e.g. empty_base_handle<key_compare, allocator_type, node_type*>). If Base is + // 0-size, the compiler doesn't have to reserve any space for it and + // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base + // class optimization] for more details. + template <typename Base1, typename Base2, typename Data> + struct empty_base_handle : public Base1, Base2 { + empty_base_handle(const Base1 &b1, const Base2 &b2, const Data &d) + : Base1(b1), + Base2(b2), + data(d) {} + Data data; + }; + + struct node_stats { + using size_type = typename Params::size_type; + + node_stats(size_type l, size_type i) + : leaf_nodes(l), + internal_nodes(i) { + } + + node_stats& operator+=(const node_stats &x) { + leaf_nodes += x.leaf_nodes; + internal_nodes += x.internal_nodes; + return *this; + } + + size_type leaf_nodes; + size_type internal_nodes; + }; + + public: + using key_type = typename Params::key_type; + using value_type = typename Params::value_type; + using size_type = typename Params::size_type; + using difference_type = typename Params::difference_type; + using key_compare = typename Params::key_compare; + using value_compare = typename Params::value_compare; + using allocator_type = typename Params::allocator_type; + using reference = typename Params::reference; + using const_reference = typename Params::const_reference; + using pointer = typename Params::pointer; + using const_pointer = typename Params::const_pointer; + using iterator = btree_iterator<node_type, reference, pointer>; + using const_iterator = typename iterator::const_iterator; + using reverse_iterator = std::reverse_iterator<iterator>; + using const_reverse_iterator = std::reverse_iterator<const_iterator>; + + // Internal types made public for use by btree_container types. + using params_type = Params; + + private: + // For use in copy_or_move_values_in_order. + const value_type &maybe_move_from_iterator(const_iterator x) { return *x; } + value_type &&maybe_move_from_iterator(iterator x) { return std::move(*x); } + + // Copies or moves (depending on the template parameter) the values in + // x into this btree in their order in x. This btree must be empty before this + // method is called. This method is used in copy construction, copy + // assignment, and move assignment. + template <typename Btree> + void copy_or_move_values_in_order(Btree *x); + + // Validates that various assumptions/requirements are true at compile time. + constexpr static bool static_assert_validation(); + + public: + btree(const key_compare &comp, const allocator_type &alloc); + + btree(const btree &x); + btree(btree &&x) noexcept + : root_(std::move(x.root_)), + rightmost_(std::exchange(x.rightmost_, EmptyNode())), + size_(std::exchange(x.size_, 0)) { + x.mutable_root() = EmptyNode(); + } + + ~btree() { + // Put static_asserts in destructor to avoid triggering them before the type + // is complete. + static_assert(static_assert_validation(), "This call must be elided."); + clear(); + } + + // Assign the contents of x to *this. + btree &operator=(const btree &x); + btree &operator=(btree &&x) noexcept; + + iterator begin() { + return iterator(leftmost(), 0); + } + const_iterator begin() const { + return const_iterator(leftmost(), 0); + } + iterator end() { + return iterator(rightmost_, rightmost_->count()); + } + const_iterator end() const { + return const_iterator(rightmost_, rightmost_->count()); + } + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Finds the first element whose key is not less than key. + template <typename K> + iterator lower_bound(const K &key) { + return internal_end(internal_lower_bound(key)); + } + template <typename K> + const_iterator lower_bound(const K &key) const { + return internal_end(internal_lower_bound(key)); + } + + // Finds the first element whose key is greater than key. + template <typename K> + iterator upper_bound(const K &key) { + return internal_end(internal_upper_bound(key)); + } + template <typename K> + const_iterator upper_bound(const K &key) const { + return internal_end(internal_upper_bound(key)); + } + + // Finds the range of values which compare equal to key. The first member of + // the returned pair is equal to lower_bound(key). The second member pair of + // the pair is equal to upper_bound(key). + template <typename K> + std::pair<iterator, iterator> equal_range(const K &key) { + return {lower_bound(key), upper_bound(key)}; + } + template <typename K> + std::pair<const_iterator, const_iterator> equal_range(const K &key) const { + return {lower_bound(key), upper_bound(key)}; + } + + // Inserts a value into the btree only if it does not already exist. The + // boolean return value indicates whether insertion succeeded or failed. + // Requirement: if `key` already exists in the btree, does not consume `args`. + // Requirement: `key` is never referenced after consuming `args`. + template <typename... Args> + std::pair<iterator, bool> insert_unique(const key_type &key, Args &&... args); + + // Inserts with hint. Checks to see if the value should be placed immediately + // before `position` in the tree. If so, then the insertion will take + // amortized constant time. If not, the insertion will take amortized + // logarithmic time as if a call to insert_unique() were made. + // Requirement: if `key` already exists in the btree, does not consume `args`. + // Requirement: `key` is never referenced after consuming `args`. + template <typename... Args> + std::pair<iterator, bool> insert_hint_unique(iterator position, + const key_type &key, + Args &&... args); + + // Insert a range of values into the btree. + template <typename InputIterator> + void insert_iterator_unique(InputIterator b, InputIterator e); + + // Inserts a value into the btree. + template <typename ValueType> + iterator insert_multi(const key_type &key, ValueType &&v); + + // Inserts a value into the btree. + template <typename ValueType> + iterator insert_multi(ValueType &&v) { + return insert_multi(params_type::key(v), std::forward<ValueType>(v)); + } + + // Insert with hint. Check to see if the value should be placed immediately + // before position in the tree. If it does, then the insertion will take + // amortized constant time. If not, the insertion will take amortized + // logarithmic time as if a call to insert_multi(v) were made. + template <typename ValueType> + iterator insert_hint_multi(iterator position, ValueType &&v); + + // Insert a range of values into the btree. + template <typename InputIterator> + void insert_iterator_multi(InputIterator b, InputIterator e); + + // Erase the specified iterator from the btree. The iterator must be valid + // (i.e. not equal to end()). Return an iterator pointing to the node after + // the one that was erased (or end() if none exists). + // Requirement: does not read the value at `*iter`. + iterator erase(iterator iter); + + // Erases range. Returns the number of keys erased and an iterator pointing + // to the element after the last erased element. + std::pair<size_type, iterator> erase(iterator begin, iterator end); + + // Erases the specified key from the btree. Returns 1 if an element was + // erased and 0 otherwise. + template <typename K> + size_type erase_unique(const K &key); + + // Erases all of the entries matching the specified key from the + // btree. Returns the number of elements erased. + template <typename K> + size_type erase_multi(const K &key); + + // Finds the iterator corresponding to a key or returns end() if the key is + // not present. + template <typename K> + iterator find(const K &key) { + return internal_end(internal_find(key)); + } + template <typename K> + const_iterator find(const K &key) const { + return internal_end(internal_find(key)); + } + + // Returns a count of the number of times the key appears in the btree. + template <typename K> + size_type count_unique(const K &key) const { + const iterator begin = internal_find(key); + if (begin.node == nullptr) { + // The key doesn't exist in the tree. + return 0; + } + return 1; + } + // Returns a count of the number of times the key appears in the btree. + template <typename K> + size_type count_multi(const K &key) const { + const auto range = equal_range(key); + return std::distance(range.first, range.second); + } + + // Clear the btree, deleting all of the values it contains. + void clear(); + + // Swap the contents of *this and x. + void swap(btree &x); + + const key_compare &key_comp() const noexcept { + return *static_cast<const key_compare*>(&root_); + } + template <typename K, typename LK> + bool compare_keys(const K &x, const LK &y) const { + return compare_result_as_less_than(key_comp()(x, y)); + } + + // Verifies the structure of the btree. + void verify() const; + + // Size routines. + size_type size() const { return size_; } + size_type max_size() const { return std::numeric_limits<size_type>::max(); } + bool empty() const { return size_ == 0; } + + // The height of the btree. An empty tree will have height 0. + size_type height() const { + size_type h = 0; + if (!empty()) { + // Count the length of the chain from the leftmost node up to the + // root. We actually count from the root back around to the level below + // the root, but the calculation is the same because of the circularity + // of that traversal. + const node_type *n = root(); + do { + ++h; + n = n->parent(); + } while (n != root()); + } + return h; + } + + // The number of internal, leaf and total nodes used by the btree. + size_type leaf_nodes() const { + return internal_stats(root()).leaf_nodes; + } + size_type internal_nodes() const { + return internal_stats(root()).internal_nodes; + } + size_type nodes() const { + node_stats stats = internal_stats(root()); + return stats.leaf_nodes + stats.internal_nodes; + } + + // The total number of bytes used by the btree. + size_type bytes_used() const { + node_stats stats = internal_stats(root()); + if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) { + return sizeof(*this) + + node_type::LeafSize(root()->max_count()); + } else { + return sizeof(*this) + + stats.leaf_nodes * node_type::LeafSize() + + stats.internal_nodes * node_type::InternalSize(); + } + } + + // The average number of bytes used per value stored in the btree. + static double average_bytes_per_value() { + // Returns the number of bytes per value on a leaf node that is 75% + // full. Experimentally, this matches up nicely with the computed number of + // bytes per value in trees that had their values inserted in random order. + return node_type::LeafSize() / (kNodeValues * 0.75); + } + + // The fullness of the btree. Computed as the number of elements in the btree + // divided by the maximum number of elements a tree with the current number + // of nodes could hold. A value of 1 indicates perfect space + // utilization. Smaller values indicate space wastage. + // Returns 0 for empty trees. + double fullness() const { + if (empty()) return 0.0; + return static_cast<double>(size()) / (nodes() * kNodeValues); + } + // The overhead of the btree structure in bytes per node. Computed as the + // total number of bytes used by the btree minus the number of bytes used for + // storing elements divided by the number of elements. + // Returns 0 for empty trees. + double overhead() const { + if (empty()) return 0.0; + return (bytes_used() - size() * sizeof(value_type)) / + static_cast<double>(size()); + } + + // The allocator used by the btree. + allocator_type get_allocator() const { + return allocator(); + } + + private: + // Internal accessor routines. + node_type *root() { return root_.data; } + const node_type *root() const { return root_.data; } + node_type *&mutable_root() { return root_.data; } + key_compare *mutable_key_comp() noexcept { + return static_cast<key_compare*>(&root_); + } + + node_type* rightmost() { + return rightmost_; + } + const node_type* rightmost() const { + return rightmost_; + } + // The leftmost node is stored as the parent of the root node. + node_type* leftmost() { return root() ? root()->parent() : NULL; } + const node_type* leftmost() const { return root() ? root()->parent() : NULL; } + + // The size of the tree is stored in the root node. + size_type* mutable_size() { return root()->mutable_size(); } + + // Allocator routines. + allocator_type* mutable_allocator() noexcept { + return static_cast<allocator_type*>(&root_); + } + const allocator_type& allocator() const noexcept { + return *static_cast<const allocator_type*>(&root_); + } + + node_type *allocate(const size_type size) { + using aligned_alloc_t = + AlignedAlloc<node_type::Alignment(), allocator_type>; + return static_cast<node_type*>( + aligned_alloc_t::allocate(mutable_allocator(), size)); + } + + // Node creation/deletion routines. + node_type* new_internal_node(node_type *parent) { + node_type *p = allocate(node_type::InternalSize()); + return node_type::init_internal(p, parent); + } + node_type* new_leaf_node(node_type *parent) { + node_type *p = allocate(node_type::LeafSize()); + return node_type::init_leaf(p, parent, kNodeValues); + } + node_type *new_leaf_root_node(const int max_count) { + node_type *p = allocate(node_type::LeafSize(max_count)); + return node_type::init_leaf(p, p, max_count); + } + + // Deletion helper routines. + void erase_same_node(iterator begin, iterator end); + iterator erase_from_leaf_node(iterator begin, size_type to_erase); + iterator rebalance_after_delete(iterator iter); + + // Deallocates a node of a certain size in bytes using the allocator. + void deallocate(const size_type size, node_type *node) { + using aligned_alloc_t = + AlignedAlloc<node_type::Alignment(), allocator_type>; + aligned_alloc_t::deallocate(mutable_allocator(), node, size); + } + + void delete_internal_node(node_type *node) { + node->destroy(mutable_allocator()); + deallocate(node_type::InternalSize(), node); + } + void delete_leaf_node(node_type *node) { + node->destroy(mutable_allocator()); + deallocate(node_type::LeafSize(node->max_count()), node); + } + + // Rebalances or splits the node iter points to. + void rebalance_or_split(iterator *iter); + + // Merges the values of left, right and the delimiting key on their parent + // onto left, removing the delimiting key and deleting right. + void merge_nodes(node_type *left, node_type *right); + + // Tries to merge node with its left or right sibling, and failing that, + // rebalance with its left or right sibling. Returns true if a merge + // occurred, at which point it is no longer valid to access node. Returns + // false if no merging took place. + bool try_merge_or_rebalance(iterator *iter); + + // Tries to shrink the height of the tree by 1. + void try_shrink(); + + iterator internal_end(iterator iter) { + return iter.node != nullptr ? iter : end(); + } + const_iterator internal_end(const_iterator iter) const { + return iter.node != nullptr ? iter : end(); + } + + // Emplaces a value into the btree immediately before iter. Requires that + // key(v) <= iter.key() and (--iter).key() <= key(v). + template <typename... Args> + iterator internal_emplace(iterator iter, Args &&... args); + + // Returns an iterator pointing to the first value >= the value "iter" is + // pointing at. Note that "iter" might be pointing to an invalid location as + // iter.position == iter.node->count(). This routine simply moves iter up in + // the tree to a valid location. + // Requires: iter.node is non-null. + template <typename IterType> + static IterType internal_last(IterType iter); + + // Returns an iterator pointing to the leaf position at which key would + // reside in the tree. We provide 2 versions of internal_locate. The first + // version uses a less-than comparator and is incapable of distinguishing when + // there is an exact match. The second version is for the key-compare-to + // specialization and distinguishes exact matches. The key-compare-to + // specialization allows the caller to avoid a subsequent comparison to + // determine if an exact match was made, which is important for keys with + // expensive comparison, such as strings. + template <typename K> + SearchResult<iterator, is_key_compare_to::value> internal_locate( + const K &key) const; + + template <typename K> + SearchResult<iterator, false> internal_locate_impl( + const K &key, std::false_type /* IsCompareTo */) const; + + template <typename K> + SearchResult<iterator, true> internal_locate_impl( + const K &key, std::true_type /* IsCompareTo */) const; + + // Internal routine which implements lower_bound(). + template <typename K> + iterator internal_lower_bound(const K &key) const; + + // Internal routine which implements upper_bound(). + template <typename K> + iterator internal_upper_bound(const K &key) const; + + // Internal routine which implements find(). + template <typename K> + iterator internal_find(const K &key) const; + + // Deletes a node and all of its children. + void internal_clear(node_type *node); + + // Verifies the tree structure of node. + int internal_verify(const node_type *node, + const key_type *lo, const key_type *hi) const; + + node_stats internal_stats(const node_type *node) const { + // The root can be a static empty node. + if (node == nullptr || (node == root() && empty())) { + return node_stats(0, 0); + } + if (node->leaf()) { + return node_stats(1, 0); + } + node_stats res(0, 1); + for (int i = 0; i <= node->count(); ++i) { + res += internal_stats(node->child(i)); + } + return res; + } + + private: + empty_base_handle<key_compare, allocator_type, node_type*> root_; + + // A pointer to the rightmost node. Note that the leftmost node is stored as + // the root's parent. + node_type *rightmost_; + + // Number of values. + size_type size_; +}; + +//// +// btree_node methods +template <typename P> +template <typename... Args> +inline void btree_node<P>::emplace_value(const size_type i, + allocator_type *alloc, + Args &&... args) { + assert(i <= count()); + // Shift old values to create space for new value and then construct it in + // place. + if (i < count()) { + value_init(count(), alloc, slot(count() - 1)); + std::copy_backward(std::make_move_iterator(slot(i)), + std::make_move_iterator(slot(count() - 1)), + slot(count())); + value_destroy(i, alloc); + } + value_init(i, alloc, std::forward<Args>(args)...); + set_count(count() + 1); + + if (!leaf() && count() > i + 1) { + for (int j = count(); j > i + 1; --j) { + set_child(j, child(j - 1)); + } + clear_child(i + 1); + } +} + +template <typename P> +inline void btree_node<P>::remove_value(const int i, allocator_type *alloc) { + if (!leaf() && count() > i + 1) { + assert(child(i + 1)->count() == 0); + for (size_type j = i + 1; j < count(); ++j) { + set_child(j, child(j + 1)); + } + clear_child(count()); + } + + remove_values_ignore_children(i, /*to_erase=*/1, alloc); +} + +template <typename P> +inline void btree_node<P>::remove_values_ignore_children( + const int i, const int to_erase, allocator_type *alloc) { + assert(to_erase >= 0); + std::copy(std::make_move_iterator(slot(i + to_erase)), + std::make_move_iterator(slot(count())), + slot(i)); + value_destroy_n(count() - to_erase, to_erase, alloc); + set_count(count() - to_erase); +} + +template <typename P> +void btree_node<P>::rebalance_right_to_left(const int to_move, + btree_node *right, + allocator_type *alloc) { + assert(parent() == right->parent()); + assert(position() + 1 == right->position()); + assert(right->count() >= count()); + assert(to_move >= 1); + assert(to_move <= right->count()); + + // 1) Move the delimiting value in the parent to the left node. + value_init(count(), alloc, parent()->slot(position())); + + // 2) Move the (to_move - 1) values from the right node to the left node. + right->uninitialized_move_n(to_move - 1, 0, count() + 1, this, alloc); + + // 3) Move the new delimiting value to the parent from the right node. + params_type::move(alloc, right->slot(to_move - 1), + parent()->slot(position())); + + // 4) Shift the values in the right node to their correct position. + std::copy(std::make_move_iterator(right->slot(to_move)), + std::make_move_iterator(right->slot(right->count())), + right->slot(0)); + + // 5) Destroy the now-empty to_move entries in the right node. + right->value_destroy_n(right->count() - to_move, to_move, alloc); + + if (!leaf()) { + // Move the child pointers from the right to the left node. + for (int i = 0; i < to_move; ++i) { + init_child(count() + i + 1, right->child(i)); + } + for (int i = 0; i <= right->count() - to_move; ++i) { + assert(i + to_move <= right->max_count()); + right->init_child(i, right->child(i + to_move)); + right->clear_child(i + to_move); + } + } + + // Fixup the counts on the left and right nodes. + set_count(count() + to_move); + right->set_count(right->count() - to_move); +} + +template <typename P> +void btree_node<P>::rebalance_left_to_right(const int to_move, + btree_node *right, + allocator_type *alloc) { + assert(parent() == right->parent()); + assert(position() + 1 == right->position()); + assert(count() >= right->count()); + assert(to_move >= 1); + assert(to_move <= count()); + + // Values in the right node are shifted to the right to make room for the + // new to_move values. Then, the delimiting value in the parent and the + // other (to_move - 1) values in the left node are moved into the right node. + // Lastly, a new delimiting value is moved from the left node into the + // parent, and the remaining empty left node entries are destroyed. + + if (right->count() >= to_move) { + // The original location of the right->count() values are sufficient to hold + // the new to_move entries from the parent and left node. + + // 1) Shift existing values in the right node to their correct positions. + right->uninitialized_move_n(to_move, right->count() - to_move, + right->count(), right, alloc); + std::copy_backward(std::make_move_iterator(right->slot(0)), + std::make_move_iterator(right->slot(right->count() - to_move)), + right->slot(right->count())); + + // 2) Move the delimiting value in the parent to the right node. + params_type::move(alloc, parent()->slot(position()), + right->slot(to_move - 1)); + + // 3) Move the (to_move - 1) values from the left node to the right node. + std::copy(std::make_move_iterator(slot(count() - (to_move - 1))), + std::make_move_iterator(slot(count())), + right->slot(0)); + } else { + // The right node does not have enough initialized space to hold the new + // to_move entries, so part of them will move to uninitialized space. + + // 1) Shift existing values in the right node to their correct positions. + right->uninitialized_move_n(right->count(), 0, to_move, right, alloc); + + // 2) Move the delimiting value in the parent to the right node. + right->value_init(to_move - 1, alloc, parent()->slot(position())); + + // 3) Move the (to_move - 1) values from the left node to the right node. + const size_type uninitialized_remaining = to_move - right->count() - 1; + uninitialized_move_n(uninitialized_remaining, + count() - uninitialized_remaining, right->count(), + right, alloc); + std::copy(std::make_move_iterator(slot(count() - (to_move - 1))), + std::make_move_iterator(slot(count() - uninitialized_remaining)), + right->slot(0)); + } + + // 4) Move the new delimiting value to the parent from the left node. + params_type::move(alloc, slot(count() - to_move), parent()->slot(position())); + + // 5) Destroy the now-empty to_move entries in the left node. + value_destroy_n(count() - to_move, to_move, alloc); + + if (!leaf()) { + // Move the child pointers from the left to the right node. + for (int i = right->count(); i >= 0; --i) { + right->init_child(i + to_move, right->child(i)); + right->clear_child(i); + } + for (int i = 1; i <= to_move; ++i) { + right->init_child(i - 1, child(count() - to_move + i)); + clear_child(count() - to_move + i); + } + } + + // Fixup the counts on the left and right nodes. + set_count(count() - to_move); + right->set_count(right->count() + to_move); +} + +template <typename P> +void btree_node<P>::split(const int insert_position, btree_node *dest, + allocator_type *alloc) { + assert(dest->count() == 0); + assert(max_count() == kNodeValues); + + // We bias the split based on the position being inserted. If we're + // inserting at the beginning of the left node then bias the split to put + // more values on the right node. If we're inserting at the end of the + // right node then bias the split to put more values on the left node. + if (insert_position == 0) { + dest->set_count(count() - 1); + } else if (insert_position == kNodeValues) { + dest->set_count(0); + } else { + dest->set_count(count() / 2); + } + set_count(count() - dest->count()); + assert(count() >= 1); + + // Move values from the left sibling to the right sibling. + uninitialized_move_n(dest->count(), count(), 0, dest, alloc); + + // Destroy the now-empty entries in the left node. + value_destroy_n(count(), dest->count(), alloc); + + // The split key is the largest value in the left sibling. + set_count(count() - 1); + parent()->emplace_value(position(), alloc, slot(count())); + value_destroy(count(), alloc); + parent()->init_child(position() + 1, dest); + + if (!leaf()) { + for (int i = 0; i <= dest->count(); ++i) { + assert(child(count() + i + 1) != nullptr); + dest->init_child(i, child(count() + i + 1)); + clear_child(count() + i + 1); + } + } +} + +template <typename P> +void btree_node<P>::merge(btree_node *src, allocator_type *alloc) { + assert(parent() == src->parent()); + assert(position() + 1 == src->position()); + + // Move the delimiting value to the left node. + value_init(count(), alloc, parent()->slot(position())); + + // Move the values from the right to the left node. + src->uninitialized_move_n(src->count(), 0, count() + 1, this, alloc); + + // Destroy the now-empty entries in the right node. + src->value_destroy_n(0, src->count(), alloc); + + if (!leaf()) { + // Move the child pointers from the right to the left node. + for (int i = 0; i <= src->count(); ++i) { + init_child(count() + i + 1, src->child(i)); + src->clear_child(i); + } + } + + // Fixup the counts on the src and dest nodes. + set_count(1 + count() + src->count()); + src->set_count(0); + + // Remove the value on the parent node. + parent()->remove_value(position(), alloc); +} + +template <typename P> +void btree_node<P>::swap(btree_node *x, allocator_type *alloc) { + using std::swap; + assert(leaf() == x->leaf()); + + // Determine which is the smaller/larger node. + btree_node *smaller = this, *larger = x; + if (smaller->count() > larger->count()) { + swap(smaller, larger); + } + + // Swap the values. + std::swap_ranges(smaller->slot(0), smaller->slot(smaller->count()), + larger->slot(0)); + + // Move values that can't be swapped. + const size_type to_move = larger->count() - smaller->count(); + larger->uninitialized_move_n(to_move, smaller->count(), smaller->count(), + smaller, alloc); + larger->value_destroy_n(smaller->count(), to_move, alloc); + + if (!leaf()) { + // Swap the child pointers. + std::swap_ranges(&smaller->mutable_child(0), + &smaller->mutable_child(smaller->count() + 1), + &larger->mutable_child(0)); + // Update swapped children's parent pointers. + int i = 0; + for (; i <= smaller->count(); ++i) { + smaller->child(i)->set_parent(smaller); + larger->child(i)->set_parent(larger); + } + // Move the child pointers that couldn't be swapped. + for (; i <= larger->count(); ++i) { + smaller->init_child(i, larger->child(i)); + larger->clear_child(i); + } + } + + // Swap the counts. + swap(mutable_count(), x->mutable_count()); +} + +//// +// btree_iterator methods +template <typename N, typename R, typename P> +void btree_iterator<N, R, P>::increment_slow() { + if (node->leaf()) { + assert(position >= node->count()); + btree_iterator save(*this); + while (position == node->count() && !node->is_root()) { + assert(node->parent()->child(node->position()) == node); + position = node->position(); + node = node->parent(); + } + if (position == node->count()) { + *this = save; + } + } else { + assert(position < node->count()); + node = node->child(position + 1); + while (!node->leaf()) { + node = node->child(0); + } + position = 0; + } +} + +template <typename N, typename R, typename P> +void btree_iterator<N, R, P>::decrement_slow() { + if (node->leaf()) { + assert(position <= -1); + btree_iterator save(*this); + while (position < 0 && !node->is_root()) { + assert(node->parent()->child(node->position()) == node); + position = node->position() - 1; + node = node->parent(); + } + if (position < 0) { + *this = save; + } + } else { + assert(position >= 0); + node = node->child(position); + while (!node->leaf()) { + node = node->child(node->count()); + } + position = node->count() - 1; + } +} + +//// +// btree methods +template <typename P> +template <typename Btree> +void btree<P>::copy_or_move_values_in_order(Btree *x) { + static_assert(std::is_same_v<btree, Btree>|| + std::is_same_v<const btree, Btree>, + "Btree type must be same or const."); + assert(empty()); + + // We can avoid key comparisons because we know the order of the + // values is the same order we'll store them in. + auto iter = x->begin(); + if (iter == x->end()) return; + insert_multi(maybe_move_from_iterator(iter)); + ++iter; + for (; iter != x->end(); ++iter) { + // If the btree is not empty, we can just insert the new value at the end + // of the tree. + internal_emplace(end(), maybe_move_from_iterator(iter)); + } +} + +template <typename P> +constexpr bool btree<P>::static_assert_validation() { + static_assert(std::is_nothrow_copy_constructible_v<key_compare>, + "Key comparison must be nothrow copy constructible"); + static_assert(std::is_nothrow_copy_constructible_v<allocator_type>, + "Allocator must be nothrow copy constructible"); + static_assert(std::is_trivially_copyable_v<iterator>, + "iterator not trivially copyable."); + + // Note: We assert that kTargetValues, which is computed from + // Params::kTargetNodeSize, must fit the base_fields::field_type. + static_assert( + kNodeValues < (1 << (8 * sizeof(typename node_type::field_type))), + "target node size too large"); + + // Verify that key_compare returns an absl::{weak,strong}_ordering or bool. + using compare_result_type = + std::invoke_result_t<key_compare, key_type, key_type>; + static_assert( + std::is_same_v<compare_result_type, bool> || + std::is_signed_v<compare_result_type>, + "key comparison function must return a signed value or " + "bool."); + + // Test the assumption made in setting kNodeValueSpace. + static_assert(node_type::MinimumOverhead() >= sizeof(void *) + 4, + "node space assumption incorrect"); + + return true; +} + +template <typename P> +btree<P>::btree(const key_compare &comp, const allocator_type &alloc) + : root_(comp, alloc, EmptyNode()), rightmost_(EmptyNode()), size_(0) {} + +template <typename P> +btree<P>::btree(const btree &x) : btree(x.key_comp(), x.allocator()) { + copy_or_move_values_in_order(&x); +} + +template <typename P> +template <typename... Args> +auto btree<P>::insert_unique(const key_type &key, Args &&... args) + -> std::pair<iterator, bool> { + if (empty()) { + mutable_root() = rightmost_ = new_leaf_root_node(1); + } + + auto res = internal_locate(key); + iterator &iter = res.value; + + if constexpr (res.has_match) { + if (res.IsEq()) { + // The key already exists in the tree, do nothing. + return {iter, false}; + } + } else { + iterator last = internal_last(iter); + if (last.node && !compare_keys(key, last.key())) { + // The key already exists in the tree, do nothing. + return {last, false}; + } + } + return {internal_emplace(iter, std::forward<Args>(args)...), true}; +} + +template <typename P> +template <typename... Args> +inline auto btree<P>::insert_hint_unique(iterator position, const key_type &key, + Args &&... args) + -> std::pair<iterator, bool> { + if (!empty()) { + if (position == end() || compare_keys(key, position.key())) { + iterator prev = position; + if (position == begin() || compare_keys((--prev).key(), key)) { + // prev.key() < key < position.key() + return {internal_emplace(position, std::forward<Args>(args)...), true}; + } + } else if (compare_keys(position.key(), key)) { + ++position; + if (position == end() || compare_keys(key, position.key())) { + // {original `position`}.key() < key < {current `position`}.key() + return {internal_emplace(position, std::forward<Args>(args)...), true}; + } + } else { + // position.key() == key + return {position, false}; + } + } + return insert_unique(key, std::forward<Args>(args)...); +} + +template <typename P> +template <typename InputIterator> +void btree<P>::insert_iterator_unique(InputIterator b, InputIterator e) { + for (; b != e; ++b) { + insert_hint_unique(end(), params_type::key(*b), *b); + } +} + +template <typename P> +template <typename ValueType> +auto btree<P>::insert_multi(const key_type &key, ValueType&& v) -> iterator { + if (empty()) { + mutable_root() = rightmost_ = new_leaf_root_node(1); + } + + iterator iter = internal_upper_bound(key); + if (iter.node == nullptr) { + iter = end(); + } + return internal_emplace(iter, std::forward<ValueType>(v)); +} + +template <typename P> +template <typename ValueType> +auto btree<P>::insert_hint_multi(iterator position, ValueType &&v) -> iterator { + if (!empty()) { + const key_type &key = params_type::key(v); + if (position == end() || !compare_keys(position.key(), key)) { + iterator prev = position; + if (position == begin() || !compare_keys(key, (--prev).key())) { + // prev.key() <= key <= position.key() + return internal_emplace(position, std::forward<ValueType>(v)); + } + } else { + iterator next = position; + ++next; + if (next == end() || !compare_keys(next.key(), key)) { + // position.key() < key <= next.key() + return internal_emplace(next, std::forward<ValueType>(v)); + } + } + } + return insert_multi(std::forward<ValueType>(v)); +} + +template <typename P> +template <typename InputIterator> +void btree<P>::insert_iterator_multi(InputIterator b, InputIterator e) { + for (; b != e; ++b) { + insert_hint_multi(end(), *b); + } +} + +template <typename P> +auto btree<P>::operator=(const btree &x) -> btree & { + if (this != &x) { + clear(); + + *mutable_key_comp() = x.key_comp(); + if constexpr (std::allocator_traits< + allocator_type>::propagate_on_container_copy_assignment::value) { + *mutable_allocator() = x.allocator(); + } + + copy_or_move_values_in_order(&x); + } + return *this; +} + +template <typename P> +auto btree<P>::operator=(btree &&x) noexcept -> btree & { + if (this != &x) { + clear(); + + using std::swap; + if constexpr (std::allocator_traits< + allocator_type>::propagate_on_container_copy_assignment::value) { + // Note: `root_` also contains the allocator and the key comparator. + swap(root_, x.root_); + swap(rightmost_, x.rightmost_); + swap(size_, x.size_); + } else { + if (allocator() == x.allocator()) { + swap(mutable_root(), x.mutable_root()); + swap(*mutable_key_comp(), *x.mutable_key_comp()); + swap(rightmost_, x.rightmost_); + swap(size_, x.size_); + } else { + // We aren't allowed to propagate the allocator and the allocator is + // different so we can't take over its memory. We must move each element + // individually. We need both `x` and `this` to have `x`s key comparator + // while moving the values so we can't swap the key comparators. + *mutable_key_comp() = x.key_comp(); + copy_or_move_values_in_order(&x); + } + } + } + return *this; +} + +template <typename P> +auto btree<P>::erase(iterator iter) -> iterator { + bool internal_delete = false; + if (!iter.node->leaf()) { + // Deletion of a value on an internal node. First, move the largest value + // from our left child here, then delete that position (in remove_value() + // below). We can get to the largest value from our left child by + // decrementing iter. + iterator internal_iter(iter); + --iter; + assert(iter.node->leaf()); + params_type::move(mutable_allocator(), iter.node->slot(iter.position), + internal_iter.node->slot(internal_iter.position)); + internal_delete = true; + } + + // Delete the key from the leaf. + iter.node->remove_value(iter.position, mutable_allocator()); + --size_; + + // We want to return the next value after the one we just erased. If we + // erased from an internal node (internal_delete == true), then the next + // value is ++(++iter). If we erased from a leaf node (internal_delete == + // false) then the next value is ++iter. Note that ++iter may point to an + // internal node and the value in the internal node may move to a leaf node + // (iter.node) when rebalancing is performed at the leaf level. + + iterator res = rebalance_after_delete(iter); + + // If we erased from an internal node, advance the iterator. + if (internal_delete) { + ++res; + } + return res; +} + +template <typename P> +auto btree<P>::rebalance_after_delete(iterator iter) -> iterator { + // Merge/rebalance as we walk back up the tree. + iterator res(iter); + bool first_iteration = true; + for (;;) { + if (iter.node == root()) { + try_shrink(); + if (empty()) { + return end(); + } + break; + } + if (iter.node->count() >= kMinNodeValues) { + break; + } + bool merged = try_merge_or_rebalance(&iter); + // On the first iteration, we should update `res` with `iter` because `res` + // may have been invalidated. + if (first_iteration) { + res = iter; + first_iteration = false; + } + if (!merged) { + break; + } + iter.position = iter.node->position(); + iter.node = iter.node->parent(); + } + + // Adjust our return value. If we're pointing at the end of a node, advance + // the iterator. + if (res.position == res.node->count()) { + res.position = res.node->count() - 1; + ++res; + } + + return res; +} + +template <typename P> +auto btree<P>::erase(iterator begin, iterator end) + -> std::pair<size_type, iterator> { + difference_type count = std::distance(begin, end); + assert(count >= 0); + + if (count == 0) { + return {0, begin}; + } + + if (count == size_) { + clear(); + return {count, this->end()}; + } + + if (begin.node == end.node) { + erase_same_node(begin, end); + size_ -= count; + return {count, rebalance_after_delete(begin)}; + } + + const size_type target_size = size_ - count; + while (size_ > target_size) { + if (begin.node->leaf()) { + const size_type remaining_to_erase = size_ - target_size; + const size_type remaining_in_node = begin.node->count() - begin.position; + begin = erase_from_leaf_node( + begin, std::min(remaining_to_erase, remaining_in_node)); + } else { + begin = erase(begin); + } + } + return {count, begin}; +} + +template <typename P> +void btree<P>::erase_same_node(iterator begin, iterator end) { + assert(begin.node == end.node); + assert(end.position > begin.position); + + node_type *node = begin.node; + size_type to_erase = end.position - begin.position; + if (!node->leaf()) { + // Delete all children between begin and end. + for (size_type i = 0; i < to_erase; ++i) { + internal_clear(node->child(begin.position + i + 1)); + } + // Rotate children after end into new positions. + for (size_type i = begin.position + to_erase + 1; i <= node->count(); ++i) { + node->set_child(i - to_erase, node->child(i)); + node->clear_child(i); + } + } + node->remove_values_ignore_children(begin.position, to_erase, + mutable_allocator()); + + // Do not need to update rightmost_, because + // * either end == this->end(), and therefore node == rightmost_, and still + // exists + // * or end != this->end(), and therefore rightmost_ hasn't been erased, since + // it wasn't covered in [begin, end) +} + +template <typename P> +auto btree<P>::erase_from_leaf_node(iterator begin, size_type to_erase) + -> iterator { + node_type *node = begin.node; + assert(node->leaf()); + assert(node->count() > begin.position); + assert(begin.position + to_erase <= node->count()); + + node->remove_values_ignore_children(begin.position, to_erase, + mutable_allocator()); + + size_ -= to_erase; + + return rebalance_after_delete(begin); +} + +template <typename P> +template <typename K> +auto btree<P>::erase_unique(const K &key) -> size_type { + const iterator iter = internal_find(key); + if (iter.node == nullptr) { + // The key doesn't exist in the tree, return nothing done. + return 0; + } + erase(iter); + return 1; +} + +template <typename P> +template <typename K> +auto btree<P>::erase_multi(const K &key) -> size_type { + const iterator begin = internal_lower_bound(key); + if (begin.node == nullptr) { + // The key doesn't exist in the tree, return nothing done. + return 0; + } + // Delete all of the keys between begin and upper_bound(key). + const iterator end = internal_end(internal_upper_bound(key)); + return erase(begin, end).first; +} + +template <typename P> +void btree<P>::clear() { + if (!empty()) { + internal_clear(root()); + } + mutable_root() = EmptyNode(); + rightmost_ = EmptyNode(); + size_ = 0; +} + +template <typename P> +void btree<P>::swap(btree &x) { + using std::swap; + if (std::allocator_traits< + allocator_type>::propagate_on_container_swap::value) { + // Note: `root_` also contains the allocator and the key comparator. + swap(root_, x.root_); + } else { + // It's undefined behavior if the allocators are unequal here. + assert(allocator() == x.allocator()); + swap(mutable_root(), x.mutable_root()); + swap(*mutable_key_comp(), *x.mutable_key_comp()); + } + swap(rightmost_, x.rightmost_); + swap(size_, x.size_); +} + +template <typename P> +void btree<P>::verify() const { + assert(root() != nullptr); + assert(leftmost() != nullptr); + assert(rightmost_ != nullptr); + assert(empty() || size() == internal_verify(root(), nullptr, nullptr)); + assert(leftmost() == (++const_iterator(root(), -1)).node); + assert(rightmost_ == (--const_iterator(root(), root()->count())).node); + assert(leftmost()->leaf()); + assert(rightmost_->leaf()); +} + +template <typename P> +void btree<P>::rebalance_or_split(iterator *iter) { + node_type *&node = iter->node; + int &insert_position = iter->position; + assert(node->count() == node->max_count()); + assert(kNodeValues == node->max_count()); + + // First try to make room on the node by rebalancing. + node_type *parent = node->parent(); + if (node != root()) { + if (node->position() > 0) { + // Try rebalancing with our left sibling. + node_type *left = parent->child(node->position() - 1); + assert(left->max_count() == kNodeValues); + if (left->count() < kNodeValues) { + // We bias rebalancing based on the position being inserted. If we're + // inserting at the end of the right node then we bias rebalancing to + // fill up the left node. + int to_move = (kNodeValues - left->count()) / + (1 + (insert_position < kNodeValues)); + to_move = std::max(1, to_move); + + if (((insert_position - to_move) >= 0) || + ((left->count() + to_move) < kNodeValues)) { + left->rebalance_right_to_left(to_move, node, mutable_allocator()); + + assert(node->max_count() - node->count() == to_move); + insert_position = insert_position - to_move; + if (insert_position < 0) { + insert_position = insert_position + left->count() + 1; + node = left; + } + + assert(node->count() < node->max_count()); + return; + } + } + } + + if (node->position() < parent->count()) { + // Try rebalancing with our right sibling. + node_type *right = parent->child(node->position() + 1); + assert(right->max_count() == kNodeValues); + if (right->count() < kNodeValues) { + // We bias rebalancing based on the position being inserted. If we're + // inserting at the beginning of the left node then we bias rebalancing + // to fill up the right node. + int to_move = + (kNodeValues - right->count()) / (1 + (insert_position > 0)); + to_move = (std::max)(1, to_move); + + if ((insert_position <= (node->count() - to_move)) || + ((right->count() + to_move) < kNodeValues)) { + node->rebalance_left_to_right(to_move, right, mutable_allocator()); + + if (insert_position > node->count()) { + insert_position = insert_position - node->count() - 1; + node = right; + } + + assert(node->count() < node->max_count()); + return; + } + } + } + + // Rebalancing failed, make sure there is room on the parent node for a new + // value. + assert(parent->max_count() == kNodeValues); + if (parent->count() == kNodeValues) { + iterator parent_iter(node->parent(), node->position()); + rebalance_or_split(&parent_iter); + } + } else { + // Rebalancing not possible because this is the root node. + // Create a new root node and set the current root node as the child of the + // new root. + parent = new_internal_node(parent); + parent->init_child(0, root()); + mutable_root() = parent; + // If the former root was a leaf node, then it's now the rightmost node. + assert(!parent->child(0)->leaf() || parent->child(0) == rightmost_); + } + + // Split the node. + node_type *split_node; + if (node->leaf()) { + split_node = new_leaf_node(parent); + node->split(insert_position, split_node, mutable_allocator()); + if (rightmost_ == node) rightmost_ = split_node; + } else { + split_node = new_internal_node(parent); + node->split(insert_position, split_node, mutable_allocator()); + } + + if (insert_position > node->count()) { + insert_position = insert_position - node->count() - 1; + node = split_node; + } +} + +template <typename P> +void btree<P>::merge_nodes(node_type *left, node_type *right) { + left->merge(right, mutable_allocator()); + if (right->leaf()) { + if (rightmost_ == right) rightmost_ = left; + delete_leaf_node(right); + } else { + delete_internal_node(right); + } +} + +template <typename P> +bool btree<P>::try_merge_or_rebalance(iterator *iter) { + node_type *parent = iter->node->parent(); + if (iter->node->position() > 0) { + // Try merging with our left sibling. + node_type *left = parent->child(iter->node->position() - 1); + assert(left->max_count() == kNodeValues); + if ((1 + left->count() + iter->node->count()) <= kNodeValues) { + iter->position += 1 + left->count(); + merge_nodes(left, iter->node); + iter->node = left; + return true; + } + } + if (iter->node->position() < parent->count()) { + // Try merging with our right sibling. + node_type *right = parent->child(iter->node->position() + 1); + assert(right->max_count() == kNodeValues); + if ((1 + iter->node->count() + right->count()) <= kNodeValues) { + merge_nodes(iter->node, right); + return true; + } + // Try rebalancing with our right sibling. We don't perform rebalancing if + // we deleted the first element from iter->node and the node is not + // empty. This is a small optimization for the common pattern of deleting + // from the front of the tree. + if ((right->count() > kMinNodeValues) && + ((iter->node->count() == 0) || + (iter->position > 0))) { + int to_move = (right->count() - iter->node->count()) / 2; + to_move = std::min(to_move, right->count() - 1); + iter->node->rebalance_right_to_left(to_move, right, mutable_allocator()); + return false; + } + } + if (iter->node->position() > 0) { + // Try rebalancing with our left sibling. We don't perform rebalancing if + // we deleted the last element from iter->node and the node is not + // empty. This is a small optimization for the common pattern of deleting + // from the back of the tree. + node_type *left = parent->child(iter->node->position() - 1); + if ((left->count() > kMinNodeValues) && + ((iter->node->count() == 0) || + (iter->position < iter->node->count()))) { + int to_move = (left->count() - iter->node->count()) / 2; + to_move = std::min(to_move, left->count() - 1); + left->rebalance_left_to_right(to_move, iter->node, mutable_allocator()); + iter->position += to_move; + return false; + } + } + return false; +} + +template <typename P> +void btree<P>::try_shrink() { + if (root()->count() > 0) { + return; + } + // Deleted the last item on the root node, shrink the height of the tree. + if (root()->leaf()) { + assert(size() == 0); + delete_leaf_node(root()); + mutable_root() = EmptyNode(); + rightmost_ = EmptyNode(); + } else { + node_type *child = root()->child(0); + child->make_root(); + delete_internal_node(root()); + mutable_root() = child; + } +} + +template <typename P> +template <typename IterType> +inline IterType btree<P>::internal_last(IterType iter) { + assert(iter.node != nullptr); + while (iter.position == iter.node->count()) { + iter.position = iter.node->position(); + iter.node = iter.node->parent(); + if (iter.node->leaf()) { + iter.node = nullptr; + break; + } + } + return iter; +} + +template <typename P> +template <typename... Args> +inline auto btree<P>::internal_emplace(iterator iter, Args &&... args) + -> iterator { + if (!iter.node->leaf()) { + // We can't insert on an internal node. Instead, we'll insert after the + // previous value which is guaranteed to be on a leaf node. + --iter; + ++iter.position; + } + const int max_count = iter.node->max_count(); + if (iter.node->count() == max_count) { + // Make room in the leaf for the new item. + if (max_count < kNodeValues) { + // Insertion into the root where the root is smaller than the full node + // size. Simply grow the size of the root node. + assert(iter.node == root()); + iter.node = + new_leaf_root_node(std::min(kNodeValues, 2 * max_count)); + iter.node->swap(root(), mutable_allocator()); + delete_leaf_node(root()); + mutable_root() = iter.node; + rightmost_ = iter.node; + } else { + rebalance_or_split(&iter); + } + } + iter.node->emplace_value(iter.position, mutable_allocator(), + std::forward<Args>(args)...); + ++size_; + return iter; +} + +template <typename P> +template <typename K> +inline auto btree<P>::internal_locate(const K &key) const + -> SearchResult<iterator, is_key_compare_to::value> { + return internal_locate_impl(key, is_key_compare_to()); +} + +template <typename P> +template <typename K> +inline auto btree<P>::internal_locate_impl( + const K &key, std::false_type /* IsCompareTo */) const + -> SearchResult<iterator, false> { + iterator iter(const_cast<node_type *>(root()), 0); + for (;;) { + iter.position = iter.node->lower_bound(key, key_comp()).value; + // NOTE: we don't need to walk all the way down the tree if the keys are + // equal, but determining equality would require doing an extra comparison + // on each node on the way down, and we will need to go all the way to the + // leaf node in the expected case. + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return {iter}; +} + +template <typename P> +template <typename K> +inline auto btree<P>::internal_locate_impl( + const K &key, std::true_type /* IsCompareTo */) const + -> SearchResult<iterator, true> { + iterator iter(const_cast<node_type *>(root()), 0); + for (;;) { + SearchResult<int, true> res = iter.node->lower_bound(key, key_comp()); + iter.position = res.value; + if (res.match == MatchKind::kEq) { + return {iter, MatchKind::kEq}; + } + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return {iter, MatchKind::kNe}; +} + +template <typename P> +template <typename K> +auto btree<P>::internal_lower_bound(const K &key) const -> iterator { + iterator iter(const_cast<node_type *>(root()), 0); + for (;;) { + iter.position = iter.node->lower_bound(key, key_comp()).value; + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return internal_last(iter); +} + +template <typename P> +template <typename K> +auto btree<P>::internal_upper_bound(const K &key) const -> iterator { + iterator iter(const_cast<node_type *>(root()), 0); + for (;;) { + iter.position = iter.node->upper_bound(key, key_comp()); + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return internal_last(iter); +} + +template <typename P> +template <typename K> +auto btree<P>::internal_find(const K &key) const -> iterator { + auto res = internal_locate(key); + if constexpr (res.has_match) { + if (res.IsEq()) { + return res.value; + } + } else { + const iterator iter = internal_last(res.value); + if (iter.node != nullptr && !compare_keys(key, iter.key())) { + return iter; + } + } + return {nullptr, 0}; +} + +template <typename P> +void btree<P>::internal_clear(node_type *node) { + if (!node->leaf()) { + for (int i = 0; i <= node->count(); ++i) { + internal_clear(node->child(i)); + } + delete_internal_node(node); + } else { + delete_leaf_node(node); + } +} + +template <typename P> +int btree<P>::internal_verify( + const node_type *node, const key_type *lo, const key_type *hi) const { + assert(node->count() > 0); + assert(node->count() <= node->max_count()); + if (lo) { + assert(!compare_keys(node->key(0), *lo)); + } + if (hi) { + assert(!compare_keys(*hi, node->key(node->count() - 1))); + } + for (int i = 1; i < node->count(); ++i) { + assert(!compare_keys(node->key(i), node->key(i - 1))); + } + int count = node->count(); + if (!node->leaf()) { + for (int i = 0; i <= node->count(); ++i) { + assert(node->child(i) != nullptr); + assert(node->child(i)->parent() == node); + assert(node->child(i)->position() == i); + count += internal_verify( + node->child(i), + (i == 0) ? lo : &node->key(i - 1), + (i == node->count()) ? hi : &node->key(i)); + } + } + return count; +} + +} // namespace btree::internal diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h new file mode 100644 index 000000000..e8d9efd38 --- /dev/null +++ b/src/include/cpp-btree/btree_container.h @@ -0,0 +1,526 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include <algorithm> +#include <initializer_list> +#include <iterator> +#include <type_traits> +#include <utility> + +#include "btree.h" + +namespace btree::internal { + +// A common base class for btree_set, btree_map, btree_multiset, and +// btree_multimap. +template <typename Tree> +class btree_container { + using params_type = typename Tree::params_type; + + protected: + // Alias used for heterogeneous lookup functions. + // `key_arg<K>` evaluates to `K` when the functors are transparent and to + // `key_type` otherwise. It permits template argument deduction on `K` for the + // transparent case. + template <class Compare> + using is_transparent_t = typename Compare::is_transparent; + template <class K> + using key_arg = + std::conditional_t< + std::experimental::is_detected_v<is_transparent_t, typename Tree::key_compare>, + K, + typename Tree::key_type>; + + public: + using key_type = typename Tree::key_type; + using value_type = typename Tree::value_type; + using size_type = typename Tree::size_type; + using difference_type = typename Tree::difference_type; + using key_compare = typename Tree::key_compare; + using value_compare = typename Tree::value_compare; + using allocator_type = typename Tree::allocator_type; + using reference = typename Tree::reference; + using const_reference = typename Tree::const_reference; + using pointer = typename Tree::pointer; + using const_pointer = typename Tree::const_pointer; + using iterator = typename Tree::iterator; + using const_iterator = typename Tree::const_iterator; + using reverse_iterator = typename Tree::reverse_iterator; + using const_reverse_iterator = typename Tree::const_reverse_iterator; + + // Constructors/assignments. + btree_container() : tree_(key_compare(), allocator_type()) {} + explicit btree_container(const key_compare &comp, + const allocator_type &alloc = allocator_type()) + : tree_(comp, alloc) {} + btree_container(const btree_container &x) = default; + btree_container(btree_container &&x) noexcept = default; + btree_container &operator=(const btree_container &x) = default; + btree_container &operator=(btree_container &&x) noexcept( + std::is_nothrow_move_assignable<Tree>::value) = default; + + // Iterator routines. + iterator begin() { return tree_.begin(); } + const_iterator begin() const { return tree_.begin(); } + const_iterator cbegin() const { return tree_.begin(); } + iterator end() { return tree_.end(); } + const_iterator end() const { return tree_.end(); } + const_iterator cend() const { return tree_.end(); } + reverse_iterator rbegin() { return tree_.rbegin(); } + const_reverse_iterator rbegin() const { return tree_.rbegin(); } + const_reverse_iterator crbegin() const { return tree_.rbegin(); } + reverse_iterator rend() { return tree_.rend(); } + const_reverse_iterator rend() const { return tree_.rend(); } + const_reverse_iterator crend() const { return tree_.rend(); } + + // Lookup routines. + template <typename K = key_type> + iterator find(const key_arg<K> &key) { + return tree_.find(key); + } + template <typename K = key_type> + const_iterator find(const key_arg<K> &key) const { + return tree_.find(key); + } + template <typename K = key_type> + bool contains(const key_arg<K> &key) const { + return find(key) != end(); + } + template <typename K = key_type> + iterator lower_bound(const key_arg<K> &key) { + return tree_.lower_bound(key); + } + template <typename K = key_type> + const_iterator lower_bound(const key_arg<K> &key) const { + return tree_.lower_bound(key); + } + template <typename K = key_type> + iterator upper_bound(const key_arg<K> &key) { + return tree_.upper_bound(key); + } + template <typename K = key_type> + const_iterator upper_bound(const key_arg<K> &key) const { + return tree_.upper_bound(key); + } + template <typename K = key_type> + std::pair<iterator, iterator> equal_range(const key_arg<K> &key) { + return tree_.equal_range(key); + } + template <typename K = key_type> + std::pair<const_iterator, const_iterator> equal_range( + const key_arg<K> &key) const { + return tree_.equal_range(key); + } + + // Deletion routines. Note that there is also a deletion routine that is + // specific to btree_set_container/btree_multiset_container. + + // Erase the specified iterator from the btree. The iterator must be valid + // (i.e. not equal to end()). Return an iterator pointing to the node after + // the one that was erased (or end() if none exists). + iterator erase(const_iterator iter) { return tree_.erase(iterator(iter)); } + iterator erase(iterator iter) { return tree_.erase(iter); } + iterator erase(const_iterator first, const_iterator last) { + return tree_.erase(iterator(first), iterator(last)).second; + } + + public: + // Utility routines. + void clear() { tree_.clear(); } + void swap(btree_container &x) { tree_.swap(x.tree_); } + void verify() const { tree_.verify(); } + + // Size routines. + size_type size() const { return tree_.size(); } + size_type max_size() const { return tree_.max_size(); } + bool empty() const { return tree_.empty(); } + + friend bool operator==(const btree_container &x, const btree_container &y) { + if (x.size() != y.size()) return false; + return std::equal(x.begin(), x.end(), y.begin()); + } + + friend bool operator!=(const btree_container &x, const btree_container &y) { + return !(x == y); + } + + friend bool operator<(const btree_container &x, const btree_container &y) { + return std::lexicographical_compare(x.begin(), x.end(), y.begin(), y.end()); + } + + friend bool operator>(const btree_container &x, const btree_container &y) { + return y < x; + } + + friend bool operator<=(const btree_container &x, const btree_container &y) { + return !(y < x); + } + + friend bool operator>=(const btree_container &x, const btree_container &y) { + return !(x < y); + } + + // The allocator used by the btree. + allocator_type get_allocator() const { return tree_.get_allocator(); } + + // The key comparator used by the btree. + key_compare key_comp() const { return tree_.key_comp(); } + value_compare value_comp() const { return tree_.value_comp(); } + + protected: + Tree tree_; +}; + +// A common base class for btree_set and btree_map. +template <typename Tree> +class btree_set_container : public btree_container<Tree> { + using super_type = btree_container<Tree>; + using params_type = typename Tree::params_type; + using init_type = typename params_type::init_type; + using is_key_compare_to = typename params_type::is_key_compare_to; + friend class BtreeNodePeer; + + protected: + template <class K> + using key_arg = typename super_type::template key_arg<K>; + + public: + using key_type = typename Tree::key_type; + using value_type = typename Tree::value_type; + using size_type = typename Tree::size_type; + using key_compare = typename Tree::key_compare; + using allocator_type = typename Tree::allocator_type; + using iterator = typename Tree::iterator; + using const_iterator = typename Tree::const_iterator; + + // Inherit constructors. + using super_type::super_type; + btree_set_container() {} + + // Range constructor. + template <class InputIterator> + btree_set_container(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + insert(b, e); + } + + // Initializer list constructor. + btree_set_container(std::initializer_list<init_type> init, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : btree_set_container(init.begin(), init.end(), comp, alloc) {} + + // Lookup routines. + template <typename K = key_type> + size_type count(const key_arg<K> &key) const { + return this->tree_.count_unique(key); + } + + // Insertion routines. + std::pair<iterator, bool> insert(const value_type &x) { + return this->tree_.insert_unique(params_type::key(x), x); + } + std::pair<iterator, bool> insert(value_type &&x) { + return this->tree_.insert_unique(params_type::key(x), std::move(x)); + } + template <typename... Args> + std::pair<iterator, bool> emplace(Args &&... args) { + init_type v(std::forward<Args>(args)...); + return this->tree_.insert_unique(params_type::key(v), std::move(v)); + } + iterator insert(const_iterator position, const value_type &x) { + return this->tree_ + .insert_hint_unique(iterator(position), params_type::key(x), x) + .first; + } + iterator insert(const_iterator position, value_type &&x) { + return this->tree_ + .insert_hint_unique(iterator(position), params_type::key(x), + std::move(x)) + .first; + } + template <typename... Args> + iterator emplace_hint(const_iterator position, Args &&... args) { + init_type v(std::forward<Args>(args)...); + return this->tree_ + .insert_hint_unique(iterator(position), params_type::key(v), + std::move(v)) + .first; + } + template <typename InputIterator> + void insert(InputIterator b, InputIterator e) { + this->tree_.insert_iterator_unique(b, e); + } + void insert(std::initializer_list<init_type> init) { + this->tree_.insert_iterator_unique(init.begin(), init.end()); + } + // Deletion routines. + template <typename K = key_type> + size_type erase(const key_arg<K> &key) { + return this->tree_.erase_unique(key); + } + using super_type::erase; + + // Merge routines. + // Moves elements from `src` into `this`. If the element already exists in + // `this`, it is left unmodified in `src`. + template < + typename T, + typename std::enable_if_t< + std::conjunction_v< + std::is_same<value_type, typename T::value_type>, + std::is_same<allocator_type, typename T::allocator_type>, + std::is_same<typename params_type::is_map_container, + typename T::params_type::is_map_container>>, + int> = 0> + void merge(btree_container<T> &src) { // NOLINT + for (auto src_it = src.begin(); src_it != src.end();) { + if (insert(std::move(*src_it)).second) { + src_it = src.erase(src_it); + } else { + ++src_it; + } + } + } + + template < + typename T, + typename std::enable_if_t< + std::conjunction_v< + std::is_same<value_type, typename T::value_type>, + std::is_same<allocator_type, typename T::allocator_type>, + std::is_same<typename params_type::is_map_container, + typename T::params_type::is_map_container>>, + int> = 0> + void merge(btree_container<T> &&src) { + merge(src); + } +}; + +// A common base class for btree_map and safe_btree_map. +// Base class for btree_map. +template <typename Tree> +class btree_map_container : public btree_set_container<Tree> { + using super_type = btree_set_container<Tree>; + using params_type = typename Tree::params_type; + + protected: + template <class K> + using key_arg = typename super_type::template key_arg<K>; + + public: + using key_type = typename Tree::key_type; + using mapped_type = typename params_type::mapped_type; + using value_type = typename Tree::value_type; + using key_compare = typename Tree::key_compare; + using allocator_type = typename Tree::allocator_type; + using iterator = typename Tree::iterator; + using const_iterator = typename Tree::const_iterator; + + // Inherit constructors. + using super_type::super_type; + btree_map_container() {} + + // Insertion routines. + template <typename... Args> + std::pair<iterator, bool> try_emplace(const key_type &k, Args &&... args) { + return this->tree_.insert_unique( + k, std::piecewise_construct, std::forward_as_tuple(k), + std::forward_as_tuple(std::forward<Args>(args)...)); + } + template <typename... Args> + std::pair<iterator, bool> try_emplace(key_type &&k, Args &&... args) { + // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k` + // and then using `k` unsequenced. This is safe because the move is into a + // forwarding reference and insert_unique guarantees that `key` is never + // referenced after consuming `args`. + const key_type& key_ref = k; + return this->tree_.insert_unique( + key_ref, std::piecewise_construct, std::forward_as_tuple(std::move(k)), + std::forward_as_tuple(std::forward<Args>(args)...)); + } + template <typename... Args> + iterator try_emplace(const_iterator hint, const key_type &k, + Args &&... args) { + return this->tree_ + .insert_hint_unique(iterator(hint), k, std::piecewise_construct, + std::forward_as_tuple(k), + std::forward_as_tuple(std::forward<Args>(args)...)) + .first; + } + template <typename... Args> + iterator try_emplace(const_iterator hint, key_type &&k, Args &&... args) { + // Note: `key_ref` exists to avoid a ClangTidy warning about moving from `k` + // and then using `k` unsequenced. This is safe because the move is into a + // forwarding reference and insert_hint_unique guarantees that `key` is + // never referenced after consuming `args`. + const key_type& key_ref = k; + return this->tree_ + .insert_hint_unique(iterator(hint), key_ref, std::piecewise_construct, + std::forward_as_tuple(std::move(k)), + std::forward_as_tuple(std::forward<Args>(args)...)) + .first; + } + mapped_type &operator[](const key_type &k) { + return try_emplace(k).first->second; + } + mapped_type &operator[](key_type &&k) { + return try_emplace(std::move(k)).first->second; + } + + template <typename K = key_type> + mapped_type &at(const key_arg<K> &key) { + auto it = this->find(key); + if (it == this->end()) + throw std::out_of_range("btree_map::at"); + return it->second; + } + template <typename K = key_type> + const mapped_type &at(const key_arg<K> &key) const { + auto it = this->find(key); + if (it == this->end()) + throw std::out_of_range("btree_map::at"); + return it->second; + } +}; + +// A common base class for btree_multiset and btree_multimap. +template <typename Tree> +class btree_multiset_container : public btree_container<Tree> { + using super_type = btree_container<Tree>; + using params_type = typename Tree::params_type; + using init_type = typename params_type::init_type; + using is_key_compare_to = typename params_type::is_key_compare_to; + + template <class K> + using key_arg = typename super_type::template key_arg<K>; + + public: + using key_type = typename Tree::key_type; + using value_type = typename Tree::value_type; + using size_type = typename Tree::size_type; + using key_compare = typename Tree::key_compare; + using allocator_type = typename Tree::allocator_type; + using iterator = typename Tree::iterator; + using const_iterator = typename Tree::const_iterator; + + // Inherit constructors. + using super_type::super_type; + btree_multiset_container() {} + + // Range constructor. + template <class InputIterator> + btree_multiset_container(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + insert(b, e); + } + + // Initializer list constructor. + btree_multiset_container(std::initializer_list<init_type> init, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : btree_multiset_container(init.begin(), init.end(), comp, alloc) {} + + // Lookup routines. + template <typename K = key_type> + size_type count(const key_arg<K> &key) const { + return this->tree_.count_multi(key); + } + + // Insertion routines. + iterator insert(const value_type &x) { return this->tree_.insert_multi(x); } + iterator insert(value_type &&x) { + return this->tree_.insert_multi(std::move(x)); + } + iterator insert(const_iterator position, const value_type &x) { + return this->tree_.insert_hint_multi(iterator(position), x); + } + iterator insert(const_iterator position, value_type &&x) { + return this->tree_.insert_hint_multi(iterator(position), std::move(x)); + } + template <typename InputIterator> + void insert(InputIterator b, InputIterator e) { + this->tree_.insert_iterator_multi(b, e); + } + void insert(std::initializer_list<init_type> init) { + this->tree_.insert_iterator_multi(init.begin(), init.end()); + } + template <typename... Args> + iterator emplace(Args &&... args) { + return this->tree_.insert_multi(init_type(std::forward<Args>(args)...)); + } + template <typename... Args> + iterator emplace_hint(const_iterator position, Args &&... args) { + return this->tree_.insert_hint_multi( + iterator(position), init_type(std::forward<Args>(args)...)); + } + + // Deletion routines. + template <typename K = key_type> + size_type erase(const key_arg<K> &key) { + return this->tree_.erase_multi(key); + } + using super_type::erase; + + // Merge routines. + // Moves all elements from `src` into `this`. + template < + typename T, + typename std::enable_if_t< + std::conjunction_v< + std::is_same<value_type, typename T::value_type>, + std::is_same<allocator_type, typename T::allocator_type>, + std::is_same<typename params_type::is_map_container, + typename T::params_type::is_map_container>>, + int> = 0> + void merge(btree_container<T> &src) { // NOLINT + insert(std::make_move_iterator(src.begin()), + std::make_move_iterator(src.end())); + src.clear(); + } + + template < + typename T, + typename std::enable_if_t< + std::conjunction_v< + std::is_same<value_type, typename T::value_type>, + std::is_same<allocator_type, typename T::allocator_type>, + std::is_same<typename params_type::is_map_container, + typename T::params_type::is_map_container>>, + int> = 0> + void merge(btree_container<T> &&src) { + merge(src); + } +}; + +// A base class for btree_multimap. +template <typename Tree> +class btree_multimap_container : public btree_multiset_container<Tree> { + using super_type = btree_multiset_container<Tree>; + using params_type = typename Tree::params_type; + + public: + using mapped_type = typename params_type::mapped_type; + + // Inherit constructors. + using super_type::super_type; + btree_multimap_container() {} +}; +} // namespace btree::internal diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h new file mode 100644 index 000000000..749c2bbcd --- /dev/null +++ b/src/include/cpp-btree/btree_map.h @@ -0,0 +1,159 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ----------------------------------------------------------------------------- +// File: btree_map.h +// ----------------------------------------------------------------------------- +// +// This header file defines B-tree maps: sorted associative containers mapping +// keys to values. +// +// * `btree::btree_map<>` +// * `btree::btree_multimap<>` +// +// These B-tree types are similar to the corresponding types in the STL +// (`std::map` and `std::multimap`) and generally conform to the STL interfaces +// of those types. However, because they are implemented using B-trees, they +// are more efficient in most situations. +// +// Unlike `std::map` and `std::multimap`, which are commonly implemented using +// red-black tree nodes, B-tree maps use more generic B-tree nodes able to hold +// multiple values per node. Holding multiple values per node often makes +// B-tree maps perform better than their `std::map` counterparts, because +// multiple entries can be checked within the same cache hit. +// +// However, these types should not be considered drop-in replacements for +// `std::map` and `std::multimap` as there are some API differences, which are +// noted in this header file. +// +// Importantly, insertions and deletions may invalidate outstanding iterators, +// pointers, and references to elements. Such invalidations are typically only +// an issue if insertion and deletion operations are interleaved with the use of +// more than one iterator, pointer, or reference simultaneously. For this +// reason, `insert()` and `erase()` return a valid iterator at the current +// position. + +#pragma once + +#include "btree.h" +#include "btree_container.h" + +namespace btree { + +// btree::btree_map<> +// +// A `btree::btree_map<K, V>` is an ordered associative container of +// unique keys and associated values designed to be a more efficient replacement +// for `std::map` (in most cases). +// +// Keys are sorted using an (optional) comparison function, which defaults to +// `std::less<K>`. +// +// A `btree::btree_map<K, V>` uses a default allocator of +// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate) +// nodes, and construct and destruct values within those nodes. You may +// instead specify a custom allocator `A` (which in turn requires specifying a +// custom comparator `C`) as in `btree::btree_map<K, V, C, A>`. +// +template <typename Key, typename Value, typename Compare = std::less<Key>, + typename Alloc = std::allocator<std::pair<const Key, Value>>> +class btree_map + : public internal::btree_map_container< + internal::btree<internal::map_params< + Key, Value, Compare, Alloc, /*TargetNodeSize=*/256, + /*Multi=*/false>>> { + + using Base = typename btree_map::btree_map_container; + + public: + // Default constructor. + btree_map() = default; + using Base::Base; +}; + +// btree::swap(btree::btree_map<>, btree::btree_map<>) +// +// Swaps the contents of two `btree::btree_map` containers. +template <typename K, typename V, typename C, typename A> +void swap(btree_map<K, V, C, A> &x, btree_map<K, V, C, A> &y) { + return x.swap(y); +} + +// btree::erase_if(btree::btree_map<>, Pred) +// +// Erases all elements that satisfy the predicate pred from the container. +template <typename K, typename V, typename C, typename A, typename Pred> +void erase_if(btree_map<K, V, C, A> &map, Pred pred) { + for (auto it = map.begin(); it != map.end();) { + if (pred(*it)) { + it = map.erase(it); + } else { + ++it; + } + } +} + +// btree::btree_multimap +// +// A `btree::btree_multimap<K, V>` is an ordered associative container of +// keys and associated values designed to be a more efficient replacement for +// `std::multimap` (in most cases). Unlike `btree::btree_map`, a B-tree multimap +// allows multiple elements with equivalent keys. +// +// Keys are sorted using an (optional) comparison function, which defaults to +// `std::less<K>`. +// +// A `btree::btree_multimap<K, V>` uses a default allocator of +// `std::allocator<std::pair<const K, V>>` to allocate (and deallocate) +// nodes, and construct and destruct values within those nodes. You may +// instead specify a custom allocator `A` (which in turn requires specifying a +// custom comparator `C`) as in `btree::btree_multimap<K, V, C, A>`. +// +template <typename Key, typename Value, typename Compare = std::less<Key>, + typename Alloc = std::allocator<std::pair<const Key, Value>>> +class btree_multimap + : public internal::btree_multimap_container< + internal::btree<internal::map_params< + Key, Value, Compare, Alloc, /*TargetNodeSize=*/256, + /*Multi=*/true>>> { + using Base = typename btree_multimap::btree_multimap_container; + + public: + btree_multimap() = default; + using Base::Base; +}; + +// btree::swap(btree::btree_multimap<>, btree::btree_multimap<>) +// +// Swaps the contents of two `btree::btree_multimap` containers. +template <typename K, typename V, typename C, typename A> +void swap(btree_multimap<K, V, C, A> &x, btree_multimap<K, V, C, A> &y) { + return x.swap(y); +} + +// btree::erase_if(btree::btree_multimap<>, Pred) +// +// Erases all elements that satisfy the predicate pred from the container. +template <typename K, typename V, typename C, typename A, typename Pred> +void erase_if(btree_multimap<K, V, C, A> &map, Pred pred) { + for (auto it = map.begin(); it != map.end();) { + if (pred(*it)) { + it = map.erase(it); + } else { + ++it; + } + } +} + +} // namespace btree diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h new file mode 100644 index 000000000..57536ce2f --- /dev/null +++ b/src/include/cpp-btree/btree_set.h @@ -0,0 +1,632 @@ +// Copyright 2018 The Abseil Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// ----------------------------------------------------------------------------- +// File: btree_set.h +// ----------------------------------------------------------------------------- +// +// This header file defines B-tree sets: sorted associative containers of +// values. +// +// * `absl::btree_set<>` +// * `absl::btree_multiset<>` +// +// These B-tree types are similar to the corresponding types in the STL +// (`std::set` and `std::multiset`) and generally conform to the STL interfaces +// of those types. However, because they are implemented using B-trees, they +// are more efficient in most situations. +// +// Unlike `std::set` and `std::multiset`, which are commonly implemented using +// red-black tree nodes, B-tree sets use more generic B-tree nodes able to hold +// multiple values per node. Holding multiple values per node often makes +// B-tree sets perform better than their `std::set` counterparts, because +// multiple entries can be checked within the same cache hit. +// +// However, these types should not be considered drop-in replacements for +// `std::set` and `std::multiset` as there are some API differences, which are +// noted in this header file. +// +// Importantly, insertions and deletions may invalidate outstanding iterators, +// pointers, and references to elements. Such invalidations are typically only +// an issue if insertion and deletion operations are interleaved with the use of +// more than one iterator, pointer, or reference simultaneously. For this +// reason, `insert()` and `erase()` return a valid iterator at the current +// position. + +#pragma once + +#include "btree.h" +#include "btree_container.h" + +namespace btree { + +// btree::btree_set<> +// +// An `btree::btree_set<K>` is an ordered associative container of unique key +// values designed to be a more efficient replacement for `std::set` (in most +// cases). +// +// Keys are sorted using an (optional) comparison function, which defaults to +// `std::less<K>`. +// +// An `btree::btree_set<K>` uses a default allocator of `std::allocator<K>` to +// allocate (and deallocate) nodes, and construct and destruct values within +// those nodes. You may instead specify a custom allocator `A` (which in turn +// requires specifying a custom comparator `C`) as in +// `btree::btree_set<K, C, A>`. +// +template <typename Key, typename Compare = std::less<Key>, + typename Alloc = std::allocator<Key>> +class btree_set + : public internal::btree_set_container< + internal::btree<internal::set_params< + Key, Compare, Alloc, /*TargetNodeSize=*/256, + /*Multi=*/false>>> { + using Base = typename btree_set::btree_set_container; + + public: + // Constructors and Assignment Operators + // + // A `btree_set` supports the same overload set as `std::set` + // for construction and assignment: + // + // * Default constructor + // + // btree::btree_set<std::string> set1; + // + // * Initializer List constructor + // + // btree::btree_set<std::string> set2 = + // {{"huey"}, {"dewey"}, {"louie"},}; + // + // * Copy constructor + // + // btree::btree_set<std::string> set3(set2); + // + // * Copy assignment operator + // + // btree::btree_set<std::string> set4; + // set4 = set3; + // + // * Move constructor + // + // // Move is guaranteed efficient + // btree::btree_set<std::string> set5(std::move(set4)); + // + // * Move assignment operator + // + // // May be efficient if allocators are compatible + // btree::btree_set<std::string> set6; + // set6 = std::move(set5); + // + // * Range constructor + // + // std::vector<std::string> v = {"a", "b"}; + // btree::btree_set<std::string> set7(v.begin(), v.end()); + btree_set() {} + using Base::Base; + + // btree_set::begin() + // + // Returns an iterator to the beginning of the `btree_set`. + using Base::begin; + + // btree_set::cbegin() + // + // Returns a const iterator to the beginning of the `btree_set`. + using Base::cbegin; + + // btree_set::end() + // + // Returns an iterator to the end of the `btree_set`. + using Base::end; + + // btree_set::cend() + // + // Returns a const iterator to the end of the `btree_set`. + using Base::cend; + + // btree_set::empty() + // + // Returns whether or not the `btree_set` is empty. + using Base::empty; + + // btree_set::max_size() + // + // Returns the largest theoretical possible number of elements within a + // `btree_set` under current memory constraints. This value can be thought + // of as the largest value of `std::distance(begin(), end())` for a + // `btree_set<Key>`. + using Base::max_size; + + // btree_set::size() + // + // Returns the number of elements currently within the `btree_set`. + using Base::size; + + // btree_set::clear() + // + // Removes all elements from the `btree_set`. Invalidates any references, + // pointers, or iterators referring to contained elements. + using Base::clear; + + // btree_set::erase() + // + // Erases elements within the `btree_set`. Overloads are listed below. + // + // iterator erase(iterator position): + // iterator erase(const_iterator position): + // + // Erases the element at `position` of the `btree_set`, returning + // the iterator pointing to the element after the one that was erased + // (or end() if none exists). + // + // iterator erase(const_iterator first, const_iterator last): + // + // Erases the elements in the open interval [`first`, `last`), returning + // the iterator pointing to the element after the interval that was erased + // (or end() if none exists). + // + // template <typename K> size_type erase(const K& key): + // + // Erases the element with the matching key, if it exists, returning the + // number of elements erased. + using Base::erase; + + // btree_set::insert() + // + // Inserts an element of the specified value into the `btree_set`, + // returning an iterator pointing to the newly inserted element, provided that + // an element with the given key does not already exist. If an insertion + // occurs, any references, pointers, or iterators are invalidated. + // Overloads are listed below. + // + // std::pair<iterator,bool> insert(const value_type& value): + // + // Inserts a value into the `btree_set`. Returns a pair consisting of an + // iterator to the inserted element (or to the element that prevented the + // insertion) and a bool denoting whether the insertion took place. + // + // std::pair<iterator,bool> insert(value_type&& value): + // + // Inserts a moveable value into the `btree_set`. Returns a pair + // consisting of an iterator to the inserted element (or to the element that + // prevented the insertion) and a bool denoting whether the insertion took + // place. + // + // iterator insert(const_iterator hint, const value_type& value): + // iterator insert(const_iterator hint, value_type&& value): + // + // Inserts a value, using the position of `hint` as a non-binding suggestion + // for where to begin the insertion search. Returns an iterator to the + // inserted element, or to the existing element that prevented the + // insertion. + // + // void insert(InputIterator first, InputIterator last): + // + // Inserts a range of values [`first`, `last`). + // + // void insert(std::initializer_list<init_type> ilist): + // + // Inserts the elements within the initializer list `ilist`. + using Base::insert; + + // btree_set::emplace() + // + // Inserts an element of the specified value by constructing it in-place + // within the `btree_set`, provided that no element with the given key + // already exists. + // + // The element may be constructed even if there already is an element with the + // key in the container, in which case the newly constructed element will be + // destroyed immediately. + // + // If an insertion occurs, any references, pointers, or iterators are + // invalidated. + using Base::emplace; + + // btree_set::emplace_hint() + // + // Inserts an element of the specified value by constructing it in-place + // within the `btree_set`, using the position of `hint` as a non-binding + // suggestion for where to begin the insertion search, and only inserts + // provided that no element with the given key already exists. + // + // The element may be constructed even if there already is an element with the + // key in the container, in which case the newly constructed element will be + // destroyed immediately. + // + // If an insertion occurs, any references, pointers, or iterators are + // invalidated. + using Base::emplace_hint; + + // btree_set::merge() + // + // Extracts elements from a given `source` btree_set into this + // `btree_set`. If the destination `btree_set` already contains an + // element with an equivalent key, that element is not extracted. + using Base::merge; + + // btree_set::swap(btree_set& other) + // + // Exchanges the contents of this `btree_set` with those of the `other` + // btree_set, avoiding invocation of any move, copy, or swap operations on + // individual elements. + // + // All iterators and references on the `btree_set` remain valid, excepting + // for the past-the-end iterator, which is invalidated. + using Base::swap; + + // btree_set::contains() + // + // template <typename K> bool contains(const K& key) const: + // + // Determines whether an element comparing equal to the given `key` exists + // within the `btree_set`, returning `true` if so or `false` otherwise. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::contains; + + // btree_set::count() + // + // template <typename K> size_type count(const K& key) const: + // + // Returns the number of elements comparing equal to the given `key` within + // the `btree_set`. Note that this function will return either `1` or `0` + // since duplicate elements are not allowed within a `btree_set`. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::count; + + // btree_set::equal_range() + // + // Returns a closed range [first, last], defined by a `std::pair` of two + // iterators, containing all elements with the passed key in the + // `btree_set`. + using Base::equal_range; + + // btree_set::find() + // + // template <typename K> iterator find(const K& key): + // template <typename K> const_iterator find(const K& key) const: + // + // Finds an element with the passed `key` within the `btree_set`. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::find; + + // btree_set::get_allocator() + // + // Returns the allocator function associated with this `btree_set`. + using Base::get_allocator; + + // btree_set::key_comp(); + // + // Returns the key comparator associated with this `btree_set`. + using Base::key_comp; + + // btree_set::value_comp(); + // + // Returns the value comparator associated with this `btree_set`. The keys to + // sort the elements are the values themselves, therefore `value_comp` and its + // sibling member function `key_comp` are equivalent. + using Base::value_comp; +}; + +// btree::swap(btree::btree_set<>, btree::btree_set<>) +// +// Swaps the contents of two `btree::btree_set` containers. +template <typename K, typename C, typename A> +void swap(btree_set<K, C, A> &x, btree_set<K, C, A> &y) { + return x.swap(y); +} + +// btree::erase_if(btree::btree_set<>, Pred) +// +// Erases all elements that satisfy the predicate pred from the container. +template <typename K, typename C, typename A, typename Pred> +void erase_if(btree_set<K, C, A> &set, Pred pred) { + for (auto it = set.begin(); it != set.end();) { + if (pred(*it)) { + it = set.erase(it); + } else { + ++it; + } + } +} + +// btree::btree_multiset<> +// +// An `btree::btree_multiset<K>` is an ordered associative container of +// keys and associated values designed to be a more efficient replacement +// for `std::multiset` (in most cases). Unlike `btree::btree_set`, a B-tree +// multiset allows equivalent elements. +// +// Keys are sorted using an (optional) comparison function, which defaults to +// `std::less<K>`. +// +// An `btree::btree_multiset<K>` uses a default allocator of `std::allocator<K>` +// to allocate (and deallocate) nodes, and construct and destruct values within +// those nodes. You may instead specify a custom allocator `A` (which in turn +// requires specifying a custom comparator `C`) as in +// `btree::btree_multiset<K, C, A>`. +// +template <typename Key, typename Compare = std::less<Key>, + typename Alloc = std::allocator<Key>> +class btree_multiset + : public internal::btree_multiset_container< + internal::btree<internal::set_params< + Key, Compare, Alloc, /*TargetNodeSize=*/256, + /*Multi=*/true>>> { + using Base = typename btree_multiset::btree_multiset_container; + + public: + // Constructors and Assignment Operators + // + // A `btree_multiset` supports the same overload set as `std::set` + // for construction and assignment: + // + // * Default constructor + // + // btree::btree_multiset<std::string> set1; + // + // * Initializer List constructor + // + // btree::btree_multiset<std::string> set2 = + // {{"huey"}, {"dewey"}, {"louie"},}; + // + // * Copy constructor + // + // btree::btree_multiset<std::string> set3(set2); + // + // * Copy assignment operator + // + // btree::btree_multiset<std::string> set4; + // set4 = set3; + // + // * Move constructor + // + // // Move is guaranteed efficient + // btree::btree_multiset<std::string> set5(std::move(set4)); + // + // * Move assignment operator + // + // // May be efficient if allocators are compatible + // btree::btree_multiset<std::string> set6; + // set6 = std::move(set5); + // + // * Range constructor + // + // std::vector<std::string> v = {"a", "b"}; + // btree::btree_multiset<std::string> set7(v.begin(), v.end()); + btree_multiset() {} + using Base::Base; + + // btree_multiset::begin() + // + // Returns an iterator to the beginning of the `btree_multiset`. + using Base::begin; + + // btree_multiset::cbegin() + // + // Returns a const iterator to the beginning of the `btree_multiset`. + using Base::cbegin; + + // btree_multiset::end() + // + // Returns an iterator to the end of the `btree_multiset`. + using Base::end; + + // btree_multiset::cend() + // + // Returns a const iterator to the end of the `btree_multiset`. + using Base::cend; + + // btree_multiset::empty() + // + // Returns whether or not the `btree_multiset` is empty. + using Base::empty; + + // btree_multiset::max_size() + // + // Returns the largest theoretical possible number of elements within a + // `btree_multiset` under current memory constraints. This value can be + // thought of as the largest value of `std::distance(begin(), end())` for a + // `btree_multiset<Key>`. + using Base::max_size; + + // btree_multiset::size() + // + // Returns the number of elements currently within the `btree_multiset`. + using Base::size; + + // btree_multiset::clear() + // + // Removes all elements from the `btree_multiset`. Invalidates any references, + // pointers, or iterators referring to contained elements. + using Base::clear; + + // btree_multiset::erase() + // + // Erases elements within the `btree_multiset`. Overloads are listed below. + // + // iterator erase(iterator position): + // iterator erase(const_iterator position): + // + // Erases the element at `position` of the `btree_multiset`, returning + // the iterator pointing to the element after the one that was erased + // (or end() if none exists). + // + // iterator erase(const_iterator first, const_iterator last): + // + // Erases the elements in the open interval [`first`, `last`), returning + // the iterator pointing to the element after the interval that was erased + // (or end() if none exists). + // + // template <typename K> size_type erase(const K& key): + // + // Erases the elements matching the key, if any exist, returning the + // number of elements erased. + using Base::erase; + + // btree_multiset::insert() + // + // Inserts an element of the specified value into the `btree_multiset`, + // returning an iterator pointing to the newly inserted element. + // Any references, pointers, or iterators are invalidated. Overloads are + // listed below. + // + // iterator insert(const value_type& value): + // + // Inserts a value into the `btree_multiset`, returning an iterator to the + // inserted element. + // + // iterator insert(value_type&& value): + // + // Inserts a moveable value into the `btree_multiset`, returning an iterator + // to the inserted element. + // + // iterator insert(const_iterator hint, const value_type& value): + // iterator insert(const_iterator hint, value_type&& value): + // + // Inserts a value, using the position of `hint` as a non-binding suggestion + // for where to begin the insertion search. Returns an iterator to the + // inserted element. + // + // void insert(InputIterator first, InputIterator last): + // + // Inserts a range of values [`first`, `last`). + // + // void insert(std::initializer_list<init_type> ilist): + // + // Inserts the elements within the initializer list `ilist`. + using Base::insert; + + // btree_multiset::emplace() + // + // Inserts an element of the specified value by constructing it in-place + // within the `btree_multiset`. Any references, pointers, or iterators are + // invalidated. + using Base::emplace; + + // btree_multiset::emplace_hint() + // + // Inserts an element of the specified value by constructing it in-place + // within the `btree_multiset`, using the position of `hint` as a non-binding + // suggestion for where to begin the insertion search. + // + // Any references, pointers, or iterators are invalidated. + using Base::emplace_hint; + + // btree_multiset::merge() + // + // Extracts elements from a given `source` btree_multiset into this + // `btree_multiset`. If the destination `btree_multiset` already contains an + // element with an equivalent key, that element is not extracted. + using Base::merge; + + // btree_multiset::swap(btree_multiset& other) + // + // Exchanges the contents of this `btree_multiset` with those of the `other` + // btree_multiset, avoiding invocation of any move, copy, or swap operations + // on individual elements. + // + // All iterators and references on the `btree_multiset` remain valid, + // excepting for the past-the-end iterator, which is invalidated. + using Base::swap; + + // btree_multiset::contains() + // + // template <typename K> bool contains(const K& key) const: + // + // Determines whether an element comparing equal to the given `key` exists + // within the `btree_multiset`, returning `true` if so or `false` otherwise. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::contains; + + // btree_multiset::count() + // + // template <typename K> size_type count(const K& key) const: + // + // Returns the number of elements comparing equal to the given `key` within + // the `btree_multiset`. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::count; + + // btree_multiset::equal_range() + // + // Returns a closed range [first, last], defined by a `std::pair` of two + // iterators, containing all elements with the passed key in the + // `btree_multiset`. + using Base::equal_range; + + // btree_multiset::find() + // + // template <typename K> iterator find(const K& key): + // template <typename K> const_iterator find(const K& key) const: + // + // Finds an element with the passed `key` within the `btree_multiset`. + // + // Supports heterogeneous lookup, provided that the set is provided a + // compatible heterogeneous comparator. + using Base::find; + + // btree_multiset::get_allocator() + // + // Returns the allocator function associated with this `btree_multiset`. + using Base::get_allocator; + + // btree_multiset::key_comp(); + // + // Returns the key comparator associated with this `btree_multiset`. + using Base::key_comp; + + // btree_multiset::value_comp(); + // + // Returns the value comparator associated with this `btree_multiset`. The + // keys to sort the elements are the values themselves, therefore `value_comp` + // and its sibling member function `key_comp` are equivalent. + using Base::value_comp; +}; + +// btree::swap(btree::btree_multiset<>, btree::btree_multiset<>) +// +// Swaps the contents of two `btree::btree_multiset` containers. +template <typename K, typename C, typename A> +void swap(btree_multiset<K, C, A> &x, btree_multiset<K, C, A> &y) { + return x.swap(y); +} + +// btree::erase_if(btree::btree_multiset<>, Pred) +// +// Erases all elements that satisfy the predicate pred from the container. +template <typename K, typename C, typename A, typename Pred> +void erase_if(btree_multiset<K, C, A> &set, Pred pred) { + for (auto it = set.begin(); it != set.end();) { + if (pred(*it)) { + it = set.erase(it); + } else { + ++it; + } + } +} + +} // namespace btree diff --git a/src/include/cpp_lib_backport.h b/src/include/cpp_lib_backport.h new file mode 100644 index 000000000..ea956c446 --- /dev/null +++ b/src/include/cpp_lib_backport.h @@ -0,0 +1,30 @@ +#pragma once + +#include <cstring> +#include <type_traits> + +namespace std { + +#ifndef __cpp_lib_bit_cast +#define __cpp_lib_bit_cast 201806L + +/// Create a value of type `To` from the bits of `from`. +template<typename To, typename From> +requires (sizeof(To) == sizeof(From)) && + std::is_trivially_copyable_v<From> && + std::is_trivially_copyable_v<To> +[[nodiscard]] constexpr To +bit_cast(const From& from) noexcept { +#if __has_builtin(__builtin_bit_cast) + return __builtin_bit_cast(To, from); +#else + static_assert(std::is_trivially_constructible_v<To>); + To to; + std::memcpy(&to, &from, sizeof(To)); + return to; +#endif +} + +#endif // __cpp_lib_bit_cast + +} // namespace std diff --git a/src/include/crc32c.h b/src/include/crc32c.h new file mode 100644 index 000000000..dd4ede666 --- /dev/null +++ b/src/include/crc32c.h @@ -0,0 +1,57 @@ +#ifndef CEPH_CRC32C_H +#define CEPH_CRC32C_H + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length); + +/* + * this is a static global with the chosen crc32c implementation for + * the given architecture. + */ +extern ceph_crc32c_func_t ceph_crc32c_func; + +extern ceph_crc32c_func_t ceph_choose_crc32(void); + +/** + * calculate crc32c for data that is entirely 0 (ZERO) + * + * Note: works the same as ceph_crc32c_func for data == nullptr, + * but faster than the optimized assembly on certain architectures. + * This is faster than intel optimized assembly, but not as fast as + * ppc64le optimized assembly. + * + * @param crc initial value + * @param length length of buffer + */ +uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length); + +/** + * calculate crc32c + * + * Note: if the data pointer is NULL, we calculate a crc value as if + * it were zero-filled. + * + * @param crc initial value + * @param data pointer to data buffer + * @param length length of buffer + */ +static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length) +{ +#ifndef HAVE_POWER8 + if (!data && length > 16) + return ceph_crc32c_zeros(crc, length); +#endif /* HAVE_POWER8 */ + + return ceph_crc32c_func(crc, data, length); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/demangle.h b/src/include/demangle.h new file mode 100644 index 000000000..9e46d952f --- /dev/null +++ b/src/include/demangle.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INCLUDE_DEMANGLE +#define CEPH_INCLUDE_DEMANGLE + +//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname +#ifdef __GNUG__ +#include <cstdlib> +#include <memory> +#include <cxxabi.h> + +static std::string ceph_demangle(const char* name) +{ + int status = -4; // some arbitrary value to eliminate the compiler warning + + // enable c++11 by passing the flag -std=c++11 to g++ + std::unique_ptr<char, void(*)(void*)> res { + abi::__cxa_demangle(name, NULL, NULL, &status), + std::free + }; + + return (status == 0) ? res.get() : name ; +} + +#else + +// does nothing if not g++ +static std::string demangle(const char* name) +{ + return name; +} + +#endif + + +#endif diff --git a/src/include/denc.h b/src/include/denc.h new file mode 100644 index 000000000..d075dd518 --- /dev/null +++ b/src/include/denc.h @@ -0,0 +1,1895 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +// If you #include "include/encoding.h" you get the old-style *and* +// the new-style definitions. (The old-style needs denc_traits<> in +// order to disable the container helpers when new-style traits are +// present.) + +// You can also just #include "include/denc.h" and get only the +// new-style helpers. The eventual goal is to drop the legacy +// definitions. + +#ifndef _ENC_DEC_H +#define _ENC_DEC_H + +#include <array> +#include <bit> +#include <cstring> +#include <concepts> +#include <map> +#include <optional> +#include <set> +#include <string> +#include <type_traits> +#include <vector> + +#include <boost/container/flat_map.hpp> +#include <boost/container/flat_set.hpp> +#include <boost/container/small_vector.hpp> +#include <boost/intrusive/set.hpp> +#include <boost/optional.hpp> + +#include "include/cpp_lib_backport.h" +#include "include/compat.h" +#include "include/int_types.h" +#include "include/scope_guard.h" + +#include "buffer.h" +#include "byteorder.h" + +#include "common/convenience.h" +#include "common/error_code.h" + +template<typename T, typename=void> +struct denc_traits { + static constexpr bool supported = false; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = true; +}; + +template<typename T> +inline constexpr bool denc_supported = denc_traits<T>::supported; + + +// hack for debug only; FIXME +//#include <iostream> +//using std::cout; + +// Define this to compile in a dump of all encoded objects to disk to +// populate ceph-object-corpus. Note that there is an almost +// identical implementation in encoding.h, but you only need to define +// ENCODE_DUMP_PATH here. +// +// See src/test/encoding/generate-corpus-objects.sh. +// +//#define ENCODE_DUMP_PATH /tmp/something + +#ifdef ENCODE_DUMP_PATH +# include <cstdio> +# include <sys/types.h> +# include <sys/stat.h> +# include <fcntl.h> + +# define ENCODE_STR(x) #x +# define ENCODE_STRINGIFY(x) ENCODE_STR(x) + +template<typename T> +class DencDumper { +public: + DencDumper(const char* name, + const ceph::bufferlist::contiguous_appender& appender) + : name{name}, + appender{appender}, + bl_offset{appender.bl.length()}, + space_offset{space_size()}, + start{appender.get_pos()} + {} + ~DencDumper() { + if (do_sample()) { + dump(); + } + } +private: + static bool do_sample() { + // this hackery with bits below is just to get a semi-reasonable + // distribution across time. it is somewhat exponential but not + // quite. + i++; + int bits = 0; + for (unsigned t = i; t; bits++) + t &= t - 1; + return bits <= 2; + } + size_t space_size() const { + return appender.get_logical_offset() - appender.get_out_of_band_offset(); + } + void dump() const { + char fn[PATH_MAX]; + ::snprintf(fn, sizeof(fn), + ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", name, + getpid(), i++); + int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644); + if (fd < 0) { + return; + } + auto close_fd = make_scope_guard([fd] { ::close(fd); }); + if (auto bl_delta = appender.bl.length() - bl_offset; bl_delta > 0) { + ceph::bufferlist dump_bl; + appender.bl.begin(bl_offset + space_offset).copy(bl_delta - space_offset, dump_bl); + const size_t space_len = space_size(); + dump_bl.append(appender.get_pos() - space_len, space_len); + dump_bl.write_fd(fd); + } else { + size_t len = appender.get_pos() - start; + [[maybe_unused]] int r = ::write(fd, start, len); + } + } + const char* name; + const ceph::bufferlist::contiguous_appender& appender; + const size_t bl_offset; + const size_t space_offset; + const char* start; + static int i; +}; + +template<typename T> int DencDumper<T>::i = 0; + +# define DENC_DUMP_PRE(Type) \ + DencDumper<Type> _denc_dumper{#Type, p}; +#else +# define DENC_DUMP_PRE(Type) +#endif + + +/* + + top level level functions look like so + ====================================== + + inline void denc(const T& o, size_t& p, uint64_t features=0); + inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p, + uint64_t features=0); + inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features=0); + + or (for featured objects) + + inline void denc(const T& o, size_t& p, uint64_t features); + inline void denc(const T& o, ceph::buffer::list::contiguous_appender& p, + uint64_t features); + inline void denc(T& o, ceph::buffer::ptr::const_iterator& p, uint64_t features); + + - These are symmetrical, so that they can be used from the magic DENC + method of writing the bound_encode/encode/decode methods all in one go; + they differ only in the type of p. + + - These are automatically fabricated via a template that calls into + the denc_traits<> methods (see below), provided denc_traits<T>::supported + is defined and true. They never need to be written explicitly. + + + static denc_traits<> definitions look like so + ============================================= + + template<> + struct denc_traits<T> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const T &o, size_t& p, uint64_t f=0); + static void encode(const T &o, ceph::buffer::list::contiguous_appender& p, + uint64_t f=0); + static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0); + }; + + or (for featured objects) + + template<> + struct denc_traits<T> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const T &o, size_t& p, uint64_t f); + static void encode(const T &o, ceph::buffer::list::contiguous_appender& p, + uint64_t f); + static void decode(T& o, ceph::buffer::ptr::const_iterator &p, uint64_t f=0); + }; + + - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro, + which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro. + There are _FEATURED and _BOUNDED variants. The class traits simply call + into class methods of the same name (see below). + + - denc_traits<T> can also be written explicitly for some type to indicate + how it should be encoded. This is the "source of truth" for how a type + is encoded. + + - denc_traits<T> are declared for the base integer types, string, ceph::buffer::ptr, + and ceph::buffer::list base types. + + - denc_traits<std::foo<T>>-like traits are declared for standard container + types. + + + class methods look like so + ========================== + + void bound_encode(size_t& p) const; + void encode(ceph::buffer::list::contiguous_appender& p) const; + void decode(ceph::buffer::ptr::const_iterator &p); + + or (for featured objects) + + void bound_encode(size_t& p, uint64_t f) const; + void encode(ceph::buffer::list::contiguous_appender& p, uint64_t f) const; + void decode(ceph::buffer::ptr::const_iterator &p); + + - These are normally invoked by the denc_traits<> methods that are + declared via WRITE_CLASS_DENC, although you can also invoke them explicitly + in your code. + + - These methods are optimised for contiguous buffer, but denc() will try + rebuild a contigous one if the decoded ceph::buffer::list is segmented. If you are + concerned about the cost, you might want to define yet another method: + + void decode(ceph::buffer::list::iterator &p); + + - These can be defined either explicitly (as above), or can be "magically" + defined all in one go using the DENC macro and DENC_{START,FINISH} helpers + (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros): + + class foo_t { + ... + DENC(foo_t, v, p) { + DENC_START(1, 1, p); + denc(v.foo, p); + denc(v.bar, p); + denc(v.baz, p); + DENC_FINISH(p); + } + ... + }; + WRITE_CLASS_DENC(foo_t) + + */ + +// --------------------------------------------------------------------- +// raw types +namespace _denc { +template<typename T, typename... U> +concept is_any_of = (std::same_as<T, U> || ...); + +template<typename T, typename=void> struct underlying_type { + using type = T; +}; +template<typename T> +struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> { + using type = std::underlying_type_t<T>; +}; +template<typename T> +using underlying_type_t = typename underlying_type<T>::type; +} + +template<class It> +concept is_const_iterator = requires(It& it, size_t n) { + { it.get_pos_add(n) } -> std::same_as<const char*>; +}; + +template<typename T, is_const_iterator It> +const T& get_pos_add(It& i) { + return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T))); +} + +template<typename T, class It> +requires (!is_const_iterator<It>) +T& get_pos_add(It& i) { + return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T))); +} + +template<typename T> +requires _denc::is_any_of<_denc::underlying_type_t<T>, + ceph_le64, ceph_le32, ceph_le16, uint8_t +#ifndef _CHAR_IS_SIGNED + , int8_t +#endif + > +struct denc_traits<T> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + static void bound_encode(const T &o, size_t& p, uint64_t f=0) { + p += sizeof(T); + } + template<class It> + requires (!is_const_iterator<It>) + static void encode(const T &o, It& p, uint64_t f=0) { + get_pos_add<T>(p) = o; + } + template<is_const_iterator It> + static void decode(T& o, It& p, uint64_t f=0) { + o = get_pos_add<T>(p); + } + static void decode(T& o, ceph::buffer::list::const_iterator &p) { + p.copy(sizeof(T), reinterpret_cast<char*>(&o)); + } +}; + + +// ----------------------------------------------------------------------- +// integer types + +// itype == internal type +// otype == external type, i.e., the type on the wire + +// NOTE: the overload resolution ensures that the legacy encode/decode methods +// defined for int types is preferred to the ones defined using the specialized +// template, and hence get selected. This machinery prevents these these from +// getting glued into the legacy encode/decode methods; the overhead of setting +// up a contiguous_appender etc is likely to be slower. +namespace _denc { + +template<typename T> struct ExtType { + using type = void; +}; + +template<typename T> +requires _denc::is_any_of<T, + int16_t, uint16_t> +struct ExtType<T> { + using type = ceph_le16; +}; + +template<typename T> +requires _denc::is_any_of<T, + int32_t, uint32_t> +struct ExtType<T> { + using type = ceph_le32; +}; + +template<typename T> +requires _denc::is_any_of<T, + int64_t, uint64_t> +struct ExtType<T> { + using type = ceph_le64; +}; + +template<> +struct ExtType<bool> { + using type = uint8_t; +}; +template<typename T> +using ExtType_t = typename ExtType<T>::type; +} // namespace _denc + +template<typename T> +requires (!std::is_void_v<_denc::ExtType_t<T>>) +struct denc_traits<T> +{ + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + using etype = _denc::ExtType_t<T>; + static void bound_encode(const T &o, size_t& p, uint64_t f=0) { + p += sizeof(etype); + } + template<class It> + requires (!is_const_iterator<It>) + static void encode(const T &o, It& p, uint64_t f=0) { + get_pos_add<etype>(p) = o; + } + template<is_const_iterator It> + static void decode(T& o, It &p, uint64_t f=0) { + o = get_pos_add<etype>(p); + } + static void decode(T& o, ceph::buffer::list::const_iterator &p) { + etype e; + p.copy(sizeof(etype), reinterpret_cast<char*>(&e)); + o = e; + } +}; + +// varint +// +// high bit of each byte indicates another byte follows. +template<typename T> +inline void denc_varint(T v, size_t& p) { + p += sizeof(T) + 1; +} + +template<typename T> +inline void denc_varint(T v, ceph::buffer::list::contiguous_appender& p) { + uint8_t byte = v & 0x7f; + v >>= 7; + while (v) { + byte |= 0x80; + get_pos_add<__u8>(p) = byte; + byte = (v & 0x7f); + v >>= 7; + } + get_pos_add<__u8>(p) = byte; +} + +template<typename T> +inline void denc_varint(T& v, ceph::buffer::ptr::const_iterator& p) { + uint8_t byte = *(__u8*)p.get_pos_add(1); + v = byte & 0x7f; + int shift = 7; + while (byte & 0x80) { + byte = get_pos_add<__u8>(p); + v |= (T)(byte & 0x7f) << shift; + shift += 7; + } +} + + +// signed varint encoding +// +// low bit = 1 = negative, 0 = positive +// high bit of every byte indicates whether another byte follows. +inline void denc_signed_varint(int64_t v, size_t& p) { + p += sizeof(v) + 2; +} +template<class It> +requires (!is_const_iterator<It>) +void denc_signed_varint(int64_t v, It& p) { + if (v < 0) { + v = (-v << 1) | 1; + } else { + v <<= 1; + } + denc_varint(v, p); +} + +template<typename T, is_const_iterator It> +inline void denc_signed_varint(T& v, It& p) +{ + int64_t i = 0; + denc_varint(i, p); + if (i & 1) { + v = -(i >> 1); + } else { + v = i >> 1; + } +} + +// varint + lowz encoding +// +// first(low) 2 bits = how many low zero bits (nibbles) +// high bit of each byte = another byte follows +// (so, 5 bits data in first byte, 7 bits data thereafter) +inline void denc_varint_lowz(uint64_t v, size_t& p) { + p += sizeof(v) + 2; +} +inline void denc_varint_lowz(uint64_t v, + ceph::buffer::list::contiguous_appender& p) { + int lowznib = v ? (std::countr_zero(v) / 4) : 0; + if (lowznib > 3) + lowznib = 3; + v >>= lowznib * 4; + v <<= 2; + v |= lowznib; + denc_varint(v, p); +} + +template<typename T> +inline void denc_varint_lowz(T& v, ceph::buffer::ptr::const_iterator& p) +{ + uint64_t i = 0; + denc_varint(i, p); + int lowznib = (i & 3); + i >>= 2; + i <<= lowznib * 4; + v = i; +} + +// signed varint + lowz encoding +// +// first low bit = 1 for negative, 0 for positive +// next 2 bits = how many low zero bits (nibbles) +// high bit of each byte = another byte follows +// (so, 4 bits data in first byte, 7 bits data thereafter) +inline void denc_signed_varint_lowz(int64_t v, size_t& p) { + p += sizeof(v) + 2; +} +template<class It> +requires (!is_const_iterator<It>) +inline void denc_signed_varint_lowz(int64_t v, It& p) { + bool negative = false; + if (v < 0) { + v = -v; + negative = true; + } + unsigned lowznib = v ? (std::countr_zero(std::bit_cast<uint64_t>(v)) / 4) : 0u; + if (lowznib > 3) + lowznib = 3; + v >>= lowznib * 4; + v <<= 3; + v |= lowznib << 1; + v |= (int)negative; + denc_varint(v, p); +} + +template<typename T, is_const_iterator It> +inline void denc_signed_varint_lowz(T& v, It& p) +{ + int64_t i = 0; + denc_varint(i, p); + int lowznib = (i & 6) >> 1; + if (i & 1) { + i >>= 3; + i <<= lowznib * 4; + v = -i; + } else { + i >>= 3; + i <<= lowznib * 4; + v = i; + } +} + + +// LBA +// +// first 1-3 bits = how many low zero bits +// *0 = 12 (common 4 K alignment case) +// *01 = 16 +// *011 = 20 +// *111 = byte +// then 28-30 bits of data +// then last bit = another byte follows +// high bit of each subsequent byte = another byte follows +inline void denc_lba(uint64_t v, size_t& p) { + p += sizeof(v) + 2; +} + +template<class It> +requires (!is_const_iterator<It>) +inline void denc_lba(uint64_t v, It& p) { + int low_zero_nibbles = v ? std::countr_zero(v) / 4 : 0; + int pos; + uint32_t word; + int t = low_zero_nibbles - 3; + if (t < 0) { + pos = 3; + word = 0x7; + } else if (t < 3) { + v >>= (low_zero_nibbles * 4); + pos = t + 1; + word = (1 << t) - 1; + } else { + v >>= 20; + pos = 3; + word = 0x3; + } + word |= (v << pos) & 0x7fffffff; + v >>= 31 - pos; + if (!v) { + *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word; + return; + } + word |= 0x80000000; + *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word; + uint8_t byte = v & 0x7f; + v >>= 7; + while (v) { + byte |= 0x80; + *(__u8*)p.get_pos_add(1) = byte; + byte = (v & 0x7f); + v >>= 7; + } + *(__u8*)p.get_pos_add(1) = byte; +} + +template<is_const_iterator It> +inline void denc_lba(uint64_t& v, It& p) { + uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)); + int shift = 0; + switch (word & 7) { + case 0: + case 2: + case 4: + case 6: + v = (uint64_t)(word & 0x7ffffffe) << (12 - 1); + shift = 12 + 30; + break; + case 1: + case 5: + v = (uint64_t)(word & 0x7ffffffc) << (16 - 2); + shift = 16 + 29; + break; + case 3: + v = (uint64_t)(word & 0x7ffffff8) << (20 - 3); + shift = 20 + 28; + break; + case 7: + v = (uint64_t)(word & 0x7ffffff8) >> 3; + shift = 28; + } + uint8_t byte = word >> 24; + while (byte & 0x80) { + byte = *(__u8*)p.get_pos_add(1); + v |= (uint64_t)(byte & 0x7f) << shift; + shift += 7; + } +} + + +// --------------------------------------------------------------------- +// denc top-level methods that call into denc_traits<T> methods + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported> denc( + const T& o, + size_t& p, + uint64_t f=0) +{ + if constexpr (traits::featured) { + traits::bound_encode(o, p, f); + } else { + traits::bound_encode(o, p); + } +} + +template<typename T, class It, typename traits=denc_traits<T>> +requires traits::supported && (!is_const_iterator<It>) +inline void +denc(const T& o, + It& p, + uint64_t features=0) +{ + if constexpr (traits::featured) { + traits::encode(o, p, features); + } else { + traits::encode(o, p); + } +} + +template<typename T, is_const_iterator It, typename traits=denc_traits<T>> +requires traits::supported +inline void +denc(T& o, + It& p, + uint64_t features=0) +{ + if constexpr (traits::featured) { + traits::decode(o, p, features); + } else { + traits::decode(o, p); + } +} + +namespace _denc { +template<typename T, typename = void> +struct has_legacy_denc : std::false_type {}; +template<typename T> +struct has_legacy_denc<T, decltype(std::declval<T&>() + .decode(std::declval< + ceph::buffer::list::const_iterator&>()))> + : std::true_type { + static void decode(T& v, ceph::buffer::list::const_iterator& p) { + v.decode(p); + } +}; +template<typename T> +struct has_legacy_denc<T, + std::enable_if_t< + !denc_traits<T>::need_contiguous>> : std::true_type { + static void decode(T& v, ceph::buffer::list::const_iterator& p) { + denc_traits<T>::decode(v, p); + } +}; +} + +template<typename T, + typename traits=denc_traits<T>, + typename has_legacy_denc=_denc::has_legacy_denc<T>> +inline std::enable_if_t<traits::supported && + has_legacy_denc::value> denc( + T& o, + ceph::buffer::list::const_iterator& p) +{ + has_legacy_denc::decode(o, p); +} + +// --------------------------------------------------------------------- +// base types and containers + +// +// std::string +// +template<typename A> +struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> { +private: + using value_type = std::basic_string<char,std::char_traits<char>,A>; + +public: + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + + static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + s.size(); + } + template<class It> + static void encode(const value_type& s, + It& p, + uint64_t f=0) { + denc((uint32_t)s.size(), p); + memcpy(p.get_pos_add(s.size()), s.data(), s.size()); + } + template<class It> + static void decode(value_type& s, + It& p, + uint64_t f=0) { + uint32_t len; + denc(len, p); + decode_nohead(len, s, p); + } + static void decode(value_type& s, ceph::buffer::list::const_iterator& p) + { + uint32_t len; + denc(len, p); + decode_nohead(len, s, p); + } + template<class It> + static void decode_nohead(size_t len, value_type& s, It& p) { + s.clear(); + if (len) { + s.append(p.get_pos_add(len), len); + } + } + static void decode_nohead(size_t len, value_type& s, + ceph::buffer::list::const_iterator& p) { + if (len) { + if constexpr (std::is_same_v<value_type, std::string>) { + s.clear(); + p.copy(len, s); + } else { + s.resize(len); + p.copy(len, s.data()); + } + } else { + s.clear(); + } + } + template<class It> + requires (!is_const_iterator<It>) + static void + encode_nohead(const value_type& s, It& p) { + auto len = s.length(); + maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16); + } +}; + +// +// ceph::buffer::ptr +// +template<> +struct denc_traits<ceph::buffer::ptr> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + v.length(); + } + template <class It> + requires (!is_const_iterator<It>) + static void + encode(const ceph::buffer::ptr& v, It& p, uint64_t f=0) { + denc((uint32_t)v.length(), p); + p.append(v); + } + template <is_const_iterator It> + static void + decode(ceph::buffer::ptr& v, It& p, uint64_t f=0) { + uint32_t len; + denc(len, p); + v = p.get_ptr(len); + } + static void decode(ceph::buffer::ptr& v, ceph::buffer::list::const_iterator& p) { + uint32_t len; + denc(len, p); + ceph::buffer::list s; + p.copy(len, s); + if (len) { + if (s.get_num_buffers() == 1) + v = s.front(); + else + v = ceph::buffer::copy(s.c_str(), s.length()); + } + } +}; + +// +// ceph::buffer::list +// +template<> +struct denc_traits<ceph::buffer::list> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + static void bound_encode(const ceph::buffer::list& v, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + v.length(); + } + static void encode(const ceph::buffer::list& v, ceph::buffer::list::contiguous_appender& p, + uint64_t f=0) { + denc((uint32_t)v.length(), p); + p.append(v); + } + static void decode(ceph::buffer::list& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { + uint32_t len = 0; + denc(len, p); + v.clear(); + v.push_back(p.get_ptr(len)); + } + static void decode(ceph::buffer::list& v, ceph::buffer::list::const_iterator& p) { + uint32_t len; + denc(len, p); + v.clear(); + p.copy(len, v); + } + static void encode_nohead(const ceph::buffer::list& v, + ceph::buffer::list::contiguous_appender& p) { + p.append(v); + } + static void decode_nohead(size_t len, ceph::buffer::list& v, + ceph::buffer::ptr::const_iterator& p) { + v.clear(); + if (len) { + v.append(p.get_ptr(len)); + } + } + static void decode_nohead(size_t len, ceph::buffer::list& v, + ceph::buffer::list::const_iterator& p) { + v.clear(); + p.copy(len, v); + } +}; + +// +// std::pair<A, B> +// +template<typename A, typename B> +struct denc_traits< + std::pair<A, B>, + std::enable_if_t<denc_supported<std::remove_const_t<A>> && denc_supported<B>>> { + typedef denc_traits<A> a_traits; + typedef denc_traits<B> b_traits; + + static constexpr bool supported = true; + static constexpr bool featured = a_traits::featured || b_traits::featured ; + static constexpr bool bounded = a_traits::bounded && b_traits::bounded; + static constexpr bool need_contiguous = (a_traits::need_contiguous || + b_traits::need_contiguous); + + static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) { + if constexpr (featured) { + denc(v.first, p, f); + denc(v.second, p, f); + } else { + denc(v.first, p); + denc(v.second, p); + } + } + + static void encode(const std::pair<A,B>& v, ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + if constexpr (featured) { + denc(v.first, p, f); + denc(v.second, p, f); + } else { + denc(v.first, p); + denc(v.second, p); + } + } + + static void decode(std::pair<A,B>& v, ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { + denc(const_cast<std::remove_const_t<A>&>(v.first), p, f); + denc(v.second, p, f); + } + template<typename AA=A> + static std::enable_if_t<!!sizeof(AA) && !need_contiguous> + decode(std::pair<A,B>& v, ceph::buffer::list::const_iterator& p, + uint64_t f = 0) { + denc(const_cast<std::remove_const_t<AA>&>(v.first), p); + denc(v.second, p); + } +}; + +namespace _denc { + template<template<class...> class C, typename Details, typename ...Ts> + struct container_base { + private: + using container = C<Ts...>; + using T = typename Details::T; + + public: + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + template<typename U=T> + static void bound_encode(const container& s, size_t& p, uint64_t f = 0) { + p += sizeof(uint32_t); + if constexpr (traits::bounded) { +#if _GLIBCXX_USE_CXX11_ABI + // intensionally not calling container's empty() method to not prohibit + // compiler from optimizing the check if it and the ::size() operate on + // different memory (observed when std::list::empty() works on pointers, + // not the size field). + if (const auto elem_num = s.size(); elem_num > 0) { +#else + if (!s.empty()) { + const auto elem_num = s.size(); +#endif + // STL containers use weird element types like std::pair<const K, V>; + // cast to something we have denc_traits for. + size_t elem_size = 0; + if constexpr (traits::featured) { + denc(static_cast<const T&>(*s.begin()), elem_size, f); + } else { + denc(static_cast<const T&>(*s.begin()), elem_size); + } + p += elem_size * elem_num; + } + } else { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + } + + template<typename U=T> + static void encode(const container& s, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + denc((uint32_t)s.size(), p); + if constexpr (traits::featured) { + encode_nohead(s, p, f); + } else { + encode_nohead(s, p); + } + } + static void decode(container& s, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p, f); + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(container& s, ceph::buffer::list::const_iterator& p) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p); + } + + // nohead + static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + static void decode_nohead(size_t num, container& s, + ceph::buffer::ptr::const_iterator& p, + uint64_t f=0) { + s.clear(); + Details::reserve(s, num); + while (num--) { + T t; + denc(t, p, f); + Details::insert(s, std::move(t)); + } + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode_nohead(size_t num, container& s, + ceph::buffer::list::const_iterator& p) { + s.clear(); + Details::reserve(s, num); + while (num--) { + T t; + denc(t, p); + Details::insert(s, std::move(t)); + } + } + }; + + template<typename T> + class container_has_reserve { + template<typename U, U> struct SFINAE_match; + template<typename U> + static std::true_type test(SFINAE_match<T(*)(typename T::size_type), + &U::reserve>*); + + template<typename U> + static std::false_type test(...); + + public: + static constexpr bool value = decltype( + test<denc_traits<T>>(0))::value; + }; + template<typename T> + inline constexpr bool container_has_reserve_v = + container_has_reserve<T>::value; + + + template<typename Container> + struct container_details_base { + using T = typename Container::value_type; + static void reserve(Container& c, size_t s) { + if constexpr (container_has_reserve_v<Container>) { + c.reserve(s); + } + } + }; + + template<typename Container> + struct pushback_details : public container_details_base<Container> { + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_back(std::forward<Args>(args)...); + } + }; +} + +template<typename T, typename ...Ts> +struct denc_traits< + std::list<T, Ts...>, + typename std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::list, + _denc::pushback_details<std::list<T, Ts...>>, + T, Ts...> {}; + +template<typename T, typename ...Ts> +struct denc_traits< + std::vector<T, Ts...>, + typename std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::vector, + _denc::pushback_details<std::vector<T, Ts...>>, + T, Ts...> {}; + +template<typename T, std::size_t N, typename ...Ts> +struct denc_traits< + boost::container::small_vector<T, N, Ts...>, + typename std::enable_if_t<denc_traits<T>::supported>> { +private: + using container = boost::container::small_vector<T, N, Ts...>; +public: + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + template<typename U=T> + static void bound_encode(const container& s, size_t& p, uint64_t f = 0) { + p += sizeof(uint32_t); + if constexpr (traits::bounded) { + if (!s.empty()) { + const auto elem_num = s.size(); + size_t elem_size = 0; + if constexpr (traits::featured) { + denc(*s.begin(), elem_size, f); + } else { + denc(*s.begin(), elem_size); + } + p += elem_size * elem_num; + } + } else { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + } + + template<typename U=T> + static void encode(const container& s, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + denc((uint32_t)s.size(), p); + if constexpr (traits::featured) { + encode_nohead(s, p, f); + } else { + encode_nohead(s, p); + } + } + static void decode(container& s, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p, f); + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(container& s, ceph::buffer::list::const_iterator& p) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p); + } + + // nohead + static void encode_nohead(const container& s, ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + static void decode_nohead(size_t num, container& s, + ceph::buffer::ptr::const_iterator& p, + uint64_t f=0) { + s.clear(); + s.reserve(num); + while (num--) { + T t; + denc(t, p, f); + s.push_back(std::move(t)); + } + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode_nohead(size_t num, container& s, + ceph::buffer::list::const_iterator& p) { + s.clear(); + s.reserve(num); + while (num--) { + T t; + denc(t, p); + s.push_back(std::move(t)); + } + } +}; + +namespace _denc { + template<typename Container> + struct setlike_details : public container_details_base<Container> { + using T = typename Container::value_type; + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_hint(c.cend(), std::forward<Args>(args)...); + } + }; +} + +template<typename T, typename ...Ts> +struct denc_traits< + std::set<T, Ts...>, + std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::set, + _denc::setlike_details<std::set<T, Ts...>>, + T, Ts...> {}; + +template<typename T, typename ...Ts> +struct denc_traits< + boost::container::flat_set<T, Ts...>, + std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base< + boost::container::flat_set, + _denc::setlike_details<boost::container::flat_set<T, Ts...>>, + T, Ts...> {}; + +namespace _denc { + template<typename Container> + struct maplike_details : public container_details_base<Container> { + using T = typename Container::value_type; + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_hint(c.cend(), std::forward<Args>(args)...); + } + }; +} + +template<typename A, typename B, typename ...Ts> +struct denc_traits< + std::map<A, B, Ts...>, + std::enable_if_t<denc_traits<A>::supported && + denc_traits<B>::supported>> + : public _denc::container_base<std::map, + _denc::maplike_details<std::map<A, B, Ts...>>, + A, B, Ts...> {}; + +template<typename A, typename B, typename ...Ts> +struct denc_traits< + boost::container::flat_map<A, B, Ts...>, + std::enable_if_t<denc_traits<A>::supported && + denc_traits<B>::supported>> + : public _denc::container_base< + boost::container::flat_map, + _denc::maplike_details<boost::container::flat_map< + A, B, Ts...>>, + A, B, Ts...> {}; + +template<typename T, size_t N> +struct denc_traits< + std::array<T, N>, + std::enable_if_t<denc_traits<T>::supported>> { +private: + using container = std::array<T, N>; +public: + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = traits::bounded; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const container& s, size_t& p, uint64_t f = 0) { + if constexpr (traits::bounded) { + if constexpr (traits::featured) { + if (!s.empty()) { + size_t elem_size = 0; + denc(*s.begin(), elem_size, f); + p += elem_size * s.size(); + } + } else { + size_t elem_size = 0; + denc(*s.begin(), elem_size); + p += elem_size * N; + } + } else { + for (const auto& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + } + + static void encode(const container& s, ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + for (const auto& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + static void decode(container& s, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + for (auto& e : s) + denc(e, p, f); + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && + !need_contiguous> + decode(container& s, ceph::buffer::list::const_iterator& p) { + for (auto& e : s) { + denc(e, p); + } + } +}; + +template<typename... Ts> +struct denc_traits< + std::tuple<Ts...>, + std::enable_if_t<(denc_traits<Ts>::supported && ...)>> { + +private: + static_assert(sizeof...(Ts) > 0, + "Zero-length tuples are not supported."); + using container = std::tuple<Ts...>; + +public: + + static constexpr bool supported = true; + static constexpr bool featured = (denc_traits<Ts>::featured || ...); + static constexpr bool bounded = (denc_traits<Ts>::bounded && ...); + static constexpr bool need_contiguous = + (denc_traits<Ts>::need_contiguous || ...); + + template<typename U = container> + static std::enable_if_t<denc_traits<U>::featured> + bound_encode(const container& s, size_t& p, uint64_t f) { + ceph::for_each(s, [&p, f] (const auto& e) { + if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + }); + } + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::featured> + bound_encode(const container& s, size_t& p) { + ceph::for_each(s, [&p] (const auto& e) { + denc(e, p); + }); + } + + template<typename U = container> + static std::enable_if_t<denc_traits<U>::featured> + encode(const container& s, ceph::buffer::list::contiguous_appender& p, + uint64_t f) { + ceph::for_each(s, [&p, f] (const auto& e) { + if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + }); + } + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::featured> + encode(const container& s, ceph::buffer::list::contiguous_appender& p) { + ceph::for_each(s, [&p] (const auto& e) { + denc(e, p); + }); + } + + static void decode(container& s, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + ceph::for_each(s, [&p] (auto& e) { + denc(e, p); + }); + } + + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::need_contiguous> + decode(container& s, ceph::buffer::list::const_iterator& p, uint64_t f = 0) { + ceph::for_each(s, [&p] (auto& e) { + denc(e, p); + }); + } +}; + +// +// boost::optional<T> +// +template<typename T> +struct denc_traits< + boost::optional<T>, + std::enable_if_t<denc_traits<T>::supported>> { + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const boost::optional<T>& v, size_t& p, + uint64_t f = 0) { + p += sizeof(bool); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void encode(const boost::optional<T>& v, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + denc((bool)v, p); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode(boost::optional<T>& v, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + bool x; + denc(x, p, f); + if (x) { + v = T{}; + denc(*v, p, f); + } else { + v = boost::none; + } + } + + template<typename U = T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(boost::optional<T>& v, ceph::buffer::list::const_iterator& p) { + bool x; + denc(x, p); + if (x) { + v = T{}; + denc(*v, p); + } else { + v = boost::none; + } + } + + template<typename U = T> + static void encode_nohead(const boost::optional<T>& v, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode_nohead(bool num, boost::optional<T>& v, + ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) { + if (num) { + v = T(); + denc(*v, p, f); + } else { + v = boost::none; + } + } +}; + +template<> +struct denc_traits<boost::none_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + + static void bound_encode(const boost::none_t& v, size_t& p) { + p += sizeof(bool); + } + + static void encode(const boost::none_t& v, + ceph::buffer::list::contiguous_appender& p) { + denc(false, p); + } +}; + +// +// std::optional<T> +// +template<typename T> +struct denc_traits< + std::optional<T>, + std::enable_if_t<denc_traits<T>::supported>> { + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const std::optional<T>& v, size_t& p, + uint64_t f = 0) { + p += sizeof(bool); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void encode(const std::optional<T>& v, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + denc((bool)v, p); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode(std::optional<T>& v, ceph::buffer::ptr::const_iterator& p, + uint64_t f = 0) { + bool x; + denc(x, p, f); + if (x) { + v = T{}; + denc(*v, p, f); + } else { + v = std::nullopt; + } + } + + template<typename U = T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(std::optional<T>& v, ceph::buffer::list::const_iterator& p) { + bool x; + denc(x, p); + if (x) { + v = T{}; + denc(*v, p); + } else { + v = std::nullopt; + } + } + + static void encode_nohead(const std::optional<T>& v, + ceph::buffer::list::contiguous_appender& p, + uint64_t f = 0) { + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode_nohead(bool num, std::optional<T>& v, + ceph::buffer::ptr::const_iterator& p, uint64_t f = 0) { + if (num) { + v = T(); + denc(*v, p, f); + } else { + v = std::nullopt; + } + } +}; + +template<> +struct denc_traits<std::nullopt_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + + static void bound_encode(const std::nullopt_t& v, size_t& p) { + p += sizeof(bool); + } + + static void encode(const std::nullopt_t& v, + ceph::buffer::list::contiguous_appender& p) { + denc(false, p); + } +}; + +// ---------------------------------------------------------------------- +// class helpers + +// Write denc_traits<> for a class that defines bound_encode/encode/decode +// methods. + +#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false) +#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true) +#define _DECLARE_CLASS_DENC(T, b) \ + template<> struct denc_traits<T> { \ + static constexpr bool supported = true; \ + static constexpr bool featured = false; \ + static constexpr bool bounded = b; \ + static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\ + static void bound_encode(const T& v, size_t& p, uint64_t f=0) { \ + v.bound_encode(p); \ + } \ + static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \ + uint64_t f=0) { \ + v.encode(p); \ + } \ + static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \ + v.decode(p); \ + } \ + }; + +#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false) +#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true) +#define _DECLARE_CLASS_DENC_FEATURED(T, b) \ + template<> struct denc_traits<T> { \ + static constexpr bool supported = true; \ + static constexpr bool featured = true; \ + static constexpr bool bounded = b; \ + static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\ + static void bound_encode(const T& v, size_t& p, uint64_t f) { \ + v.bound_encode(p, f); \ + } \ + static void encode(const T& v, ::ceph::buffer::list::contiguous_appender& p, \ + uint64_t f) { \ + v.encode(p, f); \ + } \ + static void decode(T& v, ::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \ + v.decode(p, f); \ + } \ + }; + +// ---------------------------------------------------------------------- +// encoded_sizeof_wrapper + +namespace ceph { + +template <typename T, typename traits=denc_traits<T>> +constexpr std::enable_if_t<traits::supported && traits::bounded, size_t> +encoded_sizeof_bounded() { + size_t p = 0; + traits::bound_encode(T(), p); + return p; +} + +template <typename T, typename traits=denc_traits<T>> +std::enable_if_t<traits::supported, size_t> +encoded_sizeof(const T &t) { + size_t p = 0; + traits::bound_encode(t, p); + return p; +} + +} // namespace ceph + + +// ---------------------------------------------------------------------- +// encode/decode wrappers + +// These glue the new-style denc world into old-style calls to encode +// and decode by calling into denc_traits<> methods (when present). + +namespace ceph { +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::featured> encode( + const T& o, + ceph::buffer::list& bl, + uint64_t features_unused=0) +{ + size_t len = 0; + traits::bound_encode(o, len); + auto a = bl.get_contiguous_appender(len); + traits::encode(o, a); +} + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && traits::featured> encode( + const T& o, ::ceph::buffer::list& bl, + uint64_t features) +{ + size_t len = 0; + traits::bound_encode(o, len, features); + auto a = bl.get_contiguous_appender(len); + traits::encode(o, a, features); +} + +template<typename T, + typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode( + T& o, + ::ceph::buffer::list::const_iterator& p) +{ + if (p.end()) + throw ::ceph::buffer::end_of_buffer(); + const auto& bl = p.get_bl(); + const auto remaining = bl.length() - p.get_off(); + // it is expensive to rebuild a contigous buffer and drop it, so avoid this. + if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) { + traits::decode(o, p); + } else { + // ensure we get a contigous buffer... until the end of the + // ceph::buffer::list. we don't really know how much we'll need here, + // unfortunately. hopefully it is already contiguous and we're just + // bumping the raw ref and initializing the ptr tmp fields. + ceph::buffer::ptr tmp; + auto t = p; + t.copy_shallow(remaining, tmp); + auto cp = std::cbegin(tmp); + traits::decode(o, cp); + p += cp.get_offset(); + } +} + +template<typename T, + typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && traits::need_contiguous> decode( + T& o, + ceph::buffer::list::const_iterator& p) +{ + if (p.end()) + throw ceph::buffer::end_of_buffer(); + // ensure we get a contigous buffer... until the end of the + // ceph::buffer::list. we don't really know how much we'll need here, + // unfortunately. hopefully it is already contiguous and we're just + // bumping the raw ref and initializing the ptr tmp fields. + ceph::buffer::ptr tmp; + auto t = p; + t.copy_shallow(p.get_bl().length() - p.get_off(), tmp); + auto cp = std::cbegin(tmp); + traits::decode(o, cp); + p += cp.get_offset(); +} + +// nohead variants +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && + !traits::featured> encode_nohead( + const T& o, + ceph::buffer::list& bl) +{ + size_t len = 0; + traits::bound_encode(o, len); + auto a = bl.get_contiguous_appender(len); + traits::encode_nohead(o, a); +} + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead( + size_t num, + T& o, + ceph::buffer::list::const_iterator& p) +{ + if (!num) + return; + if (p.end()) + throw ceph::buffer::end_of_buffer(); + if constexpr (traits::need_contiguous) { + ceph::buffer::ptr tmp; + auto t = p; + if constexpr (denc_traits<typename T::value_type>::bounded) { + size_t element_size = 0; + typename T::value_type v; + denc_traits<typename T::value_type>::bound_encode(v, element_size); + t.copy_shallow(num * element_size, tmp); + } else { + t.copy_shallow(p.get_bl().length() - p.get_off(), tmp); + } + auto cp = std::cbegin(tmp); + traits::decode_nohead(num, o, cp); + p += cp.get_offset(); + } else { + traits::decode_nohead(num, o, p); + } +} +} + + +// ---------------------------------------------------------------- +// DENC + +// These are some class methods we need to do the version and length +// wrappers for DENC_{START,FINISH} for inter-version +// interoperability. + +#define DENC_HELPERS \ + /* bound_encode */ \ + static void _denc_start(size_t& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **, uint32_t *) { \ + p += 2 + 4; \ + } \ + static void _denc_finish(size_t& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **, uint32_t *) { } \ + /* encode */ \ + static void _denc_start(::ceph::buffer::list::contiguous_appender& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **len_pos, \ + uint32_t *start_oob_off) { \ + denc(*struct_v, p); \ + denc(*struct_compat, p); \ + *len_pos = p.get_pos_add(4); \ + *start_oob_off = p.get_out_of_band_offset(); \ + } \ + static void _denc_finish(::ceph::buffer::list::contiguous_appender& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **len_pos, \ + uint32_t *start_oob_off) { \ + *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) + \ + p.get_out_of_band_offset() - *start_oob_off; \ + } \ + /* decode */ \ + static void _denc_start(::ceph::buffer::ptr::const_iterator& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **start_pos, \ + uint32_t *struct_len) { \ + denc(*struct_v, p); \ + denc(*struct_compat, p); \ + denc(*struct_len, p); \ + *start_pos = const_cast<char*>(p.get_pos()); \ + } \ + static void _denc_finish(::ceph::buffer::ptr::const_iterator& p, \ + __u8 *struct_v, __u8 *struct_compat, \ + char **start_pos, \ + uint32_t *struct_len) { \ + const char *pos = p.get_pos(); \ + char *end = *start_pos + *struct_len; \ + if (pos > end) { \ + throw ::ceph::buffer::malformed_input(__PRETTY_FUNCTION__); \ + } \ + if (pos < end) { \ + p += end - pos; \ + } \ + } + +// Helpers for versioning the encoding. These correspond to the +// {ENCODE,DECODE}_{START,FINISH} macros. + +#define DENC_START(v, compat, p) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + char *_denc_pchar; \ + uint32_t _denc_u32; \ + _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); \ + do { + +#define DENC_FINISH(p) \ + } while (false); \ + _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); + + +// ---------------------------------------------------------------------- + +// Helpers for writing a unified bound_encode/encode/decode +// implementation that won't screw up buffer size estimations. + +#define DENC(Type, v, p) \ + DENC_HELPERS \ + void bound_encode(size_t& p) const { \ + _denc_friend(*this, p); \ + } \ + void encode(::ceph::buffer::list::contiguous_appender& p) const { \ + DENC_DUMP_PRE(Type); \ + _denc_friend(*this, p); \ + } \ + void decode(::ceph::buffer::ptr::const_iterator& p) { \ + _denc_friend(*this, p); \ + } \ + template<typename T, typename P> \ + friend std::enable_if_t<std::is_same_v<T, Type> || \ + std::is_same_v<T, const Type>> \ + _denc_friend(T& v, P& p) + +#define DENC_FEATURED(Type, v, p, f) \ + DENC_HELPERS \ + void bound_encode(size_t& p, uint64_t f) const { \ + _denc_friend(*this, p, f); \ + } \ + void encode(::ceph::buffer::list::contiguous_appender& p, uint64_t f) const { \ + DENC_DUMP_PRE(Type); \ + _denc_friend(*this, p, f); \ + } \ + void decode(::ceph::buffer::ptr::const_iterator& p, uint64_t f=0) { \ + _denc_friend(*this, p, f); \ + } \ + template<typename T, typename P> \ + friend std::enable_if_t<std::is_same_v<T, Type> || \ + std::is_same_v<T, const Type>> \ + _denc_friend(T& v, P& p, uint64_t f) + +#endif diff --git a/src/include/dlfcn_compat.h b/src/include/dlfcn_compat.h new file mode 100644 index 000000000..95fd64e51 --- /dev/null +++ b/src/include/dlfcn_compat.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef DLFCN_COMPAT_H +#define DLFCN_COMPAT_H + +#include "acconfig.h" + +#define SHARED_LIB_SUFFIX CMAKE_SHARED_LIBRARY_SUFFIX + +#ifdef _WIN32 + #include <string> + + using dl_errmsg_t = std::string; + + // The load mode flags will be ignored on Windows. We keep the same + // values for debugging purposes though. + #define RTLD_LAZY 0x00001 + #define RTLD_NOW 0x00002 + #define RTLD_BINDING_MASK 0x3 + #define RTLD_NOLOAD 0x00004 + #define RTLD_DEEPBIND 0x00008 + #define RTLD_GLOBAL 0x00100 + #define RTLD_LOCAL 0 + #define RTLD_NODELETE 0x01000 + + void* dlopen(const char *filename, int flags); + int dlclose(void* handle); + dl_errmsg_t dlerror(); + void* dlsym(void* handle, const char* symbol); +#else + #include <dlfcn.h> + + using dl_errmsg_t = char*; +#endif /* _WIN32 */ + +#endif /* DLFCN_H */ diff --git a/src/include/elist.h b/src/include/elist.h new file mode 100644 index 000000000..38be35dbf --- /dev/null +++ b/src/include/elist.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ELIST_H +#define CEPH_ELIST_H + +/* + * elist: embedded list. + * + * requirements: + * - elist<T>::item be embedded in the parent class + * - items are _always_ added to the list via the same elist<T>::item at the same + * fixed offset in the class. + * - begin(), front(), back() methods take the member offset as an argument for traversal. + * + */ + +#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1) + +template<typename T> +class elist { +public: + struct item { + item *_prev, *_next; + + item(T i=0) : _prev(this), _next(this) {} + ~item() { + ceph_assert(!is_on_list()); + } + + item(const item& other) = delete; + const item& operator= (const item& right) = delete; + + + bool empty() const { return _prev == this; } + bool is_on_list() const { return !empty(); } + + bool remove_myself() { + if (_next == this) { + ceph_assert(_prev == this); + return false; + } + _next->_prev = _prev; + _prev->_next = _next; + _prev = _next = this; + return true; + } + + void insert_after(item *other) { + ceph_assert(other->empty()); + other->_prev = this; + other->_next = _next; + _next->_prev = other; + _next = other; + } + void insert_before(item *other) { + ceph_assert(other->empty()); + other->_next = this; + other->_prev = _prev; + _prev->_next = other; + _prev = other; + } + + T get_item(size_t offset) { + ceph_assert(offset); + return (T)(((char *)this) - offset); + } + }; + +private: + item _head; + size_t item_offset; + +public: + elist(const elist& other); + const elist& operator=(const elist& other); + + elist(size_t o) : _head(NULL), item_offset(o) {} + ~elist() { + ceph_assert(_head.empty()); + } + + bool empty() const { + return _head.empty(); + } + + void clear() { + while (!_head.empty()) + pop_front(); + } + + void push_front(item *i) { + if (!i->empty()) + i->remove_myself(); + _head.insert_after(i); + } + void push_back(item *i) { + if (!i->empty()) + i->remove_myself(); + _head.insert_before(i); + } + + T front(size_t o=0) { + ceph_assert(!_head.empty()); + return _head._next->get_item(o ? o : item_offset); + } + T back(size_t o=0) { + ceph_assert(!_head.empty()); + return _head._prev->get_item(o ? o : item_offset); + } + + void pop_front() { + ceph_assert(!empty()); + _head._next->remove_myself(); + } + void pop_back() { + ceph_assert(!empty()); + _head._prev->remove_myself(); + } + + void clear_list() { + while (!empty()) + pop_front(); + } + + enum mode_t { + MAGIC, CURRENT, CACHE_NEXT + }; + + class iterator { + private: + item *head; + item *cur, *next; + size_t item_offset; + mode_t mode; + public: + iterator(item *h, size_t o, mode_t m) : + head(h), cur(h->_next), next(cur->_next), item_offset(o), + mode(m) { + ceph_assert(item_offset > 0); + } + T operator*() { + return cur->get_item(item_offset); + } + iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur != head); + if (mode == MAGIC) { + // if 'cur' appears to be valid, use that. otherwise, + // use cached 'next'. + // this is a bit magic, and probably a bad idea... :/ + if (cur->empty()) + cur = next; + else + cur = cur->_next; + } else if (mode == CURRENT) + cur = cur->_next; + else if (mode == CACHE_NEXT) + cur = next; + else + ceph_abort(); + next = cur->_next; + return *this; + } + bool end() const { + return cur == head; + } + }; + + iterator begin(size_t o=0) { + return iterator(&_head, o ? o : item_offset, MAGIC); + } + iterator begin_use_current(size_t o=0) { + return iterator(&_head, o ? o : item_offset, CURRENT); + } + iterator begin_cache_next(size_t o=0) { + return iterator(&_head, o ? o : item_offset, CACHE_NEXT); + } +}; + + +#endif diff --git a/src/include/encoding.h b/src/include/encoding.h new file mode 100644 index 000000000..40ba9d39c --- /dev/null +++ b/src/include/encoding.h @@ -0,0 +1,1548 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_ENCODING_H +#define CEPH_ENCODING_H + +#include <set> +#include <map> +#include <deque> +#include <vector> +#include <string> +#include <string_view> +#include <tuple> +#include <optional> +#include <boost/container/small_vector.hpp> +#include <boost/optional/optional_io.hpp> +#include <boost/tuple/tuple.hpp> + +#include "include/unordered_map.h" +#include "include/unordered_set.h" +#include "common/ceph_time.h" + +#include "include/int_types.h" + +#include "common/convenience.h" + +#include "byteorder.h" +#include "buffer.h" + +// pull in the new-style encoding so that we get the denc_traits<> definition. +#include "denc.h" + +#include "assert.h" + +using namespace ceph; + +namespace ceph { + +/* + * Notes on feature encoding: + * + * - The default encode() methods have a features argument with a default parameter + * (which goes to zero). + * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default. + * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which + * does not define the default. Any caller must explicitly pass it in. + * - STL container macros have two encode variants: one with a features arg, and one + * without. + * + * The result: + * - A feature encode() method will fail to compile if a value is not + * passed in. + * - The feature varianet of the STL templates will be used when the feature arg is + * provided. It will be passed through to any template arg types, but it will be + * ignored when not needed. + */ + +// -------------------------------------- +// base types + +template<class T> +inline void encode_raw(const T& t, bufferlist& bl) +{ + bl.append((char*)&t, sizeof(t)); +} +template<class T> +inline void decode_raw(T& t, bufferlist::const_iterator &p) +{ + p.copy(sizeof(t), (char*)&t); +} + +#define WRITE_RAW_ENCODER(type) \ + inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); } + +WRITE_RAW_ENCODER(__u8) +#ifndef _CHAR_IS_SIGNED +WRITE_RAW_ENCODER(__s8) +#endif +WRITE_RAW_ENCODER(char) +WRITE_RAW_ENCODER(ceph_le64) +WRITE_RAW_ENCODER(ceph_le32) +WRITE_RAW_ENCODER(ceph_le16) + +inline void encode(const bool &v, bufferlist& bl) { + __u8 vv = v; + encode_raw(vv, bl); +} +inline void decode(bool &v, bufferlist::const_iterator& p) { + __u8 vv; + decode_raw(vv, p); + v = vv; +} + + +// ----------------------------------- +// int types + +#define WRITE_INTTYPE_ENCODER(type, etype) \ + inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \ + ceph_##etype e; \ + e = v; \ + ::ceph::encode_raw(e, bl); \ + } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \ + ceph_##etype e; \ + ::ceph::decode_raw(e, p); \ + v = e; \ + } + +WRITE_INTTYPE_ENCODER(uint64_t, le64) +WRITE_INTTYPE_ENCODER(int64_t, le64) +WRITE_INTTYPE_ENCODER(uint32_t, le32) +WRITE_INTTYPE_ENCODER(int32_t, le32) +WRITE_INTTYPE_ENCODER(uint16_t, le16) +WRITE_INTTYPE_ENCODER(int16_t, le16) + +// ----------------------------------- +// float types +// +// NOTE: The following code assumes all supported platforms use IEEE binary32 +// as float and IEEE binary64 as double floating-point format. The assumption +// is verified by the assertions below. +// +// Under this assumption, we can use raw encoding of floating-point types +// on little-endian machines, but we still need to perform a byte swap +// on big-endian machines to ensure cross-architecture compatibility. +// To achive that, we reinterpret the values as integers first, which are +// byte-swapped via the ceph_le types as above. The extra conversions +// are optimized away on little-endian machines by the compiler. +#define WRITE_FLTTYPE_ENCODER(type, itype, etype) \ + static_assert(sizeof(type) == sizeof(itype)); \ + static_assert(std::numeric_limits<type>::is_iec559, \ + "floating-point type not using IEEE754 format"); \ + inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \ + ceph_##etype e; \ + e = *reinterpret_cast<itype *>(&v); \ + ::ceph::encode_raw(e, bl); \ + } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \ + ceph_##etype e; \ + ::ceph::decode_raw(e, p); \ + *reinterpret_cast<itype *>(&v) = e; \ + } + +WRITE_FLTTYPE_ENCODER(float, uint32_t, le32) +WRITE_FLTTYPE_ENCODER(double, uint64_t, le64) + +// see denc.h for ENCODE_DUMP_PATH discussion and definition. +#ifdef ENCODE_DUMP_PATH +# define ENCODE_DUMP_PRE() \ + unsigned pre_off = bl.length() +# define ENCODE_DUMP_POST(cl) \ + do { \ + static int i = 0; \ + i++; \ + int bits = 0; \ + for (unsigned t = i; t; bits++) \ + t &= t - 1; \ + if (bits > 2) \ + break; \ + char fn[PATH_MAX]; \ + snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \ + int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644); \ + if (fd >= 0) { \ + ::ceph::bufferlist sub; \ + sub.substr_of(bl, pre_off, bl.length() - pre_off); \ + sub.write_fd(fd); \ + ::close(fd); \ + } \ + } while (0) +#else +# define ENCODE_DUMP_PRE() +# define ENCODE_DUMP_POST(cl) +#endif + + +#define WRITE_CLASS_ENCODER(cl) \ + inline void encode(const cl& c, ::ceph::buffer::list &bl, uint64_t features=0) { \ + ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_MEMBER_ENCODER(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl) const { \ + ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_ENCODER_FEATURES(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \ + ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \ + ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + + +// string +inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0) +{ + __u32 len = s.length(); + encode(len, bl); + if (len) + bl.append(s.data(), len); +} +inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0) +{ + return encode(std::string_view(s), bl, features); +} +inline void decode(std::string& s, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + s.clear(); + p.copy(len, s); +} + +inline void encode_nohead(std::string_view s, bufferlist& bl) +{ + bl.append(s.data(), s.length()); +} +inline void encode_nohead(const std::string& s, bufferlist& bl) +{ + encode_nohead(std::string_view(s), bl); +} +inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p) +{ + s.clear(); + p.copy(len, s); +} + +// const char* (encode only, string compatible) +inline void encode(const char *s, bufferlist& bl) +{ + encode(std::string_view(s, strlen(s)), bl); +} + +// opaque byte vectors +inline void encode(std::vector<uint8_t>& v, bufferlist& bl) +{ + uint32_t len = v.size(); + encode(len, bl); + if (len) + bl.append((char *)v.data(), len); +} + +inline void decode(std::vector<uint8_t>& v, bufferlist::const_iterator& p) +{ + uint32_t len; + + decode(len, p); + v.resize(len); + p.copy(len, (char *)v.data()); +} + +// ----------------------------- +// buffers + +// bufferptr (encapsulated) +inline void encode(const buffer::ptr& bp, bufferlist& bl) +{ + __u32 len = bp.length(); + encode(len, bl); + if (len) + bl.append(bp); +} +inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + + bufferlist s; + p.copy(len, s); + + if (len) { + if (s.get_num_buffers() == 1) + bp = s.front(); + else + bp = buffer::copy(s.c_str(), s.length()); + } +} + +// bufferlist (encapsulated) +inline void encode(const bufferlist& s, bufferlist& bl) +{ + __u32 len = s.length(); + encode(len, bl); + bl.append(s); +} +inline void encode_destructively(bufferlist& s, bufferlist& bl) +{ + __u32 len = s.length(); + encode(len, bl); + bl.claim_append(s); +} +inline void decode(bufferlist& s, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + s.clear(); + p.copy(len, s); +} + +inline void encode_nohead(const bufferlist& s, bufferlist& bl) +{ + bl.append(s); +} +inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p) +{ + s.clear(); + p.copy(len, s); +} + +// Time, since the templates are defined in std::chrono + +template<typename Clock, typename Duration, + typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr> +void encode(const std::chrono::time_point<Clock, Duration>& t, + ceph::bufferlist &bl) { + auto ts = Clock::to_timespec(t); + // A 32 bit count of seconds causes me vast unhappiness. + uint32_t s = ts.tv_sec; + uint32_t ns = ts.tv_nsec; + encode(s, bl); + encode(ns, bl); +} + +template<typename Clock, typename Duration, + typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr> +void decode(std::chrono::time_point<Clock, Duration>& t, + bufferlist::const_iterator& p) { + uint32_t s; + uint32_t ns; + decode(s, p); + decode(ns, p); + struct timespec ts = { + static_cast<time_t>(s), + static_cast<long int>(ns)}; + + t = Clock::from_timespec(ts); +} + +template<typename Rep, typename Period, + typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr> +void encode(const std::chrono::duration<Rep, Period>& d, + ceph::bufferlist &bl) { + using namespace std::chrono; + int32_t s = duration_cast<seconds>(d).count(); + int32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count(); + encode(s, bl); + encode(ns, bl); +} + +template<typename Rep, typename Period, + typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr> +void decode(std::chrono::duration<Rep, Period>& d, + bufferlist::const_iterator& p) { + int32_t s; + int32_t ns; + decode(s, p); + decode(ns, p); + d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns); +} + +// ----------------------------- +// STL container types + +template<typename T> +inline void encode(const boost::optional<T> &p, bufferlist &bl); +template<typename T> +inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp); +template<typename T> +inline void encode(const std::optional<T> &p, bufferlist &bl); +template<typename T> +inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp); +template<class A, class B, class C> +inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl); +template<class A, class B, class C> +inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || !b_traits::supported> +encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> +encode(const std::pair<A,B> &p, bufferlist &bl); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> +decode(std::pair<A,B> &pa, bufferlist::const_iterator &p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::list<T, Alloc>& ls, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p); +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl); +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl, uint64_t features); +template<class T, class Alloc> +inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist::iterator& p); +template<class T, class Comp, class Alloc> +inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc> +inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::vector<T,Alloc>& v, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p); +template<class T,class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl, + uint64_t features); +template<class T, class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl); +template<class T, class Alloc> +inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist::const_iterator& p); +// small_vector +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p); +// std::map +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || + !u_traits::supported> +encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl, + uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc> +inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl, + uint64_t features); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class Hash, class Pred, class Alloc> +inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl); +template<class T, class Hash, class Pred, class Alloc> +inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features); +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl); +template<class T, class Alloc> +inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::array<T, N>& v, bufferlist::const_iterator& p); + +// full bl decoder +template<class T> +inline void decode(T &o, const bufferlist& bl) +{ + auto p = bl.begin(); + decode(o, p); + ceph_assert(p.end()); +} + +// boost optional +template<typename T> +inline void encode(const boost::optional<T> &p, bufferlist &bl) +{ + __u8 present = static_cast<bool>(p); + encode(present, bl); + if (p) + encode(p.get(), bl); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +template<typename T> +inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp) +{ + __u8 present; + decode(present, bp); + if (present) { + p = T{}; + decode(p.get(), bp); + } else { + p = boost::none; + } +} +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +// std optional +template<typename T> +inline void encode(const std::optional<T> &p, bufferlist &bl) +{ + __u8 present = static_cast<bool>(p); + encode(present, bl); + if (p) + encode(*p, bl); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +template<typename T> +inline void decode(std::optional<T> &p, bufferlist::const_iterator &bp) +{ + __u8 present; + decode(present, bp); + if (present) { + p = T{}; + decode(*p, bp); + } else { + p = std::nullopt; + } +} + +// std::tuple +template<typename... Ts> +inline void encode(const std::tuple<Ts...> &t, bufferlist& bl) +{ + ceph::for_each(t, [&bl](const auto& e) { + encode(e, bl); + }); +} +template<typename... Ts> +inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp) +{ + ceph::for_each(t, [&bp](auto& e) { + decode(e, bp); + }); +} + +//triple boost::tuple +template<class A, class B, class C> +inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl) +{ + encode(boost::get<0>(t), bl); + encode(boost::get<1>(t), bl); + encode(boost::get<2>(t), bl); +} +template<class A, class B, class C> +inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp) +{ + decode(boost::get<0>(t), bp); + decode(boost::get<1>(t), bp); + decode(boost::get<2>(t), bp); +} + +// std::pair<A,B> +template<class A, class B, + typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || !b_traits::supported> + encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features) +{ + encode(p.first, bl, features); + encode(p.second, bl, features); +} +template<class A, class B, + typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> + encode(const std::pair<A,B> &p, bufferlist &bl) +{ + encode(p.first, bl); + encode(p.second, bl); +} +template<class A, class B, typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> + decode(std::pair<A,B> &pa, bufferlist::const_iterator &p) +{ + decode(pa.first, p); + decode(pa.second, p); +} + +// std::list<T> +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::list<T, Alloc>& ls, bufferlist& bl) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features) +{ + using counter_encode_t = ceph_le32; + unsigned n = 0; + auto filler = bl.append_hole(sizeof(counter_encode_t)); + for (const auto& item : ls) { + // we count on our own because of buggy std::list::size() implementation + // which doesn't follow the O(1) complexity constraint C++11 has brought. + ++n; + encode(item, bl, features); + } + counter_encode_t en; + en = n; + filler.copy_in(sizeof(en), reinterpret_cast<char*>(&en)); +} + +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + ls.emplace_back(); + decode(ls.back(), p); + } +} + +// std::list<std::shared_ptr<T>> +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (const auto& ref : ls) { + encode(*ref, bl); + } +} +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (const auto& ref : ls) { + encode(*ref, bl, features); + } +} +template<class T, class Alloc> +inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + auto ref = std::make_shared<T>(); + decode(*ref, p); + ls.emplace_back(std::move(ref)); + } +} + +// std::set<T> +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Comp, class Alloc, typename traits> +inline typename std::enable_if<!traits::supported>::type + encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl) +{ + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + for (int i=0; i<len; i++) { + T v; + decode(v, p); + s.insert(v); + } +} + +// boost::container::flat_set<T> +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (const auto& e : s) + encode(e, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + s.reserve(n); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist& bl) +{ + for (const auto& e : s) + encode(e, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist::iterator& p) +{ + s.reserve(len); + for (int i=0; i<len; i++) { + T v; + decode(v, p); + s.insert(v); + } +} + +// multiset +template<class T, class Comp, class Alloc> +inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc> +inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl, features); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::vector<T,Alloc>& v, bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.resize(n); + for (__u32 i=0; i<n; i++) + decode(v[i], p); +} + +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl) +{ + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p) +{ + v.resize(len); + for (__u32 i=0; i<v.size(); i++) + decode(v[i], p); +} + +// small vector +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& i : v) + encode(i, bl, features); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& i : v) + encode(i, bl); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.resize(n); + for (auto& i : v) + decode(i, p); +} + +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl) +{ + for (const auto& i : v) + encode(i, bl); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p) +{ + v.resize(len); + for (auto& i : v) + decode(i, p); +} + + +// vector (shared_ptr) +template<class T,class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& ref : v) { + if (ref) + encode(*ref, bl, features); + else + encode(T(), bl, features); + } +} +template<class T, class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& ref : v) { + if (ref) + encode(*ref, bl); + else + encode(T(), bl); + } +} +template<class T, class Alloc> +inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.clear(); + v.reserve(n); + while (n--) { + auto ref = std::make_shared<T>(); + decode(*ref, p); + v.emplace_back(std::move(ref)); + } +} + +// map +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || + !u_traits::supported> + encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// boost::container::flat-map +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename boost::container::flat_map<T,U,Comp>::const_iterator p + = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + m.reserve(n); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.reserve(m.size() + n); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl, uint64_t features) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p) +{ + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// multimap +template<class T, class U, class Comp, class Alloc> +inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + typename std::pair<T,U> tu = std::pair<T,U>(); + decode(tu.first, p); + typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu); + decode(it->second, p); + } +} + +// ceph::unordered_map +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Hash, class Pred, class Alloc> +inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// ceph::unordered_set +template<class T, class Hash, class Pred, class Alloc> +inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) + encode(*p, bl); +} +template<class T, class Hash, class Pred, class Alloc> +inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + m.insert(k); + } +} + +// deque +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features) +{ + __u32 n = ls.size(); + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl, features); +} +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl) +{ + __u32 n = ls.size(); + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc> +inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + ls.emplace_back(); + decode(ls.back(), p); + } +} + +// std::array<T, N> +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features) +{ + for (const auto& e : v) + encode(e, bl, features); +} +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl) +{ + for (const auto& e : v) + encode(e, bl); +} +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +decode(std::array<T, N>& v, bufferlist::const_iterator& p) +{ + for (auto& e : v) + decode(e, p); +} +} + +/* + * guards + */ + +/** + * start encoding block + * + * @param v current (code) version of the encoding + * @param compat oldest code version that can decode it + * @param bl bufferlist to encode to + * + */ +#define ENCODE_START(v, compat, bl) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + ceph_le32 struct_len; \ + auto filler = (bl).append_hole(sizeof(struct_v) + \ + sizeof(struct_compat) + sizeof(struct_len)); \ + const auto starting_bl_len = (bl).length(); \ + using ::ceph::encode; \ + do { + +/** + * finish encoding block + * + * @param bl bufferlist we were encoding to + * @param new_struct_compat struct-compat value to use + */ +#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat) \ + } while (false); \ + if (new_struct_compat) { \ + struct_compat = new_struct_compat; \ + } \ + struct_len = (bl).length() - starting_bl_len; \ + filler.copy_in(sizeof(struct_v), (char *)&struct_v); \ + filler.copy_in(sizeof(struct_compat), \ + (char *)&struct_compat); \ + filler.copy_in(sizeof(struct_len), (char *)&struct_len); + +#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0) + +#define DECODE_ERR_OLDVERSION(func, v, compatv) \ + (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv)) + +#define DECODE_ERR_PAST(func) \ + (std::string(func) + " decode past end of struct encoding") + +/** + * check for very old encoding + * + * If the encoded data is older than oldestv, raise an exception. + * + * @param oldestv oldest version of the code we can successfully decode. + */ +#define DECODE_OLDEST(oldestv) \ + if (struct_v < oldestv) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv)); + +/** + * start a decoding block + * + * @param v current version of the encoding that the code supports/encodes + * @param bl bufferlist::iterator for the encoded data + */ +#define DECODE_START(v, bl) \ + __u8 struct_v, struct_compat; \ + using ::ceph::decode; \ + decode(struct_v, bl); \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + unsigned struct_end = bl.get_off() + struct_len; \ + do { + +/* BEWARE: any change to this macro MUST be also reflected in the duplicative + * DECODE_START_LEGACY_COMPAT_LEN! */ +#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl) \ + using ::ceph::decode; \ + __u8 struct_v; \ + decode(struct_v, bl); \ + if (struct_v >= compatv) { \ + __u8 struct_compat; \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \ + } else if (skip_v) { \ + if (bl.get_remaining() < skip_v) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + bl += skip_v; \ + } \ + unsigned struct_end = 0; \ + if (struct_v >= lenv) { \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + struct_end = bl.get_off() + struct_len; \ + } \ + do { + +/** + * start a decoding block with legacy support for older encoding schemes + * + * The old encoding schemes has a __u8 struct_v only, or lacked either + * the compat version or length. Skip those fields conditionally. + * + * Most of the time, v, compatv, and lenv will all match the version + * where the structure was switched over to the new macros. + * + * @param v current version of the encoding that the code supports/encodes + * @param compatv oldest version that includes a __u8 compat version field + * @param lenv oldest version that includes a __u32 length wrapper + * @param bl bufferlist::iterator containing the encoded data + */ + +/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which + * MUST be changed altogether. For the rationale behind code duplication, + * please `git blame` and refer to the commit message. */ +#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl) \ + using ::ceph::decode; \ + __u8 struct_v; \ + decode(struct_v, bl); \ + if (struct_v >= compatv) { \ + __u8 struct_compat; \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION( \ + __PRETTY_FUNCTION__, v, struct_compat)); \ + } \ + unsigned struct_end = 0; \ + if (struct_v >= lenv) { \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + struct_end = bl.get_off() + struct_len; \ + } \ + do { + +/** + * start a decoding block with legacy support for older encoding schemes + * + * This version of the macro assumes the legacy encoding had a 32 bit + * version + * + * The old encoding schemes has a __u8 struct_v only, or lacked either + * the compat version or length. Skip those fields conditionally. + * + * Most of the time, v, compatv, and lenv will all match the version + * where the structure was switched over to the new macros. + * + * @param v current version of the encoding that the code supports/encodes + * @param compatv oldest version that includes a __u8 compat version field + * @param lenv oldest version that includes a __u32 length wrapper + * @param bl bufferlist::iterator containing the encoded data + */ +#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl) \ + __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl) + +#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl) \ + __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl) + +/** + * finish decode block + * + * @param bl bufferlist::iterator we were decoding from + */ +#define DECODE_FINISH(bl) \ + } while (false); \ + if (struct_end) { \ + if (bl.get_off() > struct_end) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + if (bl.get_off() < struct_end) \ + bl += struct_end - bl.get_off(); \ + } + +namespace ceph { + +/* + * Encoders/decoders to read from current offset in a file handle and + * encode/decode the data according to argument types. + */ +inline ssize_t decode_file(int fd, std::string &str) +{ + bufferlist bl; + __u32 len = 0; + bl.read_fd(fd, sizeof(len)); + decode(len, bl); + bl.read_fd(fd, len); + decode(str, bl); + return bl.length(); +} + +inline ssize_t decode_file(int fd, bufferptr &bp) +{ + bufferlist bl; + __u32 len = 0; + bl.read_fd(fd, sizeof(len)); + decode(len, bl); + bl.read_fd(fd, len); + auto bli = std::cbegin(bl); + + decode(bp, bli); + return bl.length(); +} +} + +#endif diff --git a/src/include/err.h b/src/include/err.h new file mode 100644 index 000000000..c188e9753 --- /dev/null +++ b/src/include/err.h @@ -0,0 +1,31 @@ +#ifndef CEPH_ERR_H +#define CEPH_ERR_H + +/* + * adapted from linux 2.6.24 include/linux/err.h + */ +#define MAX_ERRNO 4095 +#define IS_ERR_VALUE(x) ((x) >= (uintptr_t)-MAX_ERRNO) + +#include <errno.h> +#include <stdint.h> +#include <stdbool.h> + +/* this generates a warning in c++; caller can do the cast manually +static inline void *ERR_PTR(long error) +{ + return (void *) error; +} +*/ + +static inline intptr_t PTR_ERR(const void *ptr) +{ + return (intptr_t) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((uintptr_t)ptr); +} + +#endif diff --git a/src/include/error.h b/src/include/error.h new file mode 100644 index 000000000..a548d9756 --- /dev/null +++ b/src/include/error.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <stdarg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) + +#define ASSERT(c) \ + ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) + +/* print usage error message and exit */ +extern void userror(const char *use, const char *fmt, ...); + +/* print system error message and exit */ +extern void syserror(const char *fmt, ...); + +/* print error message and exit */ +extern void exiterror(const char *fmt, ...); + +/* print error message */ +extern void error(const char *fmt, ...); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/include/event_type.h b/src/include/event_type.h new file mode 100644 index 000000000..aa6ddedb4 --- /dev/null +++ b/src/include/event_type.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_EVENT_TYPE_H +#define CEPH_COMMON_EVENT_TYPE_H + +#define EVENT_SOCKET_TYPE_NONE 0 +#define EVENT_SOCKET_TYPE_PIPE 1 +#define EVENT_SOCKET_TYPE_EVENTFD 2 + +#endif diff --git a/src/include/expected.hpp b/src/include/expected.hpp new file mode 100644 index 000000000..740c6ad24 --- /dev/null +++ b/src/include/expected.hpp @@ -0,0 +1,2282 @@ +/// +// expected - An implementation of std::expected with extensions +// Written in 2017 by Simon Brand (@TartanLlama) +// +// To the extent possible under law, the author(s) have dedicated all +// copyright and related and neighboring rights to this software to the +// public domain worldwide. This software is distributed without any warranty. +// +// You should have received a copy of the CC0 Public Domain Dedication +// along with this software. If not, see +// <http://creativecommons.org/publicdomain/zero/1.0/>. +/// + +#ifndef TL_EXPECTED_HPP +#define TL_EXPECTED_HPP + +#define TL_EXPECTED_VERSION_MAJOR 0 +#define TL_EXPECTED_VERSION_MINOR 2 + +#include <exception> +#include <functional> +#include <type_traits> +#include <utility> + +#if defined(__EXCEPTIONS) || defined(_CPPUNWIND) +#define TL_EXPECTED_EXCEPTIONS_ENABLED +#endif + +#if (defined(_MSC_VER) && _MSC_VER == 1900) +/// \exclude +#define TL_EXPECTED_MSVC2015 +#define TL_EXPECTED_MSVC2015_CONSTEXPR +#else +#define TL_EXPECTED_MSVC2015_CONSTEXPR constexpr +#endif + +#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \ + !defined(__clang__)) +/// \exclude +#define TL_EXPECTED_GCC49 +#endif + +#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 && \ + !defined(__clang__)) +/// \exclude +#define TL_EXPECTED_GCC54 +#endif + +#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 && \ + !defined(__clang__)) +/// \exclude +#define TL_EXPECTED_GCC55 +#endif + +#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 && \ + !defined(__clang__)) +// GCC < 5 doesn't support overloading on const&& for member functions +/// \exclude +#define TL_EXPECTED_NO_CONSTRR + +// GCC < 5 doesn't support some standard C++11 type traits +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \ + std::has_trivial_copy_constructor<T> +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \ + std::has_trivial_copy_assign<T> + +// This one will be different for GCC 5.7 if it's ever supported +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) \ + std::is_trivially_destructible<T> + +// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks std::vector +// for non-copyable types +#elif (defined(__GNUC__) && __GNUC__ < 8 && \ + !defined(__clang__)) +#ifndef TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX +#define TL_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX +namespace tl { + namespace detail { + template<class T> + struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{}; +#ifdef _GLIBCXX_VECTOR + template<class T, class A> + struct is_trivially_copy_constructible<std::vector<T,A>> + : std::is_trivially_copy_constructible<T>{}; +#endif + } +} +#endif + +#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \ + tl::detail::is_trivially_copy_constructible<T> +#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \ + std::is_trivially_copy_assignable<T> +#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T> +#else +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \ + std::is_trivially_copy_constructible<T> +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \ + std::is_trivially_copy_assignable<T> +/// \exclude +#define TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T) \ + std::is_trivially_destructible<T> +#endif + +#if __cplusplus > 201103L +/// \exclude +#define TL_EXPECTED_CXX14 +#endif + +#ifdef TL_EXPECTED_GCC49 +#define TL_EXPECTED_GCC49_CONSTEXPR +#else +#define TL_EXPECTED_GCC49_CONSTEXPR constexpr +#endif + +#if (__cplusplus == 201103L || defined(TL_EXPECTED_MSVC2015) || \ + defined(TL_EXPECTED_GCC49)) +/// \exclude +#define TL_EXPECTED_11_CONSTEXPR +#else +/// \exclude +#define TL_EXPECTED_11_CONSTEXPR constexpr +#endif + +namespace tl { +template <class T, class E> class expected; + +#ifndef TL_MONOSTATE_INPLACE_MUTEX +#define TL_MONOSTATE_INPLACE_MUTEX +/// \brief Used to represent an expected with no data +class monostate {}; + +/// \brief A tag type to tell expected to construct its value in-place +struct in_place_t { + explicit in_place_t() = default; +}; +/// \brief A tag to tell expected to construct its value in-place +static constexpr in_place_t in_place{}; +#endif + +/// Used as a wrapper to store the unexpected value +template <class E> class unexpected { +public: + static_assert(!std::is_same<E, void>::value, "E must not be void"); + + unexpected() = delete; + constexpr explicit unexpected(const E &e) : m_val(e) {} + + constexpr explicit unexpected(E &&e) : m_val(std::move(e)) {} + + /// \returns the contained value + /// \group unexpected_value + constexpr const E &value() const & { return m_val; } + /// \group unexpected_value + TL_EXPECTED_11_CONSTEXPR E &value() & { return m_val; } + /// \group unexpected_value + TL_EXPECTED_11_CONSTEXPR E &&value() && { return std::move(m_val); } + /// \exclude + constexpr const E &&value() const && { return std::move(m_val); } + +private: + E m_val; +}; + +/// \brief Compares two unexpected objects +/// \details Simply compares lhs.value() to rhs.value() +/// \group unexpected_relop +template <class E> +constexpr bool operator==(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() == rhs.value(); +} +/// \group unexpected_relop +template <class E> +constexpr bool operator!=(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() != rhs.value(); +} +/// \group unexpected_relop +template <class E> +constexpr bool operator<(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() < rhs.value(); +} +/// \group unexpected_relop +template <class E> +constexpr bool operator<=(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() <= rhs.value(); +} +/// \group unexpected_relop +template <class E> +constexpr bool operator>(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() > rhs.value(); +} +/// \group unexpected_relop +template <class E> +constexpr bool operator>=(const unexpected<E> &lhs, const unexpected<E> &rhs) { + return lhs.value() >= rhs.value(); +} + +/// Create an `unexpected` from `e`, deducing the return type +/// +/// *Example:* +/// auto e1 = tl::make_unexpected(42); +/// unexpected<int> e2 (42); //same semantics +template <class E> +unexpected<typename std::decay<E>::type> make_unexpected(E &&e) { + return unexpected<typename std::decay<E>::type>(std::forward<E>(e)); +} + +/// \brief A tag type to tell expected to construct the unexpected value +struct unexpect_t { + unexpect_t() = default; +}; +/// \brief A tag to tell expected to construct the unexpected value +static constexpr unexpect_t unexpect{}; + +/// \exclude +namespace detail { +template<typename E> +[[noreturn]] TL_EXPECTED_11_CONSTEXPR void throw_exception(E &&e) { +#ifdef TL_EXPECTED_EXCEPTIONS_ENABLED + throw std::forward<E>(e); +#else + #ifdef _MSC_VER + __assume(0); + #else + __builtin_unreachable(); + #endif +#endif +} + +#ifndef TL_TRAITS_MUTEX +#define TL_TRAITS_MUTEX +// C++14-style aliases for brevity +template <class T> using remove_const_t = typename std::remove_const<T>::type; +template <class T> +using remove_reference_t = typename std::remove_reference<T>::type; +template <class T> using decay_t = typename std::decay<T>::type; +template <bool E, class T = void> +using enable_if_t = typename std::enable_if<E, T>::type; +template <bool B, class T, class F> +using conditional_t = typename std::conditional<B, T, F>::type; + +// std::conjunction from C++17 +template <class...> struct conjunction : std::true_type {}; +template <class B> struct conjunction<B> : B {}; +template <class B, class... Bs> +struct conjunction<B, Bs...> + : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {}; + +// std::invoke from C++17 +// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround +template <typename Fn, typename... Args, + typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>{}>, + int = 0> +constexpr auto invoke(Fn &&f, Args &&... args) noexcept( + noexcept(std::mem_fn(f)(std::forward<Args>(args)...))) + -> decltype(std::mem_fn(f)(std::forward<Args>(args)...)) { + return std::mem_fn(f)(std::forward<Args>(args)...); +} + +template <typename Fn, typename... Args, + typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>{}>> +constexpr auto invoke(Fn &&f, Args &&... args) noexcept( + noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...))) + -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)) { + return std::forward<Fn>(f)(std::forward<Args>(args)...); +} + +// std::invoke_result from C++17 +template <class F, class, class... Us> struct invoke_result_impl; + +template <class F, class... Us> +struct invoke_result_impl< + F, decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...), void()), + Us...> { + using type = decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...)); +}; + +template <class F, class... Us> +using invoke_result = invoke_result_impl<F, void, Us...>; + +template <class F, class... Us> +using invoke_result_t = typename invoke_result<F, Us...>::type; +#endif + +// Trait for checking if a type is a tl::expected +template <class T> struct is_expected_impl : std::false_type {}; +template <class T, class E> +struct is_expected_impl<expected<T, E>> : std::true_type {}; +template <class T> using is_expected = is_expected_impl<decay_t<T>>; + +template <class T, class E, class U> +using expected_enable_forward_value = detail::enable_if_t< + std::is_constructible<T, U &&>::value && + !std::is_same<detail::decay_t<U>, in_place_t>::value && + !std::is_same<expected<T, E>, detail::decay_t<U>>::value && + !std::is_same<unexpected<E>, detail::decay_t<U>>::value>; + +template <class T, class E, class U, class G, class UR, class GR> +using expected_enable_from_other = detail::enable_if_t< + std::is_constructible<T, UR>::value && + std::is_constructible<E, GR>::value && + !std::is_constructible<T, expected<U, G> &>::value && + !std::is_constructible<T, expected<U, G> &&>::value && + !std::is_constructible<T, const expected<U, G> &>::value && + !std::is_constructible<T, const expected<U, G> &&>::value && + !std::is_convertible<expected<U, G> &, T>::value && + !std::is_convertible<expected<U, G> &&, T>::value && + !std::is_convertible<const expected<U, G> &, T>::value && + !std::is_convertible<const expected<U, G> &&, T>::value>; + +template <class T, class U> +using is_void_or = conditional_t<std::is_void<T>::value, std::true_type, U>; + +template <class T> +using is_copy_constructible_or_void = + is_void_or<T, std::is_copy_constructible<T>>; + +template <class T> +using is_move_constructible_or_void = + is_void_or<T, std::is_move_constructible<T>>; + +template <class T> +using is_copy_assignable_or_void = + is_void_or<T, std::is_copy_assignable<T>>; + + +template <class T> +using is_move_assignable_or_void = + is_void_or<T, std::is_move_assignable<T>>; + + +} // namespace detail + +/// \exclude +namespace detail { +struct no_init_t {}; +static constexpr no_init_t no_init{}; + +// Implements the storage of the values, and ensures that the destructor is +// trivial if it can be. +// +// This specialization is for where neither `T` or `E` is trivially +// destructible, so the destructors must be called on destruction of the +// `expected` +template <class T, class E, bool = std::is_trivially_destructible<T>::value, + bool = std::is_trivially_destructible<E>::value> +struct expected_storage_base { + constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {} + constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * = + nullptr> + constexpr expected_storage_base(in_place_t, Args &&... args) + : m_val(std::forward<Args>(args)...), m_has_val(true) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr expected_storage_base(in_place_t, std::initializer_list<U> il, + Args &&... args) + : m_val(il, std::forward<Args>(args)...), m_has_val(true) {} + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() { + if (m_has_val) { + m_val.~T(); + } else { + m_unexpect.~unexpected<E>(); + } + } + union { + char m_no_init; + T m_val; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// This specialization is for when both `T` and `E` are trivially-destructible, +// so the destructor of the `expected` can be trivial. +template <class T, class E> struct expected_storage_base<T, E, true, true> { + constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {} + constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * = + nullptr> + constexpr expected_storage_base(in_place_t, Args &&... args) + : m_val(std::forward<Args>(args)...), m_has_val(true) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr expected_storage_base(in_place_t, std::initializer_list<U> il, + Args &&... args) + : m_val(il, std::forward<Args>(args)...), m_has_val(true) {} + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() = default; + union { + char m_no_init; + T m_val; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// T is trivial, E is not. +template <class T, class E> struct expected_storage_base<T, E, true, false> { + constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {} + TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base(no_init_t) + : m_no_init(), m_has_val(false) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * = + nullptr> + constexpr expected_storage_base(in_place_t, Args &&... args) + : m_val(std::forward<Args>(args)...), m_has_val(true) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr expected_storage_base(in_place_t, std::initializer_list<U> il, + Args &&... args) + : m_val(il, std::forward<Args>(args)...), m_has_val(true) {} + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() { + if (!m_has_val) { + m_unexpect.~unexpected<E>(); + } + } + + union { + char m_no_init; + T m_val; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// E is trivial, T is not. +template <class T, class E> struct expected_storage_base<T, E, false, true> { + constexpr expected_storage_base() : m_val(T{}), m_has_val(true) {} + constexpr expected_storage_base(no_init_t) : m_no_init(), m_has_val(false) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * = + nullptr> + constexpr expected_storage_base(in_place_t, Args &&... args) + : m_val(std::forward<Args>(args)...), m_has_val(true) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr expected_storage_base(in_place_t, std::initializer_list<U> il, + Args &&... args) + : m_val(il, std::forward<Args>(args)...), m_has_val(true) {} + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() { + if (m_has_val) { + m_val.~T(); + } + } + union { + char m_no_init; + T m_val; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// `T` is `void`, `E` is trivially-destructible +template <class E> struct expected_storage_base<void, E, false, true> { + TL_EXPECTED_MSVC2015_CONSTEXPR expected_storage_base() : m_has_val(true) {} + constexpr expected_storage_base(no_init_t) : m_val(), m_has_val(false) {} + + constexpr expected_storage_base(in_place_t) : m_has_val(true) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() = default; + struct dummy {}; + union { + dummy m_val; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// `T` is `void`, `E` is not trivially-destructible +template <class E> struct expected_storage_base<void, E, false, false> { + constexpr expected_storage_base() : m_dummy(), m_has_val(true) {} + constexpr expected_storage_base(no_init_t) : m_dummy(), m_has_val(false) {} + + constexpr expected_storage_base(in_place_t) : m_dummy(), m_has_val(true) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected_storage_base(unexpect_t, Args &&... args) + : m_unexpect(std::forward<Args>(args)...), m_has_val(false) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected_storage_base(unexpect_t, + std::initializer_list<U> il, + Args &&... args) + : m_unexpect(il, std::forward<Args>(args)...), m_has_val(false) {} + + ~expected_storage_base() { + if (!m_has_val) { + m_unexpect.~unexpected<E>(); + } + } + + union { + char m_dummy; + unexpected<E> m_unexpect; + }; + bool m_has_val; +}; + +// This base class provides some handy member functions which can be used in +// further derived classes +template <class T, class E> +struct expected_operations_base : expected_storage_base<T, E> { + using expected_storage_base<T, E>::expected_storage_base; + + template <class... Args> void construct(Args &&... args) noexcept { + new (std::addressof(this->m_val)) T(std::forward<Args>(args)...); + this->m_has_val = true; + } + + template <class Rhs> void construct_with(Rhs &&rhs) noexcept { + new (std::addressof(this->m_val)) T(std::forward<Rhs>(rhs).get()); + this->m_has_val = true; + } + + template <class... Args> void construct_error(Args &&... args) noexcept { + new (std::addressof(this->m_unexpect)) + unexpected<E>(std::forward<Args>(args)...); + this->m_has_val = false; + } + + #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED + + // These assign overloads ensure that the most efficient assignment + // implementation is used while maintaining the strong exception guarantee. + // The problematic case is where rhs has a value, but *this does not. + // + // This overload handles the case where we can just copy-construct `T` + // directly into place without throwing. + template <class U = T, + detail::enable_if_t<std::is_nothrow_copy_constructible<U>::value> + * = nullptr> + void assign(const expected_operations_base &rhs) noexcept { + if (!this->m_has_val && rhs.m_has_val) { + geterr().~unexpected<E>(); + construct(rhs.get()); + } else { + assign_common(rhs); + } + } + + // This overload handles the case where we can attempt to create a copy of + // `T`, then no-throw move it into place if the copy was successful. + template <class U = T, + detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value && + std::is_nothrow_move_constructible<U>::value> + * = nullptr> + void assign(const expected_operations_base &rhs) noexcept { + if (!this->m_has_val && rhs.m_has_val) { + T tmp = rhs.get(); + geterr().~unexpected<E>(); + construct(std::move(tmp)); + } else { + assign_common(rhs); + } + } + + // This overload is the worst-case, where we have to move-construct the + // unexpected value into temporary storage, then try to copy the T into place. + // If the construction succeeds, then everything is fine, but if it throws, + // then we move the old unexpected value back into place before rethrowing the + // exception. + template <class U = T, + detail::enable_if_t<!std::is_nothrow_copy_constructible<U>::value && + !std::is_nothrow_move_constructible<U>::value> + * = nullptr> + void assign(const expected_operations_base &rhs) { + if (!this->m_has_val && rhs.m_has_val) { + auto tmp = std::move(geterr()); + geterr().~unexpected<E>(); + + try { + construct(rhs.get()); + } catch (...) { + geterr() = std::move(tmp); + throw; + } + } else { + assign_common(rhs); + } + } + + // These overloads do the same as above, but for rvalues + template <class U = T, + detail::enable_if_t<std::is_nothrow_move_constructible<U>::value> + * = nullptr> + void assign(expected_operations_base &&rhs) noexcept { + if (!this->m_has_val && rhs.m_has_val) { + geterr().~unexpected<E>(); + construct(std::move(rhs).get()); + } else { + assign_common(std::move(rhs)); + } + } + + template <class U = T, + detail::enable_if_t<!std::is_nothrow_move_constructible<U>::value> + * = nullptr> + void assign(expected_operations_base &&rhs) { + if (!this->m_has_val && rhs.m_has_val) { + auto tmp = std::move(geterr()); + geterr().~unexpected<E>(); + try { + construct(std::move(rhs).get()); + } catch (...) { + geterr() = std::move(tmp); + throw; + } + } else { + assign_common(std::move(rhs)); + } + } + + #else + + // If exceptions are disabled then we can just copy-construct + void assign(const expected_operations_base &rhs) noexcept { + if (!this->m_has_val && rhs.m_has_val) { + geterr().~unexpected<E>(); + construct(rhs.get()); + } else { + assign_common(rhs); + } + } + + void assign(expected_operations_base &&rhs) noexcept { + if (!this->m_has_val && rhs.m_has_val) { + geterr().~unexpected<E>(); + construct(std::move(rhs).get()); + } else { + assign_common(rhs); + } + } + + #endif + + // The common part of move/copy assigning + template <class Rhs> void assign_common(Rhs &&rhs) { + if (this->m_has_val) { + if (rhs.m_has_val) { + get() = std::forward<Rhs>(rhs).get(); + } else { + destroy_val(); + construct_error(std::forward<Rhs>(rhs).geterr()); + } + } else { + if (!rhs.m_has_val) { + geterr() = std::forward<Rhs>(rhs).geterr(); + } + } + } + + bool has_value() const { return this->m_has_val; } + + TL_EXPECTED_11_CONSTEXPR T &get() & { return this->m_val; } + constexpr const T &get() const & { return this->m_val; } + TL_EXPECTED_11_CONSTEXPR T &&get() && { return std::move(this->m_val); } +#ifndef TL_EXPECTED_NO_CONSTRR + constexpr const T &&get() const && { return std::move(this->m_val); } +#endif + + TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & { + return this->m_unexpect; + } + constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; } + TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && { + return std::move(this->m_unexpect); + } +#ifndef TL_EXPECTED_NO_CONSTRR + constexpr const unexpected<E> &&geterr() const && { + return std::move(this->m_unexpect); + } +#endif + + constexpr void destroy_val() { + get().~T(); + } +}; + +// This base class provides some handy member functions which can be used in +// further derived classes +template <class E> +struct expected_operations_base<void, E> : expected_storage_base<void, E> { + using expected_storage_base<void, E>::expected_storage_base; + + template <class... Args> void construct() noexcept { this->m_has_val = true; } + + // This function doesn't use its argument, but needs it so that code in + // levels above this can work independently of whether T is void + template <class Rhs> void construct_with(Rhs &&) noexcept { + this->m_has_val = true; + } + + template <class... Args> void construct_error(Args &&... args) noexcept { + new (std::addressof(this->m_unexpect)) + unexpected<E>(std::forward<Args>(args)...); + this->m_has_val = false; + } + + template <class Rhs> void assign(Rhs &&rhs) noexcept { + if (!this->m_has_val) { + if (rhs.m_has_val) { + geterr().~unexpected<E>(); + construct(); + } else { + geterr() = std::forward<Rhs>(rhs).geterr(); + } + } else { + if (!rhs.m_has_val) { + construct_error(std::forward<Rhs>(rhs).geterr()); + } + } + } + + bool has_value() const { return this->m_has_val; } + + TL_EXPECTED_11_CONSTEXPR unexpected<E> &geterr() & { + return this->m_unexpect; + } + constexpr const unexpected<E> &geterr() const & { return this->m_unexpect; } + TL_EXPECTED_11_CONSTEXPR unexpected<E> &&geterr() && { + return std::move(this->m_unexpect); + } +#ifndef TL_EXPECTED_NO_CONSTRR + constexpr const unexpected<E> &&geterr() const && { + return std::move(this->m_unexpect); + } +#endif + + constexpr void destroy_val() { + //no-op + } +}; + +// This class manages conditionally having a trivial copy constructor +// This specialization is for when T and E are trivially copy constructible +template <class T, class E, + bool = is_void_or<T, TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)>:: + value &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value> +struct expected_copy_base : expected_operations_base<T, E> { + using expected_operations_base<T, E>::expected_operations_base; +}; + +// This specialization is for when T or E are not trivially copy constructible +template <class T, class E> +struct expected_copy_base<T, E, false> : expected_operations_base<T, E> { + using expected_operations_base<T, E>::expected_operations_base; + + expected_copy_base() = default; + expected_copy_base(const expected_copy_base &rhs) + : expected_operations_base<T, E>(no_init) { + if (rhs.has_value()) { + this->construct_with(rhs); + } else { + this->construct_error(rhs.geterr()); + } + } + + expected_copy_base(expected_copy_base &&rhs) = default; + expected_copy_base &operator=(const expected_copy_base &rhs) = default; + expected_copy_base &operator=(expected_copy_base &&rhs) = default; +}; + +// This class manages conditionally having a trivial move constructor +// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it +// doesn't implement an analogue to std::is_trivially_move_constructible. We +// have to make do with a non-trivial move constructor even if T is trivially +// move constructible +#ifndef TL_EXPECTED_GCC49 +template <class T, class E, + bool = is_void_or<T, std::is_trivially_move_constructible<T>>::value + &&std::is_trivially_move_constructible<E>::value> +struct expected_move_base : expected_copy_base<T, E> { + using expected_copy_base<T, E>::expected_copy_base; +}; +#else +template <class T, class E, bool = false> struct expected_move_base; +#endif +template <class T, class E> +struct expected_move_base<T, E, false> : expected_copy_base<T, E> { + using expected_copy_base<T, E>::expected_copy_base; + + expected_move_base() = default; + expected_move_base(const expected_move_base &rhs) = default; + + expected_move_base(expected_move_base &&rhs) noexcept( + std::is_nothrow_move_constructible<T>::value) + : expected_copy_base<T, E>(no_init) { + if (rhs.has_value()) { + this->construct_with(std::move(rhs)); + } else { + this->construct_error(std::move(rhs.geterr())); + } + } + expected_move_base &operator=(const expected_move_base &rhs) = default; + expected_move_base &operator=(expected_move_base &&rhs) = default; +}; + +// This class manages conditionally having a trivial copy assignment operator +template <class T, class E, + bool = is_void_or< + T, conjunction<TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(T), + TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T), + TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(T)>>::value + &&TL_EXPECTED_IS_TRIVIALLY_COPY_ASSIGNABLE(E)::value + &&TL_EXPECTED_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(E)::value + &&TL_EXPECTED_IS_TRIVIALLY_DESTRUCTIBLE(E)::value> +struct expected_copy_assign_base : expected_move_base<T, E> { + using expected_move_base<T, E>::expected_move_base; +}; + +template <class T, class E> +struct expected_copy_assign_base<T, E, false> : expected_move_base<T, E> { + using expected_move_base<T, E>::expected_move_base; + + expected_copy_assign_base() = default; + expected_copy_assign_base(const expected_copy_assign_base &rhs) = default; + + expected_copy_assign_base(expected_copy_assign_base &&rhs) = default; + expected_copy_assign_base &operator=(const expected_copy_assign_base &rhs) { + this->assign(rhs); + return *this; + } + expected_copy_assign_base & + operator=(expected_copy_assign_base &&rhs) = default; +}; + +// This class manages conditionally having a trivial move assignment operator +// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it +// doesn't implement an analogue to std::is_trivially_move_assignable. We have +// to make do with a non-trivial move assignment operator even if T is trivially +// move assignable +#ifndef TL_EXPECTED_GCC49 +template <class T, class E, + bool = + is_void_or<T, conjunction<std::is_trivially_destructible<T>, + std::is_trivially_move_constructible<T>, + std::is_trivially_move_assignable<T>>>:: + value &&std::is_trivially_destructible<E>::value + &&std::is_trivially_move_constructible<E>::value + &&std::is_trivially_move_assignable<E>::value> +struct expected_move_assign_base : expected_copy_assign_base<T, E> { + using expected_copy_assign_base<T, E>::expected_copy_assign_base; +}; +#else +template <class T, class E, bool = false> struct expected_move_assign_base; +#endif + +template <class T, class E> +struct expected_move_assign_base<T, E, false> + : expected_copy_assign_base<T, E> { + using expected_copy_assign_base<T, E>::expected_copy_assign_base; + + expected_move_assign_base() = default; + expected_move_assign_base(const expected_move_assign_base &rhs) = default; + + expected_move_assign_base(expected_move_assign_base &&rhs) = default; + + expected_move_assign_base & + operator=(const expected_move_assign_base &rhs) = default; + + expected_move_assign_base & + operator=(expected_move_assign_base &&rhs) noexcept( + std::is_nothrow_move_constructible<T>::value + &&std::is_nothrow_move_assignable<T>::value) { + this->assign(std::move(rhs)); + return *this; + } +}; + +// expected_delete_ctor_base will conditionally delete copy and move +// constructors depending on whether T is copy/move constructible +template <class T, class E, + bool EnableCopy = (is_copy_constructible_or_void<T>::value && + std::is_copy_constructible<E>::value), + bool EnableMove = (is_move_constructible_or_void<T>::value && + std::is_move_constructible<E>::value)> +struct expected_delete_ctor_base { + expected_delete_ctor_base() = default; + expected_delete_ctor_base(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default; + expected_delete_ctor_base & + operator=(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base & + operator=(expected_delete_ctor_base &&) noexcept = default; +}; + +template <class T, class E> +struct expected_delete_ctor_base<T, E, true, false> { + expected_delete_ctor_base() = default; + expected_delete_ctor_base(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete; + expected_delete_ctor_base & + operator=(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base & + operator=(expected_delete_ctor_base &&) noexcept = default; +}; + +template <class T, class E> +struct expected_delete_ctor_base<T, E, false, true> { + expected_delete_ctor_base() = default; + expected_delete_ctor_base(const expected_delete_ctor_base &) = delete; + expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = default; + expected_delete_ctor_base & + operator=(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base & + operator=(expected_delete_ctor_base &&) noexcept = default; +}; + +template <class T, class E> +struct expected_delete_ctor_base<T, E, false, false> { + expected_delete_ctor_base() = default; + expected_delete_ctor_base(const expected_delete_ctor_base &) = delete; + expected_delete_ctor_base(expected_delete_ctor_base &&) noexcept = delete; + expected_delete_ctor_base & + operator=(const expected_delete_ctor_base &) = default; + expected_delete_ctor_base & + operator=(expected_delete_ctor_base &&) noexcept = default; +}; + +// expected_delete_assign_base will conditionally delete copy and move +// constructors depending on whether T and E are copy/move constructible + +// assignable +template <class T, class E, + bool EnableCopy = (is_copy_constructible_or_void<T>::value && + std::is_copy_constructible<E>::value && + is_copy_assignable_or_void<T>::value && + std::is_copy_assignable<E>::value), + bool EnableMove = (is_move_constructible_or_void<T>::value && + std::is_move_constructible<E>::value && + is_move_assignable_or_void<T>::value && + std::is_move_assignable<E>::value)> +struct expected_delete_assign_base { + expected_delete_assign_base() = default; + expected_delete_assign_base(const expected_delete_assign_base &) = default; + expected_delete_assign_base(expected_delete_assign_base &&) noexcept = + default; + expected_delete_assign_base & + operator=(const expected_delete_assign_base &) = default; + expected_delete_assign_base & + operator=(expected_delete_assign_base &&) noexcept = default; +}; + +template <class T, class E> +struct expected_delete_assign_base<T, E, true, false> { + expected_delete_assign_base() = default; + expected_delete_assign_base(const expected_delete_assign_base &) = default; + expected_delete_assign_base(expected_delete_assign_base &&) noexcept = + default; + expected_delete_assign_base & + operator=(const expected_delete_assign_base &) = default; + expected_delete_assign_base & + operator=(expected_delete_assign_base &&) noexcept = delete; +}; + +template <class T, class E> +struct expected_delete_assign_base<T, E, false, true> { + expected_delete_assign_base() = default; + expected_delete_assign_base(const expected_delete_assign_base &) = default; + expected_delete_assign_base(expected_delete_assign_base &&) noexcept = + default; + expected_delete_assign_base & + operator=(const expected_delete_assign_base &) = delete; + expected_delete_assign_base & + operator=(expected_delete_assign_base &&) noexcept = default; +}; + +template <class T, class E> +struct expected_delete_assign_base<T, E, false, false> { + expected_delete_assign_base() = default; + expected_delete_assign_base(const expected_delete_assign_base &) = default; + expected_delete_assign_base(expected_delete_assign_base &&) noexcept = + default; + expected_delete_assign_base & + operator=(const expected_delete_assign_base &) = delete; + expected_delete_assign_base & + operator=(expected_delete_assign_base &&) noexcept = delete; +}; + +// This is needed to be able to construct the expected_default_ctor_base which +// follows, while still conditionally deleting the default constructor. +struct default_constructor_tag { + explicit constexpr default_constructor_tag() = default; +}; + +// expected_default_ctor_base will ensure that expected has a deleted default +// consturctor if T is not default constructible. +// This specialization is for when T is default constructible +template <class T, class E, + bool Enable = + std::is_default_constructible<T>::value || std::is_void<T>::value> +struct expected_default_ctor_base { + constexpr expected_default_ctor_base() noexcept = default; + constexpr expected_default_ctor_base( + expected_default_ctor_base const &) noexcept = default; + constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept = + default; + expected_default_ctor_base & + operator=(expected_default_ctor_base const &) noexcept = default; + expected_default_ctor_base & + operator=(expected_default_ctor_base &&) noexcept = default; + + constexpr explicit expected_default_ctor_base(default_constructor_tag) {} +}; + +// This specialization is for when T is not default constructible +template <class T, class E> struct expected_default_ctor_base<T, E, false> { + constexpr expected_default_ctor_base() noexcept = delete; + constexpr expected_default_ctor_base( + expected_default_ctor_base const &) noexcept = default; + constexpr expected_default_ctor_base(expected_default_ctor_base &&) noexcept = + default; + expected_default_ctor_base & + operator=(expected_default_ctor_base const &) noexcept = default; + expected_default_ctor_base & + operator=(expected_default_ctor_base &&) noexcept = default; + + constexpr explicit expected_default_ctor_base(default_constructor_tag) {} +}; +} // namespace detail + +template <class E> class bad_expected_access : public std::exception { +public: + explicit bad_expected_access(E e) : m_val(std::move(e)) {} + + virtual const char *what() const noexcept override { + return "Bad expected access"; + } + + const E &error() const & { return m_val; } + E &error() & { return m_val; } + const E &&error() const && { return std::move(m_val); } + E &&error() && { return std::move(m_val); } + +private: + E m_val; +}; + +/// An `expected<T, E>` object is an object that contains the storage for +/// another object and manages the lifetime of this contained object `T`. +/// Alternatively it could contain the storage for another unexpected object +/// `E`. The contained object may not be initialized after the expected object +/// has been initialized, and may not be destroyed before the expected object +/// has been destroyed. The initialization state of the contained object is +/// tracked by the expected object. +template <class T, class E> +class expected : private detail::expected_move_assign_base<T, E>, + private detail::expected_delete_ctor_base<T, E>, + private detail::expected_delete_assign_base<T, E>, + private detail::expected_default_ctor_base<T, E> { + static_assert(!std::is_reference<T>::value, "T must not be a reference"); + static_assert(!std::is_same<T, std::remove_cv<in_place_t>>::value, + "T must not be in_place_t"); + static_assert(!std::is_same<T, std::remove_cv<unexpect_t>>::value, + "T must not be unexpect_t"); + static_assert(!std::is_same<T, std::remove_cv<unexpected<E>>>::value, + "T must not be unexpected<E>"); + static_assert(!std::is_reference<E>::value, "E must not be a reference"); + + T *valptr() { return std::addressof(this->m_val); } + const T *valptr() const { return std::addressof(this->m_val); } + unexpected<E> *errptr() { return std::addressof(this->m_unexpect); } + const unexpected<E> *errptr() const { return std::addressof(this->m_unexpect); } + + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + U &val() { + return this->m_val; + } + unexpected<E> &err() { return this->m_unexpect; } + + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + const U &val() const { + return this->m_val; + } + const unexpected<E> &err() const { return this->m_unexpect; } + + using impl_base = detail::expected_move_assign_base<T, E>; + using ctor_base = detail::expected_default_ctor_base<T, E>; + +public: + typedef T value_type; + typedef E error_type; + typedef unexpected<E> unexpected_type; + +#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \ + !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55) + /// \group and_then + /// Carries out some operation which returns an expected on the stored object + /// if there is one. \requires `std::invoke(std::forward<F>(f), value())` + /// returns an `expected<U>` for some `U`. \returns Let `U` be the result + /// of `std::invoke(std::forward<F>(f), value())`. Returns an + /// `expected<U>`. The return value is empty if `*this` is empty, + /// otherwise the return value of `std::invoke(std::forward<F>(f), value())` + /// is returned. + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &; + template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) & { + return and_then_impl(*this, std::forward<F>(f)); + } + + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&; + template <class F> TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && { + return and_then_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &; + template <class F> constexpr auto and_then(F &&f) const & { + return and_then_impl(*this, std::forward<F>(f)); + } + +#ifndef TL_EXPECTED_NO_CONSTRR + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&; + template <class F> constexpr auto and_then(F &&f) const && { + return and_then_impl(std::move(*this), std::forward<F>(f)); + } +#endif + +#else + /// \group and_then + /// Carries out some operation which returns an expected on the stored object + /// if there is one. \requires `std::invoke(std::forward<F>(f), value())` + /// returns an `expected<U>` for some `U`. \returns Let `U` be the result + /// of `std::invoke(std::forward<F>(f), value())`. Returns an + /// `expected<U>`. The return value is empty if `*this` is empty, + /// otherwise the return value of `std::invoke(std::forward<F>(f), value())` + /// is returned. + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &; + template <class F> + TL_EXPECTED_11_CONSTEXPR auto + and_then(F &&f) & -> decltype(and_then_impl(*this, std::forward<F>(f))) { + return and_then_impl(*this, std::forward<F>(f)); + } + + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&; + template <class F> + TL_EXPECTED_11_CONSTEXPR auto and_then(F &&f) && -> decltype( + and_then_impl(std::move(*this), std::forward<F>(f))) { + return and_then_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &; + template <class F> + constexpr auto and_then(F &&f) const & -> decltype( + and_then_impl(*this, std::forward<F>(f))) { + return and_then_impl(*this, std::forward<F>(f)); + } + +#ifndef TL_EXPECTED_NO_CONSTRR + /// \group and_then + /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&; + template <class F> + constexpr auto and_then(F &&f) const && -> decltype( + and_then_impl(std::move(*this), std::forward<F>(f))) { + return and_then_impl(std::move(*this), std::forward<F>(f)); + } +#endif +#endif + +#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \ + !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55) + /// \brief Carries out some operation on the stored object if there is one. + /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f), + /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise + // returns an `expected<U,E>`. If `*this` is unexpected, the + /// result is `*this`, otherwise an `expected<U,E>` is constructed from the + /// return value of `std::invoke(std::forward<F>(f), value())` and is + /// returned. + /// + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) &; + template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) & { + return expected_map_impl(*this, std::forward<F>(f)); + } + + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) &&; + template <class F> TL_EXPECTED_11_CONSTEXPR auto map(F &&f) && { + return expected_map_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) const &; + template <class F> constexpr auto map(F &&f) const & { + return expected_map_impl(*this, std::forward<F>(f)); + } + + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) const &&; + template <class F> constexpr auto map(F &&f) const && { + return expected_map_impl(std::move(*this), std::forward<F>(f)); + } +#else + /// \brief Carries out some operation on the stored object if there is one. + /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f), + /// value())`. If `U` is `void`, returns an `expected<monostate,E>, otherwise + // returns an `expected<U,E>`. If `*this` is unexpected, the + /// result is `*this`, otherwise an `expected<U,E>` is constructed from the + /// return value of `std::invoke(std::forward<F>(f), value())` and is + /// returned. + /// + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) &; + template <class F> + TL_EXPECTED_11_CONSTEXPR decltype( + expected_map_impl(std::declval<expected &>(), std::declval<F &&>())) + map(F &&f) & { + return expected_map_impl(*this, std::forward<F>(f)); + } + + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) &&; + template <class F> + TL_EXPECTED_11_CONSTEXPR decltype( + expected_map_impl(std::declval<expected>(), std::declval<F &&>())) + map(F &&f) && { + return expected_map_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) const &; + template <class F> + constexpr decltype(expected_map_impl(std::declval<const expected &>(), + std::declval<F &&>())) + map(F &&f) const & { + return expected_map_impl(*this, std::forward<F>(f)); + } + +#ifndef TL_EXPECTED_NO_CONSTRR + /// \group map + /// \synopsis template <class F> constexpr auto map(F &&f) const &&; + template <class F> + constexpr decltype(expected_map_impl(std::declval<const expected &&>(), + std::declval<F &&>())) + map(F &&f) const && { + return expected_map_impl(std::move(*this), std::forward<F>(f)); + } +#endif +#endif + +#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \ + !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55) + /// \brief Carries out some operation on the stored unexpected object if there + /// is one. + /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f), + /// value())`. If `U` is `void`, returns an `expected<T,monostate>`, otherwise + /// returns an `expected<T,U>`. If `*this` has an expected + /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed + /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is + /// returned. + /// + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) &; + template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) & { + return map_error_impl(*this, std::forward<F>(f)); + } + + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) &&; + template <class F> TL_EXPECTED_11_CONSTEXPR auto map_error(F &&f) && { + return map_error_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) const &; + template <class F> constexpr auto map_error(F &&f) const & { + return map_error_impl(*this, std::forward<F>(f)); + } + + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&; + template <class F> constexpr auto map_error(F &&f) const && { + return map_error_impl(std::move(*this), std::forward<F>(f)); + } +#else + /// \brief Carries out some operation on the stored unexpected object if there + /// is one. + /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f), + /// value())`. Returns an `expected<T,U>`. If `*this` has an expected + /// value, the result is `*this`, otherwise an `expected<T,U>` is constructed + /// from `make_unexpected(std::invoke(std::forward<F>(f), value()))` and is + /// returned. + /// + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) &; + template <class F> + TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &>(), + std::declval<F &&>())) + map_error(F &&f) & { + return map_error_impl(*this, std::forward<F>(f)); + } + + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) &&; + template <class F> + TL_EXPECTED_11_CONSTEXPR decltype(map_error_impl(std::declval<expected &&>(), + std::declval<F &&>())) + map_error(F &&f) && { + return map_error_impl(std::move(*this), std::forward<F>(f)); + } + + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) const &; + template <class F> + constexpr decltype(map_error_impl(std::declval<const expected &>(), + std::declval<F &&>())) + map_error(F &&f) const & { + return map_error_impl(*this, std::forward<F>(f)); + } + +#ifndef TL_EXPECTED_NO_CONSTRR + /// \group map_error + /// \synopsis template <class F> constexpr auto map_error(F &&f) const &&; + template <class F> + constexpr decltype(map_error_impl(std::declval<const expected &&>(), + std::declval<F &&>())) + map_error(F &&f) const && { + return map_error_impl(std::move(*this), std::forward<F>(f)); + } +#endif +#endif + + /// \brief Calls `f` if the expectd is in the unexpected state + /// \requires `F` is invokable with `E`, and `std::invoke_result_t<F>` + /// must be void or convertible to `expcted<T,E>`. + /// \effects If `*this` has a value, returns `*this`. + /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)(E)` and returns + /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)(E)`. + /// + /// \group or_else + template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) & { + return or_else_impl(*this, std::forward<F>(f)); + } + + template <class F> expected TL_EXPECTED_11_CONSTEXPR or_else(F &&f) && { + return or_else_impl(std::move(*this), std::forward<F>(f)); + } + + template <class F> expected constexpr or_else(F &&f) const & { + return or_else_impl(*this, std::forward<F>(f)); + } + +#ifndef TL_EXPECTED_NO_CONSTRR + template <class F> expected constexpr or_else(F &&f) const && { + return or_else_impl(std::move(*this), std::forward<F>(f)); + } +#endif + constexpr expected() = default; + constexpr expected(const expected &rhs) = default; + constexpr expected(expected &&rhs) = default; + expected &operator=(const expected &rhs) = default; + expected &operator=(expected &&rhs) = default; + + template <class... Args, + detail::enable_if_t<std::is_constructible<T, Args &&...>::value> * = + nullptr> + constexpr expected(in_place_t, Args &&... args) + : impl_base(in_place, std::forward<Args>(args)...), + ctor_base(detail::default_constructor_tag{}) {} + + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr expected(in_place_t, std::initializer_list<U> il, Args &&... args) + : impl_base(in_place, il, std::forward<Args>(args)...), + ctor_base(detail::default_constructor_tag{}) {} + + /// \group unexpected_ctor + /// \synopsis EXPLICIT constexpr expected(const unexpected<G> &e); + template <class G = E, + detail::enable_if_t<std::is_constructible<E, const G &>::value> * = + nullptr, + detail::enable_if_t<!std::is_convertible<const G &, E>::value> * = + nullptr> + explicit constexpr expected(const unexpected<G> &e) + : impl_base(unexpect, e.value()), + ctor_base(detail::default_constructor_tag{}) {} + + /// \exclude + template < + class G = E, + detail::enable_if_t<std::is_constructible<E, const G &>::value> * = + nullptr, + detail::enable_if_t<std::is_convertible<const G &, E>::value> * = nullptr> + constexpr expected(unexpected<G> const &e) + : impl_base(unexpect, e.value()), + ctor_base(detail::default_constructor_tag{}) {} + + /// \group unexpected_ctor + /// \synopsis EXPLICIT constexpr expected(unexpected<G> &&e); + template < + class G = E, + detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr, + detail::enable_if_t<!std::is_convertible<G &&, E>::value> * = nullptr> + explicit constexpr expected(unexpected<G> &&e) noexcept( + std::is_nothrow_constructible<E, G &&>::value) + : impl_base(unexpect, std::move(e.value())), + ctor_base(detail::default_constructor_tag{}) {} + + /// \exclude + template < + class G = E, + detail::enable_if_t<std::is_constructible<E, G &&>::value> * = nullptr, + detail::enable_if_t<std::is_convertible<G &&, E>::value> * = nullptr> + constexpr expected(unexpected<G> &&e) noexcept( + std::is_nothrow_constructible<E, G &&>::value) + : impl_base(unexpect, std::move(e.value())), + ctor_base(detail::default_constructor_tag{}) {} + + template <class... Args, + detail::enable_if_t<std::is_constructible<E, Args &&...>::value> * = + nullptr> + constexpr explicit expected(unexpect_t, Args &&... args) + : impl_base(unexpect, std::forward<Args>(args)...), + ctor_base(detail::default_constructor_tag{}) {} + + /// \exclude + template <class U, class... Args, + detail::enable_if_t<std::is_constructible< + E, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + constexpr explicit expected(unexpect_t, std::initializer_list<U> il, + Args &&... args) + : impl_base(unexpect, il, std::forward<Args>(args)...), + ctor_base(detail::default_constructor_tag{}) {} + + template <class U, class G, + detail::enable_if_t<!(std::is_convertible<U const &, T>::value && + std::is_convertible<G const &, E>::value)> * = + nullptr, + detail::expected_enable_from_other<T, E, U, G, const U &, const G &> + * = nullptr> + explicit TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs) + : ctor_base(detail::default_constructor_tag{}) { + if (rhs.has_value()) { + this->construct(*rhs); + } else { + this->construct_error(rhs.error()); + } + } + + /// \exclude + template <class U, class G, + detail::enable_if_t<(std::is_convertible<U const &, T>::value && + std::is_convertible<G const &, E>::value)> * = + nullptr, + detail::expected_enable_from_other<T, E, U, G, const U &, const G &> + * = nullptr> + TL_EXPECTED_11_CONSTEXPR expected(const expected<U, G> &rhs) + : ctor_base(detail::default_constructor_tag{}) { + if (rhs.has_value()) { + this->construct(*rhs); + } else { + this->construct_error(rhs.error()); + } + } + + template < + class U, class G, + detail::enable_if_t<!(std::is_convertible<U &&, T>::value && + std::is_convertible<G &&, E>::value)> * = nullptr, + detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr> + explicit TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs) + : ctor_base(detail::default_constructor_tag{}) { + if (rhs.has_value()) { + this->construct(std::move(*rhs)); + } else { + this->construct_error(std::move(rhs.error())); + } + } + + /// \exclude + template < + class U, class G, + detail::enable_if_t<(std::is_convertible<U &&, T>::value && + std::is_convertible<G &&, E>::value)> * = nullptr, + detail::expected_enable_from_other<T, E, U, G, U &&, G &&> * = nullptr> + TL_EXPECTED_11_CONSTEXPR expected(expected<U, G> &&rhs) + : ctor_base(detail::default_constructor_tag{}) { + if (rhs.has_value()) { + this->construct(std::move(*rhs)); + } else { + this->construct_error(std::move(rhs.error())); + } + } + + template < + class U = T, + detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr, + detail::expected_enable_forward_value<T, E, U> * = nullptr> + explicit TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v) + : expected(in_place, std::forward<U>(v)) {} + + /// \exclude + template < + class U = T, + detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr, + detail::expected_enable_forward_value<T, E, U> * = nullptr> + TL_EXPECTED_MSVC2015_CONSTEXPR expected(U &&v) + : expected(in_place, std::forward<U>(v)) {} + + template < + class U = T, class G = T, + detail::enable_if_t<std::is_nothrow_constructible<T, U &&>::value> * = + nullptr, + detail::enable_if_t<!std::is_void<G>::value> * = nullptr, + detail::enable_if_t< + (!std::is_same<expected<T, E>, detail::decay_t<U>>::value && + !detail::conjunction<std::is_scalar<T>, + std::is_same<T, detail::decay_t<U>>>::value && + std::is_constructible<T, U>::value && + std::is_assignable<G &, U>::value && + std::is_nothrow_move_constructible<E>::value)> * = nullptr> + expected &operator=(U &&v) { + if (has_value()) { + val() = std::forward<U>(v); + } else { + err().~unexpected<E>(); + ::new (valptr()) T(std::forward<U>(v)); + this->m_has_val = true; + } + + return *this; + } + + /// \exclude + template < + class U = T, class G = T, + detail::enable_if_t<!std::is_nothrow_constructible<T, U &&>::value> * = + nullptr, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr, + detail::enable_if_t< + (!std::is_same<expected<T, E>, detail::decay_t<U>>::value && + !detail::conjunction<std::is_scalar<T>, + std::is_same<T, detail::decay_t<U>>>::value && + std::is_constructible<T, U>::value && + std::is_assignable<G &, U>::value && + std::is_nothrow_move_constructible<E>::value)> * = nullptr> + expected &operator=(U &&v) { + if (has_value()) { + val() = std::forward<U>(v); + } else { + auto tmp = std::move(err()); + err().~unexpected<E>(); + + #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED + try { + ::new (valptr()) T(std::move(v)); + this->m_has_val = true; + } catch (...) { + err() = std::move(tmp); + throw; + } + #else + ::new (valptr()) T(std::move(v)); + this->m_has_val = true; + #endif + } + + return *this; + } + + template <class G = E, + detail::enable_if_t<std::is_nothrow_copy_constructible<G>::value && + std::is_assignable<G &, G>::value> * = nullptr> + expected &operator=(const unexpected<G> &rhs) { + if (!has_value()) { + err() = rhs; + } else { + this->destroy_val(); + ::new (errptr()) unexpected<E>(rhs); + this->m_has_val = false; + } + + return *this; + } + + template <class G = E, + detail::enable_if_t<std::is_nothrow_move_constructible<G>::value && + std::is_move_assignable<G>::value> * = nullptr> + expected &operator=(unexpected<G> &&rhs) noexcept { + if (!has_value()) { + err() = std::move(rhs); + } else { + this->destroy_val(); + ::new (errptr()) unexpected<E>(std::move(rhs)); + this->m_has_val = false; + } + + return *this; + } + + template <class... Args, detail::enable_if_t<std::is_nothrow_constructible< + T, Args &&...>::value> * = nullptr> + void emplace(Args &&... args) { + if (has_value()) { + val() = T(std::forward<Args>(args)...); + } else { + err().~unexpected<E>(); + ::new (valptr()) T(std::forward<Args>(args)...); + this->m_has_val = true; + } + } + + /// \exclude + template <class... Args, detail::enable_if_t<!std::is_nothrow_constructible< + T, Args &&...>::value> * = nullptr> + void emplace(Args &&... args) { + if (has_value()) { + val() = T(std::forward<Args>(args)...); + } else { + auto tmp = std::move(err()); + err().~unexpected<E>(); + + #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED + try { + ::new (valptr()) T(std::forward<Args>(args)...); + this->m_has_val = true; + } catch (...) { + err() = std::move(tmp); + throw; + } + #else + ::new (valptr()) T(std::forward<Args>(args)...); + this->m_has_val = true; + #endif + } + } + + template <class U, class... Args, + detail::enable_if_t<std::is_nothrow_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + void emplace(std::initializer_list<U> il, Args &&... args) { + if (has_value()) { + T t(il, std::forward<Args>(args)...); + val() = std::move(t); + } else { + err().~unexpected<E>(); + ::new (valptr()) T(il, std::forward<Args>(args)...); + this->m_has_val = true; + } + } + + /// \exclude + template <class U, class... Args, + detail::enable_if_t<!std::is_nothrow_constructible< + T, std::initializer_list<U> &, Args &&...>::value> * = nullptr> + void emplace(std::initializer_list<U> il, Args &&... args) { + if (has_value()) { + T t(il, std::forward<Args>(args)...); + val() = std::move(t); + } else { + auto tmp = std::move(err()); + err().~unexpected<E>(); + + #ifdef TL_EXPECTED_EXCEPTIONS_ENABLED + try { + ::new (valptr()) T(il, std::forward<Args>(args)...); + this->m_has_val = true; + } catch (...) { + err() = std::move(tmp); + throw; + } + #else + ::new (valptr()) T(il, std::forward<Args>(args)...); + this->m_has_val = true; + #endif + } + } + + // TODO SFINAE + void swap(expected &rhs) noexcept( + std::is_nothrow_move_constructible<T>::value &&noexcept( + swap(std::declval<T &>(), std::declval<T &>())) && + std::is_nothrow_move_constructible<E>::value && + noexcept(swap(std::declval<E &>(), std::declval<E &>()))) { + if (has_value() && rhs.has_value()) { + using std::swap; + swap(val(), rhs.val()); + } else if (!has_value() && rhs.has_value()) { + using std::swap; + swap(err(), rhs.err()); + } else if (has_value()) { + auto temp = std::move(rhs.err()); + ::new (rhs.valptr()) T(val()); + ::new (errptr()) unexpected_type(std::move(temp)); + std::swap(this->m_has_val, rhs.m_has_val); + } else { + auto temp = std::move(this->err()); + ::new (valptr()) T(rhs.val()); + ::new (errptr()) unexpected_type(std::move(temp)); + std::swap(this->m_has_val, rhs.m_has_val); + } + } + + /// \returns a pointer to the stored value + /// \requires a value is stored + /// \group pointer + constexpr const T *operator->() const { return valptr(); } + /// \group pointer + TL_EXPECTED_11_CONSTEXPR T *operator->() { return valptr(); } + + /// \returns the stored value + /// \requires a value is stored + /// \group deref + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + constexpr const U &operator*() const & { + return val(); + } + /// \group deref + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR U &operator*() & { + return val(); + } + /// \group deref + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + constexpr const U &&operator*() const && { + return std::move(val()); + } + /// \group deref + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR U &&operator*() && { + return std::move(val()); + } + + /// \returns whether or not the optional has a value + /// \group has_value + constexpr bool has_value() const noexcept { return this->m_has_val; } + /// \group has_value + constexpr explicit operator bool() const noexcept { return this->m_has_val; } + + /// \returns the contained value if there is one, otherwise throws + /// [bad_expected_access] + /// + /// \group value + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR const U &value() const & { + if (!has_value()) + detail::throw_exception(bad_expected_access<E>(err().value())); + return val(); + } + /// \group value + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR U &value() & { + if (!has_value()) + detail::throw_exception(bad_expected_access<E>(err().value())); + return val(); + } + /// \group value + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR const U &&value() const && { + if (!has_value()) + detail::throw_exception(bad_expected_access<E>(err().value())); + return std::move(val()); + } + /// \group value + template <class U = T, + detail::enable_if_t<!std::is_void<U>::value> * = nullptr> + TL_EXPECTED_11_CONSTEXPR U &&value() && { + if (!has_value()) + detail::throw_exception(bad_expected_access<E>(err().value())); + return std::move(val()); + } + + /// \returns the unexpected value + /// \requires there is an unexpected value + /// \group error + constexpr const E &error() const & { return err().value(); } + /// \group error + TL_EXPECTED_11_CONSTEXPR E &error() & { return err().value(); } + /// \group error + constexpr const E &&error() const && { return std::move(err().value()); } + /// \group error + TL_EXPECTED_11_CONSTEXPR E &&error() && { return std::move(err().value()); } + + /// \returns the stored value if there is one, otherwise returns `u` + /// \group value_or + template <class U> constexpr T value_or(U &&v) const & { + static_assert(std::is_copy_constructible<T>::value && + std::is_convertible<U &&, T>::value, + "T must be copy-constructible and convertible to from U&&"); + return bool(*this) ? **this : static_cast<T>(std::forward<U>(v)); + } + /// \group value_or + template <class U> TL_EXPECTED_11_CONSTEXPR T value_or(U &&v) && { + static_assert(std::is_move_constructible<T>::value && + std::is_convertible<U &&, T>::value, + "T must be move-constructible and convertible to from U&&"); + return bool(*this) ? std::move(**this) : static_cast<T>(std::forward<U>(v)); + } +}; + +/// \exclude +namespace detail { +template <class Exp> using exp_t = typename detail::decay_t<Exp>::value_type; +template <class Exp> using err_t = typename detail::decay_t<Exp>::error_type; +template <class Exp, class Ret> using ret_t = expected<Ret, err_t<Exp>>; + +#ifdef TL_EXPECTED_CXX14 +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>()))> +constexpr auto and_then_impl(Exp &&exp, F &&f) { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + + return exp.has_value() + ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp)) + : Ret(unexpect, exp.error()); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>()))> +constexpr auto and_then_impl(Exp &&exp, F &&f) { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + + return exp.has_value() ? detail::invoke(std::forward<F>(f)) + : Ret(unexpect, exp.error()); +} +#else +template <class> struct TC; +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>())), + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr> +auto and_then_impl(Exp &&exp, F &&f) -> Ret { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + + return exp.has_value() + ? detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp)) + : Ret(unexpect, exp.error()); +} + +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>())), + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr> +constexpr auto and_then_impl(Exp &&exp, F &&f) -> Ret { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + + return exp.has_value() ? detail::invoke(std::forward<F>(f)) + : Ret(unexpect, exp.error()); +} +#endif + +#ifdef TL_EXPECTED_CXX14 +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto expected_map_impl(Exp &&exp, F &&f) { + using result = ret_t<Exp, detail::decay_t<Ret>>; + return exp.has_value() ? result(detail::invoke(std::forward<F>(f), + *std::forward<Exp>(exp))) + : result(unexpect, std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto expected_map_impl(Exp &&exp, F &&f) { + using result = expected<void, err_t<Exp>>; + if (exp.has_value()) { + detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp)); + return result(); + } + + return result(unexpect, std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto expected_map_impl(Exp &&exp, F &&f) { + using result = ret_t<Exp, detail::decay_t<Ret>>; + return exp.has_value() ? result(detail::invoke(std::forward<F>(f))) + : result(unexpect, std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto expected_map_impl(Exp &&exp, F &&f) { + using result = expected<void, err_t<Exp>>; + if (exp.has_value()) { + detail::invoke(std::forward<F>(f)); + return result(); + } + + return result(unexpect, std::forward<Exp>(exp).error()); +} +#else +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> + +constexpr auto expected_map_impl(Exp &&exp, F &&f) + -> ret_t<Exp, detail::decay_t<Ret>> { + using result = ret_t<Exp, detail::decay_t<Ret>>; + + return exp.has_value() ? result(detail::invoke(std::forward<F>(f), + *std::forward<Exp>(exp))) + : result(unexpect, std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + *std::declval<Exp>())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> + +auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> { + if (exp.has_value()) { + detail::invoke(std::forward<F>(f), *std::forward<Exp>(exp)); + return {}; + } + + return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> + +constexpr auto expected_map_impl(Exp &&exp, F &&f) + -> ret_t<Exp, detail::decay_t<Ret>> { + using result = ret_t<Exp, detail::decay_t<Ret>>; + + return exp.has_value() ? result(detail::invoke(std::forward<F>(f))) + : result(unexpect, std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> + +auto expected_map_impl(Exp &&exp, F &&f) -> expected<void, err_t<Exp>> { + if (exp.has_value()) { + detail::invoke(std::forward<F>(f)); + return {}; + } + + return unexpected<err_t<Exp>>(std::forward<Exp>(exp).error()); +} +#endif + +#if defined(TL_EXPECTED_CXX14) && !defined(TL_EXPECTED_GCC49) && \ + !defined(TL_EXPECTED_GCC54) && !defined(TL_EXPECTED_GCC55) +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto map_error_impl(Exp &&exp, F &&f) { + using result = expected<exp_t<Exp>, detail::decay_t<Ret>>; + return exp.has_value() + ? result(*std::forward<Exp>(exp)) + : result(unexpect, detail::invoke(std::forward<F>(f), + std::forward<Exp>(exp).error())); +} +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto map_error_impl(Exp &&exp, F &&f) { + using result = expected<exp_t<Exp>, monostate>; + if (exp.has_value()) { + return result(*std::forward<Exp>(exp)); + } + + detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); + return result(unexpect, monostate{}); +} +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto map_error_impl(Exp &&exp, F &&f) { + using result = expected<exp_t<Exp>, detail::decay_t<Ret>>; + return exp.has_value() + ? result() + : result(unexpect, detail::invoke(std::forward<F>(f), + std::forward<Exp>(exp).error())); +} +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto map_error_impl(Exp &&exp, F &&f) { + using result = expected<exp_t<Exp>, monostate>; + if (exp.has_value()) { + return result(); + } + + detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); + return result(unexpect, monostate{}); +} +#else +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto map_error_impl(Exp &&exp, F &&f) + -> expected<exp_t<Exp>, detail::decay_t<Ret>> { + using result = expected<exp_t<Exp>, detail::decay_t<Ret>>; + + return exp.has_value() + ? result(*std::forward<Exp>(exp)) + : result(unexpect, detail::invoke(std::forward<F>(f), + std::forward<Exp>(exp).error())); +} + +template <class Exp, class F, + detail::enable_if_t<!std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> { + using result = expected<exp_t<Exp>, monostate>; + if (exp.has_value()) { + return result(*std::forward<Exp>(exp)); + } + + detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); + return result(unexpect, monostate{}); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto map_error_impl(Exp &&exp, F &&f) + -> expected<exp_t<Exp>, detail::decay_t<Ret>> { + using result = expected<exp_t<Exp>, detail::decay_t<Ret>>; + + return exp.has_value() + ? result() + : result(unexpect, detail::invoke(std::forward<F>(f), + std::forward<Exp>(exp).error())); +} + +template <class Exp, class F, + detail::enable_if_t<std::is_void<exp_t<Exp>>::value> * = nullptr, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +auto map_error_impl(Exp &&exp, F &&f) -> expected<exp_t<Exp>, monostate> { + using result = expected<exp_t<Exp>, monostate>; + if (exp.has_value()) { + return result(); + } + + detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); + return result(unexpect, monostate{}); +} +#endif + +#ifdef TL_EXPECTED_CXX14 +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +constexpr auto or_else_impl(Exp &&exp, F &&f) { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + return exp.has_value() + ? std::forward<Exp>(exp) + : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) { + return exp.has_value() + ? std::forward<Exp>(exp) + : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()), + std::forward<Exp>(exp)); +} +#else +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr> +auto or_else_impl(Exp &&exp, F &&f) -> Ret { + static_assert(detail::is_expected<Ret>::value, "F must return an expected"); + return exp.has_value() + ? std::forward<Exp>(exp) + : detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()); +} + +template <class Exp, class F, + class Ret = decltype(detail::invoke(std::declval<F>(), + std::declval<Exp>().error())), + detail::enable_if_t<std::is_void<Ret>::value> * = nullptr> +detail::decay_t<Exp> or_else_impl(Exp &&exp, F &&f) { + return exp.has_value() + ? std::forward<Exp>(exp) + : (detail::invoke(std::forward<F>(f), std::forward<Exp>(exp).error()), + std::forward<Exp>(exp)); +} +#endif +} // namespace detail + +template <class T, class E, class U, class F> +constexpr bool operator==(const expected<T, E> &lhs, + const expected<U, F> &rhs) { + return (lhs.has_value() != rhs.has_value()) + ? false + : (!lhs.has_value() ? lhs.error() == rhs.error() : *lhs == *rhs); +} +template <class T, class E, class U, class F> +constexpr bool operator!=(const expected<T, E> &lhs, + const expected<U, F> &rhs) { + return (lhs.has_value() != rhs.has_value()) + ? true + : (!lhs.has_value() ? lhs.error() != rhs.error() : *lhs != *rhs); +} + +template <class T, class E, class U> +constexpr bool operator==(const expected<T, E> &x, const U &v) { + return x.has_value() ? *x == v : false; +} +template <class T, class E, class U> +constexpr bool operator==(const U &v, const expected<T, E> &x) { + return x.has_value() ? *x == v : false; +} +template <class T, class E, class U> +constexpr bool operator!=(const expected<T, E> &x, const U &v) { + return x.has_value() ? *x != v : true; +} +template <class T, class E, class U> +constexpr bool operator!=(const U &v, const expected<T, E> &x) { + return x.has_value() ? *x != v : true; +} + +template <class T, class E> +constexpr bool operator==(const expected<T, E> &x, const unexpected<E> &e) { + return x.has_value() ? false : x.error() == e.value(); +} +template <class T, class E> +constexpr bool operator==(const unexpected<E> &e, const expected<T, E> &x) { + return x.has_value() ? false : x.error() == e.value(); +} +template <class T, class E> +constexpr bool operator!=(const expected<T, E> &x, const unexpected<E> &e) { + return x.has_value() ? true : x.error() != e.value(); +} +template <class T, class E> +constexpr bool operator!=(const unexpected<E> &e, const expected<T, E> &x) { + return x.has_value() ? true : x.error() != e.value(); +} + +// TODO is_swappable +template <class T, class E, + detail::enable_if_t<std::is_move_constructible<T>::value && + std::is_move_constructible<E>::value> * = nullptr> +void swap(expected<T, E> &lhs, + expected<T, E> &rhs) noexcept(noexcept(lhs.swap(rhs))) { + lhs.swap(rhs); +} +} // namespace tl + +#define TL_OPTIONAL_EXPECTED_MUTEX +#endif diff --git a/src/include/filepath.h b/src/include/filepath.h new file mode 100644 index 000000000..d0965ad0c --- /dev/null +++ b/src/include/filepath.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILEPATH_H +#define CEPH_FILEPATH_H + +/* + * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. + * -> should it be different? how? should this[0] be "", with depth 4? + * + */ + + +#include <iosfwd> +#include <string> +#include <string_view> +#include <vector> + +#include "buffer.h" +#include "encoding.h" +#include "include/types.h" +#include "include/fs_types.h" + +#include "common/Formatter.h" + + +class filepath { + inodeno_t ino = 0; // base inode. ino=0 implies pure relative path. + std::string path; // relative path. + + /** bits - path segments + * this is ['a', 'b', 'c'] for both the aboslute and relative case. + * + * NOTE: this value is LAZILY maintained... i.e. it's a cache + */ + mutable std::vector<std::string> bits; + bool encoded = false; + + void rebuild_path() { + path.clear(); + for (unsigned i=0; i<bits.size(); i++) { + if (i) path += "/"; + path += bits[i]; + } + } + void parse_bits() const { + bits.clear(); + int off = 0; + while (off < (int)path.length()) { + int nextslash = path.find('/', off); + if (nextslash < 0) + nextslash = path.length(); // no more slashes + if (((nextslash - off) > 0) || encoded) { + // skip empty components unless they were introduced deliberately + // see commit message for more detail + bits.push_back( path.substr(off,nextslash-off) ); + } + off = nextslash+1; + } + } + + public: + filepath() = default; + filepath(std::string_view p, inodeno_t i) : ino(i), path(p) {} + filepath(const filepath& o) { + ino = o.ino; + path = o.path; + bits = o.bits; + encoded = o.encoded; + } + filepath(inodeno_t i) : ino(i) {} + filepath& operator=(const char* path) { + set_path(path); + return *this; + } + + /* + * if we are fed a relative path as a string, either set ino=0 (strictly + * relative) or 1 (absolute). throw out any leading '/'. + */ + filepath(std::string_view s) { set_path(s); } + filepath(const char* s) { set_path(s); } + + void set_path(std::string_view s, inodeno_t b) { + path = s; + ino = b; + } + void set_path(std::string_view s) { + if (s[0] == '/') { + path = s.substr(1); + ino = 1; + } else { + ino = 0; + path = s; + } + bits.clear(); + } + + + // accessors + inodeno_t get_ino() const { return ino; } + const std::string& get_path() const { return path; } + const char *c_str() const { return path.c_str(); } + + int length() const { return path.length(); } + unsigned depth() const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits.size(); + } + bool empty() const { return path.length() == 0 && ino == 0; } + + bool absolute() const { return ino == 1; } + bool pure_relative() const { return ino == 0; } + bool ino_relative() const { return ino > 0; } + + const std::string& operator[](int i) const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits[i]; + } + + const std::string& last_dentry() const { + if (bits.empty() && path.length() > 0) parse_bits(); + ceph_assert(!bits.empty()); + return bits[ bits.size()-1 ]; + } + + filepath prefixpath(int s) const { + filepath t(ino); + for (int i=0; i<s; i++) + t.push_dentry(bits[i]); + return t; + } + filepath postfixpath(int s) const { + filepath t; + for (unsigned i=s; i<bits.size(); i++) + t.push_dentry(bits[i]); + return t; + } + + + // modifiers + // string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1) + void _set_ino(inodeno_t i) { ino = i; } + void clear() { + ino = 0; + path = ""; + bits.clear(); + } + + void pop_dentry() { + if (bits.empty() && path.length() > 0) + parse_bits(); + bits.pop_back(); + rebuild_path(); + } + void push_dentry(std::string_view s) { + if (bits.empty() && path.length() > 0) + parse_bits(); + if (!bits.empty()) + path += "/"; + path += s; + bits.emplace_back(s); + } + void push_dentry(const std::string& s) { + push_dentry(std::string_view(s)); + } + void push_dentry(const char *cs) { + push_dentry(std::string_view(cs, strlen(cs))); + } + void push_front_dentry(const std::string& s) { + bits.insert(bits.begin(), s); + rebuild_path(); + } + void append(const filepath& a) { + ceph_assert(a.pure_relative()); + for (unsigned i=0; i<a.depth(); i++) + push_dentry(a[i]); + } + + // encoding + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + __u8 struct_v = 1; + encode(struct_v, bl); + encode(ino, bl); + encode(path, bl); + } + void decode(ceph::buffer::list::const_iterator& blp) { + using ceph::decode; + bits.clear(); + __u8 struct_v; + decode(struct_v, blp); + decode(ino, blp); + decode(path, blp); + encoded = true; + } + void dump(ceph::Formatter *f) const { + f->dump_unsigned("base_ino", ino); + f->dump_string("relative_path", path); + } + static void generate_test_instances(std::list<filepath*>& o) { + o.push_back(new filepath); + o.push_back(new filepath("/usr/bin", 0)); + o.push_back(new filepath("/usr/sbin", 1)); + o.push_back(new filepath("var/log", 1)); + o.push_back(new filepath("foo/bar", 101)); + } + + bool is_last_dot_or_dotdot() const { + if (depth() > 0) { + std::string dname = last_dentry(); + if (dname == "." || dname == "..") { + return true; + } + } + + return false; + } + + bool is_last_snap() const { + // walk into snapdir? + return depth() > 0 && bits[0].length() == 0; + } +}; + +WRITE_CLASS_ENCODER(filepath) + +inline std::ostream& operator<<(std::ostream& out, const filepath& path) +{ + if (path.get_ino()) { + out << '#' << path.get_ino(); + if (path.length()) + out << '/'; + } + return out << path.get_path(); +} + +#endif diff --git a/src/include/frag.h b/src/include/frag.h new file mode 100644 index 000000000..ec18bddfb --- /dev/null +++ b/src/include/frag.h @@ -0,0 +1,615 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FRAG_H +#define CEPH_FRAG_H + +#include <boost/container/small_vector.hpp> + +#include <iostream> + +#include <stdint.h> +#include <stdio.h> + +#include "buffer.h" +#include "compact_map.h" + +#include "ceph_frag.h" +#include "include/encoding.h" +#include "include/ceph_assert.h" + +#include "common/dout.h" + +/* + * + * the goal here is to use a binary split strategy to partition a namespace. + * frag_t represents a particular fragment. bits() tells you the size of the + * fragment, and value() it's name. this is roughly analogous to an ip address + * and netmask. + * + * fragtree_t represents an entire namespace and it's partition. it essentially + * tells you where fragments are split into other fragments, and by how much + * (i.e. by how many bits, resulting in a power of 2 number of child fragments). + * + * this vaguely resembles a btree, in that when a fragment becomes large or small + * we can split or merge, except that there is no guarantee of being balanced. + * + * presumably we are partitioning the output of a (perhaps specialized) hash + * function. + */ + +/** + * frag_t + * + * description of an individual fragment. that is, a particular piece + * of the overall namespace. + * + * this is conceptually analogous to an ip address and netmask. + * + * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). + * + * we write it as v/b, where v is a value and b is the number of bits. + * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, + * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + * + * this makes the right most bit of v the "most significant", which is the + * opposite of what we usually see. + */ + +/* + * TODO: + * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) + * iteration efficient (see, e.g., try_assimilate_children() + * - rework frag_t so that we mask the left-most (most significant) bits instead of + * the right-most (least significant) bits. just because it's more intuitive, and + * matches the network/netmask concept. + */ + +class frag_t { + /* + * encoding is dictated by frag_* functions in ceph_fs.h. use those + * helpers _exclusively_. + */ +public: + using _frag_t = uint32_t; + + frag_t() = default; + frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } + frag_t(_frag_t e) : _enc(e) { } + + // constructors + void from_unsigned(unsigned e) { _enc = e; } + + // accessors + unsigned value() const { return ceph_frag_value(_enc); } + unsigned bits() const { return ceph_frag_bits(_enc); } + unsigned mask() const { return ceph_frag_mask(_enc); } + unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } + + operator _frag_t() const { return _enc; } + + // tests + bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } + bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } + bool is_root() const { return bits() == 0; } + frag_t parent() const { + ceph_assert(bits() > 0); + return frag_t(ceph_frag_parent(_enc)); + } + + // splitting + frag_t make_child(int i, int nb) const { + ceph_assert(i < (1<<nb)); + return frag_t(ceph_frag_make_child(_enc, nb, i)); + } + template<typename T> + void split(int nb, T& fragments) const { + ceph_assert(nb > 0); + unsigned nway = 1 << nb; + for (unsigned i=0; i<nway; i++) + fragments.push_back(make_child(i, nb)); + } + + // binary splitting + frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); } + frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); } + + bool is_left() const { return ceph_frag_is_left_child(_enc); } + bool is_right() const { return ceph_frag_is_right_child(_enc); } + frag_t get_sibling() const { + ceph_assert(!is_root()); + return frag_t(ceph_frag_sibling(_enc)); + } + + // sequencing + bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); } + bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); } + frag_t next() const { + ceph_assert(!is_rightmost()); + return frag_t(ceph_frag_next(_enc)); + } + + // parse + bool parse(const char *s) { + int pvalue, pbits; + int r = sscanf(s, "%x/%d", &pvalue, &pbits); + if (r == 2) { + *this = frag_t(pvalue, pbits); + return true; + } + return false; + } + + void encode(ceph::buffer::list& bl) const { + ceph::encode_raw(_enc, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + __u32 v; + ceph::decode_raw(v, p); + _enc = v; + } + bool operator<(const frag_t& b) const + { + if (value() != b.value()) + return value() < b.value(); + else + return bits() < b.bits(); + } +private: + _frag_t _enc = 0; +}; +WRITE_CLASS_ENCODER(frag_t) + +inline std::ostream& operator<<(std::ostream& out, const frag_t& hb) +{ + //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '='; + unsigned num = hb.bits(); + if (num) { + unsigned val = hb.value(); + for (unsigned bit = 23; num; num--, bit--) + out << ((val & (1<<bit)) ? '1':'0'); + } + return out << '*'; +} + + +using frag_vec_t = boost::container::small_vector<frag_t, 4>; + +/** + * fragtree_t -- partition an entire namespace into one or more frag_t's. + */ +class fragtree_t { + // pairs <f, b>: + // frag_t f is split by b bits. + // if child frag_t does not appear, it is not split. +public: + compact_map<frag_t,int32_t> _splits; + +public: + // ------------- + // basics + void swap(fragtree_t& other) { + _splits.swap(other._splits); + } + void clear() { + _splits.clear(); + } + + // ------------- + // accessors + bool empty() const { + return _splits.empty(); + } + int get_split(const frag_t hb) const { + compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb); + if (p == _splits.end()) + return 0; + else + return p->second; + } + + + bool is_leaf(frag_t x) const { + frag_vec_t s; + get_leaves_under(x, s); + //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; + return s.size() == 1 && s.front() == x; + } + + /** + * get_leaves -- list all leaves + */ + template<typename T> + void get_leaves(T& c) const { + return get_leaves_under_split(frag_t(), c); + } + + /** + * get_leaves_under_split -- list all leaves under a known split point (or root) + */ + template<typename T> + void get_leaves_under_split(frag_t under, T& c) const { + frag_vec_t s; + s.push_back(under); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * get_branch -- get branch point at OR above frag @a x + * - may be @a x itself, if @a x is a split + * - may be root (frag_t()) + */ + frag_t get_branch(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + if (get_split(x)) return x; // found it! + x = x.parent(); + } + } + + /** + * get_branch_above -- get a branch point above frag @a x + * - may be root (frag_t()) + * - may NOT be @a x, even if @a x is a split. + */ + frag_t get_branch_above(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + x = x.parent(); + if (get_split(x)) return x; // found it! + } + } + + + /** + * get_branch_or_leaf -- get branch or leaf point parent for frag @a x + * - may be @a x itself, if @a x is a split or leaf + * - may be root (frag_t()) + */ + frag_t get_branch_or_leaf(frag_t x) const { + frag_t branch = get_branch(x); + int nb = get_split(branch); + if (nb > 0 && // if branch is a split, and + branch.bits() + nb <= x.bits()) // one of the children is or contains x + return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) + else + return branch; + } + + /** + * get_leaves_under(x, ls) -- search for any leaves fully contained by x + */ + template<typename T> + void get_leaves_under(frag_t x, T& c) const { + frag_vec_t s; + s.push_back(get_branch_or_leaf(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else if (x.contains(t)) + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * contains(fg) -- does fragtree contain the specific frag @a x + */ + bool contains(frag_t x) const { + frag_vec_t s; + s.push_back(get_branch(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) { + if (t == x) return false; // it's split. + t.split(nb, s); // queue up children + } else { + if (t == x) return true; // it's there. + } + } + return false; + } + + /** + * operator[] -- map a (hash?) value to a frag + */ + frag_t operator[](unsigned v) const { + frag_t t; + while (1) { + ceph_assert(t.contains(v)); + int nb = get_split(t); + + // is this a leaf? + if (nb == 0) return t; // done. + + // pick appropriate child fragment. + unsigned nway = 1 << nb; + unsigned i; + for (i=0; i<nway; i++) { + frag_t n = t.make_child(i, nb); + if (n.contains(v)) { + t = n; + break; + } + } + ceph_assert(i < nway); + } + } + + + // --------------- + // modifiers + void split(frag_t x, int b, bool simplify=true) { + ceph_assert(is_leaf(x)); + _splits[x] = b; + + if (simplify) + try_assimilate_children(get_branch_above(x)); + } + void merge(frag_t x, int b, bool simplify=true) { + ceph_assert(!is_leaf(x)); + ceph_assert(_splits[x] == b); + _splits.erase(x); + + if (simplify) + try_assimilate_children(get_branch_above(x)); + } + + /* + * if all of a given split's children are identically split, + * then the children can be assimilated. + */ + void try_assimilate_children(frag_t x) { + int nb = get_split(x); + if (!nb) return; + frag_vec_t children; + x.split(nb, children); + int childbits = 0; + for (auto& frag : children) { + int cb = get_split(frag); + if (!cb) return; // nope. + if (childbits && cb != childbits) return; // not the same + childbits = cb; + } + // all children are split with childbits! + for (auto& frag : children) + _splits.erase(frag); + _splits[x] += childbits; + } + + bool force_to_leaf(CephContext *cct, frag_t x) { + if (is_leaf(x)) + return false; + + lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl; + + frag_t parent = get_branch_or_leaf(x); + ceph_assert(parent.bits() <= x.bits()); + lgeneric_dout(cct, 10) << "parent is " << parent << dendl; + + // do we need to split from parent to x? + if (parent.bits() < x.bits()) { + int spread = x.bits() - parent.bits(); + int nb = get_split(parent); + lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl; + if (nb == 0) { + // easy: split parent (a leaf) by the difference + lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl; + split(parent, spread); + ceph_assert(is_leaf(x)); + return true; + } + ceph_assert(nb > spread); + + // add an intermediary split + merge(parent, nb, false); + split(parent, spread, false); + + frag_vec_t subs; + parent.split(spread, subs); + for (auto& frag : subs) { + lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl; + split(frag, nb - spread, false); + } + } + + // x is now a leaf or split. + // hoover up any children. + frag_vec_t s; + s.push_back(x); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) { + lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; + merge(t, nb, false); // merge this point, and + t.split(nb, s); // queue up children + } + } + + lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; + ceph_assert(is_leaf(x)); + return true; + } + + // encoding + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(_splits, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(_splits, p); + } + void encode_nohead(ceph::buffer::list& bl) const { + using ceph::encode; + for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin(); + p != _splits.end(); + ++p) { + encode(p->first, bl); + encode(p->second, bl); + } + } + void decode_nohead(int n, ceph::buffer::list::const_iterator& p) { + using ceph::decode; + _splits.clear(); + while (n-- > 0) { + frag_t f; + decode(f, p); + decode(_splits[f], p); + } + } + + void print(std::ostream& out) { + out << "fragtree_t("; + frag_vec_t s; + s.push_back(frag_t()); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + // newline + indent? + if (t.bits()) { + out << std::endl; + for (unsigned i=0; i<t.bits(); i++) out << ' '; + } + int nb = get_split(t); + if (nb) { + out << t << " %" << nb; + t.split(nb, s); // queue up children + } else { + out << t; + } + } + out << ")"; + } + + void dump(ceph::Formatter *f) const { + f->open_array_section("splits"); + for (auto p = _splits.begin(); p != _splits.end(); ++p) { + f->open_object_section("split"); + std::ostringstream frag_str; + frag_str << p->first; + f->dump_string("frag", frag_str.str()); + f->dump_int("children", p->second); + f->close_section(); // split + } + f->close_section(); // splits + } +}; +WRITE_CLASS_ENCODER(fragtree_t) + +inline bool operator==(const fragtree_t& l, const fragtree_t& r) { + return l._splits == r._splits; +} +inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { + return l._splits != r._splits; +} + +inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) +{ + out << "fragtree_t("; + + for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin(); + p != ft._splits.end(); + ++p) { + if (p != ft._splits.begin()) + out << " "; + out << p->first << "^" << p->second; + } + return out << ")"; +} + +/** + * fragset_t -- a set of fragments + */ +class fragset_t { + std::set<frag_t> _set; + +public: + const std::set<frag_t> &get() const { return _set; } + std::set<frag_t>::const_iterator begin() const { return _set.begin(); } + std::set<frag_t>::const_iterator end() const { return _set.end(); } + + bool empty() const { return _set.empty(); } + + bool contains(frag_t f) const { + while (1) { + if (_set.count(f)) return true; + if (f.bits() == 0) return false; + f = f.parent(); + } + } + + void clear() { + _set.clear(); + } + + void insert_raw(frag_t f){ + _set.insert(f); + } + void insert(frag_t f) { + _set.insert(f); + simplify(); + } + + void simplify() { + auto it = _set.begin(); + while (it != _set.end()) { + if (!it->is_root() && + _set.count(it->get_sibling())) { + _set.erase(it->get_sibling()); + auto ret = _set.insert(it->parent()); + _set.erase(it); + it = ret.first; + } else { + ++it; + } + } + } + + void encode(ceph::buffer::list& bl) const { + ceph::encode(_set, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + ceph::decode(_set, p); + } +}; +WRITE_CLASS_ENCODER(fragset_t) + + +inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) +{ + return out << "fragset_t(" << fs.get() << ")"; +} + +#endif diff --git a/src/include/fs_types.h b/src/include/fs_types.h new file mode 100644 index 000000000..c1932bfcc --- /dev/null +++ b/src/include/fs_types.h @@ -0,0 +1,175 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_INCLUDE_FS_TYPES_H +#define CEPH_INCLUDE_FS_TYPES_H + +#include "types.h" +class JSONObj; + +#define CEPHFS_EBLOCKLISTED 108 +#define CEPHFS_EPERM 1 +#define CEPHFS_ESTALE 116 +#define CEPHFS_ENOSPC 28 +#define CEPHFS_ETIMEDOUT 110 +#define CEPHFS_EIO 5 +#define CEPHFS_ENOTCONN 107 +#define CEPHFS_EEXIST 17 +#define CEPHFS_EINTR 4 +#define CEPHFS_EINVAL 22 +#define CEPHFS_EBADF 9 +#define CEPHFS_EROFS 30 +#define CEPHFS_EAGAIN 11 +#define CEPHFS_EACCES 13 +#define CEPHFS_ELOOP 40 +#define CEPHFS_EISDIR 21 +#define CEPHFS_ENOENT 2 +#define CEPHFS_ENOTDIR 20 +#define CEPHFS_ENAMETOOLONG 36 +#define CEPHFS_EBUSY 16 +#define CEPHFS_EDQUOT 122 +#define CEPHFS_EFBIG 27 +#define CEPHFS_ERANGE 34 +#define CEPHFS_ENXIO 6 +#define CEPHFS_ECANCELED 125 +#define CEPHFS_ENODATA 61 +#define CEPHFS_EOPNOTSUPP 95 +#define CEPHFS_EXDEV 18 +#define CEPHFS_ENOMEM 12 +#define CEPHFS_ENOTRECOVERABLE 131 +#define CEPHFS_ENOSYS 38 +#define CEPHFS_EWOULDBLOCK CEPHFS_EAGAIN +#define CEPHFS_ENOTEMPTY 39 +#define CEPHFS_EDEADLK 35 +#define CEPHFS_EDEADLOCK CEPHFS_EDEADLK +#define CEPHFS_EDOM 33 +#define CEPHFS_EMLINK 31 +#define CEPHFS_ETIME 62 +#define CEPHFS_EOLDSNAPC 85 +#define CEPHFS_EFAULT 14 +#define CEPHFS_EISCONN 106 +#define CEPHFS_EMULTIHOP 72 + +// taken from linux kernel: include/uapi/linux/fcntl.h +#define CEPHFS_AT_FDCWD -100 /* Special value used to indicate + openat should use the current + working directory. */ + +// -------------------------------------- +// ino + +typedef uint64_t _inodeno_t; + +struct inodeno_t { + _inodeno_t val; + inodeno_t() : val(0) {} + // cppcheck-suppress noExplicitConstructor + inodeno_t(_inodeno_t v) : val(v) {} + inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } + operator _inodeno_t() const { return val; } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(val, bl); + } + void decode(ceph::buffer::list::const_iterator& p) { + using ceph::decode; + decode(val, p); + } +} __attribute__ ((__may_alias__)); +WRITE_CLASS_ENCODER(inodeno_t) + +template<> +struct denc_traits<inodeno_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const inodeno_t &o, size_t& p) { + denc(o.val, p); + } + static void encode(const inodeno_t &o, ceph::buffer::list::contiguous_appender& p) { + denc(o.val, p); + } + static void decode(inodeno_t& o, ceph::buffer::ptr::const_iterator &p) { + denc(o.val, p); + } +}; + +inline std::ostream& operator<<(std::ostream& out, const inodeno_t& ino) { + return out << std::hex << "0x" << ino.val << std::dec; +} + +namespace std { +template<> +struct hash<inodeno_t> { + size_t operator()( const inodeno_t& x ) const { + static rjhash<uint64_t> H; + return H(x.val); + } +}; +} // namespace std + + +// file modes + +inline bool file_mode_is_readonly(int mode) { + return (mode & CEPH_FILE_MODE_WR) == 0; +} + + +// dentries +#define MAX_DENTRY_LEN 255 + +// -- +namespace ceph { + class Formatter; +} +void dump(const ceph_file_layout& l, ceph::Formatter *f); +void dump(const ceph_dir_layout& l, ceph::Formatter *f); + + + +// file_layout_t + +struct file_layout_t { + // file -> object mapping + uint32_t stripe_unit; ///< stripe unit, in bytes, + uint32_t stripe_count; ///< over this many objects + uint32_t object_size; ///< until objects are this big + + int64_t pool_id; ///< rados pool id + std::string pool_ns; ///< rados pool namespace + + file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0) + : stripe_unit(su), + stripe_count(sc), + object_size(os), + pool_id(-1) { + } + + bool operator==(const file_layout_t&) const = default; + + static file_layout_t get_default() { + return file_layout_t(1<<22, 1, 1<<22); + } + + uint64_t get_period() const { + return static_cast<uint64_t>(stripe_count) * object_size; + } + + void from_legacy(const ceph_file_layout& fl); + void to_legacy(ceph_file_layout *fl) const; + + bool is_valid() const; + + void encode(ceph::buffer::list& bl, uint64_t features) const; + void decode(ceph::buffer::list::const_iterator& p); + void dump(ceph::Formatter *f) const; + void decode_json(JSONObj *obj); + static void generate_test_instances(std::list<file_layout_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(file_layout_t) + +std::ostream& operator<<(std::ostream& out, const file_layout_t &layout); + +#endif diff --git a/src/include/function2.hpp b/src/include/function2.hpp new file mode 100644 index 000000000..613e651c7 --- /dev/null +++ b/src/include/function2.hpp @@ -0,0 +1,1581 @@ + +// Copyright 2015-2018 Denis Blank <denis.blank at outlook dot com> +// Distributed under the Boost Software License, Version 1.0 +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef FU2_INCLUDED_FUNCTION2_HPP_ +#define FU2_INCLUDED_FUNCTION2_HPP_ + +#include <cassert> +#include <cstdlib> +#include <memory> +#include <tuple> +#include <type_traits> +#include <utility> + +// Defines: +// - FU2_HAS_DISABLED_EXCEPTIONS +#if defined(FU2_WITH_DISABLED_EXCEPTIONS) || \ + defined(FU2_MACRO_DISABLE_EXCEPTIONS) +#define FU2_HAS_DISABLED_EXCEPTIONS +#else // FU2_WITH_DISABLED_EXCEPTIONS +#if defined(_MSC_VER) +#if !defined(_HAS_EXCEPTIONS) || (_HAS_EXCEPTIONS == 0) +#define FU2_HAS_DISABLED_EXCEPTIONS +#endif +#elif defined(__clang__) +#if !(__EXCEPTIONS && __has_feature(cxx_exceptions)) +#define FU2_HAS_DISABLED_EXCEPTIONS +#endif +#elif defined(__GNUC__) +#if !__EXCEPTIONS +#define FU2_HAS_DISABLED_EXCEPTIONS +#endif +#endif +#endif // FU2_WITH_DISABLED_EXCEPTIONS +// - FU2_HAS_NO_FUNCTIONAL_HEADER +#if !defined(FU2_WITH_NO_FUNCTIONAL_HEADER) || \ + !defined(FU2_NO_FUNCTIONAL_HEADER) || \ + !defined(FU2_HAS_DISABLED_EXCEPTIONS) +#define FU2_HAS_NO_FUNCTIONAL_HEADER +#include <functional> +#endif +// - FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#if defined(FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE) +#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#else // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE +#if defined(_MSC_VER) +#if defined(_HAS_CXX17) && _HAS_CXX17 +#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#endif +#elif defined(__cpp_noexcept_function_type) +#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#elif defined(__cplusplus) && (__cplusplus >= 201703L) +#define FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#endif +#endif // FU2_WITH_CXX17_NOEXCEPT_FUNCTION_TYPE + +#if !defined(FU2_HAS_DISABLED_EXCEPTIONS) +#include <exception> +#endif + +namespace fu2 { +inline namespace abi_310 { +namespace detail { +template <typename Config, typename Property> +class function; + +template <typename...> +struct identity {}; + +// Equivalent to C++17's std::void_t which is targets a bug in GCC, +// that prevents correct SFINAE behavior. +// See http://stackoverflow.com/questions/35753920 for details. +template <typename...> +struct deduce_to_void : std::common_type<void> {}; + +template <typename... T> +using void_t = typename deduce_to_void<T...>::type; + +// Copy enabler helper class +template <bool /*Copyable*/> +struct copyable {}; +template <> +struct copyable<false> { + copyable() = default; + ~copyable() = default; + copyable(copyable const&) = delete; + copyable(copyable&&) = default; + copyable& operator=(copyable const&) = delete; + copyable& operator=(copyable&&) = default; +}; + +/// Configuration trait to configure the function_base class. +template <bool Owning, bool Copyable, std::size_t Capacity> +struct config { + // Is true if the function is copyable. + static constexpr auto const is_owning = Owning; + + // Is true if the function is copyable. + static constexpr auto const is_copyable = Copyable; + + // The internal capacity of the function + // used in small functor optimization. + static constexpr auto const capacity = Capacity; +}; + +/// A config which isn't compatible to other configs +template <bool Throws, bool HasStrongExceptGuarantee, typename... Args> +struct property { + // Is true when the function throws an exception on empty invocation. + static constexpr auto const is_throwing = Throws; + + // Is true when the function throws an exception on empty invocation. + static constexpr auto const is_strong_exception_guaranteed = Throws; +}; + +/// Provides utilities for invocing callable objects +namespace invocation { +/// Invokes the given callable object with the given arguments +template <typename Callable, typename... Args> +constexpr auto invoke(Callable&& callable, Args&&... args) noexcept( + noexcept(std::forward<Callable>(callable)(std::forward<Args>(args)...))) + -> decltype(std::forward<Callable>(callable)(std::forward<Args>(args)...)) { + + return std::forward<Callable>(callable)(std::forward<Args>(args)...); +} +/// Invokes the given member function pointer by reference +template <typename T, typename Type, typename Self, typename... Args> +constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept( + noexcept((std::forward<Self>(self).*member)(std::forward<Args>(args)...))) + -> decltype((std::forward<Self>(self).* + member)(std::forward<Args>(args)...)) { + return (std::forward<Self>(self).*member)(std::forward<Args>(args)...); +} +/// Invokes the given member function pointer by pointer +template <typename T, typename Type, typename Self, typename... Args> +constexpr auto invoke(Type T::*member, Self&& self, Args&&... args) noexcept( + noexcept((std::forward<Self>(self)->*member)(std::forward<Args>(args)...))) + -> decltype( + (std::forward<Self>(self)->*member)(std::forward<Args>(args)...)) { + return (std::forward<Self>(self)->*member)(std::forward<Args>(args)...); +} +/// Invokes the given pointer to a scalar member by reference +template <typename T, typename Type, typename Self> +constexpr auto +invoke(Type T::*member, + Self&& self) noexcept(noexcept(std::forward<Self>(self).*member)) + -> decltype(std::forward<Self>(self).*member) { + return (std::forward<Self>(self).*member); +} +/// Invokes the given pointer to a scalar member by pointer +template <typename T, typename Type, typename Self> +constexpr auto +invoke(Type T::*member, + Self&& self) noexcept(noexcept(std::forward<Self>(self)->*member)) + -> decltype(std::forward<Self>(self)->*member) { + return std::forward<Self>(self)->*member; +} + +/// Deduces to a true type if the callable object can be invoked with +/// the given arguments. +/// We don't use invoke here because MSVC can't evaluate the nested expression +/// SFINAE here. +template <typename T, typename Args, typename = void> +struct can_invoke : std::false_type {}; +template <typename T, typename... Args> +struct can_invoke<T, identity<Args...>, + decltype((void)std::declval<T>()(std::declval<Args>()...))> + : std::true_type {}; +template <typename Pointer, typename T, typename... Args> +struct can_invoke<Pointer, identity<T&, Args...>, + decltype((void)((std::declval<T&>().*std::declval<Pointer>())( + std::declval<Args>()...)))> : std::true_type {}; +template <typename Pointer, typename T, typename... Args> +struct can_invoke<Pointer, identity<T&&, Args...>, + decltype( + (void)((std::declval<T&&>().*std::declval<Pointer>())( + std::declval<Args>()...)))> : std::true_type {}; +template <typename Pointer, typename T, typename... Args> +struct can_invoke<Pointer, identity<T*, Args...>, + decltype( + (void)((std::declval<T*>()->*std::declval<Pointer>())( + std::declval<Args>()...)))> : std::true_type {}; +template <typename Pointer, typename T> +struct can_invoke<Pointer, identity<T&>, + decltype((void)(std::declval<T&>().*std::declval<Pointer>()))> + : std::true_type {}; +template <typename Pointer, typename T> +struct can_invoke<Pointer, identity<T&&>, + decltype((void)(std::declval<T&&>().* + std::declval<Pointer>()))> : std::true_type { +}; +template <typename Pointer, typename T> +struct can_invoke<Pointer, identity<T*>, + decltype( + (void)(std::declval<T*>()->*std::declval<Pointer>()))> + : std::true_type {}; + +template <bool RequiresNoexcept, typename T, typename Args> +struct is_noexcept_correct : std::true_type {}; +template <typename T, typename... Args> +struct is_noexcept_correct<true, T, identity<Args...>> + : std::integral_constant<bool, noexcept(invoke(std::declval<T>(), + std::declval<Args>()...))> { +}; +} // end namespace invocation + +namespace overloading { +template <typename... Args> +struct overload_impl; +template <typename Current, typename Next, typename... Rest> +struct overload_impl<Current, Next, Rest...> : Current, + overload_impl<Next, Rest...> { + explicit overload_impl(Current current, Next next, Rest... rest) + : Current(std::move(current)), overload_impl<Next, Rest...>( + std::move(next), std::move(rest)...) { + } + + using Current::operator(); + using overload_impl<Next, Rest...>::operator(); +}; +template <typename Current> +struct overload_impl<Current> : Current { + explicit overload_impl(Current current) : Current(std::move(current)) { + } + + using Current::operator(); +}; + +template <typename... T> +constexpr auto overload(T&&... callables) { + return overload_impl<std::decay_t<T>...>{std::forward<T>(callables)...}; +} +} // namespace overloading + +/// Declares the namespace which provides the functionality to work with a +/// type-erased object. +namespace type_erasure { +/// Specialization to work with addresses of callable objects +template <typename T, typename = void> +struct address_taker { + template <typename O> + static void* take(O&& obj) { + return std::addressof(obj); + } + static T& restore(void* ptr) { + return *static_cast<T*>(ptr); + } + static T const& restore(void const* ptr) { + return *static_cast<T const*>(ptr); + } + static T volatile& restore(void volatile* ptr) { + return *static_cast<T volatile*>(ptr); + } + static T const volatile& restore(void const volatile* ptr) { + return *static_cast<T const volatile*>(ptr); + } +}; +/// Specialization to work with addresses of raw function pointers +template <typename T> +struct address_taker<T, std::enable_if_t<std::is_pointer<T>::value>> { + template <typename O> + static void* take(O&& obj) { + return reinterpret_cast<void*>(obj); + } + template <typename O> + static T restore(O ptr) { + return reinterpret_cast<T>(const_cast<void*>(ptr)); + } +}; + +template <typename Box> +struct box_factory; +/// Store the allocator inside the box +template <bool IsCopyable, typename T, typename Allocator> +struct box : private Allocator { + friend box_factory<box>; + + T value_; + + explicit box(T value, Allocator allocator) + : Allocator(std::move(allocator)), value_(std::move(value)) { + } + + box(box&&) = default; + box(box const&) = default; + box& operator=(box&&) = default; + box& operator=(box const&) = default; + ~box() = default; +}; +template <typename T, typename Allocator> +struct box<false, T, Allocator> : private Allocator { + friend box_factory<box>; + + T value_; + + explicit box(T value, Allocator allocator) + : Allocator(std::move(allocator)), value_(std::move(value)) { + } + + box(box&&) = default; + box(box const&) = delete; + box& operator=(box&&) = default; + box& operator=(box const&) = delete; + ~box() = default; +}; + +template <bool IsCopyable, typename T, typename Allocator> +struct box_factory<box<IsCopyable, T, Allocator>> { + using real_allocator = + typename std::allocator_traits<std::decay_t<Allocator>>:: + template rebind_alloc<box<IsCopyable, T, Allocator>>; + + /// Allocates space through the boxed allocator + static box<IsCopyable, T, Allocator>* + box_allocate(box<IsCopyable, T, Allocator> const* me) { + real_allocator allocator(*static_cast<Allocator const*>(me)); + + return static_cast<box<IsCopyable, T, Allocator>*>( + std::allocator_traits<real_allocator>::allocate(allocator, 1U)); + } + + /// Destroys the box through the given allocator + static void box_deallocate(box<IsCopyable, T, Allocator>* me) { + real_allocator allocator(*static_cast<Allocator const*>(me)); + + me->~box(); + std::allocator_traits<real_allocator>::deallocate(allocator, me, 1U); + } +}; + +/// Creates a box containing the given value and allocator +template <bool IsCopyable, typename T, + typename Allocator = std::allocator<std::decay_t<T>>> +auto make_box(std::integral_constant<bool, IsCopyable>, T&& value, + Allocator&& allocator = Allocator{}) { + return box<IsCopyable, std::decay_t<T>, std::decay_t<Allocator>>{ + std::forward<T>(value), std::forward<Allocator>(allocator)}; +} + +template <typename T> +struct is_box : std::false_type {}; +template <bool IsCopyable, typename T, typename Allocator> +struct is_box<box<IsCopyable, T, Allocator>> : std::true_type {}; + +/// Provides access to the pointer to a heal allocated erased object +/// as well to the inplace storage. +union data_accessor { + data_accessor() = default; + explicit constexpr data_accessor(std::nullptr_t) noexcept : ptr_(nullptr) { + } + explicit constexpr data_accessor(void* ptr) noexcept : ptr_(ptr) { + } + + /// The pointer we use if the object is on the heap + void* ptr_; + /// The first field of the inplace storage + std::size_t inplace_storage_; +}; + +/// See opcode::op_fetch_empty +constexpr void write_empty(data_accessor* accessor, bool empty) noexcept { + accessor->inplace_storage_ = std::size_t(empty); +} + +template <typename From, typename To> +using transfer_const_t = + std::conditional_t<std::is_const<std::remove_pointer_t<From>>::value, + std::add_const_t<To>, To>; +template <typename From, typename To> +using transfer_volatile_t = + std::conditional_t<std::is_volatile<std::remove_pointer_t<From>>::value, + std::add_volatile_t<To>, To>; + +/// The retriever when the object is allocated inplace +template <typename T, typename Accessor> +constexpr auto retrieve(std::true_type /*is_inplace*/, Accessor from, + std::size_t from_capacity) { + using type = transfer_const_t<Accessor, transfer_volatile_t<Accessor, void>>*; + + /// Process the command by using the data inside the internal capacity + auto storage = &(from->inplace_storage_); + auto inplace = const_cast<void*>(static_cast<type>(storage)); + return type(std::align(alignof(T), sizeof(T), inplace, from_capacity)); +} + +/// The retriever which is used when the object is allocated +/// through the allocator +template <typename T, typename Accessor> +constexpr auto retrieve(std::false_type /*is_inplace*/, Accessor from, + std::size_t /*from_capacity*/) { + + return from->ptr_; +} + +namespace invocation_table { +#if !defined(FU2_HAS_DISABLED_EXCEPTIONS) +#if defined(FU2_HAS_NO_FUNCTIONAL_HEADER) +struct bad_function_call : std::exception { + bad_function_call() noexcept { + } + + char const* what() const noexcept override { + return "bad function call"; + } +}; +#elif +using std::bad_function_call; +#endif +#endif + +#ifdef FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F) \ + F(, , noexcept, , &) \ + F(const, , noexcept, , &) \ + F(, volatile, noexcept, , &) \ + F(const, volatile, noexcept, , &) \ + F(, , noexcept, &, &) \ + F(const, , noexcept, &, &) \ + F(, volatile, noexcept, &, &) \ + F(const, volatile, noexcept, &, &) \ + F(, , noexcept, &&, &&) \ + F(const, , noexcept, &&, &&) \ + F(, volatile, noexcept, &&, &&) \ + F(const, volatile, noexcept, &&, &&) +#else // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE +#define FU2_EXPAND_QUALIFIERS_NOEXCEPT(F) +#endif // FU2_HAS_CXX17_NOEXCEPT_FUNCTION_TYPE + +#define FU2_EXPAND_QUALIFIERS(F) \ + F(, , , , &) \ + F(const, , , , &) \ + F(, volatile, , , &) \ + F(const, volatile, , , &) \ + F(, , , &, &) \ + F(const, , , &, &) \ + F(, volatile, , &, &) \ + F(const, volatile, , &, &) \ + F(, , , &&, &&) \ + F(const, , , &&, &&) \ + F(, volatile, , &&, &&) \ + F(const, volatile, , &&, &&) \ + FU2_EXPAND_QUALIFIERS_NOEXCEPT(F) + +/// If the function is qualified as noexcept, the call will never throw +template <bool IsNoexcept> +[[noreturn]] void throw_or_abortnoexcept( + std::integral_constant<bool, IsNoexcept> /*is_throwing*/) noexcept { + std::abort(); +} +/// Calls std::abort on empty function calls +[[noreturn]] inline void +throw_or_abort(std::false_type /*is_throwing*/) noexcept { + std::abort(); +} +/// Throws bad_function_call on empty funciton calls +[[noreturn]] inline void throw_or_abort(std::true_type /*is_throwing*/) { +#ifdef FU2_HAS_DISABLED_EXCEPTIONS + throw_or_abort(std::false_type{}); +#else + throw bad_function_call{}; +#endif +} + +template <typename T> +struct function_trait; + +using is_noexcept_ = std::false_type; +using is_noexcept_noexcept = std::true_type; + +#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF) \ + template <typename Ret, typename... Args> \ + struct function_trait<Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT> { \ + using pointer_type = Ret (*)(data_accessor CONST VOLATILE*, \ + std::size_t capacity, Args...); \ + template <typename T, bool IsInplace> \ + struct internal_invoker { \ + static Ret invoke(data_accessor CONST VOLATILE* data, \ + std::size_t capacity, Args... args) NOEXCEPT { \ + auto obj = retrieve<T>(std::integral_constant<bool, IsInplace>{}, \ + data, capacity); \ + auto box = static_cast<T CONST VOLATILE*>(obj); \ + return invocation::invoke( \ + static_cast<std::decay_t<decltype(box->value_)> CONST VOLATILE \ + REF>(box->value_), \ + std::forward<Args>(args)...); \ + } \ + }; \ + \ + template <typename T> \ + struct view_invoker { \ + static Ret invoke(data_accessor CONST VOLATILE* data, std::size_t, \ + Args... args) NOEXCEPT { \ + \ + auto ptr = static_cast<void CONST VOLATILE*>(data->ptr_); \ + return invocation::invoke(address_taker<T>::restore(ptr), \ + std::forward<Args>(args)...); \ + } \ + }; \ + \ + template <typename T> \ + using callable = T CONST VOLATILE REF; \ + \ + using arguments = identity<Args...>; \ + \ + using is_noexcept = is_noexcept_##NOEXCEPT; \ + \ + template <bool Throws> \ + struct empty_invoker { \ + static Ret invoke(data_accessor CONST VOLATILE* /*data*/, \ + std::size_t /*capacity*/, Args... /*args*/) NOEXCEPT { \ + throw_or_abort##NOEXCEPT(std::integral_constant<bool, Throws>{}); \ + } \ + }; \ + }; + +FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT) +#undef FU2_DEFINE_FUNCTION_TRAIT + +/// Deduces to the function pointer to the given signature +template <typename Signature> +using function_pointer_of = typename function_trait<Signature>::pointer_type; + +template <typename... Args> +struct invoke_table; + +/// We optimize the vtable_t in case there is a single function overload +template <typename First> +struct invoke_table<First> { + using type = function_pointer_of<First>; + + /// Return the function pointer itself + template <std::size_t Index> + static constexpr auto fetch(type pointer) noexcept { + static_assert(Index == 0U, "The index should be 0 here!"); + return pointer; + } + + /// Returns the thunk of an single overloaded callable + template <typename T, bool IsInplace> + static constexpr type get_invocation_table_of() noexcept { + return &function_trait<First>::template internal_invoker<T, + IsInplace>::invoke; + } + /// Returns the thunk of an single overloaded callable + template <typename T> + static constexpr type get_invocation_view_table_of() noexcept { + return &function_trait<First>::template view_invoker<T>::invoke; + } + /// Returns the thunk of an empty single overloaded callable + template <bool IsThrowing> + static constexpr type get_empty_invocation_table() noexcept { + return &function_trait<First>::template empty_invoker<IsThrowing>::invoke; + } +}; +/// We generate a table in case of multiple function overloads +template <typename First, typename Second, typename... Args> +struct invoke_table<First, Second, Args...> { + using type = + std::tuple<function_pointer_of<First>, function_pointer_of<Second>, + function_pointer_of<Args>...> const*; + + /// Return the function pointer at the particular index + template <std::size_t Index> + static constexpr auto fetch(type table) noexcept { + return std::get<Index>(*table); + } + + /// The invocation vtable for a present object + template <typename T, bool IsInplace> + struct invocation_vtable : public std::tuple<function_pointer_of<First>, + function_pointer_of<Second>, + function_pointer_of<Args>...> { + constexpr invocation_vtable() noexcept + : std::tuple<function_pointer_of<First>, function_pointer_of<Second>, + function_pointer_of<Args>...>(std::make_tuple( + &function_trait<First>::template internal_invoker< + T, IsInplace>::invoke, + &function_trait<Second>::template internal_invoker< + T, IsInplace>::invoke, + &function_trait<Args>::template internal_invoker< + T, IsInplace>::invoke...)) { + } + }; + + /// Returns the thunk of an multi overloaded callable + template <typename T, bool IsInplace> + static type get_invocation_table_of() noexcept { + static invocation_vtable<T, IsInplace> const table; + return &table; + } + + /// The invocation vtable for a present object + template <typename T> + struct invocation_view_vtable + : public std::tuple<function_pointer_of<First>, + function_pointer_of<Second>, + function_pointer_of<Args>...> { + constexpr invocation_view_vtable() noexcept + : std::tuple<function_pointer_of<First>, function_pointer_of<Second>, + function_pointer_of<Args>...>(std::make_tuple( + &function_trait<First>::template view_invoker<T>::invoke, + &function_trait<Second>::template view_invoker<T>::invoke, + &function_trait<Args>::template view_invoker<T>::invoke...)) { + } + }; + + /// Returns the thunk of an multi overloaded callable + template <typename T> + static type get_invocation_view_table_of() noexcept { + static invocation_view_vtable<T> const table; + return &table; + } + + /// The invocation table for an empty wrapper + template <bool IsThrowing> + struct empty_vtable : public std::tuple<function_pointer_of<First>, + function_pointer_of<Second>, + function_pointer_of<Args>...> { + constexpr empty_vtable() noexcept + : std::tuple<function_pointer_of<First>, function_pointer_of<Second>, + function_pointer_of<Args>...>( + std::make_tuple(&function_trait<First>::template empty_invoker< + IsThrowing>::invoke, + &function_trait<Second>::template empty_invoker< + IsThrowing>::invoke, + &function_trait<Args>::template empty_invoker< + IsThrowing>::invoke...)) { + } + }; + + /// Returns the thunk of an multi single overloaded callable + template <bool IsThrowing> + static type get_empty_invocation_table() noexcept { + static empty_vtable<IsThrowing> const table; + return &table; + } +}; + +template <std::size_t Index, typename Function, typename... Signatures> +class operator_impl; + +#define FU2_DEFINE_FUNCTION_TRAIT(CONST, VOLATILE, NOEXCEPT, OVL_REF, REF) \ + template <std::size_t Index, typename Function, typename Ret, \ + typename... Args, typename Next, typename... Signatures> \ + class operator_impl<Index, Function, \ + Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT, Next, \ + Signatures...> \ + : operator_impl<Index + 1, Function, Next, Signatures...> { \ + \ + template <std::size_t, typename, typename...> \ + friend class operator_impl; \ + \ + protected: \ + operator_impl() = default; \ + ~operator_impl() = default; \ + operator_impl(operator_impl const&) = default; \ + operator_impl(operator_impl&&) = default; \ + operator_impl& operator=(operator_impl const&) = default; \ + operator_impl& operator=(operator_impl&&) = default; \ + \ + using operator_impl<Index + 1, Function, Next, Signatures...>::operator(); \ + \ + Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT { \ + auto parent = static_cast<Function CONST VOLATILE*>(this); \ + using erasure_t = std::decay_t<decltype(parent->erasure_)>; \ + \ + return erasure_t::template invoke<Index>( \ + static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_), \ + std::forward<Args>(args)...); \ + } \ + }; \ + template <std::size_t Index, typename Config, typename Property, \ + typename Ret, typename... Args> \ + class operator_impl<Index, function<Config, Property>, \ + Ret(Args...) CONST VOLATILE OVL_REF NOEXCEPT> \ + : copyable<Config::is_owning || Config::is_copyable> { \ + \ + template <std::size_t, typename, typename...> \ + friend class operator_impl; \ + \ + protected: \ + operator_impl() = default; \ + ~operator_impl() = default; \ + operator_impl(operator_impl const&) = default; \ + operator_impl(operator_impl&&) = default; \ + operator_impl& operator=(operator_impl const&) = default; \ + operator_impl& operator=(operator_impl&&) = default; \ + \ + Ret operator()(Args... args) CONST VOLATILE OVL_REF NOEXCEPT { \ + auto parent = \ + static_cast<function<Config, Property> CONST VOLATILE*>(this); \ + using erasure_t = std::decay_t<decltype(parent->erasure_)>; \ + \ + return erasure_t::template invoke<Index>( \ + static_cast<erasure_t CONST VOLATILE REF>(parent->erasure_), \ + std::forward<Args>(args)...); \ + } \ + }; + +FU2_EXPAND_QUALIFIERS(FU2_DEFINE_FUNCTION_TRAIT) +#undef FU2_DEFINE_FUNCTION_TRAIT +} // namespace invocation_table + +namespace tables { +/// Identifies the action which is dispatched on the erased object +enum class opcode { + op_move, //< Move the object and set the vtable + op_copy, //< Copy the object and set the vtable + op_destroy, //< Destroy the object and reset the vtable + op_weak_destroy, //< Destroy the object without resetting the vtable + op_fetch_empty, //< Stores true or false into the to storage + //< to indicate emptiness +}; + +/// Abstraction for a vtable together with a command table +/// TODO Add optimization for a single formal argument +/// TODO Add optimization to merge both tables if the function is size +/// optimized +template <typename Property> +class vtable; +template <bool IsThrowing, bool HasStrongExceptGuarantee, + typename... FormalArgs> +class vtable<property<IsThrowing, HasStrongExceptGuarantee, FormalArgs...>> { + using command_function_t = void (*)(vtable* /*this*/, opcode /*op*/, + data_accessor* /*from*/, + std::size_t /*from_capacity*/, + data_accessor* /*to*/, + std::size_t /*to_capacity*/); + + using invoke_table_t = invocation_table::invoke_table<FormalArgs...>; + + command_function_t cmd_; + typename invoke_table_t::type vtable_; + + template <typename T> + struct trait { + static_assert(is_box<T>::value, + "The trait must be specialized with a box!"); + + /// The command table + template <bool IsInplace> + static void process_cmd(vtable* to_table, opcode op, data_accessor* from, + std::size_t from_capacity, data_accessor* to, + std::size_t to_capacity) { + + switch (op) { + case opcode::op_move: { + /// Retrieve the pointer to the object + auto box = static_cast<T*>(retrieve<T>( + std::integral_constant<bool, IsInplace>{}, from, from_capacity)); + assert(box && "The object must not be over aligned or null!"); + + if (!IsInplace) { + // Just swap both pointers if we allocated on the heap + to->ptr_ = from->ptr_; + +#ifndef _NDEBUG + // We don't need to null the pointer since we know that + // we don't own the data anymore through the vtable + // which is set to empty. + from->ptr_ = nullptr; +#endif + + to_table->template set_allocated<T>(); + + } + // The object is allocated inplace + else { + construct(std::true_type{}, std::move(*box), to_table, to, + to_capacity); + box->~T(); + } + return; + } + case opcode::op_copy: { + auto box = static_cast<T const*>(retrieve<T>( + std::integral_constant<bool, IsInplace>{}, from, from_capacity)); + assert(box && "The object must not be over aligned or null!"); + + assert(std::is_copy_constructible<T>::value && + "The box is required to be copyable here!"); + + // Try to allocate the object inplace + construct(std::is_copy_constructible<T>{}, *box, to_table, to, + to_capacity); + return; + } + case opcode::op_destroy: + case opcode::op_weak_destroy: { + + assert(!to && !to_capacity && "Arg overflow!"); + auto box = static_cast<T*>(retrieve<T>( + std::integral_constant<bool, IsInplace>{}, from, from_capacity)); + + if (IsInplace) { + box->~T(); + } else { + box_factory<T>::box_deallocate(box); + } + + if (op == opcode::op_destroy) { + to_table->set_empty(); + } + return; + } + case opcode::op_fetch_empty: { + write_empty(to, false); + return; + } + } + + // TODO Use an unreachable intrinsic + assert(false && "Unreachable!"); + std::exit(-1); + } + + template <typename Box> + static void + construct(std::true_type /*apply*/, Box&& box, vtable* to_table, + data_accessor* to, + std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) { + // Try to allocate the object inplace + void* storage = retrieve<T>(std::true_type{}, to, to_capacity); + if (storage) { + to_table->template set_inplace<T>(); + } else { + // Allocate the object through the allocator + to->ptr_ = storage = + box_factory<std::decay_t<Box>>::box_allocate(std::addressof(box)); + to_table->template set_allocated<T>(); + } + new (storage) T(std::forward<Box>(box)); + } + + template <typename Box> + static void + construct(std::false_type /*apply*/, Box&& /*box*/, vtable* /*to_table*/, + data_accessor* /*to*/, + std::size_t /*to_capacity*/) noexcept(HasStrongExceptGuarantee) { + } + }; + + /// The command table + static void empty_cmd(vtable* to_table, opcode op, data_accessor* /*from*/, + std::size_t /*from_capacity*/, data_accessor* to, + std::size_t /*to_capacity*/) { + + switch (op) { + case opcode::op_move: + case opcode::op_copy: { + to_table->set_empty(); + break; + } + case opcode::op_destroy: + case opcode::op_weak_destroy: { + // Do nothing + break; + } + case opcode::op_fetch_empty: { + write_empty(to, true); + break; + } + } + } + +public: + vtable() noexcept = default; + + /// Initialize an object at the given position + template <typename T> + static void init(vtable& table, T&& object, data_accessor* to, + std::size_t to_capacity) { + + trait<std::decay_t<T>>::construct(std::true_type{}, std::forward<T>(object), + &table, to, to_capacity); + } + + /// Moves the object at the given position + void move(vtable& to_table, data_accessor* from, std::size_t from_capacity, + data_accessor* to, + std::size_t to_capacity) noexcept(HasStrongExceptGuarantee) { + cmd_(&to_table, opcode::op_move, from, from_capacity, to, to_capacity); + set_empty(); + } + + /// Destroys the object at the given position + void copy(vtable& to_table, data_accessor const* from, + std::size_t from_capacity, data_accessor* to, + std::size_t to_capacity) const { + cmd_(&to_table, opcode::op_copy, const_cast<data_accessor*>(from), + from_capacity, to, to_capacity); + } + + /// Destroys the object at the given position + void destroy(data_accessor* from, + std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) { + cmd_(this, opcode::op_destroy, from, from_capacity, nullptr, 0U); + } + + /// Destroys the object at the given position without invalidating the + /// vtable + void + weak_destroy(data_accessor* from, + std::size_t from_capacity) noexcept(HasStrongExceptGuarantee) { + cmd_(this, opcode::op_weak_destroy, from, from_capacity, nullptr, 0U); + } + + /// Returns true when the vtable doesn't hold any erased object + bool empty() const noexcept { + data_accessor data; + cmd_(nullptr, opcode::op_fetch_empty, nullptr, 0U, &data, 0U); + return bool(data.inplace_storage_); + } + + /// Invoke the function at the given index + template <std::size_t Index, typename... Args> + constexpr auto invoke(Args&&... args) const { + auto thunk = invoke_table_t::template fetch<Index>(vtable_); + return thunk(std::forward<Args>(args)...); + } + /// Invoke the function at the given index + template <std::size_t Index, typename... Args> + constexpr auto invoke(Args&&... args) const volatile { + auto thunk = invoke_table_t::template fetch<Index>(vtable_); + return thunk(std::forward<Args>(args)...); + } + + template <typename T> + void set_inplace() noexcept { + using type = std::decay_t<T>; + vtable_ = invoke_table_t::template get_invocation_table_of<type, true>(); + cmd_ = &trait<type>::template process_cmd<true>; + } + + template <typename T> + void set_allocated() noexcept { + using type = std::decay_t<T>; + vtable_ = invoke_table_t::template get_invocation_table_of<type, false>(); + cmd_ = &trait<type>::template process_cmd<false>; + } + + void set_empty() noexcept { + vtable_ = invoke_table_t::template get_empty_invocation_table<IsThrowing>(); + cmd_ = &empty_cmd; + } +}; +} // namespace tables + +/// A union which makes the pointer to the heap object share the +/// same space with the internal capacity. +/// The storage type is distinguished by multiple versions of the +/// control and vtable. +template <std::size_t Capacity, typename = void> +struct internal_capacity { + /// We extend the union through a technique similar to the tail object hack + typedef union { + /// Tag to access the structure in a type-safe way + data_accessor accessor_; + /// The internal capacity we use to allocate in-place + std::aligned_storage_t<Capacity> capacity_; + } type; +}; +template <std::size_t Capacity> +struct internal_capacity<Capacity, + std::enable_if_t<(Capacity < sizeof(void*))>> { + typedef struct { + /// Tag to access the structure in a type-safe way + data_accessor accessor_; + } type; +}; + +template <std::size_t Capacity> +class internal_capacity_holder { + // Tag to access the structure in a type-safe way + typename internal_capacity<Capacity>::type storage_; + +public: + constexpr internal_capacity_holder() = default; + + constexpr data_accessor* opaque_ptr() noexcept { + return &storage_.accessor_; + } + constexpr data_accessor const* opaque_ptr() const noexcept { + return &storage_.accessor_; + } + constexpr data_accessor volatile* opaque_ptr() volatile noexcept { + return &storage_.accessor_; + } + constexpr data_accessor const volatile* opaque_ptr() const volatile noexcept { + return &storage_.accessor_; + } + + static constexpr std::size_t capacity() noexcept { + return sizeof(storage_); + } +}; + +/// An owning erasure +template <bool IsOwning /* = true*/, typename Config, typename Property> +class erasure : internal_capacity_holder<Config::capacity> { + template <bool, typename, typename> + friend class erasure; + template <std::size_t, typename, typename...> + friend class operator_impl; + + using vtable_t = tables::vtable<Property>; + + vtable_t vtable_; + +public: + /// Returns the capacity of this erasure + static constexpr std::size_t capacity() noexcept { + return internal_capacity_holder<Config::capacity>::capacity(); + } + + constexpr erasure() noexcept { + vtable_.set_empty(); + } + + constexpr erasure(std::nullptr_t) noexcept { + vtable_.set_empty(); + } + + constexpr erasure(erasure&& right) noexcept( + Property::is_strong_exception_guaranteed) { + right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + } + + constexpr erasure(erasure const& right) { + right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + } + + template <typename OtherConfig> + constexpr erasure(erasure<true, OtherConfig, Property> right) noexcept( + Property::is_strong_exception_guaranteed) { + right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + } + + template <typename T, typename Allocator = std::allocator<std::decay_t<T>>> + constexpr erasure(T&& callable, Allocator&& allocator = Allocator{}) { + vtable_t::init(vtable_, + type_erasure::make_box( + std::integral_constant<bool, Config::is_copyable>{}, + std::forward<T>(callable), + std::forward<Allocator>(allocator)), + this->opaque_ptr(), capacity()); + } + + ~erasure() { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + } + + constexpr erasure& + operator=(std::nullptr_t) noexcept(Property::is_strong_exception_guaranteed) { + vtable_.destroy(this->opaque_ptr(), capacity()); + return *this; + } + + constexpr erasure& operator=(erasure&& right) noexcept( + Property::is_strong_exception_guaranteed) { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + return *this; + } + + constexpr erasure& operator=(erasure const& right) { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + right.vtable_.copy(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + return *this; + } + + template <typename OtherConfig> + constexpr erasure& + operator=(erasure<true, OtherConfig, Property> right) noexcept( + Property::is_strong_exception_guaranteed) { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + right.vtable_.move(vtable_, right.opaque_ptr(), right.capacity(), + this->opaque_ptr(), capacity()); + return *this; + } + + template <typename T> + constexpr erasure& operator=(T&& callable) { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + vtable_t::init(vtable_, + type_erasure::make_box( + std::integral_constant<bool, Config::is_copyable>{}, + std::forward<T>(callable)), + this->opaque_ptr(), capacity()); + return *this; + } + + template <typename T, typename Allocator> + void assign(T&& callable, Allocator&& allocator) { + vtable_.weak_destroy(this->opaque_ptr(), capacity()); + vtable_t::init(vtable_, + type_erasure::make_box( + std::integral_constant<bool, Config::is_copyable>{}, + std::forward<T>(callable), + std::forward<Allocator>(allocator)), + this->opaque_ptr(), capacity()); + } + + /// Returns true when the erasure doesn't hold any erased object + constexpr bool empty() const noexcept { + return vtable_.empty(); + } + + /// Invoke the function of the erasure at the given index + /// + /// We define this out of class to be able to forward the qualified + /// erasure correctly. + template <std::size_t Index, typename Erasure, typename... Args> + static constexpr auto invoke(Erasure&& erasure, Args&&... args) { + auto const capacity = erasure.capacity(); + return erasure.vtable_.template invoke<Index>( + std::forward<Erasure>(erasure).opaque_ptr(), capacity, + std::forward<Args>(args)...); + } +}; + +// A non owning erasure +template </*bool IsOwning = false, */ typename Config, bool IsThrowing, + bool HasStrongExceptGuarantee, typename... Args> +class erasure<false, Config, + property<IsThrowing, HasStrongExceptGuarantee, Args...>> { + template <bool, typename, typename> + friend class erasure; + template <std::size_t, typename, typename...> + friend class operator_impl; + + using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>; + + using invoke_table_t = invocation_table::invoke_table<Args...>; + typename invoke_table_t::type invoke_table_; + + /// The internal pointer to the non owned object + data_accessor view_; + +public: + // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init) + constexpr erasure() noexcept + : invoke_table_( + invoke_table_t::template get_empty_invocation_table<IsThrowing>()), + view_(nullptr) { + } + + // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init) + constexpr erasure(std::nullptr_t) noexcept + : invoke_table_( + invoke_table_t::template get_empty_invocation_table<IsThrowing>()), + view_(nullptr) { + } + + // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init) + constexpr erasure(erasure&& right) noexcept + : invoke_table_(right.invoke_table_), view_(right.view_) { + } + + constexpr erasure(erasure const& /*right*/) = default; + + template <typename OtherConfig> + // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init) + constexpr erasure(erasure<false, OtherConfig, property_t> right) noexcept + : invoke_table_(right.invoke_table_), view_(right.view_) { + } + + template <typename T> + // NOLINTNEXTLINE(cppcoreguidlines-pro-type-member-init) + constexpr erasure(T&& object) + : invoke_table_(invoke_table_t::template get_invocation_view_table_of< + std::decay_t<T>>()), + view_(address_taker<std::decay_t<T>>::take(std::forward<T>(object))) { + } + + ~erasure() = default; + + constexpr erasure& + operator=(std::nullptr_t) noexcept(HasStrongExceptGuarantee) { + invoke_table_ = + invoke_table_t::template get_empty_invocation_table<IsThrowing>(); + view_.ptr_ = nullptr; + return *this; + } + + constexpr erasure& operator=(erasure&& right) noexcept { + invoke_table_ = right.invoke_table_; + view_ = right.view_; + right = nullptr; + return *this; + } + + constexpr erasure& operator=(erasure const& /*right*/) = default; + + template <typename OtherConfig> + constexpr erasure& + operator=(erasure<true, OtherConfig, property_t> right) noexcept { + invoke_table_ = right.invoke_table_; + view_ = right.view_; + return *this; + } + + template <typename T> + constexpr erasure& operator=(T&& object) { + invoke_table_ = invoke_table_t::template get_invocation_view_table_of< + std::decay_t<T>>(); + view_.ptr_ = address_taker<std::decay_t<T>>::take(std::forward<T>(object)); + return *this; + } + + /// Returns true when the erasure doesn't hold any erased object + constexpr bool empty() const noexcept { + return view_.ptr_ == nullptr; + } + + template <std::size_t Index, typename Erasure, typename... T> + static constexpr auto invoke(Erasure&& erasure, T&&... args) { + auto thunk = invoke_table_t::template fetch<Index>(erasure.invoke_table_); + return thunk(&(erasure.view_), 0UL, std::forward<T>(args)...); + } +}; +} // namespace type_erasure + +/// Deduces to a true_type if the type T provides the given signature and the +/// signature is noexcept correct callable. +template <typename T, typename Signature, + typename Trait = + type_erasure::invocation_table::function_trait<Signature>> +struct accepts_one + : std::integral_constant< + bool, invocation::can_invoke<typename Trait::template callable<T>, + typename Trait::arguments>::value && + invocation::is_noexcept_correct< + Trait::is_noexcept::value, + typename Trait::template callable<T>, + typename Trait::arguments>::value> {}; + +/// Deduces to a true_type if the type T provides all signatures +template <typename T, typename Signatures, typename = void> +struct accepts_all : std::false_type {}; +template <typename T, typename... Signatures> +struct accepts_all< + T, identity<Signatures...>, + void_t<std::enable_if_t<accepts_one<T, Signatures>::value>...>> + : std::true_type {}; + +template <typename Config, typename T> +struct assert_wrong_copy_assign { + static_assert(!Config::is_copyable || + std::is_copy_constructible<std::decay_t<T>>::value, + "Can't wrap a non copyable object into a unique function!"); + + using type = void; +}; + +template <bool IsStrongExceptGuaranteed, typename T> +struct assert_no_strong_except_guarantee { + static_assert( + !IsStrongExceptGuaranteed || + (std::is_nothrow_move_constructible<T>::value && + std::is_nothrow_destructible<T>::value), + "Can't wrap a object an object that has no strong exception guarantees " + "if this is required by the wrapper!"); + + using type = void; +}; + +/// SFINAES out if the given callable is not copyable correct to the left one. +template <typename LeftConfig, typename RightConfig> +using enable_if_copyable_correct_t = + std::enable_if_t<(!LeftConfig::is_copyable || RightConfig::is_copyable)>; + +template <typename LeftConfig, typename RightConfig> +using is_owning_correct = + std::integral_constant<bool, + (LeftConfig::is_owning == RightConfig::is_owning)>; + +/// SFINAES out if the given function2 is not owning correct to this one +template <typename LeftConfig, typename RightConfig> +using enable_if_owning_correct_t = + std::enable_if_t<is_owning_correct<LeftConfig, RightConfig>::value>; + +template <typename Config, bool IsThrowing, bool HasStrongExceptGuarantee, + typename... Args> +class function<Config, property<IsThrowing, HasStrongExceptGuarantee, Args...>> + : type_erasure::invocation_table::operator_impl< + 0U, + function<Config, + property<IsThrowing, HasStrongExceptGuarantee, Args...>>, + Args...> { + + template <typename, typename> + friend class function; + + template <std::size_t, typename, typename...> + friend class type_erasure::invocation_table::operator_impl; + + using property_t = property<IsThrowing, HasStrongExceptGuarantee, Args...>; + using erasure_t = + type_erasure::erasure<Config::is_owning, Config, property_t>; + + template <typename T> + using enable_if_can_accept_all_t = + std::enable_if_t<accepts_all<std::decay_t<T>, identity<Args...>>::value>; + + template <typename Function, typename = void> + struct is_convertible_to_this : std::false_type {}; + template <typename RightConfig> + struct is_convertible_to_this< + function<RightConfig, property_t>, + void_t<enable_if_copyable_correct_t<Config, RightConfig>, + enable_if_owning_correct_t<Config, RightConfig>>> + : std::true_type {}; + + template <typename T> + using enable_if_not_convertible_to_this = + std::enable_if_t<!is_convertible_to_this<std::decay_t<T>>::value>; + + template <typename T> + using enable_if_owning_t = + std::enable_if_t<std::is_same<T, T>::value && Config::is_owning>; + + template <typename T> + using assert_wrong_copy_assign_t = + typename assert_wrong_copy_assign<Config, std::decay_t<T>>::type; + + template <typename T> + using assert_no_strong_except_guarantee_t = + typename assert_no_strong_except_guarantee<HasStrongExceptGuarantee, + std::decay_t<T>>::type; + + erasure_t erasure_; + +public: + /// Default constructor which empty constructs the function + function() = default; + ~function() = default; + + explicit constexpr function(function const& /*right*/) = default; + explicit constexpr function(function&& /*right*/) = default; + + /// Copy construction from another copyable function + template <typename RightConfig, + std::enable_if_t<RightConfig::is_copyable>* = nullptr, + enable_if_copyable_correct_t<Config, RightConfig>* = nullptr, + enable_if_owning_correct_t<Config, RightConfig>* = nullptr> + constexpr function(function<RightConfig, property_t> const& right) + : erasure_(right.erasure_) { + } + + /// Move construction from another function + template <typename RightConfig, + enable_if_copyable_correct_t<Config, RightConfig>* = nullptr, + enable_if_owning_correct_t<Config, RightConfig>* = nullptr> + constexpr function(function<RightConfig, property_t>&& right) + : erasure_(std::move(right.erasure_)) { + } + + /// Construction from a callable object which overloads the `()` operator + template <typename T, // + enable_if_not_convertible_to_this<T>* = nullptr, + enable_if_can_accept_all_t<T>* = nullptr, + assert_wrong_copy_assign_t<T>* = nullptr, + assert_no_strong_except_guarantee_t<T>* = nullptr> + constexpr function(T&& callable) : erasure_(std::forward<T>(callable)) { + } + template <typename T, typename Allocator, // + enable_if_not_convertible_to_this<T>* = nullptr, + enable_if_can_accept_all_t<T>* = nullptr, + enable_if_owning_t<T>* = nullptr, + assert_wrong_copy_assign_t<T>* = nullptr, + assert_no_strong_except_guarantee_t<T>* = nullptr> + constexpr function(T&& callable, Allocator&& allocator) + : erasure_(std::forward<T>(callable), + std::forward<Allocator>(allocator)) { + } + + /// Empty constructs the function + constexpr function(std::nullptr_t np) : erasure_(np) { + } + + function& operator=(function const& /*right*/) = default; + function& operator=(function&& /*right*/) = default; + + /// Copy assigning from another copyable function + template <typename RightConfig, + std::enable_if_t<RightConfig::is_copyable>* = nullptr, + enable_if_copyable_correct_t<Config, RightConfig>* = nullptr, + enable_if_owning_correct_t<Config, RightConfig>* = nullptr> + function& operator=(function<RightConfig, property_t> const& right) { + erasure_ = right.erasure_; + return *this; + } + + /// Move assigning from another function + template <typename RightConfig, + enable_if_copyable_correct_t<Config, RightConfig>* = nullptr, + enable_if_owning_correct_t<Config, RightConfig>* = nullptr> + function& operator=(function<RightConfig, property_t>&& right) { + erasure_ = std::move(right.erasure_); + return *this; + } + + /// Move assigning from a callable object + template <typename T, // ... + enable_if_not_convertible_to_this<T>* = nullptr, + enable_if_can_accept_all_t<T>* = nullptr, + assert_wrong_copy_assign_t<T>* = nullptr, + assert_no_strong_except_guarantee_t<T>* = nullptr> + function& operator=(T&& callable) { + erasure_ = std::forward<T>(callable); + return *this; + } + + /// Clears the function + function& operator=(std::nullptr_t np) { + erasure_ = np; + return *this; + } + + /// Returns true when the function is empty + bool empty() const noexcept { + return erasure_.empty(); + } + + /// Returns true when the function isn't empty + explicit operator bool() const noexcept { + return !empty(); + } + + /// Assigns a new target with an optional allocator + template <typename T, typename Allocator = std::allocator<std::decay_t<T>>, + enable_if_not_convertible_to_this<T>* = nullptr, + enable_if_can_accept_all_t<T>* = nullptr, + assert_wrong_copy_assign_t<T>* = nullptr, + assert_no_strong_except_guarantee_t<T>* = nullptr> + void assign(T&& callable, Allocator&& allocator = Allocator{}) { + erasure_.assign(std::forward<T>(callable), + std::forward<Allocator>(allocator)); + } + + /// Swaps this function with the given function + void swap(function& other) noexcept(HasStrongExceptGuarantee) { + if (&other == this) { + return; + } + + function cache = std::move(other); + other = std::move(*this); + *this = std::move(cache); + } + + /// Swaps the left function with the right one + friend void swap(function& left, + function& right) noexcept(HasStrongExceptGuarantee) { + left.swap(right); + } + + /// Calls the wrapped callable object + using type_erasure::invocation_table::operator_impl< + 0U, function<Config, property_t>, Args...>::operator(); +}; + +template <typename Config, typename Property> +bool operator==(function<Config, Property> const& f, std::nullptr_t) { + return !bool(f); +} + +template <typename Config, typename Property> +bool operator!=(function<Config, Property> const& f, std::nullptr_t) { + return bool(f); +} + +template <typename Config, typename Property> +bool operator==(std::nullptr_t, function<Config, Property> const& f) { + return !bool(f); +} + +template <typename Config, typename Property> +bool operator!=(std::nullptr_t, function<Config, Property> const& f) { + return bool(f); +} + +// Default object size of the function +using object_size = std::integral_constant<std::size_t, 32U>; + +// Default capacity for small functor optimization +using default_capacity = + std::integral_constant<std::size_t, + object_size::value - (2 * sizeof(void*))>; +} // namespace detail +} // namespace abi_310 + +/// Adaptable function wrapper base for arbitrary functional types. +template < + /// This is a placeholder for future non owning support + bool IsOwning, + /// Defines whether the function is copyable or not + bool IsCopyable, + /// Defines the internal capacity of the function + /// for small functor optimization. + /// The size of the whole function object will be the capacity plus + /// the size of two pointers. + /// If the capacity is zero, the size will increase through one additional + /// pointer so the whole object has the size of 3 * sizeof(void*). + std::size_t Capacity, + /// Defines whether the function throws an exception on empty function + /// call, `std::abort` is called otherwise. + bool IsThrowing, + /// Defines whether all objects satisfy the strong exception guarantees, + /// which means the function type will satisfy the strong exception + /// guarantees too. + bool HasStrongExceptGuarantee, + /// Defines the signature of the function wrapper + typename... Signatures> +using function_base = detail::function< + detail::config<IsOwning, IsCopyable, Capacity>, + detail::property<IsThrowing, HasStrongExceptGuarantee, Signatures...>>; + +/// An owning copyable function wrapper for arbitrary callable types. +template <typename... Signatures> +using function = function_base<true, true, detail::default_capacity::value, + true, false, Signatures...>; + +/// An owning non copyable function wrapper for arbitrary callable types. +template <typename... Signatures> +using unique_function = + function_base<true, false, detail::default_capacity::value, true, false, + Signatures...>; + +/// A non owning copyable function wrapper for arbitrary callable types. +template <typename... Signatures> +using function_view = + function_base<false, true, detail::default_capacity::value, true, false, + Signatures...>; + +#if !defined(FU2_HAS_DISABLED_EXCEPTIONS) +/// Exception type that is thrown when invoking empty function objects +/// and exception support isn't disabled. +/// +/// Exception suport is enabled if +/// the template parameter 'Throwing' is set to true (default). +/// +/// This type will default to std::bad_function_call if the +/// functional header is used, otherwise the library provides its own type. +/// +/// You may disable the inclusion of the functionl header +/// through defining `FU2_WITH_NO_FUNCTIONAL_HEADER`. +/// +using detail::type_erasure::invocation_table::bad_function_call; +#endif + +/// Returns a callable object, which unifies all callable objects +/// that were passed to this function. +/// +/// ```cpp +/// auto overloaded = fu2::overload([](std::true_type) { return true; }, +/// [](std::false_type) { return false; }); +/// ``` +/// +/// \param callables A pack of callable objects with arbitrary signatures. +/// +/// \returns A callable object which exposes the +/// +template <typename... T> +constexpr auto overload(T&&... callables) { + return detail::overloading::overload(std::forward<T>(callables)...); +} +} // namespace fu2 + +#undef FU2_EXPAND_QUALIFIERS +#undef FU2_EXPAND_QUALIFIERS_NOEXCEPT + +#endif // FU2_INCLUDED_FUNCTION2_HPP_ diff --git a/src/include/hash.h b/src/include/hash.h new file mode 100644 index 000000000..2ab95448b --- /dev/null +++ b/src/include/hash.h @@ -0,0 +1,64 @@ +#ifndef CEPH_HASH_H +#define CEPH_HASH_H + +#include "acconfig.h" + +// Robert Jenkins' function for mixing 32-bit values +// http://burtleburtle.net/bob/hash/evahash.html +// a, b = random bits, c = input and output + +#define hashmix(a,b,c) \ + a=a-b; a=a-c; a=a^(c>>13); \ + b=b-c; b=b-a; b=b^(a<<8); \ + c=c-a; c=c-b; c=c^(b>>13); \ + a=a-b; a=a-c; a=a^(c>>12); \ + b=b-c; b=b-a; b=b^(a<<16); \ + c=c-a; c=c-b; c=c^(b>>5); \ + a=a-b; a=a-c; a=a^(c>>3); \ + b=b-c; b=b-a; b=b^(a<<10); \ + c=c-a; c=c-b; c=c^(b>>15); + + +//namespace ceph { + +template <class _Key> struct rjhash { }; + +inline uint64_t rjhash64(uint64_t key) { + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; +} + +inline uint32_t rjhash32(uint32_t a) { + a = (a+0x7ed55d16) + (a<<12); + a = (a^0xc761c23c) ^ (a>>19); + a = (a+0x165667b1) + (a<<5); + a = (a+0xd3a2646c) ^ (a<<9); + a = (a+0xfd7046c5) + (a<<3); + a = (a^0xb55a4f09) ^ (a>>16); + return a; +} + + +template<> struct rjhash<uint32_t> { + inline size_t operator()(const uint32_t x) const { + return rjhash32(x); + } +}; + +template<> struct rjhash<uint64_t> { + inline size_t operator()(const uint64_t x) const { + return rjhash64(x); + } +}; + +//} + + + +#endif diff --git a/src/include/health.h b/src/include/health.h new file mode 100644 index 000000000..03191eff7 --- /dev/null +++ b/src/include/health.h @@ -0,0 +1,83 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <string> + +#include "include/encoding.h" + +// health_status_t +enum health_status_t { + HEALTH_ERR = 0, + HEALTH_WARN = 1, + HEALTH_OK = 2, +}; + +inline void encode(health_status_t hs, ceph::buffer::list& bl) { + using ceph::encode; + uint8_t v = hs; + encode(v, bl); +} +inline void decode(health_status_t& hs, ceph::buffer::list::const_iterator& p) { + using ceph::decode; + uint8_t v; + decode(v, p); + hs = health_status_t(v); +} +template<> +struct denc_traits<health_status_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + static void bound_encode(const ceph::buffer::ptr& v, size_t& p, uint64_t f=0) { + p++; + } + static void encode(const health_status_t& v, + ceph::buffer::list::contiguous_appender& p, + uint64_t f=0) { + ::denc((uint8_t)v, p); + } + static void decode(health_status_t& v, ceph::buffer::ptr::const_iterator& p, + uint64_t f=0) { + uint8_t tmp; + ::denc(tmp, p); + v = health_status_t(tmp); + } + static void decode(health_status_t& v, ceph::buffer::list::const_iterator& p, + uint64_t f=0) { + uint8_t tmp; + ::denc(tmp, p); + v = health_status_t(tmp); + } +}; + +inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) { + switch (status) { + case HEALTH_ERR: + oss << "HEALTH_ERR"; + break; + case HEALTH_WARN: + oss << "HEALTH_WARN"; + break; + case HEALTH_OK: + oss << "HEALTH_OK"; + break; + } + return oss; +} + +inline const char *short_health_string(const health_status_t status) { + switch (status) { + case HEALTH_ERR: + return "ERR"; + case HEALTH_WARN: + return "WRN"; + case HEALTH_OK: + return "OK"; + default: + return "???"; + } +} diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h new file mode 100644 index 000000000..48d889763 --- /dev/null +++ b/src/include/inline_memory.h @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_INLINE_MEMORY_H +#define CEPH_INLINE_MEMORY_H + +#if defined(__GNUC__) + +// optimize for the common case, which is very small copies +static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) + __attribute__((always_inline)); + +void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) +{ + if (l > inline_len) { + return memcpy(dest, src, l); + } + switch (l) { + case 8: + return __builtin_memcpy(dest, src, 8); + case 4: + return __builtin_memcpy(dest, src, 4); + case 3: + return __builtin_memcpy(dest, src, 3); + case 2: + return __builtin_memcpy(dest, src, 2); + case 1: + return __builtin_memcpy(dest, src, 1); + default: + int cursor = 0; + while (l >= sizeof(uint64_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint64_t)); + cursor += sizeof(uint64_t); + l -= sizeof(uint64_t); + } + while (l >= sizeof(uint32_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint32_t)); + cursor += sizeof(uint32_t); + l -= sizeof(uint32_t); + } + while (l > 0) { + *((char*)dest + cursor) = *((char*)src + cursor); + cursor++; + l--; + } + } + return dest; +} + +#else + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +#endif + + +#if defined(__GNUC__) && defined(__x86_64__) + +namespace ceph { +typedef unsigned uint128_t __attribute__ ((mode (TI))); +} +using ceph::uint128_t; + +static inline bool mem_is_zero(const char *data, size_t len) + __attribute__((always_inline)); + +bool mem_is_zero(const char *data, size_t len) +{ + // we do have XMM registers in x86-64, so if we need to check at least + // 16 bytes, make use of them + if (len / sizeof(uint128_t) > 0) { + // align data pointer to 16 bytes, otherwise it'll segfault due to bug + // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). + // check up to 15 first bytes while at it. + while (((unsigned long long)data) & 15) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + --len; + } + + const char* data_start = data; + const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t); + + while (data < max128) { + if (*(uint128_t*)data != 0) { + return false; + } + data += sizeof(uint128_t); + } + len -= (data - data_start); + } + + const char* max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); + while (data < max32) { + if (*(uint32_t*)data != 0) { + return false; + } + data += sizeof(uint32_t); + } + while (data < max) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + } + return true; +} + +#else // gcc and x86_64 + +static inline bool mem_is_zero(const char *data, size_t len) { + const char *end = data + len; + const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t); + + while (data < end64) { + if (*(uint64_t*)data != 0) { + return false; + } + data += sizeof(uint64_t); + } + + while (data < end) { + if (*data != 0) { + return false; + } + ++data; + } + return true; +} + +#endif // !x86_64 + +#endif diff --git a/src/include/int_types.h b/src/include/int_types.h new file mode 100644 index 000000000..a704ba71d --- /dev/null +++ b/src/include/int_types.h @@ -0,0 +1,56 @@ +#ifndef CEPH_INTTYPES_H +#define CEPH_INTTYPES_H + +#include "acconfig.h" + +#include <inttypes.h> + +#ifdef __linux__ +#include <linux/types.h> +#else +#ifndef HAVE___U8 +typedef uint8_t __u8; +#endif + +#ifndef HAVE___S8 +typedef int8_t __s8; +#endif + +#ifndef HAVE___U16 +typedef uint16_t __u16; +#endif + +#ifndef HAVE___S16 +typedef int16_t __s16; +#endif + +#ifndef HAVE___U32 +typedef uint32_t __u32; +#endif + +#ifndef HAVE___S32 +typedef int32_t __s32; +#endif + +#ifndef HAVE___U64 +typedef uint64_t __u64; +#endif + +#ifndef HAVE___S64 +typedef int64_t __s64; +#endif +#endif /* LINUX_TYPES_H */ + +#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS +#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS +#endif + +#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE +#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need +#endif + +#ifndef BOOST_MPL_LIMIT_MAP_SIZE +#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need +#endif + +#endif diff --git a/src/include/intarith.h b/src/include/intarith.h new file mode 100644 index 000000000..68b0345a4 --- /dev/null +++ b/src/include/intarith.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INTARITH_H +#define CEPH_INTARITH_H + +#include <bit> +#include <climits> +#include <concepts> +#include <type_traits> + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) { + return (n + d - 1) / d; +} + + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) { + return (n % d ? (n + d - n % d) : n); +} + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) { + return (x + (1 << y) - 1) >> y; +} + +/* + * Wrappers for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, p2align(1200, 1024) == 1024 (1*align) + * eg, p2align(1024, 1024) == 1024 (1*align) + * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align) + */ +template<typename T> +constexpr inline T p2align(T x, T align) { + return x & -align; +} + +/* + * return x % (mod) align + * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +template<typename T> +constexpr inline T p2phase(T x, T align) { + return x & (align - 1); +} + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +template<typename T> +constexpr inline T p2nphase(T x, T align) { + return -x & (align - 1); +} + +/* + * return x rounded up to an align boundary + * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align) + */ +template<typename T> +constexpr inline T p2roundup(T x, T align) { + return -(-x & -align); +} + +// count bits (set + any 0's that follow) +template<std::integral T> +unsigned cbits(T v) { + return (sizeof(v) * CHAR_BIT) - std::countl_zero(std::make_unsigned_t<T>(v)); +} + +#endif diff --git a/src/include/interval_set.h b/src/include/interval_set.h new file mode 100644 index 000000000..dfb2a306c --- /dev/null +++ b/src/include/interval_set.h @@ -0,0 +1,824 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_INTERVAL_SET_H +#define CEPH_INTERVAL_SET_H + +#include <iterator> +#include <map> +#include <ostream> + +#include "encoding.h" + +/* + * *** NOTE *** + * + * This class is written to work with a variety of map-like containers, + * *include* ones that invalidate iterators when they are modified (e.g., + * flat_map and btree_map). + */ + +template<typename T, template<typename, typename, typename ...> class C = std::map> +class interval_set { + public: + using Map = C<T, T>; + using value_type = typename Map::value_type; + using offset_type = T; + using length_type = T; + using reference = value_type&; + using const_reference = const value_type&; + using size_type = typename Map::size_type; + + class const_iterator; + + class iterator + { + public: + using difference_type = ssize_t; + using value_type = typename Map::value_type; + using pointer = typename Map::value_type*; + using reference = typename Map::value_type&; + using iterator_category = std::forward_iterator_tag; + + explicit iterator(typename Map::iterator iter) + : _iter(iter) + { } + + // For the copy constructor and assignment operator, the compiler-generated functions, which + // perform simple bitwise copying, should be fine. + + bool operator==(const iterator& rhs) const { + return (_iter == rhs._iter); + } + + bool operator!=(const iterator& rhs) const { + return (_iter != rhs._iter); + } + + // Dereference this iterator to get a pair. + reference operator*() const { + return *_iter; + } + + // Return the interval start. + offset_type get_start() const { + return _iter->first; + } + + // Return the interval length. + length_type get_len() const { + return _iter->second; + } + + offset_type get_end() const { + return _iter->first + _iter->second; + } + + // Set the interval length. + void set_len(const length_type& len) { + _iter->second = len; + } + + // Preincrement + iterator& operator++() + { + ++_iter; + return *this; + } + + // Postincrement + iterator operator++(int) + { + iterator prev(_iter); + ++_iter; + return prev; + } + + // Predecrement + iterator& operator--() + { + --_iter; + return *this; + } + + // Postdecrement + iterator operator--(int) + { + iterator prev(_iter); + --_iter; + return prev; + } + + friend class interval_set::const_iterator; + + protected: + typename Map::iterator _iter; + friend class interval_set; + }; + + class const_iterator + { + public: + using difference_type = ssize_t; + using value_type = const typename Map::value_type; + using pointer = const typename Map::value_type*; + using reference = const typename Map::value_type&; + using iterator_category = std::forward_iterator_tag; + + explicit const_iterator(typename Map::const_iterator iter) + : _iter(iter) + { } + + const_iterator(const iterator &i) + : _iter(i._iter) + { } + + // For the copy constructor and assignment operator, the compiler-generated functions, which + // perform simple bitwise copying, should be fine. + + bool operator==(const const_iterator& rhs) const { + return (_iter == rhs._iter); + } + + bool operator!=(const const_iterator& rhs) const { + return (_iter != rhs._iter); + } + + // Dereference this iterator to get a pair. + reference operator*() const { + return *_iter; + } + + // Return the interval start. + offset_type get_start() const { + return _iter->first; + } + offset_type get_end() const { + return _iter->first + _iter->second; + } + + // Return the interval length. + length_type get_len() const { + return _iter->second; + } + + // Preincrement + const_iterator& operator++() + { + ++_iter; + return *this; + } + + // Postincrement + const_iterator operator++(int) + { + const_iterator prev(_iter); + ++_iter; + return prev; + } + + // Predecrement + iterator& operator--() + { + --_iter; + return *this; + } + + // Postdecrement + iterator operator--(int) + { + iterator prev(_iter); + --_iter; + return prev; + } + + protected: + typename Map::const_iterator _iter; + }; + + interval_set() = default; + interval_set(Map&& other) { + m.swap(other); + for (const auto& p : m) { + _size += p.second; + } + } + + size_type num_intervals() const + { + return m.size(); + } + + iterator begin() { + return iterator(m.begin()); + } + + iterator lower_bound(T start) { + return iterator(find_inc_m(start)); + } + + iterator end() { + return iterator(m.end()); + } + + const_iterator begin() const { + return const_iterator(m.begin()); + } + + const_iterator lower_bound(T start) const { + return const_iterator(find_inc(start)); + } + + const_iterator end() const { + return const_iterator(m.end()); + } + + // helpers + private: + auto find_inc(T start) const { + auto p = m.lower_bound(start); // p->first >= start + if (p != m.begin() && + (p == m.end() || p->first > start)) { + --p; // might overlap? + if (p->first + p->second <= start) + ++p; // it doesn't. + } + return p; + } + + auto find_inc_m(T start) { + auto p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + --p; // might overlap? + if (p->first + p->second <= start) + ++p; // it doesn't. + } + return p; + } + + auto find_adj(T start) const { + auto p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + --p; // might touch? + if (p->first + p->second < start) + ++p; // it doesn't. + } + return p; + } + + auto find_adj_m(T start) { + auto p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + --p; // might touch? + if (p->first + p->second < start) + ++p; // it doesn't. + } + return p; + } + + void intersection_size_asym(const interval_set &s, const interval_set &l) { + auto ps = s.m.begin(); + ceph_assert(ps != s.m.end()); + auto offset = ps->first; + bool first = true; + auto mi = m.begin(); + + while (1) { + if (first) + first = false; + auto pl = l.find_inc(offset); + if (pl == l.m.end()) + break; + while (ps != s.m.end() && ps->first + ps->second <= pl->first) + ++ps; + if (ps == s.m.end()) + break; + offset = pl->first + pl->second; + if (offset <= ps->first) { + offset = ps->first; + continue; + } + + if (*ps == *pl) { + do { + mi = m.insert(mi, *ps); + _size += ps->second; + ++ps; + ++pl; + } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl); + if (ps == s.m.end()) + break; + offset = ps->first; + continue; + } + + auto start = std::max<T>(ps->first, pl->first); + auto en = std::min<T>(ps->first + ps->second, offset); + ceph_assert(en > start); + mi = m.emplace_hint(mi, start, en - start); + _size += mi->second; + if (ps->first + ps->second <= offset) { + ++ps; + if (ps == s.m.end()) + break; + offset = ps->first; + } + } + } + + bool subset_size_sym(const interval_set &b) const { + auto pa = m.begin(), pb = b.m.begin(); + const auto a_end = m.end(), b_end = b.m.end(); + + while (pa != a_end && pb != b_end) { + while (pb->first + pb->second <= pa->first) { + ++pb; + if (pb == b_end) + return false; + } + + if (*pa == *pb) { + do { + ++pa; + ++pb; + } while (pa != a_end && pb != b_end && *pa == *pb); + continue; + } + + // interval begins before other + if (pa->first < pb->first) + return false; + // interval is longer than other + if (pa->first + pa->second > pb->first + pb->second) + return false; + + ++pa; + } + + return pa == a_end; + } + + public: + bool operator==(const interval_set& other) const { + return _size == other._size && m == other.m; + } + + uint64_t size() const { + return _size; + } + + void bound_encode(size_t& p) const { + denc_traits<Map>::bound_encode(m, p); + } + void encode(ceph::buffer::list::contiguous_appender& p) const { + denc(m, p); + } + void decode(ceph::buffer::ptr::const_iterator& p) { + denc(m, p); + _size = 0; + for (const auto& p : m) { + _size += p.second; + } + } + void decode(ceph::buffer::list::iterator& p) { + denc(m, p); + _size = 0; + for (const auto& p : m) { + _size += p.second; + } + } + + void encode_nohead(ceph::buffer::list::contiguous_appender& p) const { + denc_traits<Map>::encode_nohead(m, p); + } + void decode_nohead(int n, ceph::buffer::ptr::const_iterator& p) { + denc_traits<Map>::decode_nohead(n, m, p); + _size = 0; + for (const auto& p : m) { + _size += p.second; + } + } + + void clear() { + m.clear(); + _size = 0; + } + + bool contains(T i, T *pstart=0, T *plen=0) const { + auto p = find_inc(i); + if (p == m.end()) return false; + if (p->first > i) return false; + if (p->first+p->second <= i) return false; + ceph_assert(p->first <= i && p->first+p->second > i); + if (pstart) + *pstart = p->first; + if (plen) + *plen = p->second; + return true; + } + bool contains(T start, T len) const { + auto p = find_inc(start); + if (p == m.end()) return false; + if (p->first > start) return false; + if (p->first+p->second <= start) return false; + ceph_assert(p->first <= start && p->first+p->second > start); + if (p->first+p->second < start+len) return false; + return true; + } + bool intersects(T start, T len) const { + interval_set a; + a.insert(start, len); + interval_set i; + i.intersection_of( *this, a ); + if (i.empty()) return false; + return true; + } + + // outer range of set + bool empty() const { + return m.empty(); + } + offset_type range_start() const { + ceph_assert(!empty()); + auto p = m.begin(); + return p->first; + } + offset_type range_end() const { + ceph_assert(!empty()); + auto p = m.rbegin(); + return p->first + p->second; + } + + // interval start after p (where p not in set) + bool starts_after(T i) const { + ceph_assert(!contains(i)); + auto p = find_inc(i); + if (p == m.end()) return false; + return true; + } + offset_type start_after(T i) const { + ceph_assert(!contains(i)); + auto p = find_inc(i); + return p->first; + } + + // interval end that contains start + offset_type end_after(T start) const { + ceph_assert(contains(start)); + auto p = find_inc(start); + return p->first+p->second; + } + + void insert(T val) { + insert(val, 1); + } + + void insert(T start, T len, T *pstart=0, T *plen=0) { + //cout << "insert " << start << "~" << len << endl; + ceph_assert(len > 0); + _size += len; + auto p = find_adj_m(start); + if (p == m.end()) { + m[start] = len; // new interval + if (pstart) + *pstart = start; + if (plen) + *plen = len; + } else { + if (p->first < start) { + + if (p->first + p->second != start) { + //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; + ceph_abort(); + } + + p->second += len; // append to end + + auto n = p; + ++n; + if (pstart) + *pstart = p->first; + if (n != m.end() && + start+len == n->first) { // combine with next, too! + p->second += n->second; + if (plen) + *plen = p->second; + m.erase(n); + } else { + if (plen) + *plen = p->second; + } + } else { + if (start+len == p->first) { + if (pstart) + *pstart = start; + if (plen) + *plen = len + p->second; + T psecond = p->second; + m.erase(p); + m[start] = len + psecond; // append to front + } else { + ceph_assert(p->first > start+len); + if (pstart) + *pstart = start; + if (plen) + *plen = len; + m[start] = len; // new interval + } + } + } + } + + void swap(interval_set& other) { + m.swap(other.m); + std::swap(_size, other._size); + } + + void erase(const iterator &i) { + _size -= i.get_len(); + m.erase(i._iter); + } + + void erase(T val) { + erase(val, 1); + } + + void erase(T start, T len, + std::function<bool(T, T)> claim = {}) { + auto p = find_inc_m(start); + + _size -= len; + + ceph_assert(p != m.end()); + ceph_assert(p->first <= start); + + T before = start - p->first; + ceph_assert(p->second >= before+len); + T after = p->second - before - len; + if (before) { + if (claim && claim(p->first, before)) { + _size -= before; + m.erase(p); + } else { + p->second = before; // shorten bit before + } + } else { + m.erase(p); + } + if (after) { + if (claim && claim(start + len, after)) { + _size -= after; + } else { + m[start + len] = after; + } + } + } + + void subtract(const interval_set &a) { + for (const auto& [start, len] : a.m) { + erase(start, len); + } + } + + void insert(const interval_set &a) { + for (const auto& [start, len] : a.m) { + insert(start, len); + } + } + + + void intersection_of(const interval_set &a, const interval_set &b) { + ceph_assert(&a != this); + ceph_assert(&b != this); + clear(); + + const interval_set *s, *l; + + if (a.size() < b.size()) { + s = &a; + l = &b; + } else { + s = &b; + l = &a; + } + + if (!s->size()) + return; + + /* + * Use the lower_bound algorithm for larger size ratios + * where it performs better, but not for smaller size + * ratios where sequential search performs better. + */ + if (l->size() / s->size() >= 10) { + intersection_size_asym(*s, *l); + return; + } + + auto pa = a.m.begin(); + auto pb = b.m.begin(); + auto mi = m.begin(); + + while (pa != a.m.end() && pb != b.m.end()) { + // passing? + if (pa->first + pa->second <= pb->first) + { pa++; continue; } + if (pb->first + pb->second <= pa->first) + { pb++; continue; } + + if (*pa == *pb) { + do { + mi = m.insert(mi, *pa); + _size += pa->second; + ++pa; + ++pb; + } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb); + continue; + } + + T start = std::max(pa->first, pb->first); + T en = std::min(pa->first+pa->second, pb->first+pb->second); + ceph_assert(en > start); + mi = m.emplace_hint(mi, start, en - start); + _size += mi->second; + if (pa->first+pa->second > pb->first+pb->second) + pb++; + else + pa++; + } + } + void intersection_of(const interval_set& b) { + interval_set a; + swap(a); + intersection_of(a, b); + } + + void union_of(const interval_set &a, const interval_set &b) { + ceph_assert(&a != this); + ceph_assert(&b != this); + clear(); + + //cout << "union_of" << endl; + + // a + m = a.m; + _size = a._size; + + // - (a*b) + interval_set ab; + ab.intersection_of(a, b); + subtract(ab); + + // + b + insert(b); + return; + } + void union_of(const interval_set &b) { + interval_set a; + swap(a); + union_of(a, b); + } + void union_insert(T off, T len) { + interval_set a; + a.insert(off, len); + union_of(a); + } + + bool subset_of(const interval_set &big) const { + if (!size()) + return true; + if (size() > big.size()) + return false; + if (range_end() > big.range_end()) + return false; + + /* + * Use the lower_bound algorithm for larger size ratios + * where it performs better, but not for smaller size + * ratios where sequential search performs better. + */ + if (big.size() / size() < 10) + return subset_size_sym(big); + + for (const auto& [start, len] : m) { + if (!big.contains(start, len)) return false; + } + return true; + } + + /* + * build a subset of @other, starting at or after @start, and including + * @len worth of values, skipping holes. e.g., + * span_of([5~10,20~5], 8, 5) -> [8~2,20~3] + */ + void span_of(const interval_set &other, T start, T len) { + clear(); + auto p = other.find_inc(start); + if (p == other.m.end()) + return; + if (p->first < start) { + if (p->first + p->second < start) + return; + if (p->first + p->second < start + len) { + T howmuch = p->second - (start - p->first); + insert(start, howmuch); + len -= howmuch; + p++; + } else { + insert(start, len); + return; + } + } + while (p != other.m.end() && len > 0) { + if (p->second < len) { + insert(p->first, p->second); + len -= p->second; + p++; + } else { + insert(p->first, len); + return; + } + } + } + + /* + * Move contents of m into another Map. Use that instead of + * encoding interval_set into bufferlist then decoding it back into Map. + */ + Map detach() && { + return std::move(m); + } + +private: + // data + uint64_t _size = 0; + Map m; // map start -> len +}; + +// declare traits explicitly because (1) it's templatized, and (2) we +// want to include _nohead variants. +template<typename T, template<typename, typename, typename ...> class C> +struct denc_traits<interval_set<T, C>> { +private: + using container_t = interval_set<T, C>; +public: + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = denc_traits<T, C<T,T>>::need_contiguous; + static void bound_encode(const container_t& v, size_t& p) { + v.bound_encode(p); + } + static void encode(const container_t& v, + ceph::buffer::list::contiguous_appender& p) { + v.encode(p); + } + static void decode(container_t& v, ceph::buffer::ptr::const_iterator& p) { + v.decode(p); + } + template<typename U=T> + static typename std::enable_if<sizeof(U) && !need_contiguous>::type + decode(container_t& v, ceph::buffer::list::iterator& p) { + v.decode(p); + } + static void encode_nohead(const container_t& v, + ceph::buffer::list::contiguous_appender& p) { + v.encode_nohead(p); + } + static void decode_nohead(size_t n, container_t& v, + ceph::buffer::ptr::const_iterator& p) { + v.decode_nohead(n, p); + } +}; + + +template<typename T, template<typename, typename, typename ...> class C> +inline std::ostream& operator<<(std::ostream& out, const interval_set<T,C> &s) { + out << "["; + bool first = true; + for (const auto& [start, len] : s) { + if (!first) out << ","; + out << start << "~" << len; + first = false; + } + out << "]"; + return out; +} + + +#endif diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h new file mode 100644 index 000000000..bf06cfc93 --- /dev/null +++ b/src/include/ipaddr.h @@ -0,0 +1,47 @@ +#ifndef CEPH_IPADDR_H +#define CEPH_IPADDR_H + +class entity_addr_t; + +/* + * Check if an IP address that is in the wanted subnet. + */ +bool matches_ipv4_in_subnet(const struct ifaddrs& addrs, + const struct sockaddr_in* net, + unsigned int prefix_len); +bool matches_ipv6_in_subnet(const struct ifaddrs& addrs, + const struct sockaddr_in6* net, + unsigned int prefix_len); + +/* + * Validate and parse IPv4 or IPv6 network + * + * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage + * struct and an unsigned int: + * + * if the network string is valid, return true and populate sockaddr_storage + * and prefix_len; + * + * if the network string is invalid, return false. + */ +bool parse_network(const char *s, + struct sockaddr_storage *network, + unsigned int *prefix_len); +bool parse_network(const char *s, + entity_addr_t *network, + unsigned int *prefix_len); + +void netmask_ipv6(const struct in6_addr *addr, + unsigned int prefix_len, + struct in6_addr *out); + +void netmask_ipv4(const struct in_addr *addr, + unsigned int prefix_len, + struct in_addr *out); + +bool network_contains( + const struct entity_addr_t& network, + unsigned int prefix_len, + const struct entity_addr_t& addr); + +#endif diff --git a/src/include/krbd.h b/src/include/krbd.h new file mode 100644 index 000000000..977d45fe2 --- /dev/null +++ b/src/include/krbd.h @@ -0,0 +1,97 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_KRBD_H +#define CEPH_KRBD_H + +#include "rados/librados.h" + +/* + * Don't wait for udev add uevents in krbd_map() and udev remove + * uevents in krbd_unmap*(). Instead, make do with the respective + * kernel uevents and return as soon as they are received. + * + * systemd-udevd sends out udev uevents after it finishes processing + * the respective kernel uevents, which mostly boils down to executing + * all matching udev rules. With this flag set, on return from + * krbd_map() systemd-udevd may still be poking at the device: it + * may still be open with tools such as blkid and various ioctls to + * be run against it, none of the persistent symlinks to the device + * node may be there, etc. udev used to be responsible for creating + * the device node as well, but that has been handled by devtmpfs in + * the kernel for many years now, so the device node (as returned + * through @pdevnode) is guaranteed to be there. + * + * If set, krbd_map() and krbd_unmap*() can be invoked from any + * network namespace that is owned by the initial user namespace + * (which is a formality because things like loading kernel modules + * and creating block devices are not namespaced and require global + * privileges, i.e. capabilities in the initial user namespace). + * Otherwise, krbd_map() and krbd_unmap*() must be invoked from + * the initial network namespace. + * + * If set, krbd_unmap*() doesn't attempt to settle the udev queue + * before retrying unmap for the last time. Some EBUSY errors due + * to systemd-udevd poking at the device at the time krbd_unmap*() + * is invoked that are otherwise covered by the retry logic may be + * returned. + */ +#define KRBD_CTX_F_NOUDEV (1U << 0) + +#ifdef __cplusplus +extern "C" { +#endif + +struct krbd_ctx; + +int krbd_create_from_context(rados_config_t cct, uint32_t flags, + struct krbd_ctx **pctx); +void krbd_destroy(struct krbd_ctx *ctx); + +int krbd_map(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + const char *options, + char **pdevnode); +int krbd_is_mapped(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + char **pdevnode); + +int krbd_unmap(struct krbd_ctx *ctx, const char *devnode, + const char *options); +int krbd_unmap_by_spec(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + const char *options); + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +namespace ceph { + class Formatter; +} + +int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f); + +#endif /* __cplusplus */ + +#endif /* CEPH_KRBD_H */ diff --git a/src/include/libcephsqlite.h b/src/include/libcephsqlite.h new file mode 100644 index 000000000..d81cc55e8 --- /dev/null +++ b/src/include/libcephsqlite.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License version 2.1, as published by + * the Free Software Foundation. See file COPYING. + * + */ + +#ifndef LIBCEPHSQLITE_H +#define LIBCEPHSQLITE_H + +/* This loadable extension does not generally require using this header. It is + * here to allow controlling which version of the library is linked in. See + * also sqlite3_cephsqlite_init below. Additionally, you may specify which + * CephContext to use rather than the library instantiating its own and using + * whatever the default credential is. + */ + +#include <sqlite3.h> + +#ifdef _WIN32 +# define LIBCEPHSQLITE_API __declspec(dllexport) +#else +# define LIBCEPHSQLITE_API [[gnu::visibility("default")]] +#endif + +#ifdef __cplusplus +extern "C" { +#endif +/* This is the SQLite entry point when loaded as a dynamic library. You also + * need to ensure SQLite calls this method when using libcephsqlite as a static + * library or a dynamic library linked at compile time. For the latter case, + * you can do this by: + * + * sqlite3_auto_extension((void (*)())sqlite3_cephsqlite_init); + * sqlite3* db = nullptr; + * int rc = sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, nullptr); + * if (rc == SQLITE_DONE) { + * sqlite3_close(db); + * } else { + * // failure + * } + * + * The throwaway database created (name == "") is a memory database opened so + * that SQLite runs the libcephsqlite initialization routine to register the + * VFS. AFter that's done, the VFS is available for a future database open with + * the VFS set to "ceph": + * + * sqlite3_open_v2("foo:bar/baz.db", &db, SQLITE_OPEN_READWRITE, "ceph"); + * + * You MUST do this before calling any other libcephsqlite routine so that + * sqlite3 can pass its API routines to the libcephsqlite extension. + */ + +LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api); + +/* If you prefer to have libcephsqlite use a CephContext managed by your + * application, use this routine to set that. libcephsqlite can only have one + * context globally. + */ + +LIBCEPHSQLITE_API int cephsqlite_setcct(class CephContext* cct, char** ident); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h new file mode 100644 index 000000000..36046b5cc --- /dev/null +++ b/src/include/linux_fiemap.h @@ -0,0 +1,73 @@ +/* + * FS_IOC_FIEMAP ioctl infrastructure. + * + * Some portions copyright (C) 2007 Cluster File Systems, Inc + * + * Authors: Mark Fasheh <mfasheh@suse.com> + * Kalpak Shah <kalpak.shah@sun.com> + * Andreas Dilger <adilger@sun.com> + */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H + +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD_) +#include <sys/types.h> +#endif + +#include "include/int_types.h" + +struct fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_reserved[3]; +}; + +struct fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ +}; + +#define FIEMAP_MAX_OFFSET (~0ULL) + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ + +#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_BYPASS. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ +#define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other + * files. */ + +#endif /* _LINUX_FIEMAP_H */ diff --git a/src/include/lru.h b/src/include/lru.h new file mode 100644 index 000000000..3f5069ee3 --- /dev/null +++ b/src/include/lru.h @@ -0,0 +1,241 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_LRU_H +#define CEPH_LRU_H + +#include <math.h> +#include <stdint.h> + +#include "common/config.h" +#include "xlist.h" + +class LRUObject { +public: + LRUObject() : lru_link(this) {} + virtual ~LRUObject(); + + // pin/unpin item in cache + void lru_pin(); + void lru_unpin(); + bool lru_is_expireable() const { return !lru_pinned; } + + friend class LRU; +private: + class LRU *lru{}; + xlist<LRUObject *>::item lru_link; + bool lru_pinned = false; +}; + +class LRU { +public: + uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); } + uint64_t lru_get_top() const { return top.size(); } + uint64_t lru_get_bot() const{ return bottom.size(); } + uint64_t lru_get_pintail() const { return pintail.size(); } + uint64_t lru_get_num_pinned() const { return num_pinned; } + + void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); } + + void lru_clear() { + while (!top.empty()) { + lru_remove(top.front()); + } + while (!bottom.empty()) { + lru_remove(bottom.front()); + } + while (!pintail.empty()) { + lru_remove(pintail.front()); + } + ceph_assert(num_pinned == 0); + } + + // insert at top of lru + void lru_insert_top(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + top.push_front(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // insert at mid point in lru + void lru_insert_mid(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + bottom.push_front(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // insert at bottom of lru + void lru_insert_bot(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + bottom.push_back(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // remove an item + LRUObject *lru_remove(LRUObject *o) { + if (!o->lru) return o; + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + o->lru_link.remove_myself(); + if (o->lru_pinned) num_pinned--; + o->lru = nullptr; + adjust(); + return o; + } + + // touch item -- move to head of lru + bool lru_touch(LRUObject *o) { + if (!o->lru) { + lru_insert_top(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + top.push_front(&o->lru_link); + adjust(); + } + return true; + } + + // touch item -- move to midpoint (unless already higher) + bool lru_midtouch(LRUObject *o) { + if (!o->lru) { + lru_insert_mid(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + if (list == &top) return false; + bottom.push_front(&o->lru_link); + adjust(); + } + return true; + } + + // touch item -- move to bottom + bool lru_bottouch(LRUObject *o) { + if (!o->lru) { + lru_insert_bot(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + bottom.push_back(&o->lru_link); + adjust(); + } + return true; + } + + void lru_touch_entire_pintail() { + // promote entire pintail to the top lru + while (pintail.size() > 0) { + top.push_back(&pintail.front()->lru_link); + adjust(); + } + } + + // expire -- expire a single item + LRUObject *lru_get_next_expire() { + adjust(); + // look through tail of bot + while (bottom.size()) { + LRUObject *p = bottom.back(); + if (!p->lru_pinned) return p; + + // move to pintail + pintail.push_front(&p->lru_link); + } + + // ok, try head then + while (top.size()) { + LRUObject *p = top.back(); + if (!p->lru_pinned) return p; + + // move to pintail + pintail.push_front(&p->lru_link); + } + + // no luck! + return NULL; + } + + LRUObject *lru_expire() { + LRUObject *p = lru_get_next_expire(); + if (p) + return lru_remove(p); + return NULL; + } + + void lru_status() { + //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl; + } + +protected: + // adjust top/bot balance, as necessary + void adjust() { + uint64_t toplen = top.size(); + uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned)); + /* move items from below midpoint (bottom) to top: move midpoint forward */ + for (uint64_t i = toplen; i < topwant; i++) { + top.push_back(&bottom.front()->lru_link); + } + /* or: move items from above midpoint (top) to bottom: move midpoint backwards */ + for (uint64_t i = toplen; i > topwant; i--) { + bottom.push_front(&top.back()->lru_link); + } + } + + uint64_t num_pinned = 0; + double midpoint = 0.6; + + friend class LRUObject; +private: + using LRUList = xlist<LRUObject*>; + LRUList top, bottom, pintail; +}; + +inline LRUObject::~LRUObject() { + if (lru) { + lru->lru_remove(this); + } +} + +inline void LRUObject::lru_pin() { + if (lru && !lru_pinned) { + lru->num_pinned++; + } + lru_pinned = true; +} + +inline void LRUObject::lru_unpin() { + if (lru && lru_pinned) { + lru->num_pinned--; + + // move from pintail -> bot + if (lru_link.get_list() == &lru->pintail) { + lru->lru_bottouch(this); + } + } + lru_pinned = false; +} + +#endif diff --git a/src/include/mempool.h b/src/include/mempool.h new file mode 100644 index 000000000..076c62afe --- /dev/null +++ b/src/include/mempool.h @@ -0,0 +1,557 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef _CEPH_INCLUDE_MEMPOOL_H +#define _CEPH_INCLUDE_MEMPOOL_H + +#include <cstddef> +#include <map> +#include <unordered_map> +#include <set> +#include <vector> +#include <list> +#include <mutex> +#include <typeinfo> +#include <boost/container/flat_set.hpp> +#include <boost/container/flat_map.hpp> + +#include "common/Formatter.h" +#include "common/ceph_atomic.h" +#include "include/ceph_assert.h" +#include "include/compact_map.h" +#include "include/compact_set.h" +#include "include/compat.h" + + +/* + +Memory Pools +============ + +A memory pool is a method for accounting the consumption of memory of +a set of containers. + +Memory pools are statically declared (see pool_index_t). + +Each memory pool tracks the number of bytes and items it contains. + +Allocators can be declared and associated with a type so that they are +tracked independently of the pool total. This additional accounting +is optional and only incurs an overhead if the debugging is enabled at +runtime. This allows developers to see what types are consuming the +pool resources. + + +Declaring +--------- + +Using memory pools is very easy. + +To create a new memory pool, simply add a new name into the list of +memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER". That's +it. :) + +For each memory pool that's created a C++ namespace is also +automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER). +That namespace contains a set of common STL containers that are predefined +with the appropriate allocators. + +Thus for mempool "osd" we have automatically available to us: + + mempool::osd::map + mempool::osd::multimap + mempool::osd::set + mempool::osd::multiset + mempool::osd::list + mempool::osd::vector + mempool::osd::unordered_map + + +Putting objects in a mempool +---------------------------- + +In order to use a memory pool with a particular type, a few additional +declarations are needed. + +For a class: + + struct Foo { + MEMPOOL_CLASS_HELPERS(); + ... + }; + +Then, in an appropriate .cc file, + + MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd); + +The second argument can generally be identical to the first, except +when the type contains a nested scope. For example, for +BlueStore::Onode, we need to do + + MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode, + bluestore_meta); + +(This is just because we need to name some static variables and we +can't use :: in a variable name.) + +XXX Note: the new operator hard-codes the allocation size to the size of the +object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot +incorporate mempools into a base class without also defining a helper/factory +for the child class as well (as the base class is usually smaller than the +child class). + +In order to use the STL containers, simply use the namespaced variant +of the container type. For example, + + mempool::osd::map<int> myvec; + +Introspection +------------- + +The simplest way to interrogate the process is with + + Formater *f = ... + mempool::dump(f); + +This will dump information about *all* memory pools. When debug mode +is enabled, the runtime complexity of dump is O(num_shards * +num_types). When debug name is disabled it is O(num_shards). + +You can also interrogate a specific pool programmatically with + + size_t bytes = mempool::unittest_2::allocated_bytes(); + size_t items = mempool::unittest_2::allocated_items(); + +The runtime complexity is O(num_shards). + +Note that you cannot easily query per-type, primarily because debug +mode is optional and you should not rely on that information being +available. + +*/ + +namespace mempool { + +// -------------------------------------------------------------- +// define memory pools + +#define DEFINE_MEMORY_POOLS_HELPER(f) \ + f(bloom_filter) \ + f(bluestore_alloc) \ + f(bluestore_cache_data) \ + f(bluestore_cache_onode) \ + f(bluestore_cache_meta) \ + f(bluestore_cache_other) \ + f(bluestore_cache_buffer) \ + f(bluestore_extent) \ + f(bluestore_blob) \ + f(bluestore_shared_blob) \ + f(bluestore_inline_bl) \ + f(bluestore_fsck) \ + f(bluestore_txc) \ + f(bluestore_writing_deferred) \ + f(bluestore_writing) \ + f(bluefs) \ + f(bluefs_file_reader) \ + f(bluefs_file_writer) \ + f(buffer_anon) \ + f(buffer_meta) \ + f(osd) \ + f(osd_mapbl) \ + f(osd_pglog) \ + f(osdmap) \ + f(osdmap_mapping) \ + f(pgmap) \ + f(mds_co) \ + f(unittest_1) \ + f(unittest_2) + + +// give them integer ids +#define P(x) mempool_##x, +enum pool_index_t { + DEFINE_MEMORY_POOLS_HELPER(P) + num_pools // Must be last. +}; +#undef P + +extern bool debug_mode; +extern void set_debug_mode(bool d); + +// -------------------------------------------------------------- +class pool_t; + +// we shard pool stats across many shard_t's to reduce the amount +// of cacheline ping pong. +enum { + num_shard_bits = 5 +}; +enum { + num_shards = 1 << num_shard_bits +}; + +// +// Align shard to a cacheline. +// +// It would be possible to retrieve the value at runtime (for instance +// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment +// /proc/cpuinfo). It is easier to hard code the largest cache +// linesize for all known processors (128 bytes). If the actual cache +// linesize is smaller on a given processor, it will just waste a few +// bytes. +// +struct shard_t { + ceph::atomic<size_t> bytes = {0}; + ceph::atomic<size_t> items = {0}; + char __padding[128 - sizeof(ceph::atomic<size_t>)*2]; +} __attribute__ ((aligned (128))); + +static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized"); + +struct stats_t { + ssize_t items = 0; + ssize_t bytes = 0; + void dump(ceph::Formatter *f) const { + f->dump_int("items", items); + f->dump_int("bytes", bytes); + } + + stats_t& operator+=(const stats_t& o) { + items += o.items; + bytes += o.bytes; + return *this; + } +}; + +pool_t& get_pool(pool_index_t ix); +const char *get_pool_name(pool_index_t ix); + +struct type_t { + const char *type_name; + size_t item_size; + ceph::atomic<ssize_t> items = {0}; // signed +}; + +struct type_info_hash { + std::size_t operator()(const std::type_info& k) const { + return k.hash_code(); + } +}; + +class pool_t { + shard_t shard[num_shards]; + + mutable std::mutex lock; // only used for types list + std::unordered_map<const char *, type_t> type_map; + +public: + // + // How much this pool consumes. O(<num_shards>) + // + size_t allocated_bytes() const; + size_t allocated_items() const; + + void adjust_count(ssize_t items, ssize_t bytes); + + static size_t pick_a_shard_int() { + // Dirt cheap, see: + // https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html + size_t me = (size_t)pthread_self(); + size_t i = (me >> CEPH_PAGE_SHIFT) & ((1 << num_shard_bits) - 1); + return i; + } + + shard_t* pick_a_shard() { + size_t i = pick_a_shard_int(); + return &shard[i]; + } + + type_t *get_type(const std::type_info& ti, size_t size) { + std::lock_guard<std::mutex> l(lock); + auto p = type_map.find(ti.name()); + if (p != type_map.end()) { + return &p->second; + } + type_t &t = type_map[ti.name()]; + t.type_name = ti.name(); + t.item_size = size; + return &t; + } + + // get pool stats. by_type is not populated if !debug + void get_stats(stats_t *total, + std::map<std::string, stats_t> *by_type) const; + + void dump(ceph::Formatter *f, stats_t *ptotal=0) const; +}; + +void dump(ceph::Formatter *f); + + +// STL allocator for use with containers. All actual state +// is stored in the static pool_allocator_base_t, which saves us from +// passing the allocator to container constructors. + +template<pool_index_t pool_ix, typename T> +class pool_allocator { + pool_t *pool; + type_t *type = nullptr; + +public: + typedef pool_allocator<pool_ix, T> allocator_type; + typedef T value_type; + typedef value_type *pointer; + typedef const value_type * const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template<typename U> struct rebind { + typedef pool_allocator<pool_ix,U> other; + }; + + void init(bool force_register) { + pool = &get_pool(pool_ix); + if (debug_mode || force_register) { + type = pool->get_type(typeid(T), sizeof(T)); + } + } + + pool_allocator(bool force_register=false) { + init(force_register); + } + template<typename U> + pool_allocator(const pool_allocator<pool_ix,U>&) { + init(false); + } + + T* allocate(size_t n, void *p = nullptr) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes += total; + shard->items += n; + if (type) { + type->items += n; + } + T* r = reinterpret_cast<T*>(new char[total]); + return r; + } + + void deallocate(T* p, size_t n) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes -= total; + shard->items -= n; + if (type) { + type->items -= n; + } + delete[] reinterpret_cast<char*>(p); + } + + T* allocate_aligned(size_t n, size_t align, void *p = nullptr) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes += total; + shard->items += n; + if (type) { + type->items += n; + } + char *ptr; + int rc = ::posix_memalign((void**)(void*)&ptr, align, total); + if (rc) + throw std::bad_alloc(); + T* r = reinterpret_cast<T*>(ptr); + return r; + } + + void deallocate_aligned(T* p, size_t n) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes -= total; + shard->items -= n; + if (type) { + type->items -= n; + } + aligned_free(p); + } + + void destroy(T* p) { + p->~T(); + } + + template<class U> + void destroy(U *p) { + p->~U(); + } + + void construct(T* p, const T& val) { + ::new ((void *)p) T(val); + } + + template<class U, class... Args> void construct(U* p,Args&&... args) { + ::new((void *)p) U(std::forward<Args>(args)...); + } + + bool operator==(const pool_allocator&) const { return true; } + bool operator!=(const pool_allocator&) const { return false; } +}; + + +// Namespace mempool + +#define P(x) \ + namespace x { \ + static const mempool::pool_index_t id = mempool::mempool_##x; \ + template<typename v> \ + using pool_allocator = mempool::pool_allocator<id,v>; \ + \ + using string = std::basic_string<char,std::char_traits<char>, \ + pool_allocator<char>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using map = std::map<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using compact_map = compact_map<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using compact_multimap = compact_multimap<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using compact_set = compact_set<k, cmp, pool_allocator<k>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using multimap = std::multimap<k,v,cmp, \ + pool_allocator<std::pair<const k, \ + v>>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using set = std::set<k,cmp,pool_allocator<k>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \ + \ + template<typename k, typename v, typename cmp = std::less<k> > \ + using flat_map = boost::container::flat_map<k,v,cmp, \ + pool_allocator<std::pair<k,v>>>; \ + \ + template<typename v> \ + using list = std::list<v,pool_allocator<v>>; \ + \ + template<typename v> \ + using vector = std::vector<v,pool_allocator<v>>; \ + \ + template<typename k, typename v, \ + typename h=std::hash<k>, \ + typename eq = std::equal_to<k>> \ + using unordered_map = \ + std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\ + \ + inline size_t allocated_bytes() { \ + return mempool::get_pool(id).allocated_bytes(); \ + } \ + inline size_t allocated_items() { \ + return mempool::get_pool(id).allocated_items(); \ + } \ + }; + +DEFINE_MEMORY_POOLS_HELPER(P) + +#undef P + +}; + +// the elements allocated by mempool is in the same memory space as the ones +// allocated by the default allocator. so compare them in an efficient way: +// libstdc++'s std::equal is specialized to use memcmp if T is integer or +// pointer. this is good enough for our usecase. use +// std::is_trivially_copyable<T> to expand the support to more types if +// nececssary. +template<typename T, mempool::pool_index_t pool_index> +bool operator==(const std::vector<T, std::allocator<T>>& lhs, + const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs) +{ + return (lhs.size() == rhs.size() && + std::equal(lhs.begin(), lhs.end(), rhs.begin())); +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator!=(const std::vector<T, std::allocator<T>>& lhs, + const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs) +{ + return !(lhs == rhs); +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs, + const std::vector<T, std::allocator<T>>& rhs) +{ + return rhs == lhs; +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs, + const std::vector<T, std::allocator<T>>& rhs) +{ + return !(lhs == rhs); +} + +// Use this for any type that is contained by a container (unless it +// is a class you defined; see below). +#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool) \ + namespace mempool { \ + namespace pool { \ + extern pool_allocator<obj> alloc_##factoryname; \ + } \ + } + +#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \ + namespace mempool { \ + namespace pool { \ + pool_allocator<obj> alloc_##factoryname = {true}; \ + } \ + } + +// Use this for each class that belongs to a mempool. For example, +// +// class T { +// MEMPOOL_CLASS_HELPERS(); +// ... +// }; +// +#define MEMPOOL_CLASS_HELPERS() \ + void *operator new(size_t size); \ + void *operator new[](size_t size) noexcept { \ + ceph_abort_msg("no array new"); \ + return nullptr; } \ + void operator delete(void *); \ + void operator delete[](void *) { ceph_abort_msg("no array delete"); } + + +// Use this in some particular .cc file to match each class with a +// MEMPOOL_CLASS_HELPERS(). +#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool) \ + MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \ + void *obj::operator new(size_t size) { \ + return mempool::pool::alloc_##factoryname.allocate(1); \ + } \ + void obj::operator delete(void *p) { \ + return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1); \ + } + +#endif diff --git a/src/include/msgr.h b/src/include/msgr.h new file mode 100644 index 000000000..c8ad48ad1 --- /dev/null +++ b/src/include/msgr.h @@ -0,0 +1,255 @@ +#ifndef CEPH_MSGR_H +#define CEPH_MSGR_H + +#ifndef __KERNEL__ +#include <sys/socket.h> // for struct sockaddr_storage +#endif + +#include "include/int_types.h" + +/* See comment in ceph_fs.h. */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * Data types for message passing layer used by Ceph. + */ + +#define CEPH_MON_PORT_LEGACY 6789 /* legacy default monitor port */ +#define CEPH_MON_PORT_IANA 3300 /* IANA monitor port */ + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph v027" + + +/* + * messenger V2 connection banner prefix. + * The full banner string should have the form: "ceph v2\n<le16>" + * the 2 bytes are the length of the remaining banner. + */ +#define CEPH_BANNER_V2_PREFIX "ceph v2\n" + +/* + * messenger V2 features + */ +#define CEPH_MSGR2_INCARNATION_1 (0ull) + +#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + const static uint64_t CEPH_MSGR2_FEATUREMASK_##name = \ + (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); + +#define HAVE_MSGR2_FEATURE(x, name) \ + (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name)) + +DEFINE_MSGR2_FEATURE(0, 1, REVISION_1) // msgr2.1 +DEFINE_MSGR2_FEATURE(1, 1, COMPRESSION) // on-wire compression + +/* + * Features supported. Should be everything above. + */ +#define CEPH_MSGR2_SUPPORTED_FEATURES \ + (CEPH_MSGR2_FEATURE_REVISION_1 | \ + CEPH_MSGR2_FEATURE_COMPRESSION | \ + 0ULL) + +#define CEPH_MSGR2_REQUIRED_FEATURES (0ULL) + + + +/* + * Rollover-safe type and comparator for 32-bit sequence numbers. + * Comparator returns -1, 0, or 1. + */ +typedef __u32 ceph_seq_t; + +static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) +{ + return (__s32)a - (__s32)b; +} + + +/* + * entity_name -- logical name for a process participating in the + * network, e.g. 'mds0' or 'osd3'. + */ +struct ceph_entity_name { + __u8 type; /* CEPH_ENTITY_TYPE_* */ + __le64 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 0x01 +#define CEPH_ENTITY_TYPE_MDS 0x02 +#define CEPH_ENTITY_TYPE_OSD 0x04 +#define CEPH_ENTITY_TYPE_CLIENT 0x08 +#define CEPH_ENTITY_TYPE_MGR 0x10 +#define CEPH_ENTITY_TYPE_AUTH 0x20 + +#define CEPH_ENTITY_TYPE_ANY 0xFF + +extern const char *ceph_entity_type_name(int type); + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 type; + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_storage in_addr; +} __attribute__ ((packed)); + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 7 /* message */ +#define CEPH_MSGR_TAG_ACK 8 /* message ack */ +#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ +#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ +#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ +#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ +#define CEPH_MSGR_TAG_KEEPALIVE2 14 +#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive reply */ +#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* ceph v2 doing server challenge */ + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le64 features; /* supported feature bits */ + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; /* count connections initiated by this host */ + __le32 connect_seq; /* count connections initiated in this session */ + __le32 protocol_version; + __le32 authorizer_protocol; + __le32 authorizer_len; + __u8 flags; /* CEPH_MSG_CONNECT_* */ +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le64 features; /* feature bits for this session */ + __le32 global_seq; + __le32 connect_seq; + __le32 protocol_version; + __le32 authorizer_len; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header_old { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_inst src, orig_src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +struct ceph_msg_header2 { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 data_pre_padding_len; + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __le64 ack_seq; + __u8 flags; + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + * ceph_msg_footer_old does not support digital signatures on messages PLR + */ + +struct ceph_msg_footer_old { + __le32 front_crc, middle_crc, data_crc; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_msg_footer { + __le32 front_crc, middle_crc, data_crc; + // sig holds the 64 bits of the digital signature for the message PLR + __le64 sig; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ +#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */ + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/neorados/RADOS.hpp b/src/include/neorados/RADOS.hpp new file mode 100644 index 000000000..fa1ac92ae --- /dev/null +++ b/src/include/neorados/RADOS.hpp @@ -0,0 +1,1150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat <contact@redhat.com> + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef NEORADOS_RADOS_HPP +#define NEORADOS_RADOS_HPP + +#include <cstddef> +#include <memory> +#include <tuple> +#include <string> +#include <string_view> +#include <type_traits> +#include <variant> + +#include <boost/asio.hpp> + +#include <boost/container/flat_map.hpp> +#include <boost/container/flat_set.hpp> +#include <boost/uuid/uuid.hpp> + +#include <boost/system/error_code.hpp> + +// Will be in C++20! + +#include "include/expected.hpp" + +// Had better be in C++20. Why is this not in Boost? + +#include "include/function2.hpp" + +// Things broken out so we can decode them in Objecter. + +#include "include/neorados/RADOS_Decodable.hpp" + +// Needed for type erasure and template support. We can't really avoid +// it. + +#include "common/async/completion.h" + +// These are needed for RGW, but in general as a 'shiny new interface' +// we should try to use forward declarations and provide standard alternatives. + +#include "include/common_fwd.h" + +#include "include/buffer.h" +#include "include/rados/librados_fwd.hpp" + +#include "common/ceph_time.h" + +namespace neorados { +class Object; +class IOContext; +} +namespace std { +template<> +struct hash<neorados::Object>; +template<> +struct hash<neorados::IOContext>; +} + +namespace neorados { +namespace detail { +class Client; +} + +class RADOS; + +// Exists mostly so that repeated operations on the same object don't +// have to pay for the string copy to construct an object_t. + +class Object final { + friend RADOS; + friend std::hash<Object>; + +public: + Object(); + Object(const char* s); + Object(std::string_view s); + Object(std::string&& s); + Object(const std::string& s); + ~Object(); + + Object(const Object& o); + Object& operator =(const Object& o); + + Object(Object&& o); + Object& operator =(Object&& o); + + operator std::string_view() const; + + friend std::ostream& operator <<(std::ostream& m, const Object& o); + friend bool operator <(const Object& lhs, const Object& rhs); + friend bool operator <=(const Object& lhs, const Object& rhs); + friend bool operator >=(const Object& lhs, const Object& rhs); + friend bool operator >(const Object& lhs, const Object& rhs); + + friend bool operator ==(const Object& lhs, const Object& rhs); + friend bool operator !=(const Object& lhs, const Object& rhs); + +private: + + static constexpr std::size_t impl_size = 4 * 8; + std::aligned_storage_t<impl_size> impl; +}; + +// Not the same as the librados::IoCtx, but it does gather together +// some of the same metadata. Since we're likely to do multiple +// operations in the same pool or namespace, it doesn't make sense to +// redo a bunch of lookups and string copies. + +class IOContext final { + friend RADOS; + friend std::hash<IOContext>; + +public: + + IOContext(); + explicit IOContext(std::int64_t pool); + IOContext(std::int64_t _pool, std::string_view _ns); + IOContext(std::int64_t _pool, std::string&& _ns); + ~IOContext(); + + IOContext(const IOContext& rhs); + IOContext& operator =(const IOContext& rhs); + + IOContext(IOContext&& rhs); + IOContext& operator =(IOContext&& rhs); + + std::int64_t pool() const; + void pool(std::int64_t _pool); + + std::string_view ns() const; + void ns(std::string_view _ns); + void ns(std::string&& _ns); + + std::optional<std::string_view> key() const; + void key(std::string_view _key); + void key(std::string&& _key); + void clear_key(); + + std::optional<std::int64_t> hash() const; + void hash(std::int64_t _hash); + void clear_hash(); + + std::optional<std::uint64_t> read_snap() const; + void read_snap(std::optional<std::uint64_t> _snapid); + + // I can't actually move-construct here since snapid_t is its own + // separate class type, not an alias. + std::optional< + std::pair<std::uint64_t, + std::vector<std::uint64_t>>> write_snap_context() const; + void write_snap_context(std::optional< + std::pair<std::uint64_t, + std::vector<std::uint64_t>>> snapc); + + bool full_try() const; + void full_try(bool _full_try); + + friend std::ostream& operator <<(std::ostream& m, const IOContext& o); + friend bool operator <(const IOContext& lhs, const IOContext& rhs); + friend bool operator <=(const IOContext& lhs, const IOContext& rhs); + friend bool operator >=(const IOContext& lhs, const IOContext& rhs); + friend bool operator >(const IOContext& lhs, const IOContext& rhs); + + friend bool operator ==(const IOContext& lhs, const IOContext& rhs); + friend bool operator !=(const IOContext& lhs, const IOContext& rhs); + +private: + + static constexpr std::size_t impl_size = 16 * 8; + std::aligned_storage_t<impl_size> impl; +}; + +inline constexpr std::string_view all_nspaces("\001"); + +enum class cmpxattr_op : std::uint8_t { + eq = 1, + ne = 2, + gt = 3, + gte = 4, + lt = 5, + lte = 6 +}; + +namespace alloc_hint { +enum alloc_hint_t { + sequential_write = 1, + random_write = 2, + sequential_read = 4, + random_read = 8, + append_only = 16, + immutable = 32, + shortlived = 64, + longlived = 128, + compressible = 256, + incompressible = 512 +}; +} + +class Op { + friend RADOS; + +public: + + Op(const Op&) = delete; + Op& operator =(const Op&) = delete; + Op(Op&&); + Op& operator =(Op&&); + ~Op(); + + void set_excl(); + void set_failok(); + void set_fadvise_random(); + void set_fadvise_sequential(); + void set_fadvise_willneed(); + void set_fadvise_dontneed(); + void set_fadvise_nocache(); + + void cmpext(uint64_t off, ceph::buffer::list&& cmp_bl, std::size_t* s); + void cmpxattr(std::string_view name, cmpxattr_op op, + const ceph::buffer::list& val); + void cmpxattr(std::string_view name, cmpxattr_op op, std::uint64_t val); + void assert_version(uint64_t ver); + void assert_exists(); + void cmp_omap(const boost::container::flat_map< + std::string, + std::pair<ceph::buffer::list, int>>& assertions); + + void exec(std::string_view cls, std::string_view method, + const ceph::buffer::list& inbl, + ceph::buffer::list* out, + boost::system::error_code* ec = nullptr); + void exec(std::string_view cls, std::string_view method, + const ceph::buffer::list& inbl, + fu2::unique_function<void(boost::system::error_code, + const ceph::buffer::list&) &&> f); + void exec(std::string_view cls, std::string_view method, + const ceph::buffer::list& inbl, + fu2::unique_function<void(boost::system::error_code, int, + const ceph::buffer::list&) &&> f); + void exec(std::string_view cls, std::string_view method, + const ceph::buffer::list& inbl, + boost::system::error_code* ec = nullptr); + + + // Flags that apply to all ops in the operation vector + void balance_reads(); + void localize_reads(); + void order_reads_writes(); + void ignore_cache(); + void skiprwlocks(); + void ignore_overlay(); + void full_try(); + void full_force(); + void ignore_redirect(); + void ordersnap(); + void returnvec(); + + std::size_t size() const; + using Signature = void(boost::system::error_code); + using Completion = ceph::async::Completion<Signature>; + + friend std::ostream& operator <<(std::ostream& m, const Op& o); +protected: + Op(); + static constexpr std::size_t impl_size = 85 * 8; + std::aligned_storage_t<impl_size> impl; +}; + +// This class is /not/ thread-safe. If you want you can wrap it in +// something that locks it. + +class ReadOp final : public Op { + friend RADOS; + +public: + + ReadOp() = default; + ReadOp(const ReadOp&) = delete; + ReadOp(ReadOp&&) = default; + + ReadOp& operator =(const ReadOp&) = delete; + ReadOp& operator =(ReadOp&&) = default; + + void read(size_t off, uint64_t len, ceph::buffer::list* out, + boost::system::error_code* ec = nullptr); + void get_xattr(std::string_view name, ceph::buffer::list* out, + boost::system::error_code* ec = nullptr); + void get_omap_header(ceph::buffer::list*, + boost::system::error_code* ec = nullptr); + + void sparse_read(uint64_t off, uint64_t len, + ceph::buffer::list* out, + std::vector<std::pair<std::uint64_t, std::uint64_t>>* extents, + boost::system::error_code* ec = nullptr); + + void stat(std::uint64_t* size, ceph::real_time* mtime, + boost::system::error_code* ec = nullptr); + + void get_omap_keys(std::optional<std::string_view> start_after, + std::uint64_t max_return, + boost::container::flat_set<std::string>* keys, + bool* truncated, + boost::system::error_code* ec = nullptr); + + + void get_xattrs(boost::container::flat_map<std::string, + ceph::buffer::list>* kv, + boost::system::error_code* ec = nullptr); + + void get_omap_vals(std::optional<std::string_view> start_after, + std::optional<std::string_view> filter_prefix, + uint64_t max_return, + boost::container::flat_map<std::string, + ceph::buffer::list>* kv, + bool* truncated, + boost::system::error_code* ec = nullptr); + + + void get_omap_vals_by_keys(const boost::container::flat_set<std::string>& keys, + boost::container::flat_map<std::string, + ceph::buffer::list>* kv, + boost::system::error_code* ec = nullptr); + + void list_watchers(std::vector<struct ObjWatcher>* watchers, + boost::system::error_code* ec = nullptr); + + void list_snaps(struct SnapSet* snaps, + boost::system::error_code* ec = nullptr); +}; + +class WriteOp final : public Op { + friend RADOS; +public: + + WriteOp() = default; + WriteOp(const WriteOp&) = delete; + WriteOp(WriteOp&&) = default; + + WriteOp& operator =(const WriteOp&) = delete; + WriteOp& operator =(WriteOp&&) = default; + + void set_mtime(ceph::real_time t); + void create(bool exclusive); + void write(uint64_t off, ceph::buffer::list&& bl); + void write_full(ceph::buffer::list&& bl); + void writesame(std::uint64_t off, std::uint64_t write_len, + ceph::buffer::list&& bl); + void append(ceph::buffer::list&& bl); + void remove(); + void truncate(uint64_t off); + void zero(uint64_t off, uint64_t len); + void rmxattr(std::string_view name); + void setxattr(std::string_view name, + ceph::buffer::list&& bl); + void rollback(uint64_t snapid); + void set_omap(const boost::container::flat_map<std::string, + ceph::buffer::list>& map); + void set_omap_header(ceph::buffer::list&& bl); + void clear_omap(); + void rm_omap_keys(const boost::container::flat_set<std::string>& to_rm); + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size, + alloc_hint::alloc_hint_t flags); +}; + + +struct FSStats { + uint64_t kb; + uint64_t kb_used; + uint64_t kb_avail; + uint64_t num_objects; +}; + +// From librados.h, maybe move into a common file. But I want to see +// if we need/want to amend/add/remove anything first. +struct PoolStats { + /// space used in bytes + uint64_t num_bytes; + /// space used in KB + uint64_t num_kb; + /// number of objects in the pool + uint64_t num_objects; + /// number of clones of objects + uint64_t num_object_clones; + /// num_objects * num_replicas + uint64_t num_object_copies; + /// number of objects missing on primary + uint64_t num_objects_missing_on_primary; + /// number of objects found on no OSDs + uint64_t num_objects_unfound; + /// number of objects replicated fewer times than they should be + /// (but found on at least one OSD) + uint64_t num_objects_degraded; + /// number of objects read + uint64_t num_rd; + /// objects read in KB + uint64_t num_rd_kb; + /// number of objects written + uint64_t num_wr; + /// objects written in KB + uint64_t num_wr_kb; + /// bytes originally provided by user + uint64_t num_user_bytes; + /// bytes passed compression + uint64_t compressed_bytes_orig; + /// bytes resulted after compression + uint64_t compressed_bytes; + /// bytes allocated at storage + uint64_t compressed_bytes_alloc; +}; + +// Placement group, for PG commands +struct PG { + uint64_t pool; + uint32_t seed; +}; + +class Cursor final { +public: + static Cursor begin(); + static Cursor end(); + + Cursor(); + Cursor(const Cursor&); + Cursor& operator =(const Cursor&); + Cursor(Cursor&&); + Cursor& operator =(Cursor&&); + ~Cursor(); + + friend bool operator ==(const Cursor& lhs, + const Cursor& rhs); + friend bool operator !=(const Cursor& lhs, + const Cursor& rhs); + friend bool operator <(const Cursor& lhs, + const Cursor& rhs); + friend bool operator <=(const Cursor& lhs, + const Cursor& rhs); + friend bool operator >=(const Cursor& lhs, + const Cursor& rhs); + friend bool operator >(const Cursor& lhs, + const Cursor& rhs); + + std::string to_str() const; + static std::optional<Cursor> from_str(const std::string& s); + +private: + struct end_magic_t {}; + Cursor(end_magic_t); + Cursor(void*); + friend RADOS; + static constexpr std::size_t impl_size = 16 * 8; + std::aligned_storage_t<impl_size> impl; +}; + +class RADOS final +{ +public: + static constexpr std::tuple<uint32_t, uint32_t, uint32_t> version() { + return {0, 0, 1}; + } + + using BuildSig = void(boost::system::error_code, RADOS); + using BuildComp = ceph::async::Completion<BuildSig>; + class Builder { + std::optional<std::string> conf_files; + std::optional<std::string> cluster; + std::optional<std::string> name; + std::vector<std::pair<std::string, std::string>> configs; + bool no_default_conf = false; + bool no_mon_conf = false; + + public: + Builder() = default; + Builder& add_conf_file(std::string_view v); + Builder& set_cluster(std::string_view c) { + cluster = std::string(c); + return *this; + } + Builder& set_name(std::string_view n) { + name = std::string(n); + return *this; + } + Builder& set_no_default_conf() { + no_default_conf = true; + return *this; + } + Builder& set_no_mon_conf() { + no_mon_conf = true; + return *this; + } + Builder& set_conf_option(std::string_view opt, std::string_view val) { + configs.emplace_back(std::string(opt), std::string(val)); + return *this; + } + + template<typename CompletionToken> + auto build(boost::asio::io_context& ioctx, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, BuildSig> init(token); + build(ioctx, + BuildComp::create(ioctx.get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + private: + void build(boost::asio::io_context& ioctx, + std::unique_ptr<BuildComp> c); + }; + + + template<typename CompletionToken> + static auto make_with_cct(CephContext* cct, + boost::asio::io_context& ioctx, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, BuildSig> init(token); + make_with_cct(cct, ioctx, + BuildComp::create(ioctx.get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + static RADOS make_with_librados(librados::Rados& rados); + + RADOS(const RADOS&) = delete; + RADOS& operator =(const RADOS&) = delete; + + RADOS(RADOS&&); + RADOS& operator =(RADOS&&); + + ~RADOS(); + + CephContext* cct(); + + using executor_type = boost::asio::io_context::executor_type; + executor_type get_executor() const; + boost::asio::io_context& get_io_context(); + + template<typename CompletionToken> + auto execute(const Object& o, const IOContext& ioc, ReadOp&& op, + ceph::buffer::list* bl, + CompletionToken&& token, uint64_t* objver = nullptr, + const blkin_trace_info* trace_info = nullptr) { + boost::asio::async_completion<CompletionToken, Op::Signature> init(token); + execute(o, ioc, std::move(op), bl, + ReadOp::Completion::create(get_executor(), + std::move(init.completion_handler)), + objver, trace_info); + return init.result.get(); + } + + template<typename CompletionToken> + auto execute(const Object& o, const IOContext& ioc, WriteOp&& op, + CompletionToken&& token, uint64_t* objver = nullptr, + const blkin_trace_info* trace_info = nullptr) { + boost::asio::async_completion<CompletionToken, Op::Signature> init(token); + execute(o, ioc, std::move(op), + Op::Completion::create(get_executor(), + std::move(init.completion_handler)), + objver, trace_info); + return init.result.get(); + } + + template<typename CompletionToken> + auto execute(const Object& o, std::int64_t pool, + ReadOp&& op, + ceph::buffer::list* bl, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}, + uint64_t* objver = nullptr) { + boost::asio::async_completion<CompletionToken, Op::Signature> init(token); + execute(o, pool, std::move(op), bl, + ReadOp::Completion::create(get_executor(), + std::move(init.completion_handler)), + ns, key, objver); + return init.result.get(); + } + + template<typename CompletionToken> + auto execute(const Object& o, std::int64_t pool, WriteOp&& op, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}, + uint64_t* objver = nullptr) { + boost::asio::async_completion<CompletionToken, Op::Signature> init(token); + execute(o, pool, std::move(op), + Op::Completion::create(get_executor(), + std::move(init.completion_handler)), + ns, key, objver); + return init.result.get(); + } + + boost::uuids::uuid get_fsid() const noexcept; + + using LookupPoolSig = void(boost::system::error_code, + std::int64_t); + using LookupPoolComp = ceph::async::Completion<LookupPoolSig>; + template<typename CompletionToken> + auto lookup_pool(std::string_view name, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, LookupPoolSig> init(token); + lookup_pool(name, + LookupPoolComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + std::optional<uint64_t> get_pool_alignment(int64_t pool_id); + + using LSPoolsSig = void(std::vector<std::pair<std::int64_t, std::string>>); + using LSPoolsComp = ceph::async::Completion<LSPoolsSig>; + template<typename CompletionToken> + auto list_pools(CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, LSPoolsSig> init(token); + list_pools(LSPoolsComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + + + using SimpleOpSig = void(boost::system::error_code); + using SimpleOpComp = ceph::async::Completion<SimpleOpSig>; + template<typename CompletionToken> + auto create_pool_snap(int64_t pool, std::string_view snapName, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + create_pool_snap(pool, snapName, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + using SMSnapSig = void(boost::system::error_code, std::uint64_t); + using SMSnapComp = ceph::async::Completion<SMSnapSig>; + template<typename CompletionToken> + auto allocate_selfmanaged_snap(int64_t pool, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SMSnapSig> init(token); + allocate_selfmanaged_snap(pool, + SMSnapComp::create( + get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto delete_pool_snap(int64_t pool, std::string_view snapName, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + delete_pool_snap(pool, snapName, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto delete_selfmanaged_snap(int64_t pool, std::string_view snapName, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + delete_selfmanaged_snap(pool, snapName, + SimpleOpComp::create( + get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto create_pool(std::string_view name, std::optional<int> crush_rule, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + create_pool(name, crush_rule, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto delete_pool(std::string_view name, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + delete_pool(name, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto delete_pool(int64_t pool, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + delete_pool(pool, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + using PoolStatSig = void(boost::system::error_code, + boost::container::flat_map<std::string, + PoolStats>, bool); + using PoolStatComp = ceph::async::Completion<PoolStatSig>; + template<typename CompletionToken> + auto stat_pools(const std::vector<std::string>& pools, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, PoolStatSig> init(token); + stat_pools(pools, + PoolStatComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + using StatFSSig = void(boost::system::error_code, + FSStats); + using StatFSComp = ceph::async::Completion<StatFSSig>; + template<typename CompletionToken> + auto statfs(std::optional<int64_t> pool, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, StatFSSig> init(token); + ceph_statfs(pool, StatFSComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + using WatchCB = fu2::unique_function<void(boost::system::error_code, + uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + ceph::buffer::list&& bl)>; + + using WatchSig = void(boost::system::error_code ec, + uint64_t cookie); + using WatchComp = ceph::async::Completion<WatchSig>; + template<typename CompletionToken> + auto watch(const Object& o, const IOContext& ioc, + std::optional<std::chrono::seconds> timeout, + WatchCB&& cb, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, WatchSig> init(token); + watch(o, ioc, timeout, std::move(cb), + WatchComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto watch(const Object& o, std::int64_t pool, + std::optional<std::chrono::seconds> timeout, + WatchCB&& cb, CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}) { + boost::asio::async_completion<CompletionToken, WatchSig> init(token); + watch(o, pool, timeout, std::move(cb), + WatchComp::create(get_executor(), + std::move(init.completion_handler)), + ns, key); + return init.result.get(); + } + + template<typename CompletionToken> + auto notify_ack(const Object& o, + const IOContext& ioc, + uint64_t notify_id, + uint64_t cookie, + ceph::buffer::list&& bl, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + notify_ack(o, ioc, notify_id, cookie, std::move(bl), + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto notify_ack(const Object& o, + std::int64_t pool, + uint64_t notify_id, + uint64_t cookie, + ceph::buffer::list&& bl, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}) { + boost::asio::async_completion<CompletionToken, WatchSig> init(token); + notify_ack(o, pool, notify_id, cookie, std::move(bl), + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler)), + ns, key); + return init.result.get(); + } + + template<typename CompletionToken> + auto unwatch(uint64_t cookie, const IOContext& ioc, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + unwatch(cookie, ioc, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto unwatch(uint64_t cookie, std::int64_t pool, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + unwatch(cookie, pool, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler)), + ns, key); + return init.result.get(); + } + + // This is one of those places where having to force everything into + // a .cc file is really infuriating. If we had modules, that would + // let us separate out the implementation details without + // sacrificing all the benefits of templates. + using VoidOpSig = void(); + using VoidOpComp = ceph::async::Completion<VoidOpSig>; + template<typename CompletionToken> + auto flush_watch(CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, VoidOpSig> init(token); + flush_watch(VoidOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + using NotifySig = void(boost::system::error_code, ceph::buffer::list); + using NotifyComp = ceph::async::Completion<NotifySig>; + template<typename CompletionToken> + auto notify(const Object& oid, const IOContext& ioc, ceph::buffer::list&& bl, + std::optional<std::chrono::milliseconds> timeout, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, NotifySig> init(token); + notify(oid, ioc, std::move(bl), timeout, + NotifyComp::create(get_executor(), + std::move(init.completion_handler))); + + return init.result.get(); + } + + template<typename CompletionToken> + auto notify(const Object& oid, std::int64_t pool, ceph::buffer::list&& bl, + std::optional<std::chrono::milliseconds> timeout, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}) { + boost::asio::async_completion<CompletionToken, NotifySig> init(token); + notify(oid, pool, bl, timeout, + NotifyComp::create(get_executor(), + std::move(init.completion_handler)), + ns, key); + + return init.result.get(); + } + + // The versions with pointers are fine for coroutines, but + // extraordinarily unappealing for callback-oriented programming. + using EnumerateSig = void(boost::system::error_code, + std::vector<Entry>, + Cursor); + using EnumerateComp = ceph::async::Completion<EnumerateSig>; + template<typename CompletionToken> + auto enumerate_objects(const IOContext& ioc, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, EnumerateSig> init(token); + enumerate_objects(ioc, begin, end, max, filter, + EnumerateComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto enumerate_objects(std::int64_t pool, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + CompletionToken&& token, + std::optional<std::string_view> ns = {}, + std::optional<std::string_view> key = {}) { + boost::asio::async_completion<CompletionToken, EnumerateSig> init(token); + enumerate_objects(pool, begin, end, max, filter, + EnumerateComp::create(get_executor(), + std::move(init.completion_handler)), + ns, key); + return init.result.get(); + } + + using CommandSig = void(boost::system::error_code, + std::string, ceph::buffer::list); + using CommandComp = ceph::async::Completion<CommandSig>; + template<typename CompletionToken> + auto osd_command(int osd, std::vector<std::string>&& cmd, + ceph::buffer::list&& in, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, CommandSig> init(token); + osd_command(osd, std::move(cmd), std::move(in), + CommandComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + template<typename CompletionToken> + auto pg_command(PG pg, std::vector<std::string>&& cmd, + ceph::buffer::list&& in, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, CommandSig> init(token); + pg_command(pg, std::move(cmd), std::move(in), + CommandComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto mon_command(std::vector<std::string> command, + const ceph::buffer::list& bl, + std::string* outs, ceph::buffer::list* outbl, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + mon_command(command, bl, outs, outbl, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto enable_application(std::string_view pool, std::string_view app_name, + bool force, CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + enable_application(pool, app_name, force, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto blocklist_add(std::string_view client_address, + std::optional<std::chrono::seconds> expire, + CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + blocklist_add(client_address, expire, + SimpleOpComp::create(get_executor(), + std::move(init.completion_handler))); + return init.result.get(); + } + + template<typename CompletionToken> + auto wait_for_latest_osd_map(CompletionToken&& token) { + boost::asio::async_completion<CompletionToken, SimpleOpSig> init(token); + wait_for_latest_osd_map( + SimpleOpComp::create(get_executor(), std::move(init.completion_handler))); + return init.result.get(); + } + + uint64_t instance_id() const; + +private: + + RADOS(); + + friend Builder; + + RADOS(std::unique_ptr<detail::Client> impl); + static void make_with_cct(CephContext* cct, + boost::asio::io_context& ioctx, + std::unique_ptr<BuildComp> c); + + void execute(const Object& o, const IOContext& ioc, ReadOp&& op, + ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c, + uint64_t* objver, const blkin_trace_info* trace_info); + + void execute(const Object& o, const IOContext& ioc, WriteOp&& op, + std::unique_ptr<Op::Completion> c, uint64_t* objver, + const blkin_trace_info* trace_info); + + void execute(const Object& o, std::int64_t pool, ReadOp&& op, + ceph::buffer::list* bl, std::unique_ptr<Op::Completion> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key, + uint64_t* objver); + + void execute(const Object& o, std::int64_t pool, WriteOp&& op, + std::unique_ptr<Op::Completion> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key, + uint64_t* objver); + + void lookup_pool(std::string_view name, std::unique_ptr<LookupPoolComp> c); + void list_pools(std::unique_ptr<LSPoolsComp> c); + void create_pool_snap(int64_t pool, std::string_view snapName, + std::unique_ptr<SimpleOpComp> c); + void allocate_selfmanaged_snap(int64_t pool, std::unique_ptr<SMSnapComp> c); + void delete_pool_snap(int64_t pool, std::string_view snapName, + std::unique_ptr<SimpleOpComp> c); + void delete_selfmanaged_snap(int64_t pool, std::uint64_t snap, + std::unique_ptr<SimpleOpComp> c); + void create_pool(std::string_view name, std::optional<int> crush_rule, + std::unique_ptr<SimpleOpComp> c); + void delete_pool(std::string_view name, + std::unique_ptr<SimpleOpComp> c); + void delete_pool(int64_t pool, + std::unique_ptr<SimpleOpComp> c); + void stat_pools(const std::vector<std::string>& pools, + std::unique_ptr<PoolStatComp> c); + void stat_fs(std::optional<std::int64_t> pool, + std::unique_ptr<StatFSComp> c); + + void watch(const Object& o, const IOContext& ioc, + std::optional<std::chrono::seconds> timeout, + WatchCB&& cb, std::unique_ptr<WatchComp> c); + void watch(const Object& o, std::int64_t pool, + std::optional<std::chrono::seconds> timeout, + WatchCB&& cb, std::unique_ptr<WatchComp> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + tl::expected<ceph::timespan, boost::system::error_code> + watch_check(uint64_t cookie); + void notify_ack(const Object& o, + const IOContext& _ioc, + uint64_t notify_id, + uint64_t cookie, + ceph::buffer::list&& bl, + std::unique_ptr<SimpleOpComp>); + void notify_ack(const Object& o, + std::int64_t pool, + uint64_t notify_id, + uint64_t cookie, + ceph::buffer::list&& bl, + std::unique_ptr<SimpleOpComp>, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + void unwatch(uint64_t cookie, const IOContext& ioc, + std::unique_ptr<SimpleOpComp>); + void unwatch(uint64_t cookie, std::int64_t pool, + std::unique_ptr<SimpleOpComp>, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + void notify(const Object& oid, const IOContext& ioctx, + ceph::buffer::list&& bl, + std::optional<std::chrono::milliseconds> timeout, + std::unique_ptr<NotifyComp> c); + void notify(const Object& oid, std::int64_t pool, + ceph::buffer::list&& bl, + std::optional<std::chrono::milliseconds> timeout, + std::unique_ptr<NotifyComp> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + void flush_watch(std::unique_ptr<VoidOpComp>); + + void enumerate_objects(const IOContext& ioc, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + std::vector<Entry>* ls, + Cursor* cursor, + std::unique_ptr<SimpleOpComp> c); + void enumerate_objects(std::int64_t pool, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + std::vector<Entry>* ls, + Cursor* cursor, + std::unique_ptr<SimpleOpComp> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + void enumerate_objects(const IOContext& ioc, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + std::unique_ptr<EnumerateComp> c); + void enumerate_objects(std::int64_t pool, const Cursor& begin, + const Cursor& end, const std::uint32_t max, + const ceph::buffer::list& filter, + std::unique_ptr<EnumerateComp> c, + std::optional<std::string_view> ns, + std::optional<std::string_view> key); + void osd_command(int osd, std::vector<std::string>&& cmd, + ceph::buffer::list&& in, std::unique_ptr<CommandComp> c); + void pg_command(PG pg, std::vector<std::string>&& cmd, + ceph::buffer::list&& in, std::unique_ptr<CommandComp> c); + + void mon_command(std::vector<std::string> command, + const ceph::buffer::list& bl, + std::string* outs, ceph::buffer::list* outbl, + std::unique_ptr<SimpleOpComp> c); + + void enable_application(std::string_view pool, std::string_view app_name, + bool force, std::unique_ptr<SimpleOpComp> c); + + void blocklist_add(std::string_view client_address, + std::optional<std::chrono::seconds> expire, + std::unique_ptr<SimpleOpComp> c); + + void wait_for_latest_osd_map(std::unique_ptr<SimpleOpComp> c); + + // Proxy object to provide access to low-level RADOS messaging clients + std::unique_ptr<detail::Client> impl; +}; + +enum class errc { + pool_dne = 1, + invalid_snapcontext +}; + +const boost::system::error_category& error_category() noexcept; +} + +namespace boost::system { +template<> +struct is_error_code_enum<::neorados::errc> { + static const bool value = true; +}; + +template<> +struct is_error_condition_enum<::neorados::errc> { + static const bool value = false; +}; +} + +namespace neorados { +// explicit conversion: +inline boost::system::error_code make_error_code(errc e) noexcept { + return { static_cast<int>(e), error_category() }; +} + +// implicit conversion: +inline boost::system::error_condition make_error_condition(errc e) noexcept { + return { static_cast<int>(e), error_category() }; +} +} + +namespace std { +template<> +struct hash<neorados::Object> { + size_t operator ()(const neorados::Object& r) const; +}; +template<> +struct hash<neorados::IOContext> { + size_t operator ()(const neorados::IOContext& r) const; +}; +} // namespace std + +#endif // NEORADOS_RADOS_HPP diff --git a/src/include/neorados/RADOS_Decodable.hpp b/src/include/neorados/RADOS_Decodable.hpp new file mode 100644 index 000000000..83d065b3f --- /dev/null +++ b/src/include/neorados/RADOS_Decodable.hpp @@ -0,0 +1,116 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat <contact@redhat.com> + * Author: Adam C. Emerson + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef NEORADOS_RADOS_DECODABLE_HPP +#define NEORADOS_RADOS_DECODABLE_HPP + +#include <cstdint> +#include <cstdlib> +#include <string> +#include <iostream> +#include <tuple> +#include <utility> +#include <vector> + +#include <fmt/core.h> +#if FMT_VERSION >= 90000 +#include <fmt/ostream.h> +#endif + +namespace neorados { +struct Entry { + std::string nspace; + std::string oid; + std::string locator; + + Entry() {} + Entry(std::string nspace, std::string oid, std::string locator) : + nspace(std::move(nspace)), oid(std::move(oid)), locator(locator) {} +}; +inline bool operator ==(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) == + std::tie(r.nspace, r.oid, r.locator); +} +inline bool operator !=(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) != + std::tie(r.nspace, r.oid, r.locator); +} +inline bool operator <(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) < + std::tie(r.nspace, r.oid, r.locator); +} +inline bool operator <=(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) <= + std::tie(r.nspace, r.oid, r.locator); +} +inline bool operator >=(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) >= + std::tie(r.nspace, r.oid, r.locator); +} +inline bool operator >(const Entry& l, const Entry r) { + return std::tie(l.nspace, l.oid, l.locator) > + std::tie(r.nspace, r.oid, r.locator); +} + +inline std::ostream& operator <<(std::ostream& out, const Entry& entry) { + if (!entry.nspace.empty()) + out << entry.nspace << '/'; + out << entry.oid; + if (!entry.locator.empty()) + out << '@' << entry.locator; + return out; +} + +struct CloneInfo { + uint64_t cloneid = 0; + std::vector<uint64_t> snaps; // ascending + std::vector<std::pair<uint64_t, uint64_t>> overlap;// with next newest + uint64_t size = 0; + CloneInfo() = default; +}; + +struct SnapSet { + std::vector<CloneInfo> clones; // ascending + std::uint64_t seq = 0; // newest snapid seen by the object + SnapSet() = default; +}; + +struct ObjWatcher { + /// Address of the Watcher + std::string addr; + /// Watcher ID + std::int64_t watcher_id; + /// Cookie + std::uint64_t cookie; + /// Timeout in Seconds + std::uint32_t timeout_seconds; +}; +} + +namespace std { +template<> +struct hash<::neorados::Entry> { + std::size_t operator ()(::neorados::Entry e) const { + hash<std::string> h; + return (h(e.nspace) << 2) ^ (h(e.oid) << 1) ^ h(e.locator); + } +}; +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<neorados::Entry> : ostream_formatter {}; +#endif + +#endif // RADOS_DECODABLE_HPP diff --git a/src/include/neorados/buffer_fwd.h b/src/include/neorados/buffer_fwd.h new file mode 120000 index 000000000..bd1f6f1b0 --- /dev/null +++ b/src/include/neorados/buffer_fwd.h @@ -0,0 +1 @@ +../buffer_fwd.h
\ No newline at end of file diff --git a/src/include/neorados/completion.h b/src/include/neorados/completion.h new file mode 120000 index 000000000..100678fc2 --- /dev/null +++ b/src/include/neorados/completion.h @@ -0,0 +1 @@ +../../common/async/completion.h
\ No newline at end of file diff --git a/src/include/object.h b/src/include/object.h new file mode 100644 index 000000000..4564af86e --- /dev/null +++ b/src/include/object.h @@ -0,0 +1,189 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OBJECT_H +#define CEPH_OBJECT_H + +#include <cstdint> +#include <cstdio> +#include <iomanip> +#include <iosfwd> +#include <string> +#include <string> +#include <string_view> + +#include "include/rados.h" +#include "include/unordered_map.h" + +#include "hash.h" +#include "encoding.h" +#include "ceph_hash.h" + +struct object_t { + std::string name; + + object_t() {} + // cppcheck-suppress noExplicitConstructor + object_t(const char *s) : name(s) {} + // cppcheck-suppress noExplicitConstructor + object_t(const std::string& s) : name(s) {} + object_t(std::string&& s) : name(std::move(s)) {} + object_t(std::string_view s) : name(s) {} + + auto operator<=>(const object_t&) const noexcept = default; + + void swap(object_t& o) { + name.swap(o.name); + } + void clear() { + name.clear(); + } + + void encode(ceph::buffer::list &bl) const { + using ceph::encode; + encode(name, bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + decode(name, bl); + } +}; +WRITE_CLASS_ENCODER(object_t) + +inline std::ostream& operator<<(std::ostream& out, const object_t& o) { + return out << o.name; +} + +namespace std { +template<> struct hash<object_t> { + size_t operator()(const object_t& r) const { + //static hash<string> H; + //return H(r.name); + return ceph_str_hash_linux(r.name.c_str(), r.name.length()); + } +}; +} // namespace std + + +struct file_object_t { + uint64_t ino, bno; + mutable char buf[34]; + + file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) { + buf[0] = 0; + } + + const char *c_str() const { + if (!buf[0]) + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno); + return buf; + } + + operator object_t() { + return object_t(c_str()); + } +}; + + +// --------------------------- +// snaps + +struct snapid_t { + uint64_t val; + // cppcheck-suppress noExplicitConstructor + snapid_t(uint64_t v=0) : val(v) {} + snapid_t operator+=(snapid_t o) { val += o.val; return *this; } + snapid_t operator++() { ++val; return *this; } + operator uint64_t() const { return val; } +}; + +inline void encode(snapid_t i, ceph::buffer::list &bl) { + using ceph::encode; + encode(i.val, bl); +} +inline void decode(snapid_t &i, ceph::buffer::list::const_iterator &p) { + using ceph::decode; + decode(i.val, p); +} + +template<> +struct denc_traits<snapid_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const snapid_t& o, size_t& p) { + denc(o.val, p); + } + static void encode(const snapid_t &o, ceph::buffer::list::contiguous_appender& p) { + denc(o.val, p); + } + static void decode(snapid_t& o, ceph::buffer::ptr::const_iterator &p) { + denc(o.val, p); + } +}; + +inline std::ostream& operator<<(std::ostream& out, const snapid_t& s) { + if (s == CEPH_NOSNAP) + return out << "head"; + else if (s == CEPH_SNAPDIR) + return out << "snapdir"; + else + return out << std::hex << s.val << std::dec; +} + + +struct sobject_t { + object_t oid; + snapid_t snap; + + sobject_t() : snap(0) {} + sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {} + + auto operator<=>(const sobject_t&) const noexcept = default; + + void swap(sobject_t& o) { + oid.swap(o.oid); + snapid_t t = snap; + snap = o.snap; + o.snap = t; + } + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(oid, bl); + encode(snap, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(oid, bl); + decode(snap, bl); + } +}; +WRITE_CLASS_ENCODER(sobject_t) + +inline std::ostream& operator<<(std::ostream& out, const sobject_t &o) { + return out << o.oid << "/" << o.snap; +} +namespace std { +template<> struct hash<sobject_t> { + size_t operator()(const sobject_t &r) const { + static hash<object_t> H; + static rjhash<uint64_t> I; + return H(r.oid) ^ I(r.snap); + } +}; +} // namespace std + +#endif diff --git a/src/include/object_fmt.h b/src/include/object_fmt.h new file mode 100644 index 000000000..33df5e3fb --- /dev/null +++ b/src/include/object_fmt.h @@ -0,0 +1,29 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once + +/** + * \file fmtlib formatters for some object.h structs + */ +#include <fmt/format.h> + +#include "object.h" + + +template <> +struct fmt::formatter<snapid_t> { + + constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const snapid_t& snp, FormatContext& ctx) const + { + if (snp == CEPH_NOSNAP) { + return fmt::format_to(ctx.out(), "head"); + } + if (snp == CEPH_SNAPDIR) { + return fmt::format_to(ctx.out(), "snapdir"); + } + return fmt::format_to(ctx.out(), "{:x}", snp.val); + } +}; diff --git a/src/include/on_exit.h b/src/include/on_exit.h new file mode 100644 index 000000000..c412ab33e --- /dev/null +++ b/src/include/on_exit.h @@ -0,0 +1,49 @@ +#ifndef CEPH_ON_EXIT_H +#define CEPH_ON_EXIT_H + +#include <pthread.h> +#include <vector> + +#include "include/ceph_assert.h" +/* + * Create a static instance at the file level to get callbacks called when the + * process exits via main() or exit(). + */ + +class OnExitManager { + public: + typedef void (*callback_t)(void *arg); + + OnExitManager() { + int ret = pthread_mutex_init(&lock_, NULL); + ceph_assert(ret == 0); + } + + ~OnExitManager() { + pthread_mutex_lock(&lock_); + std::vector<struct cb>::iterator it; + for (it = funcs_.begin(); it != funcs_.end(); it++) { + it->func(it->arg); + } + funcs_.clear(); + pthread_mutex_unlock(&lock_); + } + + void add_callback(callback_t func, void *arg) { + pthread_mutex_lock(&lock_); + struct cb callback = { func, arg }; + funcs_.push_back(callback); + pthread_mutex_unlock(&lock_); + } + + private: + struct cb { + callback_t func; + void *arg; + }; + + std::vector<struct cb> funcs_; + pthread_mutex_t lock_; +}; + +#endif diff --git a/src/include/page.h b/src/include/page.h new file mode 100644 index 000000000..db6e20585 --- /dev/null +++ b/src/include/page.h @@ -0,0 +1,18 @@ +#ifndef CEPH_PAGE_H +#define CEPH_PAGE_H + +namespace ceph { + // these are in common/page.cc + extern unsigned _page_size; + extern unsigned long _page_mask; + extern unsigned _page_shift; +} + +#endif + + +#define CEPH_PAGE_SIZE ceph::_page_size +#define CEPH_PAGE_MASK ceph::_page_mask +#define CEPH_PAGE_SHIFT ceph::_page_shift + + diff --git a/src/include/rados.h b/src/include/rados.h new file mode 100644 index 000000000..eac3a2159 --- /dev/null +++ b/src/include/rados.h @@ -0,0 +1,700 @@ +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H + +/* + * Data types for the Ceph distributed object storage layer RADOS + * (Reliable Autonomic Distributed Object Store). + */ + +#include <string.h> +#include <stdbool.h> +#include "msgr.h" + +/* See comment in ceph_fs.h. */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * fs id + */ +struct ceph_fsid { + unsigned char fsid[16]; +}; + +static inline int ceph_fsid_compare(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ +#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ +#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ + +/* + * placement group. + * we encode this into one __le64. + */ +struct ceph_pg { + __le16 preferred; /* preferred primary osd */ + __le16 ps; /* placement seed */ + __le32 pool; /* object pool */ +} __attribute__ ((packed)); + +/* + * pg pool types + * + * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are + * duplicated here only for CrushCompiler's benefit. + */ +#define CEPH_PG_TYPE_REPLICATED 1 +/* #define CEPH_PG_TYPE_RAID4 2 never implemented */ +#define CEPH_PG_TYPE_ERASURE 3 + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + * + * ** This function is released to the public domain by the author. ** + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; /* for per-object parity, if any */ +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS (1<<0) +#define CEPH_OSD_UP (1<<1) +#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ +#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ +#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ +#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ +#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */ +#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */ +#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */ +#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */ +#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */ +#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */ +#define CEPH_OSD_STOP (1<<12) /* osd has been stopped by admin */ + +extern const char *ceph_osd_state_name(int s); + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + +#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 +#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), deprecated since mimic*/ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), deprecated since mimic */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ +#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ +#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ +#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ +#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ +#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ +#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ +#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */ +#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */ +#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */ +#define CEPH_OSDMAP_NOAUTOSCALE (1<<23) /* block pg autoscale */ + +/* these are hidden in 'ceph status' view */ +#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS | \ + CEPH_OSDMAP_RECOVERY_DELETES | \ + CEPH_OSDMAP_SORTBITWISE | \ + CEPH_OSDMAP_PURGED_SNAPDIRS | \ + CEPH_OSDMAP_PGLOG_HARDLIMIT) +#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS) + +/* + * major ceph release numbers + */ +#define CEPH_RELEASE_ARGONAUT 1 +#define CEPH_RELEASE_BOBTAIL 2 +#define CEPH_RELEASE_CUTTLEFISH 3 +#define CEPH_RELEASE_DUMPLING 4 +#define CEPH_RELEASE_EMPEROR 5 +#define CEPH_RELEASE_FIREFLY 6 +#define CEPH_RELEASE_GIANT 7 +#define CEPH_RELEASE_HAMMER 8 +#define CEPH_RELEASE_INFERNALIS 9 +#define CEPH_RELEASE_JEWEL 10 +#define CEPH_RELEASE_KRAKEN 11 +#define CEPH_RELEASE_LUMINOUS 12 +#define CEPH_RELEASE_MIMIC 13 +#define CEPH_RELEASE_NAUTILUS 14 +#define CEPH_RELEASE_OCTOPUS 15 +#define CEPH_RELEASE_PACIFIC 16 +#define CEPH_RELEASE_QUINCY 17 +#define CEPH_RELEASE_REEF 18 +#define CEPH_RELEASE_MAX 19 /* highest + 1 */ + +/* + * The error code to return when an OSD can't handle a write + * because it is too large. + */ +#define OSD_WRITETOOBIG EMSGSIZE + +/* + * osd ops + * + * WARNING: do not use these op codes directly. Use the helpers + * defined below instead. In certain cases, op code behavior was + * redefined, resulting in special-cases in the helpers. + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_RMW 0x3000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 +#define CEPH_OSD_OP_MODE_CACHE 0x8000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 +#define CEPH_OSD_OP_TYPE_EXEC 0x0400 +#define CEPH_OSD_OP_TYPE_PG 0x0500 +// LEAVE UNUSED 0x0600 used to be multiobject ops + +#define __CEPH_OSD_OP1(mode, nr) \ + (CEPH_OSD_OP_MODE_##mode | (nr)) + +#define __CEPH_OSD_OP(mode, type, nr) \ + (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr)) + +#define __CEPH_FORALL_OSD_OPS(f) \ + /** data **/ \ + /* read */ \ + f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \ + f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \ + f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \ + f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \ + \ + /* fancy read */ \ + f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \ + f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \ + \ + f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \ + f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \ + \ + /* versioning */ \ + f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \ + \ + f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ + \ + f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ + \ + /* sync */ \ + f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ + \ + /* write */ \ + f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \ + f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \ + f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \ + f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \ + f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \ + \ + /* fancy write */ \ + f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ + f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \ + f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ + f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ + \ + f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \ + f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \ + f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \ + \ + f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \ + f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \ + \ + f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \ + \ + /* omap */ \ + f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \ + f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \ + f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \ + f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \ + f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \ + f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \ + f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \ + f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \ + f(OMAPRMKEYRANGE, __CEPH_OSD_OP(WR, DATA, 44), "omap-rm-key-range") \ + f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \ + \ + /* tiering */ \ + f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ + /* was copy-get-classic */ \ + f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ + f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ + f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \ + f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \ + f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \ + f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \ + \ + /* convert tmap to omap */ \ + f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \ + \ + /* hints */ \ + f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \ + \ + /* cache pin/unpin */ \ + f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \ + f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \ + \ + /* ESX/SCSI */ \ + f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \ + f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \ + \ + /* Extensible */ \ + f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \ + f(SET_CHUNK, __CEPH_OSD_OP(CACHE, DATA, 40), "set-chunk") \ + f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \ + f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \ + f(TIER_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 43), "tier-flush") \ + f(TIER_EVICT, __CEPH_OSD_OP(CACHE, DATA, 44), "tier-evict") \ + \ + /** attrs **/ \ + /* read */ \ + f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \ + f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \ + f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \ + \ + /* write */ \ + f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \ + f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \ + f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \ + f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \ + \ + /** subop **/ \ + f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \ + f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \ + f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \ + f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \ + f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \ + f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \ + f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \ + /* 8 used to be scrub-stop */ \ + f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \ + \ + /** exec **/ \ + /* note: the RD bit here is wrong; see special-case below in helper */ \ + f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \ + \ + /** pg **/ \ + f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \ + f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \ + f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \ + f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \ + f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \ + f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \ + f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls") + +enum { +#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode), +__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY) +#undef GENERATE_ENUM_ENTRY +}; + +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} +static inline int ceph_osd_op_type_exec(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; +} +static inline int ceph_osd_op_type_pg(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE_RD) && + op != CEPH_OSD_OP_CALL; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return op & CEPH_OSD_OP_MODE_WR; +} +static inline int ceph_osd_op_mode_cache(int op) +{ + return op & CEPH_OSD_OP_MODE_CACHE; +} +static inline bool ceph_osd_op_uses_extent(int op) +{ + switch(op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_TRIMTRUNC: + case CEPH_OSD_OP_CMPEXT: + return true; + default: + return false; + } +} + +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * and objclass.h. Any modification here needs to be updated there + */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' /* create key */ +#define CEPH_OSD_TMAP_RM 'r' +#define CEPH_OSD_TMAP_RMSLOPPY 'R' + +extern const char *ceph_osd_op_name(int op); + +/* + * osd op flags + * + * An op may be READ, WRITE, or READ|WRITE. + */ +enum { + CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */ + CEPH_OSD_FLAG_READ = 0x0010, /* op may read */ + CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 0x0100, + CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ + CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ + CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id + */ + CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ + CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */ + CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */ + CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000, /* client understands pool EIO flag */ +}; + +enum { + CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */ + CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */ + CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */ + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */ +}; + +#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/ +#define EBLOCKLISTED 108 /* blocklisted */ +#define EBLACKLISTED 108 /* deprecated */ + +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + +enum { + CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to + * cloneid */ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* use provided truncate_{seq,size} (copy-from2 only) */ +}; + +#define CEPH_OSD_COPY_FROM_FLAGS \ + (CEPH_OSD_COPY_FROM_FLAG_FLUSH | \ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | \ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | \ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | \ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED | \ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ) + +enum { + CEPH_OSD_TMAP2OMAP_NULLOK = 1, +}; + +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +enum { + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0, + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1, + CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2 +}; + +const char *ceph_osd_watch_op_name(int o); + +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + +const char *ceph_osd_alloc_hint_flag_name(int f); + +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; + +const char *ceph_osd_backoff_op_name(int op); + +/* + * an individual object operation. each may be accompanied by some data + * payload + */ +struct ceph_osd_op { + __le16 op; /* CEPH_OSD_OP_* */ + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ + union { + struct { + __le64 offset, length; + __le64 truncate_size; + __le32 truncate_seq; + } __attribute__ ((packed)) extent; + struct { + __le32 name_len; + __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } __attribute__ ((packed)) xattr; + struct { + __u8 class_len; + __u8 method_len; + __u8 argc; + __le32 indata_len; + } __attribute__ ((packed)) cls; + struct { + __le64 count; + __le32 start_epoch; /* for the pgls sequence */ + } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; + struct { + __le64 cookie; + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __u32 gen; /* registration generation */ + __u32 timeout; /* connection timeout */ + } __attribute__ ((packed)) watch; + struct { + __le64 cookie; + } __attribute__ ((packed)) notify; + struct { + __le64 unused; + __le64 ver; + } __attribute__ ((packed)) assert_ver; + struct { + __le64 offset, length; + __le64 src_offset; + } __attribute__ ((packed)) clonerange; + struct { + __le64 max; /* max data in reply */ + } __attribute__ ((packed)) copy_get; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ + /* + * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags + * for src object, flags for dest object are in + * ceph_osd_op::flags. + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; + struct { + struct ceph_timespec stamp; + } __attribute__ ((packed)) hit_set_get; + struct { + __u8 flags; + } __attribute__ ((packed)) tmap2omap; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ + } __attribute__ ((packed)) alloc_hint; + struct { + __le64 offset; + __le64 length; + __le64 data_length; + } __attribute__ ((packed)) writesame; + struct { + __le64 offset; + __le64 length; + __le32 chunk_size; + __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */ + } __attribute__ ((packed)) checksum; + } __attribute__ ((packed)); + __le32 payload_len; +} __attribute__ ((packed)); + +/* + * Check the compatibility of struct ceph_osd_op + * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) + + * sizeof(ceph_osd_op::flags) + + * sizeof(ceph_osd_op::extent) + + * sizeof(ceph_osd_op::payload_len)) + */ +#ifdef __cplusplus +static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4), + "sizeof(ceph_osd_op) breaks the compatibility"); +#endif + +struct ceph_osd_reply_head { + __le32 client_inc; /* client incarnation */ + __le32 flags; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; /* for replaying uncommitted */ + + __le32 result; /* result code */ + + __le32 object_len; /* length of object name */ + __le32 num_ops; + struct ceph_osd_op ops[0]; /* ops[], object */ +} __attribute__ ((packed)); + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h new file mode 120000 index 000000000..51fc03be1 --- /dev/null +++ b/src/include/rados/buffer.h @@ -0,0 +1 @@ +../buffer.h
\ No newline at end of file diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h new file mode 120000 index 000000000..bd1f6f1b0 --- /dev/null +++ b/src/include/rados/buffer_fwd.h @@ -0,0 +1 @@ +../buffer_fwd.h
\ No newline at end of file diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h new file mode 120000 index 000000000..19ef4317e --- /dev/null +++ b/src/include/rados/crc32c.h @@ -0,0 +1 @@ +../crc32c.h
\ No newline at end of file diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h new file mode 120000 index 000000000..48f0d4436 --- /dev/null +++ b/src/include/rados/inline_memory.h @@ -0,0 +1 @@ +../inline_memory.h
\ No newline at end of file diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h new file mode 100644 index 000000000..858804c3a --- /dev/null +++ b/src/include/rados/librados.h @@ -0,0 +1,4156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRADOS_H +#define CEPH_LIBRADOS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif +#include <unistd.h> +#include <string.h> +#include "rados_types.h" + +#include <sys/time.h> + +#ifndef CEPH_OSD_TMAP_SET +/* These are also defined in rados.h and objclass.h. Keep them in sync! */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' +#define CEPH_OSD_TMAP_RM 'r' +#endif + +#define LIBRADOS_VER_MAJOR 3 +#define LIBRADOS_VER_MINOR 0 +#define LIBRADOS_VER_EXTRA 0 + +#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA) + +#define LIBRADOS_SUPPORTS_WATCH 1 +#define LIBRADOS_SUPPORTS_SERVICES 1 +#define LIBRADOS_SUPPORTS_GETADDRS 1 +#define LIBRADOS_SUPPORTS_APP_METADATA 1 + +/* RADOS lock flags + * They are also defined in cls_lock_types.h. Keep them in sync! + */ +#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0) +#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW +#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1) + +/* + * Constants for rados_write_op_create(). + */ +#define LIBRADOS_CREATE_EXCLUSIVE 1 +#define LIBRADOS_CREATE_IDEMPOTENT 0 + +/* + * Flags that can be set on a per-op basis via + * rados_read_op_set_flags() and rados_write_op_set_flags(). + */ +enum { + // fail a create operation if the object already exists + LIBRADOS_OP_FLAG_EXCL = 0x1, + // allow the transaction to succeed even if the flagged op fails + LIBRADOS_OP_FLAG_FAILOK = 0x2, + // indicate read/write op random + LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4, + // indicate read/write op sequential + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, + // indicate read/write data will be accessed in the near future (by someone) + LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10, + // indicate read/write data will not accessed in the near future (by anyone) + LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20, + // indicate read/write data will not accessed again (by *this* client) + LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40, + // optionally support FUA (force unit access) on write requests + LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80, +}; + +#define CEPH_RADOS_API + +/** + * @name xattr comparison operations + * Operators for comparing xattrs on objects, and aborting the + * rados_read_op or rados_write_op transaction if the comparison + * fails. + * + * @{ + */ +enum { + LIBRADOS_CMPXATTR_OP_EQ = 1, + LIBRADOS_CMPXATTR_OP_NE = 2, + LIBRADOS_CMPXATTR_OP_GT = 3, + LIBRADOS_CMPXATTR_OP_GTE = 4, + LIBRADOS_CMPXATTR_OP_LT = 5, + LIBRADOS_CMPXATTR_OP_LTE = 6 +}; +/** @} */ + +/** + * @name Operation Flags + * Flags for rados_read_op_operate(), rados_write_op_operate(), + * rados_aio_read_op_operate(), and rados_aio_write_op_operate(). + * See librados.hpp for details. + * @{ + */ +enum { + LIBRADOS_OPERATION_NOFLAG = 0, + LIBRADOS_OPERATION_BALANCE_READS = 1, + LIBRADOS_OPERATION_LOCALIZE_READS = 2, + LIBRADOS_OPERATION_ORDER_READS_WRITES = 4, + LIBRADOS_OPERATION_IGNORE_CACHE = 8, + LIBRADOS_OPERATION_SKIPRWLOCKS = 16, + LIBRADOS_OPERATION_IGNORE_OVERLAY = 32, + /* send requests to cluster despite the cluster or pool being marked + full; ops will either succeed (e.g., delete) or return EDQUOT or + ENOSPC. */ + LIBRADOS_OPERATION_FULL_TRY = 64, + /* + * Mainly for delete op + */ + LIBRADOS_OPERATION_FULL_FORCE = 128, + LIBRADOS_OPERATION_IGNORE_REDIRECT = 256, + LIBRADOS_OPERATION_ORDERSNAP = 512, + /* enable/allow >0 return values and payloads on write/update */ + LIBRADOS_OPERATION_RETURNVEC = 1024, +}; +/** @} */ + +/** + * @name Alloc hint flags + * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2() + * indicating future IO patterns. + * @{ + */ +enum { + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8, + LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32, + LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64, + LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128, + LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; +/** @} */ + +typedef enum { + LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0, + LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1, + LIBRADOS_CHECKSUM_TYPE_CRC32C = 2 +} rados_checksum_type_t; + +/* + * snap id contants + */ +#define LIBRADOS_SNAP_HEAD UINT64_C(-2) +#define LIBRADOS_SNAP_DIR UINT64_C(-1) + +/** + * @typedef rados_t + * + * A handle for interacting with a RADOS cluster. It encapsulates all + * RADOS client configuration, including username, key for + * authentication, logging, and debugging. Talking to different clusters + * -- or to the same cluster with different users -- requires + * different cluster handles. + */ +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif //VOIDPTR_RADOS_T + +/** + * @typedef rados_config_t + * + * A handle for the ceph configuration context for the rados_t cluster + * instance. This can be used to share configuration context/state + * (e.g., logging configuration) between librados instance. + * + * @warning The config context does not have independent reference + * counting. As such, a rados_config_t handle retrieved from a given + * rados_t is only valid as long as that rados_t. + */ +typedef void *rados_config_t; + +/** + * @typedef rados_ioctx_t + * + * An io context encapsulates a few settings for all I/O operations + * done on it: + * - pool - set when the io context is created (see rados_ioctx_create()) + * - snapshot context for writes (see + * rados_ioctx_selfmanaged_snap_set_write_ctx()) + * - snapshot id to read from (see rados_ioctx_snap_set_read()) + * - object locator for all single-object operations (see + * rados_ioctx_locator_set_key()) + * - namespace for all single-object operations (see + * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES + * before rados_nobjects_list_open() will list all objects in all + * namespaces. + * + * @warning Changing any of these settings is not thread-safe - + * librados users must synchronize any of these changes on their own, + * or use separate io contexts for each thread + */ +typedef void *rados_ioctx_t; + +/** + * @typedef rados_list_ctx_t + * + * An iterator for listing the objects in a pool. + * Used with rados_nobjects_list_open(), + * rados_nobjects_list_next(), rados_nobjects_list_next2(), and + * rados_nobjects_list_close(). + */ +typedef void *rados_list_ctx_t; + +/** + * @typedef rados_object_list_cursor + * + * The cursor used with rados_enumerate_objects + * and accompanying methods. + */ +typedef void * rados_object_list_cursor; + +/** + * @struct rados_object_list_item + * + * The item populated by rados_object_list in + * the results array. + */ +typedef struct { + + /// oid length + size_t oid_length; + /// name of the object + char *oid; + /// namespace length + size_t nspace_length; + /// the object namespace + char *nspace; + /// locator length + size_t locator_length; + /// object locator + char *locator; +} rados_object_list_item; + +/** + * @typedef rados_snap_t + * The id of a snapshot. + */ +typedef uint64_t rados_snap_t; + +/** + * @typedef rados_xattrs_iter_t + * An iterator for listing extended attrbutes on an object. + * Used with rados_getxattrs(), rados_getxattrs_next(), and + * rados_getxattrs_end(). + */ +typedef void *rados_xattrs_iter_t; + +/** + * @typedef rados_omap_iter_t + * An iterator for listing omap key/value pairs on an object. + * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and + * rados_omap_get_end(). + */ +typedef void *rados_omap_iter_t; + +/** + * @struct rados_pool_stat_t + * Usage information for a pool. + */ +struct rados_pool_stat_t { + /// space used in bytes + uint64_t num_bytes; + /// space used in KB + uint64_t num_kb; + /// number of objects in the pool + uint64_t num_objects; + /// number of clones of objects + uint64_t num_object_clones; + /// num_objects * num_replicas + uint64_t num_object_copies; + /// number of objects missing on primary + uint64_t num_objects_missing_on_primary; + /// number of objects found on no OSDs + uint64_t num_objects_unfound; + /// number of objects replicated fewer times than they should be + /// (but found on at least one OSD) + uint64_t num_objects_degraded; + /// number of objects read + uint64_t num_rd; + /// objects read in KB + uint64_t num_rd_kb; + /// number of objects written + uint64_t num_wr; + /// objects written in KB + uint64_t num_wr_kb; + /// bytes originally provided by user + uint64_t num_user_bytes; + /// bytes passed compression + uint64_t compressed_bytes_orig; + /// bytes resulted after compression + uint64_t compressed_bytes; + /// bytes allocated at storage + uint64_t compressed_bytes_alloc; +}; + +/** + * @struct rados_cluster_stat_t + * Cluster-wide usage information + */ +struct rados_cluster_stat_t { + /// total device size + uint64_t kb; + /// total used + uint64_t kb_used; + /// total available/free + uint64_t kb_avail; + /// number of objects + uint64_t num_objects; +}; + +/** + * @typedef rados_write_op_t + * + * An object write operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_write_op() rados_release_write_op() + * - Extended attribute manipulation: rados_write_op_cmpxattr() + * rados_write_op_cmpxattr(), rados_write_op_setxattr(), + * rados_write_op_rmxattr() + * - Object map key/value pairs: rados_write_op_omap_set(), + * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(), + * rados_write_op_omap_cmp() + * - Object properties: rados_write_op_assert_exists(), + * rados_write_op_assert_version() + * - Creating objects: rados_write_op_create() + * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero + * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove, + * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext() + * - Hints: rados_write_op_set_alloc_hint() + * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate() + */ +typedef void *rados_write_op_t; + +/** + * @typedef rados_read_op_t + * + * An object read operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_read_op() rados_release_read_op() + * - Extended attribute manipulation: rados_read_op_cmpxattr(), + * rados_read_op_getxattr(), rados_read_op_getxattrs() + * - Object map key/value pairs: rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(), + * rados_read_op_omap_cmp() + * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(), + * rados_read_op_assert_version() + * - IO on objects: rados_read_op_read(), rados_read_op_checksum(), + * rados_read_op_cmpext() + * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf() + * - Request properties: rados_read_op_set_flags() + * - Performing the operation: rados_read_op_operate(), + * rados_aio_read_op_operate() + */ +typedef void *rados_read_op_t; + +/** + * @typedef rados_completion_t + * Represents the state of an asynchronous operation - it contains the + * return value once the operation completes, and can be used to block + * until the operation is complete or safe. + */ +typedef void *rados_completion_t; + +/** + * @struct blkin_trace_info + * blkin trace information for Zipkin tracing + */ +struct blkin_trace_info; + +/** + * Get the version of librados. + * + * The version number is major.minor.extra. Note that this is + * unrelated to the Ceph version number. + * + * TODO: define version semantics, i.e.: + * - incrementing major is for backwards-incompatible changes + * - incrementing minor is for backwards-compatible changes + * - incrementing extra is for bug fixes + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param extra where to store the extra version number + */ +CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra); + +/** + * @name Setup and Teardown + * These are the first and last functions to that should be called + * when using librados. + * + * @{ + */ + +/** + * Create a handle for communicating with a RADOS cluster. + * + * Ceph environment variables are read when this is called, so if + * $CEPH_ARGS specifies everything you need to connect, no further + * configuration is necessary. + * + * @param cluster where to store the handle + * @param id the user to connect as (i.e. admin, not client.admin) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id); + +/** + * Extended version of rados_create. + * + * Like rados_create, but + * 1) don't assume 'client\.'+id; allow full specification of name + * 2) allow specification of cluster name + * 3) flags for future expansion + */ +CEPH_RADOS_API int rados_create2(rados_t *pcluster, + const char *const clustername, + const char * const name, uint64_t flags); + +/** + * Initialize a cluster handle from an existing configuration. + * + * Share configuration state with another rados_t instance. + * + * @param cluster where to store the handle + * @param cct the existing configuration to use + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create_with_context(rados_t *cluster, + rados_config_t cct); + +/** + * Ping the monitor with ID mon_id, storing the resulting reply in + * buf (if specified) with a maximum size of len. + * + * The result buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param mon_id [in] ID of the monitor to ping + * @param outstr [out] double pointer with the resulting reply + * @param outstrlen [out] pointer with the size of the reply in outstr + */ +CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id, + char **outstr, size_t *outstrlen); + +/** + * Connect to the cluster. + * + * @note BUG: Before calling this, calling a function that communicates with the + * cluster will crash. + * + * @pre The cluster handle is configured with at least a monitor + * address. If cephx is enabled, a client name and secret must also be + * set. + * + * @post If this succeeds, any function in librados may be used + * + * @param cluster The cluster to connect to. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_connect(rados_t cluster); + +/** + * Disconnects from the cluster. + * + * For clean up, this is only necessary after rados_connect() has + * succeeded. + * + * @warning This does not guarantee any asynchronous writes have + * completed. To do that, you must call rados_aio_flush() on all open + * io contexts. + * + * @warning We implicitly call rados_watch_flush() on shutdown. If + * there are watches being used, this should be done explicitly before + * destroying the relevant IoCtx. We do it here as a safety measure. + * + * @post the cluster handle cannot be used again + * + * @param cluster the cluster to shutdown + */ +CEPH_RADOS_API void rados_shutdown(rados_t cluster); + +/** @} init */ + +/** + * @name Configuration + * These functions read and update Ceph configuration for a cluster + * handle. Any configuration changes must be done before connecting to + * the cluster. + * + * Options that librados users might want to set include: + * - mon_host + * - auth_supported + * - key, keyfile, or keyring when using cephx + * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog + * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms + * + * See docs.ceph.com for information about available configuration options` + * + * @{ + */ + +/** + * Configure the cluster handle using a Ceph config file + * + * If path is NULL, the default locations are searched, and the first + * found is used. The locations are: + * - $CEPH_CONF (environment variable) + * - /etc/ceph/ceph.conf + * - ~/.ceph/config + * - ceph.conf (in the current working directory) + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param path path to a Ceph configuration file + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path); + +/** + * Configure the cluster handle with command line arguments + * + * argv can contain any common Ceph command line option, including any + * configuration parameter prefixed by '--' and replacing spaces with + * dashes or underscores. For example, the following options are equivalent: + * - --mon-host 10.0.0.1:6789 + * - --mon_host 10.0.0.1:6789 + * - -m 10.0.0.1:6789 + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc, + const char **argv); + + +/** + * Configure the cluster handle with command line arguments, returning + * any remainders. Same rados_conf_parse_argv, except for extra + * remargv argument to hold returns unrecognized arguments. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @param remargv char* array for returned unrecognized arguments + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc, + const char **argv, + const char **remargv); +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cluster cluster handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var); + +/** + * Set a configuration option + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param option option to set + * @param value value of the option + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when the option is not a Ceph configuration option + */ +CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option, + const char *value); + +/** + * Get the value of a configuration option + * + * @param cluster configuration to read + * @param option which option to read + * @param buf where to write the configuration value + * @param len the size of buf in bytes + * @returns 0 on success, negative error code on failure + * @returns -ENAMETOOLONG if the buffer is too short to contain the + * requested value + */ +CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option, + char *buf, size_t len); + +/** @} config */ + +/** + * Read usage info about the cluster + * + * This tells you total space, space used, space available, and number + * of objects. These are not updated immediately when data is written, + * they are eventually consistent. + * + * @param cluster cluster to query + * @param result where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cluster_stat(rados_t cluster, + struct rados_cluster_stat_t *result); + +/** + * Get the fsid of the cluster as a hexadecimal string. + * + * The fsid is a unique id of an entire Ceph cluster. + * + * @param cluster where to get the fsid + * @param buf where to write the fsid + * @param len the size of buf in bytes (should be 37) + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the buffer is too short to contain the + * fsid + */ +CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len); + +/** + * Get/wait for the most recent osdmap + * + * @param cluster the cluster to shutdown + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster); + +/** + * @name Pools + * + * RADOS pools are separate namespaces for objects. Pools may have + * different crush rules associated with them, so they could have + * differing replication levels or placement strategies. RADOS + * permissions are also tied to pools - users can have different read, + * write, and execute permissions on a per-pool basis. + * + * @{ + */ + +/** + * List pools + * + * Gets a list of pool names as NULL-terminated strings. The pool + * names will be placed in the supplied buffer one after another. + * After the last pool name, there will be two 0 bytes in a row. + * + * If len is too short to fit all the pool name entries we need, we will fill + * as much as we can. + * + * Buf may be null to determine the buffer size needed to list all pools. + * + * @param cluster cluster handle + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len); + +/** + * List inconsistent placement groups of the given pool + * + * Gets a list of inconsistent placement groups as NULL-terminated strings. + * The placement group names will be placed in the supplied buffer one after + * another. After the last name, there will be two 0 types in a row. + * + * If len is too short to fit all the placement group entries we need, we will + * fill as much as we can. + * + * @param cluster cluster handle + * @param pool pool ID + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool, + char *buf, size_t len); + +/** + * Get a configuration handle for a rados cluster handle + * + * This handle is valid only as long as the cluster handle is valid. + * + * @param cluster cluster handle + * @returns config handle for this cluster + */ +CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster); + +/** + * Get a global id for current instance + * + * This id is a unique representation of current connection to the cluster + * + * @param cluster cluster handle + * @returns instance global id + */ +CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster); + +/** + * Gets the minimum compatible OSD version + * + * @param cluster cluster handle + * @param require_osd_release [out] minimum compatible OSD version + * based upon the current features + * @returns 0 on sucess, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster, + int8_t* require_osd_release); + +/** + * Gets the minimum compatible client version + * + * @param cluster cluster handle + * @param min_compat_client [out] minimum compatible client version + * based upon the current features + * @param require_min_compat_client [out] required minimum client version + * based upon explicit setting + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster, + int8_t* min_compat_client, + int8_t* require_min_compat_client); + +/** + * Create an io context + * + * The io context allows you to perform operations within a particular + * pool. For more details see rados_ioctx_t. + * + * @param cluster which cluster the pool is in + * @param pool_name name of the pool + * @param ioctx where to store the io context + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name, + rados_ioctx_t *ioctx); +CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id, + rados_ioctx_t *ioctx); + +/** + * The opposite of rados_ioctx_create + * + * This just tells librados that you no longer need to use the io context. + * It may not be freed immediately if there are pending asynchronous + * requests on it, but you should not use an io context again after + * calling this function on it. + * + * @warning This does not guarantee any asynchronous + * writes have completed. You must call rados_aio_flush() + * on the io context before destroying it to do that. + * + * @warning If this ioctx is used by rados_watch, the caller needs to + * be sure that all registered watches are disconnected via + * rados_unwatch() and that rados_watch_flush() is called. This + * ensures that a racing watch callback does not make use of a + * destroyed ioctx. + * + * @param io the io context to dispose of + */ +CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io); + +/** + * Get configuration handle for a pool handle + * + * @param io pool handle + * @returns rados_config_t for this cluster + */ +CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io); + +/** + * Get the cluster handle used by this rados_ioctx_t + * Note that this is a weak reference, and should not + * be destroyed via rados_shutdown(). + * + * @param io the io context + * @returns the cluster handle for this io context + */ +CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io); + +/** + * Get pool usage statistics + * + * Fills in a rados_pool_stat_t after querying the cluster. + * + * @param io determines which pool to query + * @param stats where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io, + struct rados_pool_stat_t *stats); + +/** + * Get the id of a pool + * + * @param cluster which cluster the pool is in + * @param pool_name which pool to look up + * @returns id of the pool + * @returns -ENOENT if the pool is not found + */ +CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster, + const char *pool_name); + +/** + * Get the name of a pool + * + * @param cluster which cluster the pool is in + * @param id the id of the pool + * @param buf where to store the pool name + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id, + char *buf, size_t maxlen); + +/** + * Create a pool with default settings + * + * The default crush rule is rule 0. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name); + +/** + * Create a pool owned by a specific auid. + * + * DEPRECATED: auid support has been removed, and this call will be removed in a future + * release. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster, + const char *pool_name, + uint64_t auid) + __attribute__((deprecated)); + +/** + * Create a pool with a specific CRUSH rule + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool1 + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster, + const char *pool_name, + uint8_t crush_rule_num); + +/** + * Create a pool with a specific CRUSH rule and auid + * + * DEPRECATED: auid support has been removed and this call will be removed + * in a future release. + * + * This is a combination of rados_pool_create_with_crush_rule() and + * rados_pool_create_with_auid(). + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool2 + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster, + const char *pool_name, + uint64_t auid, + uint8_t crush_rule_num) + __attribute__((deprecated)); + +/** + * Returns the pool that is the base tier for this pool. + * + * The return value is the ID of the pool that should be used to read from/write to. + * If tiering is not set up for the pool, returns \c pool. + * + * @param cluster the cluster the pool is in + * @param pool ID of the pool to query + * @param base_tier [out] base tier, or \c pool if tiering is not configured + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool, + int64_t* base_tier); + +/** + * Delete a pool and all data inside it + * + * The pool is removed from the cluster immediately, + * but the actual data is deleted in the background. + * + * @param cluster the cluster the pool is in + * @param pool_name which pool to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name); + +/** + * Attempt to change an io context's associated auid "owner" + * + * DEPRECATED: auid support has been removed and this call has no effect. + * + * Requires that you have write permission on both the current and new + * auid. + * + * @param io reference to the pool to change. + * @param auid the auid you wish the io to have. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid) + __attribute__((deprecated)); + + +/** + * Get the auid of a pool + * + * DEPRECATED: auid support has been removed and this call always reports + * CEPH_AUTH_UID_DEFAULT (-1). + + * @param io pool to query + * @param auid where to store the auid + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid) + __attribute__((deprecated)); + +/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Test whether the specified pool requires alignment or not. + * + * @param io pool to query + * @param req 1 if alignment is supported, 0 if not. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io, + int *req); + +/* deprecated, use rados_ioctx_pool_required_alignment2 instead */ +CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Get the alignment flavor of a pool + * + * @param io pool to query + * @param alignment where to store the alignment flavor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io, + uint64_t *alignment); + +/** + * Get the pool id of the io context + * + * @param io the io context to query + * @returns the id of the pool the io context uses + */ +CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io); + +/** + * Get the pool name of the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} pools */ + +/** + * @name Object Locators + * + * @{ + */ + +/** + * Set the key for mapping objects to pgs within an io context. + * + * The key is used instead of the object name to determine which + * placement groups an object is put in. This affects all subsequent + * operations of the io context - until a different locator key is + * set, all objects in this io context will be placed in the same pg. + * + * @param io the io context to change + * @param key the key to use as the object locator, or NULL to discard + * any previously set key + */ +CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io, + const char *key); + +/** + * Set the namespace for objects within an io context + * + * The namespace specification further refines a pool into different + * domains. The mapping of objects to pgs is also based on this + * value. + * + * @param io the io context to change + * @param nspace the name to use as the namespace, or NULL use the + * default namespace + */ +CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io, + const char *nspace); + +/** + * Get the namespace for objects within the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} obj_loc */ + +/** + * @name Listing Objects + * @{ + */ +/** + * Start listing objects in a pool + * + * @param io the pool to list from + * @param ctx the handle to store list context in + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io, + rados_list_ctx_t *ctx); + +/** + * Return hash position of iterator, rounded to the current PG + * + * @param ctx iterator marking where you are in the listing + * @returns current hash position, rounded to the current pg + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx); + +/** + * Reposition object iterator to a different hash position + * + * @param ctx iterator marking where you are in the listing + * @param pos hash position to move to + * @returns actual (rounded) position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx, + uint32_t pos); + +/** + * Reposition object iterator to a different position + * + * @param ctx iterator marking where you are in the listing + * @param cursor position to move to + * @returns rounded position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor cursor); + +/** + * Reposition object iterator to a different position + * + * The returned handle must be released with rados_object_list_cursor_free(). + * + * @param ctx iterator marking where you are in the listing + * @param cursor where to store cursor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor *cursor); + +/** + * Get the next object name and locator in the pool + * + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace); + +/** + * Get the next object name, locator and their sizes in the pool + * + * The sizes allow to list objects with \0 (the NUL character) + * in .e.g *entry. Is is unusual see such object names but a bug + * in a client has risen the need to handle them as well. + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @param entry_size where to store the size of name of the entry + * @param key_size where to store the size of object locator (set to NULL to ignore) + * @param nspace_size where to store the size of object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace, + size_t *entry_size, + size_t *key_size, + size_t *nspace_size); + +/** + * Close the object listing handle. + * + * This should be called when the handle is no longer needed. + * The handle should not be used after it has been closed. + * + * @param ctx the handle to close + */ +CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx); + +/** + * Get cursor handle pointing to the *beginning* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin( + rados_ioctx_t io); + +/** + * Get cursor handle pointing to the *end* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io); + +/** + * Check if a cursor has reached the end of a pool + * + * @param io ioctx + * @param cur cursor + * @returns 1 if the cursor has reached the end of the pool, 0 otherwise + */ +CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Release a cursor + * + * Release a cursor. The handle may not be used after this point. + * + * @param io ioctx + * @param cur cursor + */ +CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Compare two cursor positions + * + * Compare two cursors, and indicate whether the first cursor precedes, + * matches, or follows the second. + * + * @param io ioctx + * @param lhs first cursor + * @param rhs second cursor + * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs + */ +CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io, + rados_object_list_cursor lhs, rados_object_list_cursor rhs); + +/** + * @return the number of items set in the results array + */ +CEPH_RADOS_API int rados_object_list(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t result_size, + const char *filter_buf, + const size_t filter_buf_len, + rados_object_list_item *results, + rados_object_list_cursor *next); + +CEPH_RADOS_API void rados_object_list_free( + const size_t result_size, + rados_object_list_item *results); + +/** + * Obtain cursors delineating a subset of a range. Use this + * when you want to split up the work of iterating over the + * global namespace. Expected use case is when you are iterating + * in parallel, with `m` workers, and each worker taking an id `n`. + * + * @param io ioctx + * @param start start of the range to be sliced up (inclusive) + * @param finish end of the range to be sliced up (exclusive) + * @param n which of the m chunks you would like to get cursors for + * @param m how many chunks to divide start-finish into + * @param split_start cursor populated with start of the subrange (inclusive) + * @param split_finish cursor populated with end of the subrange (exclusive) + */ +CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t n, + const size_t m, + rados_object_list_cursor *split_start, + rados_object_list_cursor *split_finish); + + +/** @} Listing Objects */ + +/** + * @name Snapshots + * + * RADOS snapshots are based upon sequence numbers that form a + * snapshot context. They are pool-specific. The snapshot context + * consists of the current snapshot sequence number for a pool, and an + * array of sequence numbers at which snapshots were taken, in + * descending order. Whenever a snapshot is created or deleted, the + * snapshot sequence number for the pool is increased. To add a new + * snapshot, the new snapshot sequence number must be increased and + * added to the snapshot context. + * + * There are two ways to manage these snapshot contexts: + * -# within the RADOS cluster + * These are called pool snapshots, and store the snapshot context + * in the OSDMap. These represent a snapshot of all the objects in + * a pool. + * -# within the RADOS clients + * These are called self-managed snapshots, and push the + * responsibility for keeping track of the snapshot context to the + * clients. For every write, the client must send the snapshot + * context. In librados, this is accomplished with + * rados_selfmanaged_snap_set_write_ctx(). These are more + * difficult to manage, but are restricted to specific objects + * instead of applying to an entire pool. + * + * @{ + */ + +/** + * Create a pool-wide snapshot + * + * @param io the pool to snapshot + * @param snapname the name of the snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io, + const char *snapname); + +/** + * Delete a pool snapshot + * + * @param io the pool to delete the snapshot from + * @param snapname which snapshot to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io, + const char *snapname); + +/** + * Rollback an object to a pool snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapname which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid, + const char *snapname); + +/** + * @warning Deprecated: Use rados_ioctx_snap_rollback() instead + */ +CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid, + const char *snapname) + __attribute__((deprecated)); + +/** + * Set the snapshot from which reads are performed. + * + * Subsequent reads will return data as it was at the time of that + * snapshot. + * + * @param io the io context to change + * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no + * snapshot (i.e. normal operation) + */ +CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io, + rados_snap_t snap); + +/** + * Allocate an ID for a self-managed snapshot + * + * Get a unique ID to put in the snaphot context to create a + * snapshot. A clone of an object is not created until a write with + * the new snapshot context is completed. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid, + rados_completion_t completion); + +/** + * Remove a self-managed snapshot + * + * This increases the snapshot sequence number, which will cause + * snapshots to be removed lazily. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid, + rados_completion_t completion); + +/** + * Rollback an object to a self-managed snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapid which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, + const char *oid, + rados_snap_t snapid); + +/** + * Set the snapshot context for use when writing to objects + * + * This is stored in the io context, and applies to all future writes. + * + * @param io the io context to change + * @param seq the newest snapshot sequence number for the pool + * @param snaps array of snapshots in sorted by descending id + * @param num_snaps how many snaphosts are in the snaps array + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snaps are not in descending order + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io, + rados_snap_t seq, + rados_snap_t *snaps, + int num_snaps); + +/** + * List all the ids of pool snapshots + * + * If the output array does not have enough space to fit all the + * snapshots, -ERANGE is returned and the caller should retry with a + * larger array. + * + * @param io the pool to read from + * @param snaps where to store the results + * @param maxlen the number of rados_snap_t that fit in the snaps array + * @returns number of snapshots on success, negative error code on failure + * @returns -ERANGE is returned if the snaps array is too short + */ +CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps, + int maxlen); + +/** + * Get the id of a pool snapshot + * + * @param io the pool to read from + * @param name the snapshot to find + * @param id where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name, + rados_snap_t *id); + +/** + * Get the name of a pool snapshot + * + * @param io the pool to read from + * @param id the snapshot to find + * @param name where to store the result + * @param maxlen the size of the name array + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the name array is too small + */ +CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id, + char *name, int maxlen); + +/** + * Find when a pool snapshot occurred + * + * @param io the pool the snapshot was taken in + * @param id the snapshot to lookup + * @param t where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, + time_t *t); + +/** @} Snapshots */ + +/** + * @name Synchronous I/O + * Writes are replicated to a number of OSDs based on the + * configuration of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_ioctx_wait_for_complete(). For greater data safety, use the + * asynchronous functions and rados_aio_wait_for_safe(). + * + * @{ + */ + +/** + * Return the version of the last object read or written to. + * + * This exposes the internal version number of the last object read or + * written via this io context + * + * @param io the io context to check + * @returns last read or written object version + */ +CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io); + +/** + * Write *len* bytes from *buf* into the *oid* object, starting at + * offset *off*. The value of *len* must be <= UINT_MAX/2. + * + * @note This will never return a positive value not equal to len. + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid, + const char *buf, size_t len, uint64_t off); + +/** + * Write *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Write the same *data_len* bytes from *buf* multiple times into the + * *oid* object. *write_len* bytes are written in total, which must be + * a multiple of *data_len*. The value of *write_len* and *data_len* + * must be <= UINT_MAX/2. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Append *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * @param io the context to operate in + * @param oid the name of the object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf, + size_t len, uint64_t off); + +/** + * Compute checksum from object data + * + * The io context determines the snapshot to checksum, if any was set + * by rados_ioctx_snap_set_read(). The length of the init_value and + * resulting checksum are dependent upon the checksum type: + * + * XXHASH64: le64 + * XXHASH32: le32 + * CRC32C: le32 + * + * The checksum result is encoded the following manner: + * + * le32 num_checksum_chunks + * { + * leXX checksum for chunk (where XX = appropriate size for the checksum type) + * } * num_checksum_chunks + * + * @param io the context in which to perform the checksum + * @param oid the name of the object to checksum + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param len the number of bytes to checksum + * @param off the offset to start checksumming in the object + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result + * @param checksum_len the number of bytes available for the result + * @return negative error code on failure + */ +CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid, + rados_checksum_type_t type, + const char *init_value, size_t init_value_len, + size_t len, uint64_t off, size_t chunk_size, + char *pchecksum, size_t checksum_len); + +/** + * Delete an object + * + * @note This does not delete any snapshots of the object. + * + * @param io the pool to delete the object from + * @param oid the name of the object to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @param io the context in which to truncate + * @param oid the name of the object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid, + uint64_t size); + +/** + * Compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o name of the object + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o, + const char *cmp_buf, size_t cmp_len, + uint64_t off); + +/** + * @name Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o, + const char *name, char *buf, size_t len); + +/** + * Set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o, + const char *name, const char *buf, + size_t len); + +/** + * Delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o, + const char *name); + +/** + * Start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Get the next omap key/value pair on the object + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key is + * null-terminated, and val has length len. If the end of the list has + * been reached, key and val are NULL, and len is 0. key and val will + * not be accessible after rados_omap_get_end() is called on iter, so + * if they are needed after that they should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter, + char **key, + char **val, + size_t *len); + +/** + * Get the next omap key/value pair on the object. Note that it's + * perfectly safe to mix calls to rados_omap_get_next and + * rados_omap_get_next2. + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key has length + * keylen and val has length vallen. If the end of the list has + * been reached, key and val are NULL, and keylen and vallen is 0. + * key and val will not be accessible after rados_omap_get_end() + * is called on iter, so if they are needed after that they + * should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param key_len where to store the number of bytes in key + * @param val_len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter, + char **key, + char **val, + size_t *key_len, + size_t *val_len); + +/** + * Return number of elements in the iterator + * + * @param iter the iterator of which to return the size + */ +CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter); + +/** + * Close the omap iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter); + +/** + * Get object size and most recent update time from the OSD. + * + * @param io ioctx + * @param o object name + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize, + time_t *pmtime); + +CEPH_RADOS_API int rados_stat2(rados_ioctx_t io, const char *o, uint64_t *psize, + struct timespec *pmtime); + +/** + * Execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param oid the object to call the method on + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns the length of the output, or + * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For + * methods that don't return data, the return value is + * method-specific. + */ +CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid, + const char *cls, const char *method, + const char *in_buf, size_t in_len, char *buf, + size_t out_len); + + +/** @} Synchronous I/O */ + +/** + * @name Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_callback_t + * Callbacks for asynchrous operations take two parameters: + * - cb the completion that has finished + * - arg application defined data made available to the callback function + */ +typedef void (*rados_callback_t)(rados_completion_t cb, void *arg); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * TODO: more complete documentation of this elsewhere (in the RADOS docs?) + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all replicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_completion_t *pc); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete callback corresponds to operation being acked. + * + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is committed + * on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg, + rados_callback_t cb_complete, + rados_completion_t *pc); + +/** + * Block until an operation completes + * + * This means it is in memory on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c); + +/** + * Block until an operation is safe + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c); + +/** + * Is an asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c); + +/** + * Block until an operation completes and callback completes + * + * This means it is in memory on all replicas and can be read. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c); + +/** + * Block until an operation is safe and callback has completed + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation and callback completed + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c); + +/** + * Is an asynchronous operation safe and has the callback completed + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c); + +/** + * Get the return value of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns return value of the operation + */ +CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c); + +/** + * Get the internal object version of the target of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns version number of the asychronous operation's target + */ +CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c); + +/** + * Release a completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c completion to release + */ +CEPH_RADOS_API void rados_aio_release(rados_completion_t c); + +/** + * Write data to an object asynchronously + * + * Queues the write and returns. The return value of the completion + * will be 0 on success, negative error code on failure. + * + * @param io the context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len, uint64_t off); + +/** + * Asynchronously append data to an object + * + * Queues the append and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the append is safe and complete + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write an entire object + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * Queues the write_full and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write_full is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write the same buffer multiple times + * + * Queues the writesame and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the writesame is safe and complete + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid, + rados_completion_t completion); + +/** + * Asynchronously read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @note only the 'complete' callback of the completion will be called. + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param completion what to do when the read is complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + char *buf, size_t len, uint64_t off); + +/** + * Block until all pending writes in an io context are safe + * + * This is not equivalent to calling rados_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @note BUG: always returns 0, should be void or accept a timeout + * + * @param io the context to flush + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io); + + +/** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * rados_aio_flush(). + * + * @param io the context to flush + * @param completion what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io, + rados_completion_t completion); + + +/** + * Asynchronously get object stats (size/mtime) + * + * @param io ioctx + * @param o object name + * @param completion what to do when the stat is complete + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, time_t *pmtime); + +CEPH_RADOS_API int rados_aio_stat2(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, struct timespec *pmtime); + +/** + * Asynchronously compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o the name of the object to compare with + * @param completion what to do when the comparison is complete + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cmp_buf, + size_t cmp_len, + uint64_t off); + +/** + * Cancel async operation + * + * @param io ioctx + * @param completion completion handle + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io, + rados_completion_t completion); + +/** + * Asynchronously execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param o name of the object + * @param completion what to do when the exec completes + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cls, const char *method, + const char *in_buf, size_t in_len, + char *buf, size_t out_len); + +/** @} Asynchronous I/O */ + +/** + * @name Asynchronous Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Asynchronously get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param completion what to do when the getxattr completes + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, char *buf, size_t len); + +/** + * Asynchronously set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param completion what to do when the setxattr completes + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, const char *buf, + size_t len); + +/** + * Asynchronously delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param completion what to do when the rmxattr completes + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name); + +/** + * Asynchronously start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param completion what to do when the getxattrs completes + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + rados_xattrs_iter_t *iter); + +/** @} Asynchronous Xattrs */ + +/** + * @name Watch/Notify + * + * Watch/notify is a protocol to help communicate among clients. It + * can be used to sychronize client state. All that's needed is a + * well-known object name (for example, rbd uses the header object of + * an image). + * + * Watchers register an interest in an object, and receive all + * notifies on that object. A notify attempts to communicate with all + * clients watching an object, and blocks on the notifier until each + * client responds or a timeout is reached. + * + * See rados_watch() and rados_notify() for more details. + * + * @{ + */ + +/** + * @typedef rados_watchcb_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param opcode undefined + * @param ver version of the watched object + * @param arg application-specific data + * + * @note BUG: opcode is an internal detail that shouldn't be exposed + * @note BUG: ver is unused + */ +typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg); + +/** + * @typedef rados_watchcb2_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param arg opaque user-defined value provided to rados_watch2() + * @param notify_id an id for this notify event + * @param handle the watcher handle we are notifying + * @param notifier_id the unique client id for the notifier + * @param data payload from the notifier + * @param data_len length of payload buffer + */ +typedef void (*rados_watchcb2_t)(void *arg, + uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + void *data, + size_t data_len); + +/** + * @typedef rados_watcherrcb_t + * + * Callback activated when we encounter an error with the watch session. + * This can happen when the location of the objects moves within the + * cluster and we fail to register our watch with the new object location, + * or when our connection with the object OSD is otherwise interrupted and + * we may have missed notify events. + * + * @param pre opaque user-defined value provided to rados_watch2() + * @param cookie the internal id assigned to the watch session + * @param err error code + */ + typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @note BUG: librados should provide a way for watchers to notice connection resets + * @note BUG: the ver parameter does not work, and -ERANGE will never be returned + * (See URL tracker.ceph.com/issues/2592) + * + * @param io the pool the object is in + * @param o the object to watch + * @param ver expected version of the object + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param arg application defined data to pass when watchcb is called + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the version of the object is greater than ver + */ +CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver, + uint64_t *cookie, + rados_watchcb_t watchcb, void *arg) + __attribute__((deprecated)); + + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to the + * primary OSD for a watched object, the watch will be removed after + * a timeout configured with osd_client_watch_timeout. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after the number of seconds that configured in timeout parameter. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Check on the status of a watch + * + * Return the number of milliseconds since the watch was last confirmed. + * Or, if there has been an error, return that. + * + * If there is an error, the watch is no longer valid, and should be + * destroyed with rados_unwatch2(). The the user is still interested + * in the object, a new watch should be created with rados_watch2(). + * + * @param io the pool the object is in + * @param cookie the watch handle + * @returns ms since last confirmed on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param o the name of the watched object (ignored) + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie) + __attribute__((deprecated)); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie); + +/** + * Asynchronous unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie, + rados_completion_t completion); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * @note BUG: the timeout is not changeable via the C API + * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t + * + * @param io the pool the object is in + * @param o the name of the object + * @param ver obsolete - just pass zero + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver, + const char *buf, int buf_len) + __attribute__((deprecated)); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * The reply buffer is optional. If specified, the client will get + * back an encoded buffer that includes the ids of the clients that + * acknowledged the notify as well as their notify ack payloads (if + * any). Clients that timed out are not included. Even clients that + * do not include a notify ack payload are included in the list but + * have a 0-length payload associated with them. The format: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * Note: There may be multiple instances of the same gid if there are + * multiple watchers registered via the same client. + * + * Note: The buffer must be released with rados_buffer_free() when the + * user is done with it. + * + * Note: Since the result buffer includes clients that time out, it + * will be set even when rados_notify() returns an error code (like + * -ETIMEDOUT). + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param o the name of the object + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @param timeout_ms notify timeout (in ms) + * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free) + * @param reply_buffer_len pointer to size of reply buffer + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *buf, int buf_len, + uint64_t timeout_ms, char **reply_buffer, + size_t *reply_buffer_len); +CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o, + const char *buf, int buf_len, + uint64_t timeout_ms, + char **reply_buffer, size_t *reply_buffer_len); + +/** + * Decode a notify response + * + * Decode a notify response (from rados_aio_notify() call) into acks and + * timeout arrays. + * + * @param reply_buffer buffer from rados_aio_notify() call + * @param reply_buffer_len reply_buffer length + * @param acks pointer to struct notify_ack_t pointer + * @param nr_acks pointer to ack count + * @param timeouts pointer to notify_timeout_t pointer + * @param nr_timeouts pointer to timeout count + * @returns 0 on success + */ +CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len, + struct notify_ack_t **acks, size_t *nr_acks, + struct notify_timeout_t **timeouts, size_t *nr_timeouts); + +/** + * Free notify allocated buffer + * + * Release memory allocated by rados_decode_notify_response() call + * + * @param acks notify_ack_t struct (from rados_decode_notify_response()) + * @param nr_acks ack count + * @param timeouts notify_timeout_t struct (from rados_decode_notify_response()) + */ +CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks, + struct notify_timeout_t *timeouts); + +/** + * Acknolwedge receipt of a notify + * + * @param io the pool the object is in + * @param o the name of the object + * @param notify_id the notify_id we got on the watchcb2_t callback + * @param cookie the watcher handle + * @param buf payload to return to notifier (optional) + * @param buf_len payload length + * @returns 0 on success + */ +CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o, + uint64_t notify_id, uint64_t cookie, + const char *buf, int buf_len); + +/** + * Flush watch/notify callbacks + * + * This call will block until all pending watch/notify callbacks have + * been executed and the queue is empty. It should usually be called + * after shutting down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + */ +CEPH_RADOS_API int rados_watch_flush(rados_t cluster); +/** + * Flush watch/notify callbacks + * + * This call will be nonblock, and the completion will be called + * until all pending watch/notify callbacks have been executed and + * the queue is empty. It should usually be called after shutting + * down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + * @param completion what to do when operation has been attempted + */ +CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion); + +/** @} Watch/Notify */ + +/** + * Pin an object in the cache tier + * + * When an object is pinned in the cache tier, it stays in the cache + * tier, and won't be flushed out. + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o); + +/** + * Unpin an object in the cache tier + * + * After an object is unpinned in the cache tier, it can be flushed out + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o); + +/** + * @name Hints + * + * @{ + */ + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** @} Hints */ + +/** + * @name Object Operations + * + * A single rados operation can do multiple operations on one object + * atomically. The whole operation will succeed or fail, and no partial + * results will be visible. + * + * Operations may be either reads, which can return data, or writes, + * which cannot. The effects of writes are applied and visible all at + * once, so an operation that sets an xattr and then checks its value + * will not see the updated value. + * + * @{ + */ + +/** + * Create a new rados_write_op_t write operation. This will store all actions + * to be performed atomically. You must call rados_release_write_op when you are + * finished with it. + * + * @note the ownership of a write operartion is passed to the function + * performing the operation, so the same instance of @c rados_write_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_write_op_t rados_create_write_op(void); + +/** + * Free a rados_write_op_t, must be called when you're done with it. + * @param write_op operation to deallocate, created with rados_create_write_op + */ +CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op); + +/** + * Set flags for the last operation added to this write_op. + * At least one op must have been added to the write_op. + * @param write_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op, + int flags); + +/** + * Ensure that the object exists before writing + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before writing. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_write_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_write_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param write_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param write_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that given xattr satisfies comparison. + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param write_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Set an xattr + * @param write_op operation to add this action to + * @param name name of the xattr + * @param value buffer to set xattr to + * @param value_len length of buffer to set xattr to + */ +CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op, + const char *name, + const char *value, + size_t value_len); + +/** + * Remove an xattr + * @param write_op operation to add this action to + * @param name name of the xattr to remove + */ +CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op, + const char *name); + +/** + * Create the object + * @param write_op operation to add this action to + * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or + LIBRADOS_CREATE_IDEMPOTENT + * will error if the object already exists. + * @param category category string (DEPRECATED, HAS NO EFFECT) + */ +CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op, + int exclusive, + const char* category); + +/** + * Write to offset + * @param write_op operation to add this action to + * @param offset offset to write to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op, + const char *buffer, + size_t len, + uint64_t offset); + +/** + * Write whole object, atomically replacing it. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op, + const char *buffer, + size_t len); + +/** + * Write the same buffer multiple times + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param data_len length of buffer + * @param write_len total number of bytes to write, as a multiple of @c data_len + * @param offset offset to write to + */ +CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op, + const char *buffer, + size_t data_len, + size_t write_len, + uint64_t offset); + +/** + * Append to end of object. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op, + const char *buffer, + size_t len); +/** + * Remove object + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op); + +/** + * Truncate an object + * @param write_op operation to add this action to + * @param offset Offset to truncate to + */ +CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op, + uint64_t offset); + +/** + * Zero part of an object + * @param write_op operation to add this action to + * @param offset Offset to zero + * @param len length to zero + */ +CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op, + uint64_t offset, + uint64_t len); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * @param write_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + int *prval); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *lens, + size_t num); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param key_lens array of lengths corresponding to each key + * @param val_lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *key_lens, + const size_t *val_lens, + size_t num); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to remove + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op, + char const* const* keys, + size_t keys_len); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of char arrays representing keys to remove + * @param key_lens array of size_t values representing length of each key + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op, + char const* const* keys, + const size_t* key_lens, + size_t keys_len); + + +/** + * Remove key/value pairs from an object whose keys are in the range + * [key_begin, key_end) + * + * @param write_op operation to add this action to + * @param key_begin the lower bound of the key range to remove + * @param key_begin_len length of key_begin + * @param key_end the upper bound of the key range to remove + * @param key_end_len length of key_end + */ +CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op, + const char *key_begin, + size_t key_begin_len, + const char *key_end, + size_t key_end_len); + +/** + * Remove all key/value pairs from an object + * + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + time_t *mtime, + int flags); +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ + +CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + time_t *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Create a new rados_read_op_t read operation. This will store all + * actions to be performed atomically. You must call + * rados_release_read_op when you are finished with it (after it + * completes, or you decide not to send it in the first place). + * + * @note the ownership of a read operartion is passed to the function + * performing the operation, so the same instance of @c rados_read_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_read_op_t rados_create_read_op(void); + +/** + * Free a rados_read_op_t, must be called when you're done with it. + * @param read_op operation to deallocate, created with rados_create_read_op + */ +CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op); + +/** + * Set flags for the last operation added to this read_op. + * At least one op must have been added to the read_op. + * @param read_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags); + +/** + * Ensure that the object exists before reading + * @param read_op operation to add this action to + */ +CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before reading. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_read_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_read_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param read_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param read_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that the an xattr satisfies a comparison + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param read_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Start iterating over xattrs on an object. + * + * @param read_op operation to add this action to + * @param iter where to store the iterator + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op, + rados_xattrs_iter_t *iter, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Get object size and mtime + * @param read_op operation to add this action to + * @param psize where to store object size + * @param pmtime where to store modification time + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op, + uint64_t *psize, + time_t *pmtime, + int *prval); + +CEPH_RADOS_API void rados_read_op_stat2(rados_read_op_t read_op, + uint64_t *psize, + struct timespec *pmtime, + int *prval); +/** + * Read bytes from offset into buffer. + * + * prlen will be filled with the number of bytes read if successful. + * A short read can only occur if the read reaches the end of the + * object. + * + * @param read_op operation to add this action to + * @param offset offset to read from + * @param len length of buffer + * @param buffer where to put the data + * @param bytes_read where to store the number of bytes read by this action + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op, + uint64_t offset, + size_t len, + char *buffer, + size_t *bytes_read, + int *prval); + +/** + * Compute checksum from object data + * + * @param read_op operation to add this action to + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param offset the offset to start checksumming in the object + * @param len the number of bytes to checksum + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result for this action + * @param checksum_len the number of bytes available for the result + * @param prval where to store the return value for this action + */ +CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op, + rados_checksum_type_t type, + const char *init_value, + size_t init_value_len, + uint64_t offset, size_t len, + size_t chunk_size, char *pchecksum, + size_t checksum_len, int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * The output buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf where to put librados-allocated output buffer + * @param out_len length of out_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char **out_buf, + size_t *out_len, + int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * If the output buffer is too small, prval will + * be set to -ERANGE and used_len will be 0. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf user-provided buffer to read into + * @param out_len length of out_buf in bytes + * @param used_len where to store the number of bytes read into out_buf + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char *out_buf, + size_t out_len, + size_t *used_len, + int *prval); + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to null-terminated keys to get + * @param keys_len the number of strings in keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op, + char const* const* keys, + size_t keys_len, + rados_omap_iter_t *iter, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to keys to get + * @param num_keys the number of strings in keys + * @param key_lens array of size_t's describing each key len (in bytes) + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op, + char const* const* keys, + size_t num_keys, + const size_t* key_lens, + rados_omap_iter_t *iter, + int *prval); + +/** + * Perform a read operation synchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + const char *oid, + int flags); + +/** + * Perform a read operation asynchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + int flags); + +/** @} Object Operations */ + +/** + * Take an exclusive lock on an object. + * + * @param io the context to operate in + * @param oid the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid, + const char * name, const char * cookie, + const char * desc, + struct timeval * duration, + uint8_t flags); + +/** + * Take a shared lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag The tag of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o, + const char * name, const char * cookie, + const char * tag, const char * desc, + struct timeval * duration, uint8_t flags); + +/** + * Release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie); + +/** + * Asynchronous release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @param completion what to do when operation has been attempted + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie, + rados_completion_t completion); + +/** + * List clients that have locked the named object lock and information about + * the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the object lock + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o, + const char *name, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Releases a shared or exclusive lock on an object, which was taken by the + * specified client. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param client the client currently holding the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + * @returns -EINVAL if the client cannot be parsed + */ +CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o, + const char *name, const char *client, + const char *cookie); + +/** + * Blocklists the specified client from the OSDs + * + * @param cluster cluster handle + * @param client_address client address + * @param expire_seconds number of seconds to blocklist (0 for default) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_blocklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds); +CEPH_RADOS_API int rados_blacklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds) + __attribute__((deprecated)); + +/** + * Gets addresses of the RADOS session, suitable for blocklisting. + * + * @param cluster cluster handle + * @param addrs the output string. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs); + +CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io); + +CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io); + +/** + * Enable an application on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param force 0 if only single application per pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io, + const char *app_name, int force); + +/** + * List all enabled applications + * + * If the provided buffer is too short, the required length is filled in and + * -ERANGE is returned. Otherwise, the buffers are filled with the application + * names, with a '\0' after each. + * + * @param io pool ioctx + * @param values buffer in which to store application names + * @param values_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values, + size_t *values_len); + +/** + * Get application metadata value from pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value result buffer + * @param value_len maximum len of value + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io, + const char *app_name, + const char *key, char *value, + size_t *value_len); + +/** + * Set application metadata on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io, + const char *app_name, + const char *key, + const char *value); + +/** + * Remove application metadata from a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io, + const char *app_name, + const char *key); + +/** + * List all metadata key/value pairs associated with an application. + * + * This iterates over all metadata, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are filled + * in and -ERANGE is returned. Otherwise, the buffers are filled with + * the keys and values of the metadata, with a '\0' after each. + * + * @param io pool ioctx + * @param app_name application name + * @param keys buffer in which to store key names + * @param key_len number of bytes in keys buffer + * @param values buffer in which to store values + * @param vals_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io, + const char *app_name, + char *keys, size_t *key_len, + char *values, + size_t *vals_len); + +/** + * @name Mon/OSD/PG Commands + * + * These interfaces send commands relating to the monitor, OSD, or PGs. + * + * @{ + */ + +/** + * Send monitor command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr tell command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name mgr name to target + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command_target( + rados_t cluster, + const char *name, + const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send monitor command to a specific monitor. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name target monitor's name + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * free a rados-allocated buffer + * + * Release memory allocated by librados calls like rados_mon_command(). + * + * @param buf buffer pointer + */ +CEPH_RADOS_API void rados_buffer_free(char *buf); + +CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback_t)(void *arg, + const char *line, + const char *who, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback2_t)(void *arg, + const char *line, + const char *channel, + const char *who, + const char *name, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level, + rados_log_callback_t cb, void *arg); +CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level, + rados_log_callback2_t cb, void *arg); + + +/** + * register daemon instance for a service + * + * Register us as a daemon providing a particular service. We identify + * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname'). + * The metadata is a map of keys and values with arbitrary static metdata + * for this instance. The encoding is a series of NULL-terminated strings, + * alternating key names and values, terminating with an empty key name. + * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}. + * + * For the lifetime of the librados instance, regular beacons will be sent + * to the cluster to maintain our registration in the service map. + * + * @param cluster handle + * @param service service name + * @param daemon daemon instance name + * @param metadata_dict static daemon metadata dict + */ +CEPH_RADOS_API int rados_service_register( + rados_t cluster, + const char *service, + const char *daemon, + const char *metadata_dict); + +/** + * update daemon status + * + * Update our mutable status information in the service map. + * + * The status dict is encoded the same way the daemon metadata is encoded + * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is + * {foo=bar,this=that}. + * + * @param cluster rados cluster handle + * @param status_dict status dict + */ +CEPH_RADOS_API int rados_service_update_status( + rados_t cluster, + const char *status_dict); + +/** @} Mon/OSD/PG commands */ + +/* + * These methods are no longer supported and return -ENOTSUP where possible. + */ +CEPH_RADOS_API int rados_objects_list_open( + rados_ioctx_t io, + rados_list_ctx_t *ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position( + rados_list_ctx_t ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_seek( + rados_list_ctx_t ctx, + uint32_t pos) __attribute__((deprecated)); +CEPH_RADOS_API int rados_objects_list_next( + rados_list_ctx_t ctx, + const char **entry, + const char **key) __attribute__((deprecated)); +CEPH_RADOS_API void rados_objects_list_close( + rados_list_ctx_t ctx) __attribute__((deprecated)); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp new file mode 100644 index 000000000..cb8261af1 --- /dev/null +++ b/src/include/rados/librados.hpp @@ -0,0 +1,1568 @@ +#ifndef __LIBRADOS_HPP +#define __LIBRADOS_HPP + +#include <string> +#include <list> +#include <map> +#include <memory> +#include <set> +#include <vector> +#include <utility> +#include "buffer.h" + +#include "librados.h" +#include "librados_fwd.hpp" +#include "rados_types.hpp" + +namespace libradosstriper +{ + class RadosStriper; +} + +namespace neorados { class RADOS; } + +namespace librados { + +using ceph::bufferlist; + +struct AioCompletionImpl; +struct IoCtxImpl; +struct ListObjectImpl; +class NObjectIteratorImpl; +struct ObjListCtx; +class ObjectOperationImpl; +struct PlacementGroupImpl; +struct PoolAsyncCompletionImpl; + +typedef struct rados_cluster_stat_t cluster_stat_t; +typedef struct rados_pool_stat_t pool_stat_t; + +typedef void *list_ctx_t; +typedef uint64_t auid_t; +typedef void *config_t; + +typedef struct { + std::string client; + std::string cookie; + std::string address; +} locker_t; + +typedef std::map<std::string, pool_stat_t> stats_map; + +typedef void *completion_t; +typedef void (*callback_t)(completion_t cb, void *arg); + +inline namespace v14_2_0 { + + class IoCtx; + class RadosClient; + + class CEPH_RADOS_API ListObject + { + public: + const std::string& get_nspace() const; + const std::string& get_oid() const; + const std::string& get_locator() const; + + ListObject(); + ~ListObject(); + ListObject( const ListObject&); + ListObject& operator=(const ListObject& rhs); + private: + ListObject(ListObjectImpl *impl); + + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& out, const ListObject& lop); + + ListObjectImpl *impl; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop); + + class CEPH_RADOS_API NObjectIterator; + + class CEPH_RADOS_API ObjectCursor + { + public: + ObjectCursor(); + ObjectCursor(const ObjectCursor &rhs); + explicit ObjectCursor(rados_object_list_cursor c); + ~ObjectCursor(); + ObjectCursor& operator=(const ObjectCursor& rhs); + bool operator<(const ObjectCursor &rhs) const; + bool operator==(const ObjectCursor &rhs) const; + void set(rados_object_list_cursor c); + + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + std::string to_str() const; + bool from_str(const std::string& s); + + protected: + rados_object_list_cursor c_cursor; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + class CEPH_RADOS_API NObjectIterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = ListObject; + using difference_type = std::ptrdiff_t; + using pointer = ListObject*; + using reference = ListObject&; + static const NObjectIterator __EndObjectIterator; + NObjectIterator(): impl(NULL) {} + ~NObjectIterator(); + NObjectIterator(const NObjectIterator &rhs); + NObjectIterator& operator=(const NObjectIterator& rhs); + + bool operator==(const NObjectIterator& rhs) const; + bool operator!=(const NObjectIterator& rhs) const; + const ListObject& operator*() const; + const ListObject* operator->() const; + NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions + NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + + /// get current hash position of the iterator, rounded to the current pg + uint32_t get_pg_hash_position() const; + + /// move the iterator to a given hash position. this may (will!) be rounded + /// to the nearest pg. errors are thrown as exceptions + uint32_t seek(uint32_t pos); + + /// move the iterator to a given cursor position. errors are thrown as exceptions + uint32_t seek(const ObjectCursor& cursor); + + /// get current cursor position + ObjectCursor get_cursor(); + + /** + * Configure PGLS filter to be applied OSD-side (requires caller + * to know/understand the format expected by the OSD) + */ + void set_filter(const bufferlist &bl); + + private: + NObjectIterator(ObjListCtx *ctx_); + void get_next(); + NObjectIteratorImpl *impl; + }; + + class CEPH_RADOS_API ObjectItem + { + public: + std::string oid; + std::string nspace; + std::string locator; + }; + + /// DEPRECATED; do not use + class CEPH_RADOS_API WatchCtx { + public: + virtual ~WatchCtx(); + virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0; + }; + + class CEPH_RADOS_API WatchCtx2 { + public: + virtual ~WatchCtx2(); + /** + * Callback activated when we receive a notify event. + * + * @param notify_id unique id for this notify event + * @param cookie the watcher we are notifying + * @param notifier_id the unique client id of the notifier + * @param bl opaque notify payload (from the notifier) + */ + virtual void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + + /** + * Callback activated when we encounter an error with the watch. + * + * Errors we may see: + * -ENOTCONN : our watch was disconnected + * -ETIMEDOUT : our watch is still valid, but we may have missed + * a notify event. + * + * @param cookie the watcher with the problem + * @param err error + */ + virtual void handle_error(uint64_t cookie, int err) = 0; + }; + + struct CEPH_RADOS_API AioCompletion { + AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {} + ~AioCompletion(); + int set_complete_callback(void *cb_arg, callback_t cb); + int set_safe_callback(void *cb_arg, callback_t cb) + __attribute__ ((deprecated)); + int wait_for_complete(); + int wait_for_safe() __attribute__ ((deprecated)); + int wait_for_complete_and_cb(); + int wait_for_safe_and_cb() __attribute__ ((deprecated)); + bool is_complete(); + bool is_safe() __attribute__ ((deprecated)); + bool is_complete_and_cb(); + bool is_safe_and_cb() __attribute__ ((deprecated)); + int get_return_value(); + int get_version() __attribute__ ((deprecated)); + uint64_t get_version64(); + void release(); + AioCompletionImpl *pc; + }; + + struct CEPH_RADOS_API PoolAsyncCompletion { + PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {} + ~PoolAsyncCompletion(); + int set_callback(void *cb_arg, callback_t cb); + int wait(); + bool is_complete(); + int get_return_value(); + void release(); + PoolAsyncCompletionImpl *pc; + }; + + /** + * These are per-op flags which may be different among + * ops added to an ObjectOperation. + */ + enum ObjectOperationFlags { + OP_EXCL = LIBRADOS_OP_FLAG_EXCL, + OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK, + OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM, + OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL, + OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED, + OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE, + }; + + class CEPH_RADOS_API ObjectOperationCompletion { + public: + virtual ~ObjectOperationCompletion() {} + virtual void handle_completion(int r, bufferlist& outbl) = 0; + }; + + /** + * These flags apply to the ObjectOperation as a whole. + * + * Prior to octopus BALANCE_READS and LOCALIZE_READS should only + * be used when reading from data you're certain won't change, like + * a snapshot, or where eventual consistency is ok. Since octopus + * (get_min_compatible_osd() >= CEPH_RELEASE_OCTOPUS) both are safe + * for general use. + * + * ORDER_READS_WRITES will order reads the same way writes are + * ordered (e.g., waiting for degraded objects). In particular, it + * will make a write followed by a read sequence be preserved. + * + * IGNORE_CACHE will skip the caching logic on the OSD that normally + * handles promotion of objects between tiers. This allows an operation + * to operate (or read) the cached (or uncached) object, even if it is + * not coherent. + * + * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and + * process the op directly on the destination pool. This is useful + * for CACHE_FLUSH and CACHE_EVICT operations. + */ + enum ObjectOperationGlobalFlags { + OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG, + OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS, + OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS, + OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES, + OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE, + OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS, + OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY, + // send requests to cluster despite the cluster or pool being + // marked full; ops will either succeed (e.g., delete) or return + // EDQUOT or ENOSPC + OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY, + // mainly for delete + OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE, + OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT, + OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP, + // enable/allow return value and per-op return code/buffers + OPERATION_RETURNVEC = LIBRADOS_OPERATION_RETURNVEC, + }; + + /* + * Alloc hint flags for the alloc_hint operation. + */ + enum AllocHintFlags { + ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + ALLOC_HINT_FLAG_RANDOM_READ = 8, + ALLOC_HINT_FLAG_APPEND_ONLY = 16, + ALLOC_HINT_FLAG_IMMUTABLE = 32, + ALLOC_HINT_FLAG_SHORTLIVED = 64, + ALLOC_HINT_FLAG_LONGLIVED = 128, + ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, + }; + + /* + * ObjectOperation : compound object operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectOperation + { + public: + ObjectOperation(); + virtual ~ObjectOperation(); + + ObjectOperation(const ObjectOperation&) = delete; + ObjectOperation& operator=(const ObjectOperation&) = delete; + + /** + * Move constructor. + * \warning A moved from ObjectOperation is invalid and may not be used for + * any purpose. This is a hard contract violation and will + * kill your program. + */ + ObjectOperation(ObjectOperation&&); + ObjectOperation& operator =(ObjectOperation&&); + + size_t size(); + void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated)); + //flag mean ObjectOperationFlags + void set_op_flags2(int flags); + + void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval); + void cmpxattr(const char *name, uint8_t op, const bufferlist& val); + void cmpxattr(const char *name, uint8_t op, uint64_t v); + void exec(const char *cls, const char *method, bufferlist& inbl); + void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval); + void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion); + /** + * Guard operation with a check that object version == ver + * + * @param ver [in] version to check + */ + void assert_version(uint64_t ver); + + /** + * Guard operation with a check that the object already exists + */ + void assert_exists(); + + /** + * get key/value pairs for specified keys + * + * @param assertions [in] comparison assertions + * @param prval [out] place error code in prval upon completion + * + * assertions has the form of mappings from keys to (comparison rval, assertion) + * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ]. + * + * That is, to assert that the value at key 'foo' is greater than 'bar': + * + * ObjectReadOperation op; + * int r; + * map<string, pair<bufferlist, int> > assertions; + * bufferlist bar(string('bar')); + * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT); + * op.omap_cmp(assertions, &r); + */ + void omap_cmp( + const std::map<std::string, std::pair<bufferlist, int> > &assertions, + int *prval); + + protected: + ObjectOperationImpl* impl; + friend class IoCtx; + friend class Rados; + }; + + /* + * ObjectWriteOperation : compound object write operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation + { + protected: + time_t *unused; + public: + ObjectWriteOperation() : unused(NULL) {} + ~ObjectWriteOperation() override {} + + ObjectWriteOperation(ObjectWriteOperation&&) = default; + ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default; + + void mtime(time_t *pt); + void mtime2(struct timespec *pts); + + void create(bool exclusive); + void create(bool exclusive, + const std::string& category); ///< NOTE: category is unused + + void write(uint64_t off, const bufferlist& bl); + void write_full(const bufferlist& bl); + void writesame(uint64_t off, uint64_t write_len, + const bufferlist& bl); + void append(const bufferlist& bl); + void remove(); + void truncate(uint64_t off); + void zero(uint64_t off, uint64_t len); + void rmxattr(const char *name); + void setxattr(const char *name, const bufferlist& bl); + void setxattr(const char *name, const bufferlist&& bl); + void tmap_update(const bufferlist& cmdbl); + void tmap_put(const bufferlist& bl); + void selfmanaged_snap_rollback(uint64_t snapid); + + /** + * Rollback an object to the specified snapshot id + * + * Used with pool snapshots + * + * @param snapid [in] snopshot id specified + */ + void snap_rollback(uint64_t snapid); + + /** + * set keys and values according to map + * + * @param map [in] keys and values to set + */ + void omap_set(const std::map<std::string, bufferlist> &map); + + /** + * set header + * + * @param bl [in] header to set + */ + void omap_set_header(const bufferlist &bl); + + /** + * Clears omap contents + */ + void omap_clear(); + + /** + * Clears keys in to_rm + * + * @param to_rm [in] keys to remove + */ + void omap_rm_keys(const std::set<std::string> &to_rm); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t src_fadvise_flags); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). Instead of + * copying truncate_seq and truncate_size from the source object it receives + * these values as parameters. + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param truncate_seq truncate sequence for the destination object + * @param truncate_size truncate size for the destination object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from2(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t truncate_seq, + uint64_t truncate_size, uint32_t src_fadvise_flags); + + /** + * undirty an object + * + * Clear an objects dirty flag + */ + void undirty(); + + /** + * Set allocation hint for an object + * + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags flags () + */ + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size); + void set_alloc_hint2(uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + /** + * Pin/unpin an object in cache tier + * + * @returns 0 on success, negative error code on failure + */ + void cache_pin(); + void cache_unpin(); + + /** + * Extensible tier + * + * Set redirect target + */ + void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx, + uint64_t tgt_version, int flag = 0); + void tier_promote(); + void unset_manifest(); + + friend class IoCtx; + }; + + /* + * ObjectReadOperation : compound object operation that return value + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation + { + public: + ObjectReadOperation() {} + ~ObjectReadOperation() override {} + + ObjectReadOperation(ObjectReadOperation&&) = default; + ObjectReadOperation& operator =(ObjectReadOperation&&) = default; + + void stat(uint64_t *psize, time_t *pmtime, int *prval); + void stat2(uint64_t *psize, struct timespec *pts, int *prval); + void getxattr(const char *name, bufferlist *pbl, int *prval); + void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval); + void read(size_t off, uint64_t len, bufferlist *pbl, int *prval); + void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl, + uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl, + int *prval); + + /** + * see aio_sparse_read() + */ + void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m, + bufferlist *data_bl, int *prval, + uint64_t truncate_size = 0, + uint32_t truncate_seq = 0); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals2: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + + /** + * omap_get_keys: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_keys2: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys2(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore, + int *prval); + + /** + * omap_get_header: get header from object omap + * + * @param header [out] place header here upon completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_header(bufferlist *header, int *prval); + + /** + * get key/value pairs for specified keys + * + * @param keys [in] keys to get + * @param map [out] place key/value pairs found here on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals_by_keys(const std::set<std::string> &keys, + std::map<std::string, bufferlist> *map, + int *prval); + + /** + * list_watchers: Get list watchers of object + * + * @param out_watchers [out] place returned values in out_watchers on completion + * @param prval [out] place error code in prval upon completion + */ + void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval); + + /** + * list snapshot clones associated with a logical object + * + * This will include a record for each version of the object, + * include the "HEAD" (which will have a cloneid of SNAP_HEAD). + * Each clone includes a vector of snap ids for which it is + * defined to exist. + * + * NOTE: this operation must be submitted from an IoCtx with a + * read snapid of SNAP_DIR for reliable results. + * + * @param out_snaps [out] pointer to resulting snap_set_t + * @param prval [out] place error code in prval upon completion + */ + void list_snaps(snap_set_t *out_snaps, int *prval); + + /** + * query dirty state of an object + * + * @param isdirty [out] pointer to resulting bool + * @param prval [out] place error code in prval upon completion + */ + void is_dirty(bool *isdirty, int *prval); + + /** + * flush a cache tier object to backing tier; will block racing + * updates. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_flush(); + + /** + * Flush a cache tier object to backing tier; will EAGAIN if we race + * with an update. Must be used with the SKIPRWLOCKS flag. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_try_flush(); + + /** + * evict a clean cache tier object + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promote on the OSD (that is then evicted). + */ + void cache_evict(); + + /** + * Extensible tier + * + * set_chunk: make a chunk pointing a part of the source object at the target + * object + * + * @param src_offset [in] source offset to indicate the start position of + * a chunk in the source object + * @param src_length [in] source length to set the length of the chunk + * @param tgt_oid [in] target object's id to set a chunk + * @param tgt_offset [in] the start position of the target object + * @param flag [in] flag for the source object + * + */ + void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx, + std::string tgt_oid, uint64_t tgt_offset, int flag = 0); + /** + * flush a manifest tier object to backing tier, performing deduplication; + * will block racing updates. + * + * Invoking tier_flush() implicitly makes a manifest object even if + * the target object is not manifest. + */ + void tier_flush(); + /** + * evict a manifest tier object to backing tier; will block racing + * updates. + */ + void tier_evict(); + }; + + /* IoCtx : This is a context in which we can perform I/O. + * It includes a Pool, + * + * Typical use (error checking omitted): + * + * IoCtx p; + * rados.ioctx_create("my_pool", p); + * p->stat(&stats); + * ... etc ... + * + * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * that is used for watch events to ensure that racing callbacks + * have completed. + */ + class CEPH_RADOS_API IoCtx + { + public: + IoCtx(); + static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + IoCtx(const IoCtx& rhs); + IoCtx& operator=(const IoCtx& rhs); + IoCtx(IoCtx&& rhs) noexcept; + IoCtx& operator=(IoCtx&& rhs) noexcept; + + ~IoCtx(); + + bool is_valid() const; + + // Close our pool handle + void close(); + + // deep copy + void dup(const IoCtx& rhs); + + // set pool auid + int set_auid(uint64_t auid_) + __attribute__ ((deprecated)); + + // set pool auid + int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + + // get pool auid + int get_auid(uint64_t *auid_) + __attribute__ ((deprecated)); + + uint64_t get_instance_id() const; + + std::string get_pool_name(); + + bool pool_requires_alignment(); + int pool_requires_alignment2(bool * req); + uint64_t pool_required_alignment(); + int pool_required_alignment2(uint64_t * alignment); + + // create an object + int create(const std::string& oid, bool exclusive); + int create(const std::string& oid, bool exclusive, + const std::string& category); ///< category is unused + + /** + * write bytes to an object at a specified offset + * + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + /** + * append bytes to an object + * + * NOTE: this call steals the contents of @param bl. + */ + int append(const std::string& oid, bufferlist& bl, size_t len); + /** + * replace object contents with provided data + * + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& oid, bufferlist& bl); + int writesame(const std::string& oid, bufferlist& bl, + size_t write_len, uint64_t off); + int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + int checksum(const std::string& o, rados_checksum_type_t type, + const bufferlist &init_value_bl, size_t len, uint64_t off, + size_t chunk_size, bufferlist *pbl); + int remove(const std::string& oid); + int remove(const std::string& oid, int flags); + int trunc(const std::string& oid, uint64_t size); + int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m); + int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl); + int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off); + int getxattr(const std::string& oid, const char *name, bufferlist& bl); + int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset); + int setxattr(const std::string& oid, const char *name, bufferlist& bl); + int rmxattr(const std::string& oid, const char *name); + int stat(const std::string& oid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts); + int exec(const std::string& oid, const char *cls, const char *method, + bufferlist& inbl, bufferlist& outbl); + /** + * modify object tmap based on encoded update sequence + * + * NOTE: this call steals the contents of @param bl + */ + int tmap_update(const std::string& oid, bufferlist& cmdbl); + + int omap_get_vals(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_vals(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_keys(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys); + int omap_get_keys2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore); + int omap_get_header(const std::string& oid, + bufferlist *bl); + int omap_get_vals_by_keys(const std::string& oid, + const std::set<std::string>& keys, + std::map<std::string, bufferlist> *vals); + int omap_set(const std::string& oid, + const std::map<std::string, bufferlist>& map); + int omap_set_header(const std::string& oid, + const bufferlist& bl); + int omap_clear(const std::string& oid); + int omap_rm_keys(const std::string& oid, + const std::set<std::string>& keys); + + void snap_set_read(snap_t seq); + int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps); + + // Create a snapshot with a given name + int snap_create(const char *snapname); + + // Look up a snapshot by name. + // Returns 0 on success; error code otherwise + int snap_lookup(const char *snapname, snap_t *snap); + + // Gets a timestamp for a snap + int snap_get_stamp(snap_t snapid, time_t *t); + + // Gets the name of a snap + int snap_get_name(snap_t snapid, std::string *s); + + // Remove a snapshot from this pool + int snap_remove(const char *snapname); + + int snap_list(std::vector<snap_t> *snaps); + + int snap_rollback(const std::string& oid, const char *snapname); + + // Deprecated name kept for backward compatibility - same as snap_rollback() + int rollback(const std::string& oid, const char *snapname) + __attribute__ ((deprecated)); + + int selfmanaged_snap_create(uint64_t *snapid); + void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c); + + int selfmanaged_snap_remove(uint64_t snapid); + void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c); + + int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid); + + // Advisory locking on rados objects. + int lock_exclusive(const std::string &oid, const std::string &name, + const std::string &cookie, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int lock_shared(const std::string &oid, const std::string &name, + const std::string &cookie, const std::string &tag, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int unlock(const std::string &oid, const std::string &name, + const std::string &cookie); + + int break_lock(const std::string &oid, const std::string &name, + const std::string &client, const std::string &cookie); + + int list_lockers(const std::string &oid, const std::string &name, + int *exclusive, + std::string *tag, + std::list<librados::locker_t> *lockers); + + + /// Start enumerating objects for a pool. Errors are thrown as exceptions. + NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from a hash position. + /// Errors are thrown as exceptions. + NObjectIterator nobjects_begin(uint32_t start_hash_position, + const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from cursor. Errors are + /// thrown as exceptions. + NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor, + const bufferlist &filter=bufferlist()); + /// Iterator indicating the end of a pool + const NObjectIterator& nobjects_end() const; + + /// Get cursor for pool beginning + ObjectCursor object_list_begin(); + + /// Get cursor for pool end + ObjectCursor object_list_end(); + + /// Check whether a cursor is at the end of a pool + bool object_list_is_end(const ObjectCursor &oc); + + /// List some objects between two cursors + int object_list(const ObjectCursor &start, const ObjectCursor &finish, + const size_t result_count, + const bufferlist &filter, + std::vector<ObjectItem> *result, + ObjectCursor *next); + + /// Generate cursors that include the N out of Mth slice of the pool + void object_list_slice( + const ObjectCursor start, + const ObjectCursor finish, + const size_t n, + const size_t m, + ObjectCursor *split_start, + ObjectCursor *split_finish); + + /** + * List available hit set objects + * + * @param uint32_t [in] hash position to query + * @param c [in] completion + * @param pls [out] list of available intervals + */ + int hit_set_list(uint32_t hash, AioCompletion *c, + std::list< std::pair<time_t, time_t> > *pls); + + /** + * Retrieve hit set for a given hash, and time + * + * @param hash [in] hash position + * @param c [in] completion + * @param stamp [in] time interval that falls within the hit set's interval + * @param pbl [out] buffer to store the result in + */ + int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp, + bufferlist *pbl); + + uint64_t get_last_version(); + + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off); + /** + * Asynchronously read from an object at a particular snapshot + * + * This is the same as normal aio_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param pbl where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid); + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off); + /** + * Asynchronously read existing extents from an object at a + * particular snapshot + * + * This is the same as normal aio_sparse_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * m will be filled in with a map of extents in the object, + * mapping offsets to lengths (in bytes) within the range + * requested. The data for all of the extents are stored + * back-to-back in offset order in data_bl. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param m where to store the map of extents + * @param data_bl where to store the data + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off, uint64_t snapid); + /** + * Asynchronously compare an on-disk object range with a buffer + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param off object byte offset at which to start the comparison + * @param cmp_bl buffer containing bytes to be compared with object contents + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ + int aio_cmpext(const std::string& oid, + librados::AioCompletion *c, + uint64_t off, + bufferlist& cmp_bl); + int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len, uint64_t off); + int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len); + int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl); + int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t write_len, uint64_t off); + + /** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param oid the name of the object + * @param c what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than SNAP_HEAD + */ + int aio_remove(const std::string& oid, AioCompletion *c); + int aio_remove(const std::string& oid, AioCompletion *c, int flags); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * aio_flush(). + * + * @param c what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ + int aio_flush_async(AioCompletion *c); + int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset); + int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name); + int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); + + /** + * Cancel aio operation + * + * @param c completion handle + * @returns 0 on success, negative error code on failure + */ + int aio_cancel(AioCompletion *c); + + int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method, + bufferlist& inbl, bufferlist *outbl); + + /* + * asynchronous version of unlock + */ + int aio_unlock(const std::string &oid, const std::string &name, + const std::string &cookie, AioCompletion *c); + + // compound object operations + int operate(const std::string& oid, ObjectWriteOperation *op); + int operate(const std::string& oid, ObjectWriteOperation *op, int flags); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags); + /** + * Schedule an async write operation with explicit snapshot parameters + * + * This is the same as the first aio_operate(), except that it + * gets the snapshot context from its arguments instead of the + * IoCtx internal state. + * + * @param oid the object to operate on + * @param c what to do when the operation is complete and safe + * @param op which operations to perform + * @param seq latest selfmanaged snapshot sequence number for this object + * @param snaps currently existing selfmanaged snapshot ids for this object + * @returns 0 on success, negative error code on failure + */ + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, int flags, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, bufferlist *pbl); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, snap_t snapid, int flags, + bufferlist *pbl) + __attribute__ ((deprecated)); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl, const blkin_trace_info *trace_info); + + // watch/notify + int watch2(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx); + int watch3(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx); + int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int unwatch2(uint64_t handle); + int aio_unwatch(uint64_t handle, AioCompletion *c); + /** + * Send a notify event to watchers + * + * Upon completion the pbl bufferlist reply payload will be + * encoded like so: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * + */ + int notify2(const std::string& o, ///< object + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + int aio_notify(const std::string& o, ///< object + AioCompletion *c, ///< completion when notify completes + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + /* + * Decode a notify response into acks and timeout vectors. + */ + void decode_notify_response(bufferlist &bl, + std::vector<librados::notify_ack_t> *acks, + std::vector<librados::notify_timeout_t> *timeouts); + + int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers); + int list_snaps(const std::string& o, snap_set_t *out_snaps); + void set_notify_timeout(uint32_t timeout); + + /// acknowledge a notify we received. + void notify_ack(const std::string& o, ///< watched object + uint64_t notify_id, ///< notify id + uint64_t cookie, ///< our watch handle + bufferlist& bl); ///< optional reply payload + + /*** + * check on watch validity + * + * Check if a watch is valid. If so, return the number of + * milliseconds since we last confirmed its liveness. If there is + * a known error, return it. + * + * If there is an error, the watch is no longer valid, and should + * be destroyed with unwatch(). The user is still interested in + * the object, a new watch should be created with watch(). + * + * @param cookie watch handle + * @returns ms since last confirmed valid, or error + */ + int watch_check(uint64_t cookie); + + // old, deprecated versions + int watch(const std::string& o, uint64_t ver, uint64_t *cookie, + librados::WatchCtx *ctx) __attribute__ ((deprecated)); + int notify(const std::string& o, uint64_t ver, bufferlist& bl) + __attribute__ ((deprecated)); + int unwatch(const std::string& o, uint64_t cookie) + __attribute__ ((deprecated)); + + /** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it + * was submitted with a OP_FAILOK flag set) and is not guaranteed + * to do anything on the backend. + * + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ + int set_alloc_hint(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size); + int set_alloc_hint2(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + // assert version for next sync operations + void set_assert_version(uint64_t ver); + + /** + * Pin/unpin an object in cache tier + * + * @param o the name of the object + * @returns 0 on success, negative error code on failure + */ + int cache_pin(const std::string& o); + int cache_unpin(const std::string& o); + + std::string get_pool_name() const; + + void locator_set_key(const std::string& key); + void set_namespace(const std::string& nspace); + std::string get_namespace() const; + + int64_t get_id(); + + // deprecated versions + uint32_t get_object_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + uint32_t get_object_pg_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + + int get_object_hash_position2(const std::string& oid, uint32_t *hash_position); + int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position); + + config_t cct(); + + void set_osdmap_full_try() + __attribute__ ((deprecated)); + void unset_osdmap_full_try() + __attribute__ ((deprecated)); + + bool get_pool_full_try(); + void set_pool_full_try(); + void unset_pool_full_try(); + + int application_enable(const std::string& app_name, bool force); + int application_enable_async(const std::string& app_name, + bool force, PoolAsyncCompletion *c); + int application_list(std::set<std::string> *app_names); + int application_metadata_get(const std::string& app_name, + const std::string &key, + std::string *value); + int application_metadata_set(const std::string& app_name, + const std::string &key, + const std::string& value); + int application_metadata_remove(const std::string& app_name, + const std::string &key); + int application_metadata_list(const std::string& app_name, + std::map<std::string, std::string> *values); + + private: + /* You can only get IoCtx instances from Rados */ + IoCtx(IoCtxImpl *io_ctx_impl_); + + friend class Rados; // Only Rados can use our private constructor to create IoCtxes. + friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl + friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl + friend class ObjectReadOperation; // set_chunk needs to see our IoCtxImpl + + IoCtxImpl *io_ctx_impl; + }; + + struct CEPH_RADOS_API PlacementGroup { + PlacementGroup(); + PlacementGroup(const PlacementGroup&); + ~PlacementGroup(); + bool parse(const char*); + std::unique_ptr<PlacementGroupImpl> impl; + }; + + CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&); + + class CEPH_RADOS_API Rados + { + public: + static void version(int *major, int *minor, int *extra); + + Rados(); + explicit Rados(IoCtx& ioctx); + ~Rados(); + static void from_rados_t(rados_t cluster, Rados &rados); + + int init(const char * const id); + int init2(const char * const name, const char * const clustername, + uint64_t flags); + int init_with_context(config_t cct_); + config_t cct(); + int connect(); + void shutdown(); + int watch_flush(); + int aio_watch_flush(AioCompletion*); + int conf_read_file(const char * const path) const; + int conf_parse_argv(int argc, const char ** argv) const; + int conf_parse_argv_remainder(int argc, const char ** argv, + const char ** remargv) const; + int conf_parse_env(const char *env) const; + int conf_set(const char *option, const char *value); + int conf_get(const char *option, std::string &val); + + int service_daemon_register( + const std::string& service, ///< service name (e.g., 'rgw') + const std::string& name, ///< daemon name (e.g., 'gwfoo') + const std::map<std::string,std::string>& metadata); ///< static metadata about daemon + int service_daemon_update_status( + std::map<std::string,std::string>&& status); + + int pool_create(const char *name); + int pool_create(const char *name, uint64_t auid) + __attribute__ ((deprecated)); + int pool_create(const char *name, uint64_t auid, uint8_t crush_rule) + __attribute__ ((deprecated)); + int pool_create_with_rule(const char *name, uint8_t crush_rule); + int pool_create_async(const char *name, PoolAsyncCompletion *c); + int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c); + int pool_get_base_tier(int64_t pool, int64_t* base_tier); + int pool_delete(const char *name); + int pool_delete_async(const char *name, PoolAsyncCompletion *c); + int64_t pool_lookup(const char *name); + int pool_reverse_lookup(int64_t id, std::string *name); + + uint64_t get_instance_id(); + + int get_min_compatible_osd(int8_t* require_osd_release); + int get_min_compatible_client(int8_t* min_compat_client, + int8_t* require_min_compat_client); + + int mon_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int mgr_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int osd_command(int osdid, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + + int ioctx_create(const char *name, IoCtx &pioctx); + int ioctx_create2(int64_t pool_id, IoCtx &pioctx); + + // Features useful for test cases + void test_blocklist_self(bool set); + + /* pool info */ + int pool_list(std::list<std::string>& v); + int pool_list2(std::list<std::pair<int64_t, std::string> >& v); + int get_pool_stats(std::list<std::string>& v, + stats_map& result); + /// deprecated; use simpler form. categories no longer supported. + int get_pool_stats(std::list<std::string>& v, + std::map<std::string, stats_map>& stats); + /// deprecated; categories no longer supported + int get_pool_stats(std::list<std::string>& v, + std::string& category, + std::map<std::string, stats_map>& stats); + /// check if pool has selfmanaged snaps + bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname); + + int cluster_stat(cluster_stat_t& result); + int cluster_fsid(std::string *fsid); + + /** + * List inconsistent placement groups in the given pool + * + * @param pool_id the pool id + * @param pgs [out] the inconsistent PGs + */ + int get_inconsistent_pgs(int64_t pool_id, + std::vector<PlacementGroup>* pgs); + /** + * List the inconsistent objects found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param objects [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_objects(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_obj_t>* objects, + uint32_t* interval); + /** + * List the inconsistent snapsets found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param snapsets [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_snapsets(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_snapset_t>* snapset, + uint32_t* interval); + + /// get/wait for the most recent osdmap + int wait_for_latest_osdmap(); + + int blocklist_add(const std::string& client_address, + uint32_t expire_seconds); + + std::string get_addrs() const; + + /* + * pool aio + * + * It is up to the caller to release the completion handler, even if the pool_create_async() + * and/or pool_delete_async() fails and does not send the async request + */ + static PoolAsyncCompletion *pool_async_create_completion(); + + // -- aio -- + static AioCompletion *aio_create_completion(); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete, + callback_t cb_safe) + __attribute__ ((deprecated)); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete); + + friend std::ostream& operator<<(std::ostream &oss, const Rados& r); + private: + friend class neorados::RADOS; + + // We don't allow assignment or copying + Rados(const Rados& rhs); + const Rados& operator=(const Rados& rhs); + RadosClient *client; + }; + +} // namespace v14_2_0 +} // namespace librados + +#endif + diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp new file mode 100644 index 000000000..396f3a838 --- /dev/null +++ b/src/include/rados/librados_fwd.hpp @@ -0,0 +1,34 @@ +#ifndef __LIBRADOS_FWD_HPP +#define __LIBRADOS_FWD_HPP + +struct blkin_trace_info; + +namespace libradosstriper { + +class RadosStriper; + +} // namespace libradosstriper + +namespace librados { +inline namespace v14_2_0 { + +class AioCompletion; +class IoCtx; +class ListObject; +class NObjectIterator; +class ObjectCursor; +class ObjectItem; +class ObjectOperation; +class ObjectOperationCompletion; +class ObjectReadOperation; +class ObjectWriteOperation; +class PlacementGroup; +class PoolAsyncCompletion; +class Rados; +class WatchCtx; +class WatchCtx2; + +} // inline namespace v14_2_0 +} // namespace librados + +#endif // __LIBRADOS_FWD_HPP diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h new file mode 100644 index 000000000..c20e96bed --- /dev/null +++ b/src/include/rados/librgw.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_LIBRGW_H +#define CEPH_LIBRGW_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_VER_MAJOR 1 +#define LIBRGW_VER_MINOR 1 +#define LIBRGW_VER_EXTRA 0 + +#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA) + +typedef void* librgw_t; +int librgw_create(librgw_t *rgw, int argc, char **argv); +void librgw_shutdown(librgw_t rgw); + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_LIBRGW_H */ diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h new file mode 100644 index 000000000..80ae69d25 --- /dev/null +++ b/src/include/rados/objclass.h @@ -0,0 +1,177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H +#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H + +#ifdef __cplusplus + +#include "buffer.h" + +extern "C" { +#endif + +#define CEPH_CLS_API [[gnu::visibility("default")]] + +#define CLS_VER(maj,min) \ +int __cls_ver__## maj ## _ ##min = 0; \ +int __cls_ver_maj = maj; \ +int __cls_ver_min = min; + +#define CLS_NAME(name) \ +int __cls_name__## name = 0; \ +const char *__cls_name = #name; + +#define CLS_INIT(name) \ +CEPH_CLS_API void __cls_init() + +#define CLS_METHOD_RD 0x1 /// method executes read operations +#define CLS_METHOD_WR 0x2 /// method executes write operations +#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier + +#define CLS_LOG(level, fmt, ...) \ + cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__) + +/** + * Initialize a class. + */ +void __cls_init(); + +/** + * @typdef cls_handle_t + * + * A handle for interacting with the object class. + */ +typedef void *cls_handle_t; + +/** + * @typedef cls_method_handle_t + * + * A handle for interacting with the method of the object class. + */ +typedef void *cls_method_handle_t; + +/** + * @typedef cls_method_context_t + * + * A context for the method of the object class. + */ +typedef void* cls_method_context_t; + +/*class utils*/ +extern int cls_log(int level, const char *format, ...) + __attribute__((__format__(printf, 2, 3))); + +/* class registration api */ +extern int cls_register(const char *name, cls_handle_t *handle); + +#ifdef __cplusplus +} + +/** + * @typedef cls_method_cxx_call_t + * + */ +typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx, + class ceph::buffer::list *inbl, class ceph::buffer::list *outbl); + +/** + * Register a method. + * + * @param hclass + * @param method + * @param flags + * @param class_call + * @param handle + */ +extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags, + cls_method_cxx_call_t class_call, cls_method_handle_t *handle); + +/** + * Create an object. + * + * @param hctx + * @param exclusive + */ +extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive); + +/** + * Remove an object. + * + * @param hctx + */ +extern int cls_cxx_remove(cls_method_context_t hctx); + +/** + * Check on the status of an object. + * + * @param hctx + * @param size + * @param mtime + */ +extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime); + +/** + * Read contents of an object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Write to the object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Get xattr of the object. + * + * @param hctx + * @param name + * @param outbl + */ +extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *outbl); + +/** + * Set xattr of the object. + * + * @param hctx + * @param name + * @param inbl + */ +extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *inbl); + +/** + * Get value corresponding to a key from the map. + * + * @param hctx + * @param key + * @param outbl + */ +extern int cls_cxx_map_get_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *outbl); + +/** + * Set value corresponding to a key in the map. + * + * @param hctx + * @param key + * @param inbl + */ +extern int cls_cxx_map_set_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *inbl); + +#endif + +#endif diff --git a/src/include/rados/page.h b/src/include/rados/page.h new file mode 120000 index 000000000..cf983e838 --- /dev/null +++ b/src/include/rados/page.h @@ -0,0 +1 @@ +../page.h
\ No newline at end of file diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h new file mode 100644 index 000000000..d308341ec --- /dev/null +++ b/src/include/rados/rados_types.h @@ -0,0 +1,41 @@ +#ifndef CEPH_RADOS_TYPES_H +#define CEPH_RADOS_TYPES_H + +#include <stdint.h> + +/** + * @struct obj_watch_t + * One item from list_watchers + */ +struct obj_watch_t { + /// Address of the Watcher + char addr[256]; + /// Watcher ID + int64_t watcher_id; + /// Cookie + uint64_t cookie; + /// Timeout in Seconds + uint32_t timeout_seconds; +}; + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + char *payload; + uint64_t payload_len; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; + +/** + * + * Pass as nspace argument to rados_ioctx_set_namespace() + * before calling rados_nobjects_list_open() to return + * all objects in all namespaces. + */ +#define LIBRADOS_ALL_NSPACES "\001" + +#endif diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp new file mode 100644 index 000000000..84023579b --- /dev/null +++ b/src/include/rados/rados_types.hpp @@ -0,0 +1,341 @@ +#ifndef CEPH_RADOS_TYPES_HPP +#define CEPH_RADOS_TYPES_HPP + +#include <map> +#include <utility> +#include <vector> +#include <stdint.h> +#include <string> + +#include "buffer.h" +#include "rados_types.h" + +namespace librados { + +typedef uint64_t snap_t; + +enum { + SNAP_HEAD = (uint64_t)(-2), + SNAP_DIR = (uint64_t)(-1) +}; + +struct clone_info_t { + snap_t cloneid; + std::vector<snap_t> snaps; // ascending + std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest + uint64_t size; + clone_info_t() : cloneid(0), size(0) {} +}; + +struct snap_set_t { + std::vector<clone_info_t> clones; // ascending + snap_t seq; // newest snapid seen by the object + snap_set_t() : seq(0) {} +}; + +struct object_id_t { + std::string name; + std::string nspace; + std::string locator; + snap_t snap = 0; + object_id_t() = default; + object_id_t(const std::string& name, + const std::string& nspace, + const std::string& locator, + snap_t snap) + : name(name), + nspace(nspace), + locator(locator), + snap(snap) + {} +}; + +struct err_t { + enum : uint64_t { + SHARD_MISSING = 1 << 1, + SHARD_STAT_ERR = 1 << 2, + SHARD_READ_ERR = 1 << 3, + DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old + DATA_DIGEST_MISMATCH_INFO = 1 << 9, + OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old + OMAP_DIGEST_MISMATCH_INFO = 1 << 10, + SIZE_MISMATCH_OI = 1 << 11, // Old + SIZE_MISMATCH_INFO = 1 << 11, + SHARD_EC_HASH_MISMATCH = 1 << 12, + SHARD_EC_SIZE_MISMATCH = 1 << 13, + OI_ATTR_MISSING = 1 << 14, // Old + INFO_MISSING = 1 << 14, + OI_ATTR_CORRUPTED = 1 << 15, // Old + INFO_CORRUPTED = 1 << 15, + SS_ATTR_MISSING = 1 << 16, // Old + SNAPSET_MISSING = 1 << 16, + SS_ATTR_CORRUPTED = 1 << 17, // Old + SNAPSET_CORRUPTED = 1 << 17, + OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old + OBJ_SIZE_INFO_MISMATCH = 1 << 18, + HINFO_MISSING = 1 << 19, + HINFO_CORRUPTED = 1 << 20 + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED; + static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH; + bool has_shard_missing() const { + return errors & SHARD_MISSING; + } + bool has_stat_error() const { + return errors & SHARD_STAT_ERR; + } + bool has_read_error() const { + return errors & SHARD_READ_ERR; + } + bool has_data_digest_mismatch_oi() const { // Compatibility + return errors & DATA_DIGEST_MISMATCH_OI; + } + bool has_data_digest_mismatch_info() const { + return errors & DATA_DIGEST_MISMATCH_INFO; + } + bool has_omap_digest_mismatch_oi() const { // Compatibility + return errors & OMAP_DIGEST_MISMATCH_OI; + } + bool has_omap_digest_mismatch_info() const { + return errors & OMAP_DIGEST_MISMATCH_INFO; + } + bool has_size_mismatch_oi() const { // Compatibility + return errors & SIZE_MISMATCH_OI; + } + bool has_size_mismatch_info() const { + return errors & SIZE_MISMATCH_INFO; + } + bool has_ec_hash_error() const { + return errors & SHARD_EC_HASH_MISMATCH; + } + bool has_ec_size_error() const { + return errors & SHARD_EC_SIZE_MISMATCH; + } + bool has_oi_attr_missing() const { // Compatibility + return errors & OI_ATTR_MISSING; + } + bool has_info_missing() const { + return errors & INFO_MISSING; + } + bool has_oi_attr_corrupted() const { // Compatibility + return errors & OI_ATTR_CORRUPTED; + } + bool has_info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool has_ss_attr_missing() const { // Compatibility + return errors & SS_ATTR_MISSING; + } + bool has_snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool has_ss_attr_corrupted() const { // Compatibility + return errors & SS_ATTR_CORRUPTED; + } + bool has_snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_obj_size_oi_mismatch() const { // Compatibility + return errors & OBJ_SIZE_OI_MISMATCH; + } + bool has_obj_size_info_mismatch() const { + return errors & OBJ_SIZE_INFO_MISMATCH; + } + bool has_hinfo_missing() const { + return errors & HINFO_MISSING; + } + bool has_hinfo_corrupted() const { + return errors & HINFO_CORRUPTED; + } +}; + +struct shard_info_t : err_t { + std::map<std::string, ceph::bufferlist> attrs; + uint64_t size = -1; + bool omap_digest_present = false; + uint32_t omap_digest = 0; + bool data_digest_present = false; + uint32_t data_digest = 0; + bool selected_oi = false; + bool primary = false; +}; + +struct osd_shard_t { + int32_t osd; + int8_t shard; +}; + +inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) { + if (lhs.osd < rhs.osd) + return true; + else if (lhs.osd > rhs.osd) + return false; + else + return lhs.shard < rhs.shard; +} + +struct obj_err_t { + enum : uint64_t { + OBJECT_INFO_INCONSISTENCY = 1 << 1, + // XXX: Can an older rados binary work if these bits stay the same? + DATA_DIGEST_MISMATCH = 1 << 4, + OMAP_DIGEST_MISMATCH = 1 << 5, + SIZE_MISMATCH = 1 << 6, + ATTR_VALUE_MISMATCH = 1 << 7, + ATTR_NAME_MISMATCH = 1 << 8, + SNAPSET_INCONSISTENCY = 1 << 9, + HINFO_INCONSISTENCY = 1 << 10, + SIZE_TOO_LARGE = 1 << 11, + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH + |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE; + static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH; + bool has_object_info_inconsistency() const { + return errors & OBJECT_INFO_INCONSISTENCY; + } + bool has_data_digest_mismatch() const { + return errors & DATA_DIGEST_MISMATCH; + } + bool has_omap_digest_mismatch() const { + return errors & OMAP_DIGEST_MISMATCH; + } + bool has_size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool has_attr_value_mismatch() const { + return errors & ATTR_VALUE_MISMATCH; + } + bool has_attr_name_mismatch() const { + return errors & ATTR_NAME_MISMATCH; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_snapset_inconsistency() const { + return errors & SNAPSET_INCONSISTENCY; + } + bool has_hinfo_inconsistency() const { + return errors & HINFO_INCONSISTENCY; + } + bool has_size_too_large() const { + return errors & SIZE_TOO_LARGE; + } +}; + +struct inconsistent_obj_t : obj_err_t { + inconsistent_obj_t() = default; + inconsistent_obj_t(const object_id_t& object) + : object{object}, version(0) + {} + object_id_t object; + uint64_t version; // XXX: Redundant with object info attr + std::map<osd_shard_t, shard_info_t> shards; + err_t union_shards; +}; + +struct inconsistent_snapset_t { + inconsistent_snapset_t() = default; + inconsistent_snapset_t(const object_id_t& head) + : object{head} + {} + enum { + SNAPSET_MISSING = 1 << 0, + SNAPSET_CORRUPTED = 1 << 1, + CLONE_MISSING = 1 << 2, + SNAP_ERROR = 1 << 3, + HEAD_MISMATCH = 1 << 4, // Unused + HEADLESS_CLONE = 1 << 5, + SIZE_MISMATCH = 1 << 6, + OI_MISSING = 1 << 7, // Old + INFO_MISSING = 1 << 7, + OI_CORRUPTED = 1 << 8, // Old + INFO_CORRUPTED = 1 << 8, + EXTRA_CLONES = 1 << 9, + }; + uint64_t errors = 0; + object_id_t object; + // Extra clones + std::vector<snap_t> clones; + std::vector<snap_t> missing; + ceph::bufferlist ss_bl; + + bool ss_attr_missing() const { // Compatibility + return errors & SNAPSET_MISSING; + } + bool snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool ss_attr_corrupted() const { // Compatibility + return errors & SNAPSET_CORRUPTED; + } + bool snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool clone_missing() const { + return errors & CLONE_MISSING; + } + bool snapset_mismatch() const { // Compatibility + return errors & SNAP_ERROR; + } + bool snapset_error() const { + return errors & SNAP_ERROR; + } + bool head_mismatch() const { // Compatibility + return false; + } + bool headless() const { + return errors & HEADLESS_CLONE; + } + bool size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool oi_attr_missing() const { // Compatibility + return errors & OI_MISSING; + } + bool info_missing() const { + return errors & INFO_MISSING; + } + bool oi_attr_corrupted() const { // Compatibility + return errors & OI_CORRUPTED; + } + bool info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool extra_clones() const { + return errors & EXTRA_CLONES; + } +}; + +/** + * @var all_nspaces + * Pass as nspace argument to IoCtx::set_namespace() + * before calling nobjects_begin() to iterate + * through all objects in all namespaces. + */ +const std::string all_nspaces(LIBRADOS_ALL_NSPACES); + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + ceph::bufferlist payload_bl; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; +} +#endif diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h new file mode 100644 index 000000000..e1ea45593 --- /dev/null +++ b/src/include/rados/rgw_file.h @@ -0,0 +1,431 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * convert RGW commands to file commands + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef RADOS_RGW_FILE_H +#define RADOS_RGW_FILE_H + +#include <sys/stat.h> +#include <sys/types.h> +#include <stdint.h> +#include <stdbool.h> + +#include "librgw.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_FILE_VER_MAJOR 1 +#define LIBRGW_FILE_VER_MINOR 2 +#define LIBRGW_FILE_VER_EXTRA 0 + +#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA) + +/* + * object types + */ +enum rgw_fh_type { + RGW_FS_TYPE_NIL = 0, + RGW_FS_TYPE_FILE, + RGW_FS_TYPE_DIRECTORY, + RGW_FS_TYPE_SYMBOLIC_LINK, +}; + +/* + * dynamic allocated handle to support nfs handle + */ + +/* content-addressable hash */ +struct rgw_fh_hk { + uint64_t bucket; + uint64_t object; +}; + +struct rgw_file_handle +{ + /* content-addressable hash */ + struct rgw_fh_hk fh_hk; + void *fh_private; /* librgw private data */ + /* object type */ + enum rgw_fh_type fh_type; +}; + +struct rgw_fs +{ + librgw_t rgw; + void *fs_private; + struct rgw_file_handle* root_fh; +}; + + +/* XXX mount info hypothetical--emulate Unix, support at least + * UUID-length fsid */ +struct rgw_statvfs { + uint64_t f_bsize; /* file system block size */ + uint64_t f_frsize; /* fragment size */ + uint64_t f_blocks; /* size of fs in f_frsize units */ + uint64_t f_bfree; /* # free blocks */ + uint64_t f_bavail; /* # free blocks for unprivileged users */ + uint64_t f_files; /* # inodes */ + uint64_t f_ffree; /* # free inodes */ + uint64_t f_favail; /* # free inodes for unprivileged users */ + uint64_t f_fsid[2]; /* file system ID */ + uint64_t f_flag; /* mount flags */ + uint64_t f_namemax; /* maximum filename length */ +}; + + +void rgwfile_version(int *major, int *minor, int *extra); + +/* + lookup object by name (POSIX style) +*/ +#define RGW_LOOKUP_FLAG_NONE 0x0000 +#define RGW_LOOKUP_FLAG_CREATE 0x0001 +#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */ +#define RGW_LOOKUP_FLAG_DIR 0x0004 +#define RGW_LOOKUP_FLAG_FILE 0x0008 + +#define RGW_LOOKUP_TYPE_FLAGS \ + (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE) + +int rgw_lookup(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *path, + struct rgw_file_handle **fh, + struct stat *st, uint32_t mask, uint32_t flags); + +/* + lookup object by handle (NFS style) +*/ +int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk, + struct rgw_file_handle **fh, uint32_t flags); + +/* + * release file handle + */ +#define RGW_FH_RELE_FLAG_NONE 0x0000 + +int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + attach rgw namespace +*/ +#define RGW_MOUNT_FLAG_NONE 0x0000 + +int rgw_mount(librgw_t rgw, const char *uid, const char *key, + const char *secret, struct rgw_fs **rgw_fs, + uint32_t flags); + +int rgw_mount2(librgw_t rgw, const char *uid, const char *key, + const char *secret, const char *root, struct rgw_fs **rgw_fs, + uint32_t flags); + +/* + register invalidate callbacks +*/ +#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000 + +typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk); + +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags); + +/* + detach rgw namespace +*/ +#define RGW_UMOUNT_FLAG_NONE 0x0000 + +int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags); + + +/* + get filesystem attributes +*/ +#define RGW_STATFS_FLAG_NONE 0x0000 + +int rgw_statfs(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + struct rgw_statvfs *vfs_st, + uint32_t flags); + + +/* XXX (get|set)attr mask bits */ +#define RGW_SETATTR_MODE 1 +#define RGW_SETATTR_UID 2 +#define RGW_SETATTR_GID 4 +#define RGW_SETATTR_MTIME 8 +#define RGW_SETATTR_ATIME 16 +#define RGW_SETATTR_SIZE 32 +#define RGW_SETATTR_CTIME 64 + +/* + create file +*/ +#define RGW_CREATE_FLAG_NONE 0x0000 + +int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a symbolic link + */ +#define RGW_CREATELINK_FLAG_NONE 0x0000 +int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, const char *link_path, struct stat *st, + uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a new directory +*/ +#define RGW_MKDIR_FLAG_NONE 0x0000 + +int rgw_mkdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t flags); + +/* + rename object +*/ +#define RGW_RENAME_FLAG_NONE 0x0000 + +int rgw_rename(struct rgw_fs *rgw_fs, + struct rgw_file_handle *olddir, const char* old_name, + struct rgw_file_handle *newdir, const char* new_name, + uint32_t flags); + +/* + remove file or directory +*/ +#define RGW_UNLINK_FLAG_NONE 0x0000 + +int rgw_unlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char* path, + uint32_t flags); + +/* + read directory content +*/ +typedef int (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset, + struct stat *st, uint32_t mask, + uint32_t flags); + +#define RGW_READDIR_FLAG_NONE 0x0000 +#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */ + +int rgw_readdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, uint64_t *offset, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* enumeration continuing from name */ +int rgw_readdir2(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *name, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* project offset of dirent name */ +#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000 + +int rgw_dirent_offset(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, int64_t *offset, + uint32_t flags); + +/* + get unix attributes for object +*/ +#define RGW_GETATTR_FLAG_NONE 0x0000 + +int rgw_getattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t flags); + +/* + set unix attributes for object +*/ +#define RGW_SETATTR_FLAG_NONE 0x0000 + +int rgw_setattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t mask, uint32_t flags); + +/* + truncate file +*/ +#define RGW_TRUNCATE_FLAG_NONE 0x0000 + +int rgw_truncate(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t size, + uint32_t flags); + +/* + open file +*/ +#define RGW_OPEN_FLAG_NONE 0x0000 +#define RGW_OPEN_FLAG_CREATE 0x0001 +#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */ +#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */ + +int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + uint32_t posix_flags, uint32_t flags); + +/* + close file +*/ + +#define RGW_CLOSE_FLAG_NONE 0x0000 +#define RGW_CLOSE_FLAG_RELE 0x0001 + +int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + read data from file +*/ +#define RGW_READ_FLAG_NONE 0x0000 + +int rgw_read(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + read symbolic link +*/ +#define RGW_READLINK_FLAG_NONE 0x0000 + +int rgw_readlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + write data to file +*/ +#define RGW_WRITE_FLAG_NONE 0x0000 + +int rgw_write(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_written, void *buffer, + uint32_t flags); + +#define RGW_UIO_NONE 0x0000 +#define RGW_UIO_GIFT 0x0001 +#define RGW_UIO_FREE 0x0002 +#define RGW_UIO_BUFQ 0x0004 + +struct rgw_uio; +typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t); + +/* buffer vector descriptors */ +struct rgw_vio { + void *vio_p1; + void *vio_u1; + void *vio_base; + int32_t vio_len; +}; + +struct rgw_uio { + rgw_uio_release uio_rele; + void *uio_p1; + void *uio_u1; + uint64_t uio_offset; + uint64_t uio_resid; + uint32_t uio_cnt; + uint32_t uio_flags; + struct rgw_vio *uio_vio; /* appended vectors */ +}; + +typedef struct rgw_uio rgw_uio; + +int rgw_readv(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +int rgw_writev(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +/* + sync written data +*/ +#define RGW_FSYNC_FLAG_NONE 0x0000 + +int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + NFS commit operation +*/ + +#define RGW_COMMIT_FLAG_NONE 0x0000 + +int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint64_t offset, uint64_t length, uint32_t flags); + +/* + extended attributes + */ +typedef struct rgw_xattrstr +{ + char *val; + uint32_t len; +} rgw_xattrstr; + +typedef struct rgw_xattr +{ + rgw_xattrstr key; + rgw_xattrstr val; +} rgw_xattr; + +typedef struct rgw_xattrlist +{ + rgw_xattr *xattrs; + uint32_t xattr_cnt; +} rgw_xattrlist; + +#define RGW_GETXATTR_FLAG_NONE 0x0000 + +typedef int (*rgw_getxattr_cb)(rgw_xattrlist *attrs, void *arg, + uint32_t flags); + +int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg, + uint32_t flags); + +#define RGW_LSXATTR_FLAG_NONE 0x0000 +#define RGW_LSXATTR_FLAG_STOP 0x0001 + +int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrstr *filter_prefix /* unimplemented for now */, + rgw_getxattr_cb cb, void *cb_arg, uint32_t flags); + +#define RGW_SETXATTR_FLAG_NONE 0x0000 + +int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags); + +#define RGW_RMXATTR_FLAG_NONE 0x0000 + +int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* RADOS_RGW_FILE_H */ diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h new file mode 100644 index 000000000..a35345f7d --- /dev/null +++ b/src/include/radosstriper/libradosstriper.h @@ -0,0 +1,620 @@ +#ifndef CEPH_LIBRADOSSTRIPER_H +#define CEPH_LIBRADOSSTRIPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <string.h> + +#include "../rados/librados.h" + +#define LIBRADOSSTRIPER_VER_MAJOR 0 +#define LIBRADOSSTRIPER_VER_MINOR 0 +#define LIBRADOSSTRIPER_VER_EXTRA 0 + +#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA) + +/** + * @typedef rados_striper_t + * + * A handle for interacting with striped objects in a RADOS cluster. + */ +typedef void *rados_striper_t; + +/** + * @defgroup libradosstriper_h_init Setup and Teardown + * These are the first and last functions to that should be called + * when using libradosstriper. + * + * @{ + */ + +/** + * Creates a rados striper using the given io context + * Striper has initially default object layout. + * See rados_striper_set_object_layout_*() to change this + * + * @param ioctx the rados context to use + * @param striper where to store the rados striper + * @returns 0 on success, negative error code on failure + */ + int rados_striper_create(rados_ioctx_t ioctx, + rados_striper_t *striper); + +/** + * Destroys a rados striper + * + * @param striper the striper to destroy + */ +void rados_striper_destroy(rados_striper_t striper); + +/** + * Sets the object layout's stripe unit of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_unit the stripe_unit value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper, + unsigned int stripe_unit); + +/** + * Sets the object layout's stripe count of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_count the stripe_count value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_count(rados_striper_t striper, + unsigned int stripe_count); + +/** + * Sets the object layout's object_size of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param object_size the object_size value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_object_size(rados_striper_t striper, + unsigned int object_size); + +/** @} init */ + +/** + * @defgroup libradosstriper_h_synch_io Synchronous I/O + * Writes are striped to several rados objects which are then + * replicated to a number of OSDs based on the configuration + * of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_striper_ioctx_wait_for_complete(). + * + * @{ + */ + +/** + * Synchronously write data to a striped object at the specified offset + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_write(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously write an entire striped object + * + * The striped object is filled with the provided data. If the striped object exists, + * it is truncated and then written. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_write_full(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Append data to an object + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_append(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Synchronously read data from a striped object at the specified offset + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +int rados_striper_read(rados_striper_t striper, + const char *soid, + char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @returns 0 on success, negative error code on failure + */ +int rados_striper_remove(rados_striper_t striper, + const char* soid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @note the truncation is not fully atomic. The metadata part is, + * so the behavior will be atomic from user point of view when + * the object size is reduced. However, in case of failure, old data + * may stay around, hidden. They may reappear if the object size is + * later grown, instead of the expected 0s. When growing the + * object and in case of failure, the new 0 data may not be + * fully created. This can lead to ENOENT errors when + * writing/reading the missing parts. + * @note the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + * @param io the rados context to use + * @param soid the name of the striped object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size); + +/** @} Synchronous I/O */ + +/** + * @defgroup libradosstriper_h_xattrs Xattrs + * Extended attributes are stored as extended attributes on the + * first rados regular object of the striped object. + * Thus, they have the same limitations as the underlying + * rados extended attributes. + * + * @{ + */ + +/** + * Get the value of an extended attribute on a striped object. + * + * @param striper the striper in which the getxattr will occur + * @param oid name of the striped object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +int rados_striper_getxattr(rados_striper_t striper, + const char *oid, + const char *name, + char *buf, + size_t len); + +/** + * Set an extended attribute on a striped object. + * + * @param striper the striper in which the setxattr will occur + * @param oid name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +int rados_striper_setxattr(rados_striper_t striper, + const char *oid, + const char *name, + const char *buf, + size_t len); + +/** + * Delete an extended attribute from a striped object. + * + * @param striper the striper in which the rmxattr will occur + * @param oid name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_rmxattr(rados_striper_t striper, + const char *oid, + const char *name); + +/** + * Start iterating over xattrs on a striped object. + * + * @post iter is a valid iterator + * + * @param striper the striper in which the getxattrs will occur + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs(rados_striper_t striper, + const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the striped object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, + const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +void rados_striper_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Synchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +int rados_striper_stat(rados_striper_t striper, + const char* soid, + uint64_t *psize, + time_t *pmtime); + +int rados_striper_stat2(rados_striper_t striper, + const char* soid, + uint64_t *psize, + struct timespec *pmtime); + +/** + * @defgroup libradosstriper_h_asynch_io Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_striper_multi_completion_t + * Represents the state of a set of asynchronous operations + * it contains the aggregated return value once the operations complete + * and can be used to block until all operations are complete and/or safe. + */ +typedef void *rados_striper_multi_completion_t; + +/** + * Constructs a multi completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all relpicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +int rados_striper_multi_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_striper_multi_completion_t *pc); + +/** + * Block until all operation complete + * + * This means data is in memory on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c); + +/** + * Block until all operation are safe + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c); + +/** + * Block until all operations complete and callback completes + * + * This means data is in memory on all replicas and can be read. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Block until all operations are safe and callback has completed + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation and callback completed + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe and has the callback completed + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Get the return value of a multi asychronous operation + * + * The return value is set when all operations are complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operations to inspect + * @returns aggregated return value of the operations + */ +int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c); + +/** + * Release a multi asynchrnous IO completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c multi completion to release + */ +void rados_striper_multi_aio_release(rados_striper_multi_completion_t c); + +/** + * Asynchronously write data to a striped object at the specified offset + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len, + uint64_t off); + +/** + * Asynchronously appends data to a striped object + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_append(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously fills and object with the provided data. + * If the object exists, it is truncated and then written. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write_full(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously read data from a striped object at the specified offset + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param completion what to do when the read is safe and complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_read(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + char *buf, + const size_t len, + uint64_t off); + +/** + * Asynchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, negative error code on failure + */ + +int rados_striper_aio_remove(rados_striper_t striper, + const char* soid, + rados_completion_t completion); + +/** + * Block until all pending writes in a striper are safe + * + * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @param striper the striper in which the flush will occur + * @returns 0 on success, negative error code on failure +*/ +void rados_striper_aio_flush(rados_striper_t striper); + +/** + * Asynchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @param completion what to do when the stats is complete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_aio_stat(rados_striper_t striper, + const char* soid, + rados_completion_t completion, + uint64_t *psize, + time_t *pmtime); + +int rados_striper_aio_stat2(rados_striper_t striper, + const char* soid, + rados_completion_t completion, + uint64_t *psize, + struct timespec *pmtime); +/** @} Asynchronous I/O */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp new file mode 100644 index 000000000..fb790b0d7 --- /dev/null +++ b/src/include/radosstriper/libradosstriper.hpp @@ -0,0 +1,241 @@ +#ifndef __LIBRADOSSTRIPER_HPP +#define __LIBRADOSSTRIPER_HPP + +#include <string.h> +#include <string> +#include <map> +#include "../rados/buffer.h" +#include "../rados/librados.hpp" + +#include "libradosstriper.h" + +namespace libradosstriper +{ + struct RadosStriperImpl; + struct MultiAioCompletionImpl; + + /* + * Completion object for multiple asynchronous IO + * It allows to internally handle several "requests" + */ + struct MultiAioCompletion { + MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {} + ~MultiAioCompletion(); + int set_complete_callback(void *cb_arg, librados::callback_t cb); + int set_safe_callback(void *cb_arg, librados::callback_t cb) __attribute__ ((deprecated)); + void wait_for_complete(); + void wait_for_safe() __attribute__ ((deprecated)); + void wait_for_complete_and_cb(); + void wait_for_safe_and_cb() __attribute__ ((deprecated)); + bool is_complete(); + bool is_safe() __attribute__ ((deprecated)); + bool is_complete_and_cb(); + bool is_safe_and_cb() __attribute__ ((deprecated)); + int get_return_value(); + void release(); + MultiAioCompletionImpl *pc; + }; + + /* RadosStriper : This class allows to perform read/writes on striped objects + * + * Typical use (error checking omitted): + * + * RadosStriper rs; + * RadosStriper.striper_create("my_cluster", rs); + * bufferlist bl; + * ... put data in bl ... + * rs.write(object_name, bl, len, offset); + * bufferlist bl2; + * rs.read(object_name, &bl2, len, offset); + * ... + */ + class RadosStriper + { + public: + + /* + * constructor + */ + RadosStriper(); + + /* + * builds the C counter part of a RadosStriper + */ + static void to_rados_striper_t(RadosStriper &striper, + rados_striper_t *s); + + /* + * copy constructor + */ + RadosStriper(const RadosStriper& rs); + + /* + * operator= + */ + RadosStriper& operator=(const RadosStriper& rs); + + /* + * destructor + * Internally calling close() if an object is currently opened + */ + ~RadosStriper(); + + /* + * create method + */ + static int striper_create(librados::IoCtx& ioctx, + RadosStriper *striper); + + /* + * set object layout's stripe unit + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_unit(unsigned int stripe_unit); + + /* + * set object layout's stripe count + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_count(unsigned int stripe_count); + + /* + * set object layout's object size + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_object_size(unsigned int object_size); + + /** + * Get the value of an extended attribute on a striped object + */ + int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Set the value of an extended attribute on a striped object + */ + int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Delete an extended attribute from a striped object + */ + int rmxattr(const std::string& oid, const char *name); + + /** + * Start iterating over xattrs on a striped object. + */ + int getxattrs(const std::string& oid, + std::map<std::string, ceph::bufferlist>& attrset); + + /** + * synchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * synchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& soid, const ceph::bufferlist& bl); + + /** + * synchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int append(const std::string& soid, const ceph::bufferlist& bl, size_t len); + + /** + * asynchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @p bl. + */ + int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * asynchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @p bl. + */ + int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl); + + /** + * asynchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len); + + /** + * synchronously read from the striped object at the specified offset. + */ + int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off); + + /** + * asynchronously read from the striped object at the specified offset. + */ + int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off); + + /** + * synchronously get striped object stats (size/mtime) + */ + int stat(const std::string& soid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts); + + /** + * asynchronously get striped object stats (size/mtime) + */ + int aio_stat(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, struct timespec *pts); + + /** + * deletes a striped object. + * There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + */ + int remove(const std::string& soid); + int remove(const std::string& soid, int flags); + + /** + * asynchronous remove of striped objects + * See synchronous version for comments on (lack of) atomicity + */ + int aio_remove(const std::string& soid, librados::AioCompletion *c); + int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags); + + /** + * Resizes a striped object + * the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + */ + int trunc(const std::string& oid, uint64_t size); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * creation of multi aio completion objects + */ + static MultiAioCompletion *multi_aio_create_completion(); + static MultiAioCompletion *multi_aio_create_completion(void *cb_arg, + librados::callback_t cb_complete, + librados::callback_t cb_safe); + + private: + RadosStriperImpl *rados_striper_impl; + + }; + +} + +#endif diff --git a/src/include/random.h b/src/include/random.h new file mode 100644 index 000000000..f2e3e37bc --- /dev/null +++ b/src/include/random.h @@ -0,0 +1,301 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * +*/ + +#ifndef CEPH_RANDOM_H +#define CEPH_RANDOM_H 1 + +#include <mutex> +#include <random> +#include <type_traits> +#include <boost/optional.hpp> + +// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85494 +#ifdef __MINGW32__ +#include <boost/random/random_device.hpp> + +using random_device_t = boost::random::random_device; +#else +using random_device_t = std::random_device; +#endif + +// Basic random number facility (see N3551 for inspiration): +namespace ceph::util { + +inline namespace version_1_0_3 { + +namespace detail { + +template <typename T0, typename T1> +using larger_of = typename std::conditional< + sizeof(T0) >= sizeof(T1), + T0, T1> + ::type; + +// avoid mixing floating point and integers: +template <typename NumberT0, typename NumberT1> +using has_compatible_numeric_types = + std::disjunction< + std::conjunction< + std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1> + >, + std::conjunction< + std::is_integral<NumberT0>, std::is_integral<NumberT1> + > + >; + + +// Select the larger of type compatible numeric types: +template <typename NumberT0, typename NumberT1> +using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value, + detail::larger_of<NumberT0, NumberT1>>; + +} // namespace detail + +namespace detail { + +// Choose default distribution for appropriate types: +template <typename NumberT, + bool IsIntegral> +struct select_distribution +{ + using type = std::uniform_int_distribution<NumberT>; +}; + +template <typename NumberT> +struct select_distribution<NumberT, false> +{ + using type = std::uniform_real_distribution<NumberT>; +}; + +template <typename NumberT> +using default_distribution = typename + select_distribution<NumberT, std::is_integral<NumberT>::value>::type; + +} // namespace detail + +namespace detail { + +template <typename EngineT> +EngineT& engine(); + +template <typename MutexT, typename EngineT, + typename SeedT = typename EngineT::result_type> +void randomize_rng(const SeedT seed, MutexT& m, EngineT& e) +{ + std::lock_guard<MutexT> lg(m); + e.seed(seed); +} + +template <typename MutexT, typename EngineT> +void randomize_rng(MutexT& m, EngineT& e) +{ + random_device_t rd; + + std::lock_guard<MutexT> lg(m); + e.seed(rd()); +} + +template <typename EngineT = std::default_random_engine, + typename SeedT = typename EngineT::result_type> +void randomize_rng(const SeedT n) +{ + detail::engine<EngineT>().seed(n); +} + +template <typename EngineT = std::default_random_engine> +void randomize_rng() +{ + random_device_t rd; + detail::engine<EngineT>().seed(rd()); +} + +template <typename EngineT> +EngineT& engine() +{ + thread_local boost::optional<EngineT> rng_engine; + + if (!rng_engine) { + rng_engine.emplace(EngineT()); + randomize_rng<EngineT>(); + } + + return *rng_engine; +} + +} // namespace detail + +namespace detail { + +template <typename NumberT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max, + EngineT& e) +{ + DistributionT d { min, max }; + + using param_type = typename DistributionT::param_type; + return d(e, param_type { min, max }); +} + +template <typename NumberT, + typename MutexT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max, + MutexT& m, EngineT& e) +{ + DistributionT d { min, max }; + + using param_type = typename DistributionT::param_type; + + std::lock_guard<MutexT> lg(m); + return d(e, param_type { min, max }); +} + +template <typename NumberT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max) +{ + return detail::generate_random_number<NumberT, DistributionT, EngineT> + (min, max, detail::engine<EngineT>()); +} + +template <typename MutexT, + typename EngineT, + typename NumberT = int, + typename DistributionT = detail::default_distribution<NumberT>> +NumberT generate_random_number(MutexT& m, EngineT& e) +{ + return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT> + (0, std::numeric_limits<NumberT>::max(), m, e); +} + +template <typename NumberT, typename MutexT, typename EngineT> +NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e) +{ + return generate_random_number<NumberT>(0, max, m, e); +} + +} // namespace detail + +template <typename EngineT = std::default_random_engine> +void randomize_rng() +{ + detail::randomize_rng<EngineT>(); +} + +template <typename NumberT = int, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT = std::default_random_engine> +NumberT generate_random_number() +{ + return detail::generate_random_number<NumberT, DistributionT, EngineT> + (0, std::numeric_limits<NumberT>::max()); +} + +template <typename NumberT0, typename NumberT1, + typename NumberT = detail::select_number_t<NumberT0, NumberT1> + > +NumberT generate_random_number(const NumberT0 min, const NumberT1 max) +{ + return detail::generate_random_number<NumberT, + detail::default_distribution<NumberT>, + std::default_random_engine> + (static_cast<NumberT>(min), static_cast<NumberT>(max)); +} + +template <typename NumberT0, typename NumberT1, + typename DistributionT, + typename EngineT, + typename NumberT = detail::select_number_t<NumberT0, NumberT1> + > +NumberT generate_random_number(const NumberT min, const NumberT max, + EngineT& e) +{ + return detail::generate_random_number<NumberT, + DistributionT, + EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e); +} + +template <typename NumberT> +NumberT generate_random_number(const NumberT max) +{ + return generate_random_number<NumberT>(0, max); +} + +// Function object: +template <typename NumberT> +class random_number_generator final +{ + std::mutex l; + random_device_t rd; + std::default_random_engine e; + + using seed_type = typename decltype(e)::result_type; + + public: + using number_type = NumberT; + using random_engine_type = decltype(e); + using random_device_type = decltype(rd); + + public: + random_device_type& random_device() noexcept { return rd; } + random_engine_type& random_engine() noexcept { return e; } + + public: + random_number_generator() { + detail::randomize_rng(l, e); + } + + explicit random_number_generator(const seed_type seed) { + detail::randomize_rng(seed, l, e); + } + + random_number_generator(random_number_generator&& rhs) + : e(std::move(rhs.e)) + {} + + public: + random_number_generator(const random_number_generator&) = delete; + random_number_generator& operator=(const random_number_generator&) = delete; + + public: + NumberT operator()() { + return detail::generate_random_number(l, e); + } + + NumberT operator()(const NumberT max) { + return detail::generate_random_number<NumberT>(max, l, e); + } + + NumberT operator()(const NumberT min, const NumberT max) { + return detail::generate_random_number<NumberT>(min, max, l, e); + } + + public: + void seed(const seed_type n) { + detail::randomize_rng(n, l, e); + } +}; + +template <typename NumberT> +random_number_generator(const NumberT max) -> random_number_generator<NumberT>; + +} // inline namespace version_* + +} // namespace ceph::util + +#endif diff --git a/src/include/rangeset.h b/src/include/rangeset.h new file mode 100644 index 000000000..e7e3d047c --- /dev/null +++ b/src/include/rangeset.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_RANGESET_H +#define CEPH_RANGESET_H + +/* + * + * my first container with iterator! it's pretty ugly. + * + */ + +#include <map> + +//typedef int T; + +template <class T> +struct _rangeset_base { + map<T,T> ranges; // pair(first,last) (inclusive, e.g. [first,last]) + + typedef typename map<T,T>::iterator mapit; + + // get iterator for range including val. or ranges.end(). + mapit get_range_for(T val) { + mapit it = ranges.lower_bound(val); + if (it == ranges.end()) { + // search backwards + typename map<T,T>::reverse_iterator it = ranges.rbegin(); + if (it == ranges.rend()) return ranges.end(); + if (it->first <= val && it->second >= val) + return ranges.find(it->first); + return ranges.end(); + } else { + if (it->first == val) return + it--; + if (it->first <= val && it->second >= val) + return it; + return ranges.end(); + } + } + +}; + + +template <class T> +class rangeset_iterator : + public std::iterator<std::input_iterator_tag, T> +{ + //typedef typename map<T,T>::iterator mapit; + + map<T,T> ranges; + typename map<T,T>::iterator it; + T current; + +public: + // cons + rangeset_iterator() {} + + rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) { + this->ranges = ranges; + this->it = it; + if (this->it != ranges.end()) + current = it->first; + } + + bool operator==(rangeset_iterator<T> rit) { + return (it == rit.it && rit.current == current); + } + bool operator!=(rangeset_iterator<T> rit) { + return (it != rit.it) || (rit.current != current); + } + + T& operator*() { + return current; + } + + rangeset_iterator<T> operator++(int) { + if (current < it->second) + current++; + else { + it++; + if (it != ranges.end()) + current = it->first; + } + + return *this; + } +}; + + +template <class T> +class rangeset +{ + typedef typename map<T,T>::iterator map_iterator; + + _rangeset_base<T> theset; + inodeno_t _size; + +public: + rangeset() { _size = 0; } + typedef rangeset_iterator<T> iterator; + + iterator begin() { + map_iterator it = theset.ranges.begin(); + return iterator(it, theset.ranges); + } + + iterator end() { + map_iterator it = theset.ranges.end(); + return iterator(it, theset.ranges); + } + + map_iterator map_begin() { + return theset.ranges.begin(); + } + map_iterator map_end() { + return theset.ranges.end(); + } + int map_size() { + return theset.ranges.size(); + } + + void map_insert(T v1, T v2) { + theset.ranges.insert(pair<T,T>(v1,v2)); + _size += v2 - v1+1; + } + + + // ... + bool contains(T val) { + if (theset.get_range_for(val) == theset.ranges.end()) return false; + ceph_assert(!empty()); + return true; + } + + void insert(T val) { + ceph_assert(!contains(val)); + + map_iterator left = theset.get_range_for(val-1); + map_iterator right = theset.get_range_for(val+1); + + if (left != theset.ranges.end() && + right != theset.ranges.end()) { + // join! + left->second = right->second; + theset.ranges.erase(right); + _size++; + return; + } + + if (left != theset.ranges.end()) { + // add to left range + left->second = val; + _size++; + return; + } + + if (right != theset.ranges.end()) { + // add to right range + theset.ranges.insert(pair<T,T>(val, right->second)); + theset.ranges.erase(val+1); + _size++; + return; + } + + // new range + theset.ranges.insert(pair<T,T>(val,val)); + _size++; + return; + } + + unsigned size() { + return size(); + } + + bool empty() { + if (theset.ranges.empty()) { + ceph_assert(_size == 0); + return true; + } + ceph_assert(_size>0); + return false; + } + + + T first() { + ceph_assert(!empty()); + map_iterator it = theset.ranges.begin(); + return it->first; + } + + void erase(T val) { + ceph_assert(contains(val)); + map_iterator it = theset.get_range_for(val); + ceph_assert(it != theset.ranges.end()); + + // entire range + if (val == it->first && val == it->second) { + theset.ranges.erase(it); + _size--; + return; + } + + // beginning + if (val == it->first) { + theset.ranges.insert(pair<T,T>(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + // end + if (val == it->second) { + it->second = val-1; + _size--; + return; + } + + // middle split + theset.ranges.insert(pair<T,T>(it->first, val-1)); + theset.ranges.insert(pair<T,T>(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + void dump() { + for (typename map<T,T>::iterator it = theset.ranges.begin(); + it != theset.ranges.end(); + it++) { + cout << " " << it->first << "-" << it->second << endl; + } + } + +}; + + +#endif diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h new file mode 100644 index 000000000..31c73b38f --- /dev/null +++ b/src/include/rbd/features.h @@ -0,0 +1,121 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_RBD_FEATURES_H +#define CEPH_RBD_FEATURES_H + +#define RBD_FEATURE_LAYERING (1ULL<<0) +#define RBD_FEATURE_STRIPINGV2 (1ULL<<1) +#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_OBJECT_MAP (1ULL<<3) +#define RBD_FEATURE_FAST_DIFF (1ULL<<4) +#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) +#define RBD_FEATURE_JOURNALING (1ULL<<6) +#define RBD_FEATURE_DATA_POOL (1ULL<<7) +#define RBD_FEATURE_OPERATIONS (1ULL<<8) +#define RBD_FEATURE_MIGRATING (1ULL<<9) +#define RBD_FEATURE_NON_PRIMARY (1ULL<<10) +#define RBD_FEATURE_DIRTY_CACHE (1ULL<<11) + +#define RBD_FEATURES_DEFAULT (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN) + +#define RBD_FEATURE_NAME_LAYERING "layering" +#define RBD_FEATURE_NAME_STRIPINGV2 "striping" +#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK "exclusive-lock" +#define RBD_FEATURE_NAME_OBJECT_MAP "object-map" +#define RBD_FEATURE_NAME_FAST_DIFF "fast-diff" +#define RBD_FEATURE_NAME_DEEP_FLATTEN "deep-flatten" +#define RBD_FEATURE_NAME_JOURNALING "journaling" +#define RBD_FEATURE_NAME_DATA_POOL "data-pool" +#define RBD_FEATURE_NAME_OPERATIONS "operations" +#define RBD_FEATURE_NAME_MIGRATING "migrating" +#define RBD_FEATURE_NAME_NON_PRIMARY "non-primary" +#define RBD_FEATURE_NAME_DIRTY_CACHE "dirty-cache" + +/// features that make an image inaccessible for read or write by +/// clients that don't understand them +#define RBD_FEATURES_INCOMPATIBLE (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that make an image unwritable by clients that don't understand them +#define RBD_FEATURES_RW_INCOMPATIBLE (RBD_FEATURES_INCOMPATIBLE | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY) + +#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that may be dynamically enabled or disabled +#define RBD_FEATURES_MUTABLE (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +#define RBD_FEATURES_MUTABLE_INTERNAL (RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that may be dynamically disabled +#define RBD_FEATURES_DISABLE_ONLY (RBD_FEATURE_DEEP_FLATTEN) + +/// features that only work when used with a single client +/// using the image for writes +#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that will be implicitly enabled +#define RBD_FEATURES_IMPLICIT_ENABLE (RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING | \ + RBD_FEATURE_NON_PRIMARY | \ + RBD_FEATURE_DIRTY_CACHE) + +/// features that cannot be controlled by the user +#define RBD_FEATURES_INTERNAL (RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +#define RBD_OPERATION_FEATURE_CLONE_PARENT (1ULL<<0) +#define RBD_OPERATION_FEATURE_CLONE_CHILD (1ULL<<1) +#define RBD_OPERATION_FEATURE_GROUP (1ULL<<2) +#define RBD_OPERATION_FEATURE_SNAP_TRASH (1ULL<<3) + +#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent" +#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD "clone-child" +#define RBD_OPERATION_FEATURE_NAME_GROUP "group" +#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH "snap-trash" + +/// all valid operation features +#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \ + RBD_OPERATION_FEATURE_CLONE_CHILD | \ + RBD_OPERATION_FEATURE_GROUP | \ + RBD_OPERATION_FEATURE_SNAP_TRASH) + +#endif diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h new file mode 100644 index 000000000..7ae20e4dd --- /dev/null +++ b/src/include/rbd/librbd.h @@ -0,0 +1,1549 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRBD_H +#define CEPH_LIBRBD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif +#include <stdbool.h> +#include <string.h> +#include <sys/uio.h> +#include "../rados/librados.h" +#include "features.h" + +#define LIBRBD_VER_MAJOR 1 +#define LIBRBD_VER_MINOR 18 +#define LIBRBD_VER_EXTRA 0 + +#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA) + +#define LIBRBD_SUPPORTS_AIO_FLUSH 1 +#define LIBRBD_SUPPORTS_AIO_OPEN 1 +#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1 +#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE_IOVEC 1 +#define LIBRBD_SUPPORTS_LOCKING 1 +#define LIBRBD_SUPPORTS_INVALIDATE 1 +#define LIBRBD_SUPPORTS_IOVEC 1 +#define LIBRBD_SUPPORTS_WATCH 0 +#define LIBRBD_SUPPORTS_WRITESAME 1 +#define LIBRBD_SUPPORTS_WRITE_ZEROES 1 +#define LIBRBD_SUPPORTS_ENCRYPTION 1 +#define LIBRBD_SUPPORTS_ENCRYPTION_LOAD2 1 + +#if __GNUC__ >= 4 + #define CEPH_RBD_API __attribute__ ((visibility ("default"))) + #define CEPH_RBD_DEPRECATED __attribute__((deprecated)) + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#else + #define CEPH_RBD_API + #define CEPH_RBD_DEPRECATED +#endif + +#define RBD_FLAG_OBJECT_MAP_INVALID (1<<0) +#define RBD_FLAG_FAST_DIFF_INVALID (1<<1) + +#define RBD_MIRROR_IMAGE_STATUS_LOCAL_MIRROR_UUID "" + +typedef void *rbd_image_t; +typedef void *rbd_image_options_t; +typedef void *rbd_pool_stats_t; + +typedef void *rbd_completion_t; +typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg); + +typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr); + +typedef void (*rbd_update_callback_t)(void *arg); + +typedef enum { + RBD_SNAP_NAMESPACE_TYPE_USER = 0, + RBD_SNAP_NAMESPACE_TYPE_GROUP = 1, + RBD_SNAP_NAMESPACE_TYPE_TRASH = 2, + RBD_SNAP_NAMESPACE_TYPE_MIRROR = 3, +} rbd_snap_namespace_type_t; + +typedef struct { + char *id; + char *name; +} rbd_image_spec_t; + +typedef struct { + int64_t pool_id; + char *pool_name; + char *pool_namespace; + char *image_id; + char *image_name; + bool trash; +} rbd_linked_image_spec_t; + +typedef struct { + uint64_t id; + rbd_snap_namespace_type_t namespace_type; + char *name; +} rbd_snap_spec_t; + +typedef struct { + uint64_t id; + uint64_t size; + const char *name; +} rbd_snap_info_t; + +typedef struct { + const char *pool_name; + const char *image_name; + const char *image_id; + bool trash; +} rbd_child_info_t; + +#define RBD_MAX_IMAGE_NAME_SIZE 96 +#define RBD_MAX_BLOCK_NAME_SIZE 24 + +#define RBD_SNAP_CREATE_SKIP_QUIESCE (1 << 0) +#define RBD_SNAP_CREATE_IGNORE_QUIESCE_ERROR (1 << 1) + +#define RBD_SNAP_REMOVE_UNPROTECT (1 << 0) +#define RBD_SNAP_REMOVE_FLATTEN (1 << 1) +#define RBD_SNAP_REMOVE_FORCE (RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN) + +/** + * These types used to in set_image_notification to indicate the type of event + * socket passed in. + */ +enum { + EVENT_TYPE_PIPE = 1, + EVENT_TYPE_EVENTFD = 2 +}; + +typedef struct { + uint64_t size; + uint64_t obj_size; + uint64_t num_objs; + int order; + char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */ + int64_t parent_pool; /* deprecated */ + char parent_name[RBD_MAX_IMAGE_NAME_SIZE]; /* deprecated */ +} rbd_image_info_t; + +typedef enum { + RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */ + RBD_MIRROR_MODE_IMAGE, /* mirroring enabled on a per-image basis */ + RBD_MIRROR_MODE_POOL /* mirroring enabled on all journaled images */ +} rbd_mirror_mode_t; + +typedef enum { + RBD_MIRROR_PEER_DIRECTION_RX = 0, + RBD_MIRROR_PEER_DIRECTION_TX = 1, + RBD_MIRROR_PEER_DIRECTION_RX_TX = 2 +} rbd_mirror_peer_direction_t; + +typedef struct { + char *uuid; + char *cluster_name; + char *client_name; +} rbd_mirror_peer_t CEPH_RBD_DEPRECATED; + +typedef struct { + char *uuid; + rbd_mirror_peer_direction_t direction; + char *site_name; + char *mirror_uuid; + char *client_name; + time_t last_seen; +} rbd_mirror_peer_site_t; + +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host" +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY "key" + +typedef enum { + RBD_MIRROR_IMAGE_MODE_JOURNAL = 0, + RBD_MIRROR_IMAGE_MODE_SNAPSHOT = 1, +} rbd_mirror_image_mode_t; + +typedef enum { + RBD_MIRROR_IMAGE_DISABLING = 0, + RBD_MIRROR_IMAGE_ENABLED = 1, + RBD_MIRROR_IMAGE_DISABLED = 2 +} rbd_mirror_image_state_t; + +typedef struct { + char *global_id; + rbd_mirror_image_state_t state; + bool primary; +} rbd_mirror_image_info_t; + +typedef enum { + MIRROR_IMAGE_STATUS_STATE_UNKNOWN = 0, + MIRROR_IMAGE_STATUS_STATE_ERROR = 1, + MIRROR_IMAGE_STATUS_STATE_SYNCING = 2, + MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3, + MIRROR_IMAGE_STATUS_STATE_REPLAYING = 4, + MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5, + MIRROR_IMAGE_STATUS_STATE_STOPPED = 6, +} rbd_mirror_image_status_state_t; + +typedef struct { + char *name; + rbd_mirror_image_info_t info; + rbd_mirror_image_status_state_t state; + char *description; + time_t last_update; + bool up; +} rbd_mirror_image_status_t CEPH_RBD_DEPRECATED; + +typedef struct { + char *mirror_uuid; + rbd_mirror_image_status_state_t state; + char *description; + time_t last_update; + bool up; +} rbd_mirror_image_site_status_t; + +typedef struct { + char *name; + rbd_mirror_image_info_t info; + uint32_t site_statuses_count; + rbd_mirror_image_site_status_t *site_statuses; +} rbd_mirror_image_global_status_t; + +typedef enum { + RBD_GROUP_IMAGE_STATE_ATTACHED, + RBD_GROUP_IMAGE_STATE_INCOMPLETE +} rbd_group_image_state_t; + +typedef struct { + char *name; + int64_t pool; + rbd_group_image_state_t state; +} rbd_group_image_info_t; + +typedef struct { + char *name; + int64_t pool; +} rbd_group_info_t; + +typedef enum { + RBD_GROUP_SNAP_STATE_INCOMPLETE, + RBD_GROUP_SNAP_STATE_COMPLETE +} rbd_group_snap_state_t; + +typedef struct { + char *name; + rbd_group_snap_state_t state; +} rbd_group_snap_info_t; + +typedef struct { + int64_t group_pool; + char *group_name; + char *group_snap_name; +} rbd_snap_group_namespace_t; + +typedef enum { + RBD_SNAP_MIRROR_STATE_PRIMARY, + RBD_SNAP_MIRROR_STATE_PRIMARY_DEMOTED, + RBD_SNAP_MIRROR_STATE_NON_PRIMARY, + RBD_SNAP_MIRROR_STATE_NON_PRIMARY_DEMOTED +} rbd_snap_mirror_state_t; + +typedef struct { + rbd_snap_mirror_state_t state; + size_t mirror_peer_uuids_count; + char *mirror_peer_uuids; + bool complete; + char *primary_mirror_uuid; + uint64_t primary_snap_id; + uint64_t last_copied_object_number; +} rbd_snap_mirror_namespace_t; + +typedef enum { + RBD_LOCK_MODE_EXCLUSIVE = 0, + RBD_LOCK_MODE_SHARED = 1, +} rbd_lock_mode_t; + +CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra); + +/* image options */ +enum { + RBD_IMAGE_OPTION_FORMAT = 0, + RBD_IMAGE_OPTION_FEATURES = 1, + RBD_IMAGE_OPTION_ORDER = 2, + RBD_IMAGE_OPTION_STRIPE_UNIT = 3, + RBD_IMAGE_OPTION_STRIPE_COUNT = 4, + RBD_IMAGE_OPTION_JOURNAL_ORDER = 5, + RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6, + RBD_IMAGE_OPTION_JOURNAL_POOL = 7, + RBD_IMAGE_OPTION_FEATURES_SET = 8, + RBD_IMAGE_OPTION_FEATURES_CLEAR = 9, + RBD_IMAGE_OPTION_DATA_POOL = 10, + RBD_IMAGE_OPTION_FLATTEN = 11, + RBD_IMAGE_OPTION_CLONE_FORMAT = 12, + RBD_IMAGE_OPTION_MIRROR_IMAGE_MODE = 13, +}; + +typedef enum { + RBD_TRASH_IMAGE_SOURCE_USER = 0, + RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1, + RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2, + RBD_TRASH_IMAGE_SOURCE_REMOVING = 3, + RBD_TRASH_IMAGE_SOURCE_USER_PARENT = 4, +} rbd_trash_image_source_t; + +typedef struct { + char *id; + char *name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; +} rbd_trash_image_info_t; + +typedef struct { + char *addr; + int64_t id; + uint64_t cookie; +} rbd_image_watcher_t; + +typedef enum { + RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1, + RBD_IMAGE_MIGRATION_STATE_ERROR = 0, + RBD_IMAGE_MIGRATION_STATE_PREPARING = 1, + RBD_IMAGE_MIGRATION_STATE_PREPARED = 2, + RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3, + RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4, + RBD_IMAGE_MIGRATION_STATE_ABORTING = 5, +} rbd_image_migration_state_t; + +typedef struct { + int64_t source_pool_id; + char *source_pool_namespace; + char *source_image_name; + char *source_image_id; + int64_t dest_pool_id; + char *dest_pool_namespace; + char *dest_image_name; + char *dest_image_id; + rbd_image_migration_state_t state; + char *state_description; +} rbd_image_migration_status_t; + +typedef enum { + RBD_CONFIG_SOURCE_CONFIG = 0, + RBD_CONFIG_SOURCE_POOL = 1, + RBD_CONFIG_SOURCE_IMAGE = 2, +} rbd_config_source_t; + +typedef struct { + char *name; + char *value; + rbd_config_source_t source; +} rbd_config_option_t; + +typedef enum { + RBD_POOL_STAT_OPTION_IMAGES, + RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, + RBD_POOL_STAT_OPTION_TRASH_IMAGES, + RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS +} rbd_pool_stat_option_t; + +/* rbd_write_zeroes / rbd_aio_write_zeroes flags */ +enum { + RBD_WRITE_ZEROES_FLAG_THICK_PROVISION = (1U<<0), /* fully allocated zeroed extent */ +}; + +typedef enum { + RBD_ENCRYPTION_FORMAT_LUKS1 = 0, + RBD_ENCRYPTION_FORMAT_LUKS2 = 1, + RBD_ENCRYPTION_FORMAT_LUKS = 2 +} rbd_encryption_format_t; + +typedef enum { + RBD_ENCRYPTION_ALGORITHM_AES128 = 0, + RBD_ENCRYPTION_ALGORITHM_AES256 = 1 +} rbd_encryption_algorithm_t; + +typedef void *rbd_encryption_options_t; + +typedef struct { + rbd_encryption_format_t format; + rbd_encryption_options_t opts; + size_t opts_size; +} rbd_encryption_spec_t; + +typedef struct { + rbd_encryption_algorithm_t alg; + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks1_format_options_t; + +typedef struct { + rbd_encryption_algorithm_t alg; + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks2_format_options_t; + +typedef struct { + const char* passphrase; + size_t passphrase_size; +} rbd_encryption_luks_format_options_t; + +CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts); +CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts, + int optname, const char* optval); +CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts, + int optname, uint64_t optval); +CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts, + int optname, char* optval, + size_t maxlen); +CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts, + int optname, uint64_t* optval); +CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts, + int optname, bool* is_set); +CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname); +CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts); + +/* helpers */ +CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image); +CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images, + size_t num_images); +CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image); +CEPH_RBD_API void rbd_linked_image_spec_list_cleanup( + rbd_linked_image_spec_t *images, size_t num_images); +CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap); + +/* images */ +CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images, + size_t *max_images); + +CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, + int *order); +CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order); +/** + * create new rbd image + * + * The stripe_unit must be a factor of the object size (1 << order). + * The stripe_count can be one (no intra-object striping) or greater + * than one. The RBD_FEATURE_STRIPINGV2 must be specified if the + * stripe_unit != the object size and the stripe_count is != 1. + * + * @param io ioctx + * @param name image name + * @param size image size in bytes + * @param features initial feature bits + * @param order object/block size, as a power of two (object size == 1 << order) + * @param stripe_unit stripe unit size, in bytes. + * @param stripe_count number of objects to stripe over before looping + * @return 0 on success, or negative error code + */ +CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); +CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order); +CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count); +CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, rbd_image_options_t c_opts); +CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name); +CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname, + const char *destname); + +CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name, + uint64_t delay); +CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id, + rbd_trash_image_info_t *info); +CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info); +CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io, + rbd_trash_image_info_t *trash_entries, + size_t *num_entries); +CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries, + size_t num_entries); +CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold); +CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts, + float threshold, librbd_progress_fn_t cb, + void* cbdata); +CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force); +CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io, + const char *id, + bool force, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id, + const char *name); + +/* migration */ +CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx, + const char *image_name, + rados_ioctx_t dest_ioctx, + const char *dest_image_name, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_migration_prepare_import( + const char *source_spec, rados_ioctx_t dest_ioctx, + const char *dest_image_name, rbd_image_options_t opts); +CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx, + const char *image_name, + rbd_image_migration_status_t *status, + size_t status_size); +CEPH_RBD_API void rbd_migration_status_cleanup( + rbd_image_migration_status_t *status); + +/* pool mirroring */ +CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster, + char *name, size_t *max_len); +CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster, + const char *name); + +CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx, + rbd_mirror_mode_t *mirror_mode); +CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx, + rbd_mirror_mode_t mirror_mode); + +CEPH_RBD_API int rbd_mirror_uuid_get(rados_ioctx_t io_ctx, + char *uuid, size_t *max_len); + +CEPH_RBD_API int rbd_mirror_peer_bootstrap_create( + rados_ioctx_t io_ctx, char *token, size_t *max_len); +CEPH_RBD_API int rbd_mirror_peer_bootstrap_import( + rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction, + const char *token); + +CEPH_RBD_API int rbd_mirror_peer_site_add( + rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length, + rbd_mirror_peer_direction_t direction, const char *site_name, + const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_name( + rados_ioctx_t io_ctx, const char *uuid, const char *site_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_client_name( + rados_ioctx_t io_ctx, const char *uuid, const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_site_set_direction( + rados_ioctx_t io_ctx, const char *uuid, + rbd_mirror_peer_direction_t direction); +CEPH_RBD_API int rbd_mirror_peer_site_remove( + rados_ioctx_t io_ctx, const char *uuid); +CEPH_RBD_API int rbd_mirror_peer_site_list( + rados_ioctx_t io_ctx, rbd_mirror_peer_site_t *peers, int *max_peers); +CEPH_RBD_API void rbd_mirror_peer_site_list_cleanup( + rbd_mirror_peer_site_t *peers, int max_peers); +CEPH_RBD_API int rbd_mirror_peer_site_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_value_len, size_t *key_value_count); +CEPH_RBD_API int rbd_mirror_peer_site_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t key_value_count); + +CEPH_RBD_API int rbd_mirror_image_global_status_list( + rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_global_status_t *images, size_t *len); +CEPH_RBD_API void rbd_mirror_image_global_status_list_cleanup( + char **image_ids, rbd_mirror_image_global_status_t *images, size_t len); + +/* rbd_mirror_peer_ commands are deprecated to rbd_mirror_peer_site_ + * equivalents */ +CEPH_RBD_API int rbd_mirror_peer_add( + rados_ioctx_t io_ctx, char *uuid, size_t uuid_max_length, + const char *cluster_name, const char *client_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_remove( + rados_ioctx_t io_ctx, const char *uuid) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_list( + rados_ioctx_t io_ctx, rbd_mirror_peer_t *peers, int *max_peers) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_mirror_peer_list_cleanup( + rbd_mirror_peer_t *peers, int max_peers) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_client( + rados_ioctx_t io_ctx, const char *uuid, const char *client_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_cluster( + rados_ioctx_t io_ctx, const char *uuid, const char *cluster_name) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_value_len, size_t *key_value_count) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_peer_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t key_value_count) + CEPH_RBD_DEPRECATED; + +/* rbd_mirror_image_status_list_ commands are deprecard to + * rbd_mirror_image_global_status_list_ commands */ + +CEPH_RBD_API int rbd_mirror_image_status_list( + rados_ioctx_t io_ctx, const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_status_t *images, size_t *len) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_mirror_image_status_list_cleanup( + char **image_ids, rbd_mirror_image_status_t *images, size_t len) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_mirror_image_status_summary( + rados_ioctx_t io_ctx, rbd_mirror_image_status_state_t *states, int *counts, + size_t *maxlen); + +CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx, + const char *start_id, + size_t max, char **image_ids, + char **instance_ids, + size_t *len); +CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids, + char **instance_ids, + size_t len); +CEPH_RBD_API int rbd_mirror_image_info_list( + rados_ioctx_t io_ctx, rbd_mirror_image_mode_t *mode_filter, + const char *start_id, size_t max, char **image_ids, + rbd_mirror_image_mode_t *mode_entries, + rbd_mirror_image_info_t *info_entries, size_t *num_entries); +CEPH_RBD_API void rbd_mirror_image_info_list_cleanup( + char **image_ids, rbd_mirror_image_info_t *info_entries, + size_t num_entries); + +/* pool metadata */ +CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key, + char *value, size_t *val_len); +CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key, + const char *value); +CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx, + const char *key); +CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start, + uint64_t max, char *keys, + size_t *key_len, char *values, + size_t *vals_len); + +CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); + +CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); + +/** + * Open an image in read-only mode. + * + * This is intended for use by clients that cannot write to a block + * device due to cephx restrictions. There will be no watch + * established on the header object, since a watch is a write. This + * means the metadata reported about this image (parents, snapshots, + * size, etc.) may become stale. This should not be used for + * long-running operations, unless you can be sure that one of these + * properties changing is safe. + * + * Attempting to write to a read-only image will return -EROFS. + * + * @param io ioctx to determine the pool the image is in + * @param name image name + * @param image where to store newly opened image handle + * @param snap_name name of snapshot to open at, or NULL for no snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_features_to_string(uint64_t features, char *str_features, + size_t *size); +CEPH_RBD_API int rbd_features_from_string(const char *str_features, uint64_t *features); +CEPH_RBD_API int rbd_close(rbd_image_t image); +CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c); +CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size); +CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info, + size_t infosize); +CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old); +CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size); +CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features); +CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features, + uint8_t enabled); +CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features); +CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit); +CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image, + uint64_t *stripe_count); + +CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image, + struct timespec *timestamp); + +CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap); +CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len); +CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len); +CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image, + char *prefix, size_t prefix_len); +CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image); + +CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image, + char *parent_poolname, size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_snapname, + size_t psnapnamelen) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image, + char *parent_poolname, + size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_id, size_t pidlen, + char *parent_snapname, + size_t psnapnamelen) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_get_parent(rbd_image_t image, + rbd_linked_image_spec_t *parent_image, + rbd_snap_spec_t *parent_snap); + +CEPH_RBD_API int rbd_get_migration_source_spec(rbd_image_t image, + char* source_spec, + size_t* max_len); + +CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags); +CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info, + size_t group_info_size); +CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type); + +/* exclusive lock feature */ +CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner); +CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode); +CEPH_RBD_API int rbd_lock_release(rbd_image_t image); +CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image, + rbd_lock_mode_t *lock_mode, + char **lock_owners, + size_t *max_lock_owners); +CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners, + size_t lock_owner_count); +CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode, + const char *lock_owner); + +/* object map feature */ +CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata); + +CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx, + const char *destname); +CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest); +CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts, + size_t sparse_size); +CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata, + size_t sparse_size); + +/* deep copy */ +CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image, + rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, + void *cbdata); + +/* encryption */ + +/* + * Format the image using the encryption spec specified by + * (format, opts, opts_size) tuple. + * + * For a flat (i.e. non-cloned) image, the new encryption is loaded + * implicitly, calling rbd_encryption_load() afterwards is not needed. + * If existing encryption is already loaded, it is automatically + * replaced with the new encryption. + * + * For a cloned image, the new encryption must be loaded explicitly. + * Existing encryption (if any) must not be loaded. + */ +CEPH_RBD_API int rbd_encryption_format(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size); +/* + * Load the encryption spec specified by (format, opts, opts_size) + * tuple for the image and all ancestor images. If an ancestor image + * which does not match any encryption format known to librbd is + * encountered, it - along with remaining ancestor images - is + * interpreted as plaintext. + */ +CEPH_RBD_API int rbd_encryption_load(rbd_image_t image, + rbd_encryption_format_t format, + rbd_encryption_options_t opts, + size_t opts_size); +/* + * Load encryption specs. The first spec in the passed array is + * applied to the image itself, the second spec is applied to its + * ancestor image, the third spec is applied to the ancestor of + * that ancestor image and so on. + * + * If not enough specs are passed, the last spec is reused exactly as + * in rbd_encryption_load(). If an ancestor image for which the last + * spec is being reused turns out to not match any encryption format + * known to librbd, it - along with remaining ancestor images - is + * interpreted as plaintext. + */ +CEPH_RBD_API int rbd_encryption_load2(rbd_image_t image, + const rbd_encryption_spec_t *specs, + size_t spec_count); + +/* snapshots */ +CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, + int *max_snaps); +CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps); +CEPH_RBD_API int rbd_snap_exists(rbd_image_t image, const char *snapname, bool *exists); +CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_create2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id); +CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image, + const char *snapname, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname, + const char* dstsnapsname); +/** + * Prevent a snapshot from being deleted until it is unprotected. + * + * @param snap_name which snapshot to protect + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if snap is already protected + */ +CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name); +/** + * Allow a snaphshot to be deleted. + * + * @param snap_name which snapshot to unprotect + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snap is not protected + */ +CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name); +/** + * Determine whether a snapshot is protected. + * + * @param snap_name which snapshot query + * @param is_protected where to store the result (0 or 1) + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name, + int *is_protected); +/** + * Get the current snapshot limit for an image. If no limit is set, + * UINT64_MAX is returned. + * + * @param limit pointer where the limit will be stored on success + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit); + +/** + * Set a limit for the number of snapshots that may be taken of an image. + * + * @param limit the maximum number of snapshots allowed in the future. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit); + +/** + * Get the timestamp of a snapshot for an image. + * + * @param snap_id the snap id of a snapshot of input image. + * @param timestamp the timestamp of input snapshot. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp); + +CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id); +CEPH_RBD_API int rbd_snap_get_name(rbd_image_t image, uint64_t snap_id, char *snapname, size_t *name_len); +CEPH_RBD_API int rbd_snap_get_id(rbd_image_t image, const char *snapname, uint64_t *snap_id); + +CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image, + uint64_t snap_id, + rbd_snap_namespace_type_t *namespace_type); +CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image, + uint64_t snap_id, + rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image, + uint64_t snap_id, + char* original_name, + size_t max_length); +CEPH_RBD_API int rbd_snap_get_mirror_namespace( + rbd_image_t image, uint64_t snap_id, + rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size); +CEPH_RBD_API int rbd_snap_mirror_namespace_cleanup( + rbd_snap_mirror_namespace_t *mirror_snap, size_t mirror_snap_size); + +CEPH_RBD_API int rbd_flatten(rbd_image_t image); + +CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size); + +CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image, + size_t sparse_size, + librbd_progress_fn_t cb, + void *cbdata); + +/** + * List all images that are cloned from the image at the + * snapshot that is set via rbd_snap_set(). + * + * This iterates over all pools, so it should be run by a user with + * read access to all of them. pools_len and images_len are filled in + * with the number of bytes put into the pools and images buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the pool and image names + * of the children, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param pools buffer in which to store pool names + * @param pools_len number of bytes in pools buffer + * @param images buffer in which to store image names + * @param images_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools, + size_t *pools_len, char *images, + size_t *images_len) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_list_children2(rbd_image_t image, + rbd_child_info_t *children, + int *max_children) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child) + CEPH_RBD_DEPRECATED; +CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children, + size_t num_children) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_list_children3(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +CEPH_RBD_API int rbd_list_descendants(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +/** + * @defgroup librbd_h_locking Advisory Locking + * + * An rbd image may be locking exclusively, or shared, to facilitate + * e.g. live migration where the image may be open in two places at once. + * These locks are intended to guard against more than one client + * writing to an image without coordination. They don't need to + * be used for snapshots, since snapshots are read-only. + * + * Currently locks only guard against locks being acquired. + * They do not prevent anything else. + * + * A locker is identified by the internal rados client id of the + * holder and a user-defined cookie. This (client id, cookie) pair + * must be unique for each locker. + * + * A shared lock also has a user-defined tag associated with it. Each + * additional shared lock must specify the same tag or lock + * acquisition will fail. This can be used by e.g. groups of hosts + * using a clustered filesystem on top of an rbd image to make sure + * they're accessing the correct image. + * + * @{ + */ +/** + * List clients that have locked the image and information about the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the image + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Take an exclusive lock on the image. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie); + +/** + * Take a shared lock on the image. + * + * Other clients may also take a shared lock, as lock as they use the + * same tag. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag user-defined identifier for this shared use of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie, + const char *tag); + +/** + * Release a shared or exclusive lock on the image. + * + * @param image the image to unlock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie); + +/** + * Release a shared or exclusive lock that was taken by the specified client. + * + * @param image the image to unlock + * @param client the entity holding the lock (as given by rbd_list_lockers()) + * @param cookie user-defined identifier for the instance of the lock to break + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client, + const char *cookie); + +/** @} locking */ + +/* I/O */ +CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len, + char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len, + char *buf, int op_flags); +/* DEPRECATED; use rbd_read_iterate2 */ +CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); + +/** + * iterate read over an image + * + * Reads each region of the image and calls the callback. If the + * buffer pointer passed to the callback is NULL, the given extent is + * defined to be zeros (a hole). Normally the granularity for the + * callback is the image stripe size. + * + * @param image image to read + * @param ofs offset to start from + * @param len bytes of source image to cover + * @param cb callback for each region + * @returns 0 success, error otherwise + */ +CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); +/** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent 1 if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ +CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + uint8_t include_parent, uint8_t whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, int op_flags); +CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len); +CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, size_t data_len, + int op_flags); +CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, + size_t len, int zero_flags, + int op_flags); +CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs, + size_t len, const char *cmp_buf, + const char *buf, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c); + +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c, + int op_flags); +CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, + const char *buf, size_t data_len, + rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, + size_t len, rbd_completion_t c, + int zero_flags, int op_flags); +CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image, + uint64_t off, size_t len, + const char *cmp_buf, + const char *buf, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags); +CEPH_RBD_API ssize_t rbd_aio_compare_and_writev(rbd_image_t image, + uint64_t off, + const struct iovec *cmp_iov, + int cmp_iovcnt, + const struct iovec *iov, + int iovcnt, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg, + rbd_callback_t complete_cb, + rbd_completion_t *c); +CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c); +CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c); +CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c); +CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c); +CEPH_RBD_API void rbd_aio_release(rbd_completion_t c); +CEPH_RBD_API int rbd_flush(rbd_image_t image); +/** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c); + +/** + * Drop any cached data for an image + * + * @param image the image to invalidate cached data for + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image); + +CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp); + +CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len); +CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value); +CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key); +/** + * List all metadatas associated with this image. + * + * This iterates over all metadatas, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the keys and values + * of the image, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param start_after which name to begin listing after + * (use the empty string to start at the beginning) + * @param max the maximum number of names to lis(if 0 means no limit) + * @param keys buffer in which to store pool names + * @param keys_len number of bytes in pools buffer + * @param values buffer in which to store image names + * @param vals_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max, + char *keys, size_t *key_len, char *values, size_t *vals_len); + +// RBD image mirroring support functions +CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image) CEPH_RBD_DEPRECATED; +CEPH_RBD_API int rbd_mirror_image_enable2(rbd_image_t image, + rbd_mirror_image_mode_t mode); +CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_create_snapshot(rbd_image_t image, + uint64_t *snap_id); +CEPH_RBD_API int rbd_mirror_image_create_snapshot2(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id); +CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size); +CEPH_RBD_API void rbd_mirror_image_get_info_cleanup( + rbd_mirror_image_info_t *mirror_image_info); +CEPH_RBD_API int rbd_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode); + +CEPH_RBD_API int rbd_mirror_image_get_global_status( + rbd_image_t image, + rbd_mirror_image_global_status_t *mirror_image_global_status, + size_t status_size); +CEPH_RBD_API void rbd_mirror_image_global_status_cleanup( + rbd_mirror_image_global_status_t *mirror_image_global_status); + +CEPH_RBD_API int rbd_mirror_image_get_status( + rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status, + size_t status_size) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image, + char *instance_id, + size_t *id_max_length); +CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_mode(rbd_image_t image, + rbd_mirror_image_mode_t *mode, + rbd_completion_t c); + +CEPH_RBD_API int rbd_aio_mirror_image_get_global_status( + rbd_image_t image, + rbd_mirror_image_global_status_t *mirror_global_image_status, + size_t status_size, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_status( + rbd_image_t image, rbd_mirror_image_status_t *mirror_image_status, + size_t status_size, rbd_completion_t c) + CEPH_RBD_DEPRECATED; + +CEPH_RBD_API int rbd_aio_mirror_image_create_snapshot(rbd_image_t image, + uint32_t flags, + uint64_t *snap_id, + rbd_completion_t c); + +// RBD groups support functions +CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size); +CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name, + const char *dest_name); +CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info, + size_t group_info_size); + +/** + * Register an image metadata change watcher. + * + * @param image the image to watch + * @param handle where to store the internal id assigned to this watch + * @param watch_cb what to do when a notify is received on this image + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle, + rbd_update_callback_t watch_cb, void *arg); + +/** + * Unregister an image watcher. + * + * @param image the image to unwatch + * @param handle which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle); + +/** + * List any watchers of an image. + * + * Watchers will be allocated and stored in the passed watchers array. If there + * are more watchers than max_watchers, -ERANGE will be returned and the number + * of watchers will be stored in max_watchers. + * + * The caller should call rbd_watchers_list_cleanup when finished with the list + * of watchers. + * + * @param image the image to list watchers for. + * @param watchers an array to store watchers in. + * @param max_watchers capacity of the watchers array. + * @returns 0 on success, negative error code on failure. + * @returns -ERANGE if there are too many watchers for the passed array. + * @returns the number of watchers in max_watchers. + */ +CEPH_RBD_API int rbd_watchers_list(rbd_image_t image, + rbd_image_watcher_t *watchers, + size_t *max_watchers); + +CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers, + size_t num_watchers); + +CEPH_RBD_API int rbd_config_image_list(rbd_image_t image, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_id); +CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t num_entries); + +CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_create2(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + uint32_t flags); +CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p, + const char *group_name, + const char *old_snap_name, + const char *new_snap_name); +CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t num_entries); +CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names, + size_t *size); +CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io, + const char *namespace_name, + bool *exists); + +CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force); + +CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats); +CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats); +CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats, + int stat_option, + uint64_t* stat_val); +CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats); + +/** + * Register a quiesce/unquiesce watcher. + * + * @param image the image to watch + * @param quiesce_cb what to do when librbd wants to quiesce + * @param unquiesce_cb what to do when librbd wants to unquiesce + * @param arg opaque value to pass to the callbacks + * @param handle where to store the internal id assigned to this watch + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_quiesce_watch(rbd_image_t image, + rbd_update_callback_t quiesce_cb, + rbd_update_callback_t unquiesce_cb, + void *arg, uint64_t *handle); + +/** + * Notify quiesce is complete + * + * @param image the image to notify + * @param handle which watch is complete + * @param r the return code + */ +CEPH_RBD_API void rbd_quiesce_complete(rbd_image_t image, uint64_t handle, + int r); + +/** + * Unregister a quiesce/unquiesce watcher. + * + * @param image the image to unwatch + * @param handle which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_quiesce_unwatch(rbd_image_t image, uint64_t handle); + +#if __GNUC__ >= 4 + #pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_LIBRBD_H */ diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp new file mode 100644 index 000000000..5d307cded --- /dev/null +++ b/src/include/rbd/librbd.hpp @@ -0,0 +1,869 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __LIBRBD_HPP +#define __LIBRBD_HPP + +#include <string> +#include <list> +#include <map> +#include <vector> +#include "../rados/buffer.h" +#include "../rados/librados.hpp" +#include "librbd.h" + +#if __GNUC__ >= 4 + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#endif + +namespace librbd { + + using librados::IoCtx; + + class Image; + class ImageOptions; + class PoolStats; + typedef void *image_ctx_t; + typedef void *completion_t; + typedef void (*callback_t)(completion_t cb, void *arg); + + typedef struct { + std::string id; + std::string name; + } image_spec_t; + + typedef struct { + int64_t pool_id; + std::string pool_name; + std::string pool_namespace; + std::string image_id; + std::string image_name; + bool trash; + } linked_image_spec_t; + + typedef rbd_snap_namespace_type_t snap_namespace_type_t; + + typedef struct { + uint64_t id; + snap_namespace_type_t namespace_type; + std::string name; + } snap_spec_t; + + typedef struct { + uint64_t id; + uint64_t size; + std::string name; + } snap_info_t; + + typedef struct { + int64_t group_pool; + std::string group_name; + std::string group_snap_name; + } snap_group_namespace_t; + + typedef rbd_snap_mirror_state_t snap_mirror_state_t; + + typedef struct { + snap_mirror_state_t state; + std::set<std::string> mirror_peer_uuids; + bool complete; + std::string primary_mirror_uuid; + uint64_t primary_snap_id; + uint64_t last_copied_object_number; + } snap_mirror_namespace_t; + + typedef struct { + std::string client; + std::string cookie; + std::string address; + } locker_t; + + typedef rbd_mirror_peer_direction_t mirror_peer_direction_t; + + typedef struct { + std::string uuid; + std::string cluster_name; + std::string client_name; + } mirror_peer_t CEPH_RBD_DEPRECATED; + + typedef struct { + std::string uuid; + mirror_peer_direction_t direction; + std::string site_name; + std::string mirror_uuid; + std::string client_name; + time_t last_seen; + } mirror_peer_site_t; + + typedef rbd_mirror_image_mode_t mirror_image_mode_t; + typedef rbd_mirror_image_state_t mirror_image_state_t; + + typedef struct { + std::string global_id; + mirror_image_state_t state; + bool primary; + } mirror_image_info_t; + + typedef rbd_mirror_image_status_state_t mirror_image_status_state_t; + + typedef struct { + std::string name; + mirror_image_info_t info; + mirror_image_status_state_t state; + std::string description; + time_t last_update; + bool up; + } mirror_image_status_t CEPH_RBD_DEPRECATED; + + typedef struct { + std::string mirror_uuid; + mirror_image_status_state_t state; + std::string description; + time_t last_update; + bool up; + } mirror_image_site_status_t; + + typedef struct { + std::string name; + mirror_image_info_t info; + std::vector<mirror_image_site_status_t> site_statuses; + } mirror_image_global_status_t; + + typedef rbd_group_image_state_t group_image_state_t; + + typedef struct { + std::string name; + int64_t pool; + group_image_state_t state; + } group_image_info_t; + + typedef struct { + std::string name; + int64_t pool; + } group_info_t; + + typedef rbd_group_snap_state_t group_snap_state_t; + + typedef struct { + std::string name; + group_snap_state_t state; + } group_snap_info_t; + + typedef rbd_image_info_t image_info_t; + + class CEPH_RBD_API ProgressContext + { + public: + virtual ~ProgressContext(); + virtual int update_progress(uint64_t offset, uint64_t total) = 0; + }; + + typedef struct { + std::string id; + std::string name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; + } trash_image_info_t; + + typedef struct { + std::string pool_name; + std::string image_name; + std::string image_id; + bool trash; + } child_info_t; + + typedef struct { + std::string addr; + int64_t id; + uint64_t cookie; + } image_watcher_t; + + typedef rbd_image_migration_state_t image_migration_state_t; + + typedef struct { + int64_t source_pool_id; + std::string source_pool_namespace; + std::string source_image_name; + std::string source_image_id; + int64_t dest_pool_id; + std::string dest_pool_namespace; + std::string dest_image_name; + std::string dest_image_id; + image_migration_state_t state; + std::string state_description; + } image_migration_status_t; + + typedef rbd_config_source_t config_source_t; + + typedef struct { + std::string name; + std::string value; + config_source_t source; + } config_option_t; + + typedef rbd_encryption_format_t encryption_format_t; + typedef rbd_encryption_algorithm_t encryption_algorithm_t; + typedef rbd_encryption_options_t encryption_options_t; + typedef rbd_encryption_spec_t encryption_spec_t; + + typedef struct { + encryption_algorithm_t alg; + std::string passphrase; + } encryption_luks1_format_options_t; + + typedef struct { + encryption_algorithm_t alg; + std::string passphrase; + } encryption_luks2_format_options_t; + + typedef struct { + std::string passphrase; + } encryption_luks_format_options_t; + +class CEPH_RBD_API RBD +{ +public: + RBD(); + ~RBD(); + + // This must be dynamically allocated with new, and + // must be released with release(). + // Do not use delete. + struct AioCompletion { + void *pc; + AioCompletion(void *cb_arg, callback_t complete_cb); + bool is_complete(); + int wait_for_complete(); + ssize_t get_return_value(); + void *get_arg(); + void release(); + }; + + void version(int *major, int *minor, int *extra); + + int open(IoCtx& io_ctx, Image& image, const char *name); + int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname); + int aio_open(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + // see librbd.h + int open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname); + int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname); + int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + int features_to_string(uint64_t features, std::string *str_features); + int features_from_string(const std::string str_features, uint64_t *features); + + int list(IoCtx& io_ctx, std::vector<std::string>& names) + CEPH_RBD_DEPRECATED; + int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images); + + int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order); + int create2(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order); + int create3(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); + int create4(IoCtx& io_ctx, const char *name, uint64_t size, + ImageOptions& opts); + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order); + int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order, uint64_t stripe_unit, int stripe_count); + int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, ImageOptions& opts); + int remove(IoCtx& io_ctx, const char *name); + int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx); + int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname); + + int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay); + int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info); + int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries); + int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold); + int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold, + ProgressContext &pctx); + int trash_remove(IoCtx &io_ctx, const char *image_id, bool force); + int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, + bool force, ProgressContext &pctx); + int trash_restore(IoCtx &io_ctx, const char *id, const char *name); + + // Migration + int migration_prepare(IoCtx& io_ctx, const char *image_name, + IoCtx& dest_io_ctx, const char *dest_image_name, + ImageOptions& opts); + int migration_prepare_import(const char *source_spec, IoCtx& dest_io_ctx, + const char *dest_image_name, ImageOptions& opts); + int migration_execute(IoCtx& io_ctx, const char *image_name); + int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_abort(IoCtx& io_ctx, const char *image_name); + int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_commit(IoCtx& io_ctx, const char *image_name); + int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_status(IoCtx& io_ctx, const char *image_name, + image_migration_status_t *status, size_t status_size); + + // RBD pool mirroring support functions + int mirror_site_name_get(librados::Rados& rados, std::string* site_name); + int mirror_site_name_set(librados::Rados& rados, + const std::string& site_name); + + int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode); + int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode); + + int mirror_uuid_get(IoCtx& io_ctx, std::string* mirror_uuid); + + int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token); + int mirror_peer_bootstrap_import(IoCtx& io_ctx, + mirror_peer_direction_t direction, + const std::string &token); + + int mirror_peer_site_add(IoCtx& io_ctx, std::string *uuid, + mirror_peer_direction_t direction, + const std::string &site_name, + const std::string &client_name); + int mirror_peer_site_set_name(IoCtx& io_ctx, const std::string& uuid, + const std::string &site_name); + int mirror_peer_site_set_client_name(IoCtx& io_ctx, const std::string& uuid, + const std::string &client_name); + int mirror_peer_site_set_direction(IoCtx& io_ctx, const std::string& uuid, + mirror_peer_direction_t direction); + int mirror_peer_site_remove(IoCtx& io_ctx, const std::string& uuid); + int mirror_peer_site_list(IoCtx& io_ctx, + std::vector<mirror_peer_site_t> *peers); + int mirror_peer_site_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map<std::string, std::string> *key_vals); + int mirror_peer_site_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map<std::string, std::string>& key_vals); + + int mirror_image_global_status_list( + IoCtx& io_ctx, const std::string &start_id, size_t max, + std::map<std::string, mirror_image_global_status_t> *images); + int mirror_image_status_summary(IoCtx& io_ctx, + std::map<mirror_image_status_state_t, int> *states); + int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map<std::string, std::string> *sevice_ids); + int mirror_image_info_list(IoCtx& io_ctx, mirror_image_mode_t *mode_filter, + const std::string &start_id, size_t max, + std::map<std::string, std::pair<mirror_image_mode_t, + mirror_image_info_t>> *entries); + + /// mirror_peer_ commands are deprecated to mirror_peer_site_ equivalents + int mirror_peer_add(IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid) + CEPH_RBD_DEPRECATED; + int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name) + CEPH_RBD_DEPRECATED; + int mirror_peer_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map<std::string, std::string> *key_vals) + CEPH_RBD_DEPRECATED; + int mirror_peer_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map<std::string, std::string>& key_vals) + CEPH_RBD_DEPRECATED; + + /// mirror_image_status_list command is deprecated to + /// mirror_image_global_status_list + + int mirror_image_status_list( + IoCtx& io_ctx, const std::string &start_id, size_t max, + std::map<std::string, mirror_image_status_t> *images) + CEPH_RBD_DEPRECATED; + + // RBD groups support functions + int group_create(IoCtx& io_ctx, const char *group_name); + int group_remove(IoCtx& io_ctx, const char *group_name); + int group_list(IoCtx& io_ctx, std::vector<std::string> *names); + int group_rename(IoCtx& io_ctx, const char *src_group_name, + const char *dest_group_name); + + int group_image_add(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_id); + int group_image_list(IoCtx& io_ctx, const char *group_name, + std::vector<group_image_info_t> *images, + size_t group_image_info_size); + + int group_snap_create(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_create2(IoCtx& io_ctx, const char *group_name, + const char *snap_name, uint32_t flags); + int group_snap_remove(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rename(IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, const char *new_snap_name); + int group_snap_list(IoCtx& group_ioctx, const char *group_name, + std::vector<group_snap_info_t> *snaps, + size_t group_snap_info_size); + int group_snap_rollback(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name, + const char *snap_name, + ProgressContext& pctx); + + int namespace_create(IoCtx& ioctx, const char *namespace_name); + int namespace_remove(IoCtx& ioctx, const char *namespace_name); + int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names); + int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists); + + int pool_init(IoCtx& io_ctx, bool force); + int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats); + + int pool_metadata_get(IoCtx &io_ctx, const std::string &key, + std::string *value); + int pool_metadata_set(IoCtx &io_ctx, const std::string &key, + const std::string &value); + int pool_metadata_remove(IoCtx &io_ctx, const std::string &key); + int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max, + std::map<std::string, ceph::bufferlist> *pairs); + + int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options); + +private: + /* We don't allow assignment or copying */ + RBD(const RBD& rhs); + const RBD& operator=(const RBD& rhs); +}; + +class CEPH_RBD_API ImageOptions { +public: + ImageOptions(); + ImageOptions(rbd_image_options_t opts); + ImageOptions(const ImageOptions &imgopts); + ~ImageOptions(); + + int set(int optname, const std::string& optval); + int set(int optname, uint64_t optval); + int get(int optname, std::string* optval) const; + int get(int optname, uint64_t* optval) const; + int is_set(int optname, bool* is_set); + int unset(int optname); + void clear(); + bool empty() const; + +private: + friend class RBD; + friend class Image; + + rbd_image_options_t opts; +}; + +class CEPH_RBD_API PoolStats { +public: + PoolStats(); + ~PoolStats(); + + PoolStats(const PoolStats&) = delete; + PoolStats& operator=(const PoolStats&) = delete; + + int add(rbd_pool_stat_option_t option, uint64_t* opt_val); + +private: + friend class RBD; + + rbd_pool_stats_t pool_stats; +}; + +class CEPH_RBD_API UpdateWatchCtx { +public: + virtual ~UpdateWatchCtx() {} + /** + * Callback activated when we receive a notify event. + */ + virtual void handle_notify() = 0; +}; + +class CEPH_RBD_API QuiesceWatchCtx { +public: + virtual ~QuiesceWatchCtx() {} + /** + * Callback activated when we want to quiesce. + */ + virtual void handle_quiesce() = 0; + + /** + * Callback activated when we want to unquiesce. + */ + virtual void handle_unquiesce() = 0; +}; + +class CEPH_RBD_API Image +{ +public: + Image(); + ~Image(); + + int close(); + int aio_close(RBD::AioCompletion *c); + + int resize(uint64_t size); + int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx); + int resize_with_progress(uint64_t size, ProgressContext& pctx); + int stat(image_info_t &info, size_t infosize); + int get_name(std::string *name); + int get_id(std::string *id); + std::string get_block_name_prefix(); + int64_t get_data_pool_id(); + int parent_info(std::string *parent_poolname, std::string *parent_name, + std::string *parent_snapname) + CEPH_RBD_DEPRECATED; + int parent_info2(std::string *parent_poolname, std::string *parent_name, + std::string *parent_id, std::string *parent_snapname) + CEPH_RBD_DEPRECATED; + int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap); + + int get_migration_source_spec(std::string* source_spec); + + int old_format(uint8_t *old); + int size(uint64_t *size); + int get_group(group_info_t *group_info, size_t group_info_size); + int features(uint64_t *features); + int update_features(uint64_t features, bool enabled); + int get_op_features(uint64_t *op_features); + int overlap(uint64_t *overlap); + int get_flags(uint64_t *flags); + int set_image_notification(int fd, int type); + + /* exclusive lock feature */ + int is_exclusive_lock_owner(bool *is_owner); + int lock_acquire(rbd_lock_mode_t lock_mode); + int lock_release(); + int lock_get_owners(rbd_lock_mode_t *lock_mode, + std::list<std::string> *lock_owners); + int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner); + + /* object map feature */ + int rebuild_object_map(ProgressContext &prog_ctx); + + int check_object_map(ProgressContext &prog_ctx); + + int copy(IoCtx& dest_io_ctx, const char *destname); + int copy2(Image& dest); + int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, + size_t sparse_size); + int copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ProgressContext &prog_ctx); + int copy_with_progress2(Image& dest, ProgressContext &prog_ctx); + int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, + size_t sparse_size); + + /* deep copy */ + int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + + /* encryption */ + int encryption_format(encryption_format_t format, encryption_options_t opts, + size_t opts_size); + int encryption_load(encryption_format_t format, encryption_options_t opts, + size_t opts_size); + int encryption_load2(const encryption_spec_t *specs, size_t spec_count); + + /* striping */ + uint64_t get_stripe_unit() const; + uint64_t get_stripe_count() const; + + int get_create_timestamp(struct timespec *timestamp); + int get_access_timestamp(struct timespec *timestamp); + int get_modify_timestamp(struct timespec *timestamp); + + int flatten(); + int flatten_with_progress(ProgressContext &prog_ctx); + + int sparsify(size_t sparse_size); + int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx); + /** + * Returns a pair of poolname, imagename for each clone + * of this image at the currently set snapshot. + */ + int list_children(std::set<std::pair<std::string, std::string> > *children) + CEPH_RBD_DEPRECATED; + /** + * Returns a structure of poolname, imagename, imageid and trash flag + * for each clone of this image at the currently set snapshot. + */ + int list_children2(std::vector<librbd::child_info_t> *children) + CEPH_RBD_DEPRECATED; + int list_children3(std::vector<linked_image_spec_t> *images); + int list_descendants(std::vector<linked_image_spec_t> *images); + + /* advisory locking (see librbd.h for details) */ + int list_lockers(std::list<locker_t> *lockers, + bool *exclusive, std::string *tag); + int lock_exclusive(const std::string& cookie); + int lock_shared(const std::string& cookie, const std::string& tag); + int unlock(const std::string& cookie); + int break_lock(const std::string& client, const std::string& cookie); + + /* snapshots */ + int snap_list(std::vector<snap_info_t>& snaps); + /* DEPRECATED; use snap_exists2 */ + bool snap_exists(const char *snapname) CEPH_RBD_DEPRECATED; + int snap_exists2(const char *snapname, bool *exists); + int snap_create(const char *snapname); + int snap_create2(const char *snapname, uint32_t flags, ProgressContext& pctx); + int snap_remove(const char *snapname); + int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx); + int snap_remove_by_id(uint64_t snap_id); + int snap_rollback(const char *snap_name); + int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx); + int snap_protect(const char *snap_name); + int snap_unprotect(const char *snap_name); + int snap_is_protected(const char *snap_name, bool *is_protected); + int snap_set(const char *snap_name); + int snap_set_by_id(uint64_t snap_id); + int snap_get_name(uint64_t snap_id, std::string *snap_name); + int snap_get_id(const std::string snap_name, uint64_t *snap_id); + int snap_rename(const char *srcname, const char *dstname); + int snap_get_limit(uint64_t *limit); + int snap_set_limit(uint64_t limit); + int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp); + int snap_get_namespace_type(uint64_t snap_id, + snap_namespace_type_t *namespace_type); + int snap_get_group_namespace(uint64_t snap_id, + snap_group_namespace_t *group_namespace, + size_t snap_group_namespace_size); + int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name); + int snap_get_mirror_namespace( + uint64_t snap_id, snap_mirror_namespace_t *mirror_namespace, + size_t snap_mirror_namespace_size); + + /* I/O */ + ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + int64_t read_iterate(uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + int read_iterate2(uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + /** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent true if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ + int diff_iterate(const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + int diff_iterate2(const char *fromsnapname, + uint64_t ofs, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + + ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + + int discard(uint64_t ofs, uint64_t len); + ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags); + ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags); + + /** + * compare and write from/to image + * + * Compare data in compare bufferlist to data at offset in image. + * len bytes of the compare bufferlist are compared, i.e. the compare + * bufferlist has to be at least len bytes long. + * If the compare is successful len bytes from the write bufferlist + * are written to the image, i.e. the write bufferlist also has to be + * at least len bytes long. + * If the compare is unsuccessful no data is written and the + * offset in the bufferlist where the compare first differed + * is returned through mismatch_off. + * + * @param off offset in image + * @param len length of compare, length of write + * @param cmp_bl bufferlist to compare from + * @param bl bufferlist to write to image if compare succeeds + * @param c aio completion to notify when compare and write is complete + * @param mismatch_off (out) offset in bufferlist where compare first differed + * @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ + ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl, + ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags); + + int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); + int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags); + + int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl, + ceph::bufferlist& bl, RBD::AioCompletion *c, + uint64_t *mismatch_off, int op_flags); + + /** + * read async from image + * + * The target bufferlist is populated with references to buffers + * that contain the data for the given extent of the image. + * + * NOTE: If caching is enabled, the bufferlist will directly + * reference buffers in the cache to avoid an unnecessary data copy. + * As a result, if the user intends to modify the buffer contents + * directly, they should make a copy first (unconditionally, or when + * the reference count on ther underlying buffer is more than 1). + * + * @param off offset in image + * @param len length of read + * @param bl bufferlist to read into + * @param c aio completion to notify when read is complete + */ + int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int flush(); + /** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ + int aio_flush(RBD::AioCompletion *c); + + /** + * Drop any cached data for this image + * + * @returns 0 on success, negative error code on failure + */ + int invalidate_cache(); + + int poll_io_events(RBD::AioCompletion **comps, int numcomp); + + int metadata_get(const std::string &key, std::string *value); + int metadata_set(const std::string &key, const std::string &value); + int metadata_remove(const std::string &key); + /** + * Returns a pair of key/value for this image + */ + int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs); + + // RBD image mirroring support functions + int mirror_image_enable() CEPH_RBD_DEPRECATED; + int mirror_image_enable2(mirror_image_mode_t mode); + int mirror_image_disable(bool force); + int mirror_image_promote(bool force); + int mirror_image_demote(); + int mirror_image_resync(); + int mirror_image_create_snapshot(uint64_t *snap_id); + int mirror_image_create_snapshot2(uint32_t flags, uint64_t *snap_id); + int mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size); + int mirror_image_get_mode(mirror_image_mode_t *mode); + int mirror_image_get_global_status( + mirror_image_global_status_t *mirror_image_global_status, + size_t status_size); + int mirror_image_get_status( + mirror_image_status_t *mirror_image_status, size_t status_size) + CEPH_RBD_DEPRECATED; + int mirror_image_get_instance_id(std::string *instance_id); + int aio_mirror_image_promote(bool force, RBD::AioCompletion *c); + int aio_mirror_image_demote(RBD::AioCompletion *c); + int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size, RBD::AioCompletion *c); + int aio_mirror_image_get_mode(mirror_image_mode_t *mode, + RBD::AioCompletion *c); + int aio_mirror_image_get_global_status( + mirror_image_global_status_t *mirror_image_global_status, + size_t status_size, RBD::AioCompletion *c); + int aio_mirror_image_get_status( + mirror_image_status_t *mirror_image_status, size_t status_size, + RBD::AioCompletion *c) + CEPH_RBD_DEPRECATED; + int aio_mirror_image_create_snapshot(uint32_t flags, uint64_t *snap_id, + RBD::AioCompletion *c); + + int update_watch(UpdateWatchCtx *ctx, uint64_t *handle); + int update_unwatch(uint64_t handle); + + int list_watchers(std::list<image_watcher_t> &watchers); + + int config_list(std::vector<config_option_t> *options); + + int quiesce_watch(QuiesceWatchCtx *ctx, uint64_t *handle); + int quiesce_unwatch(uint64_t handle); + void quiesce_complete(uint64_t handle, int r); + +private: + friend class RBD; + + Image(const Image& rhs); + const Image& operator=(const Image& rhs); + + image_ctx_t ctx; +}; + +} // namespace librbd + +#if __GNUC__ >= 4 + #pragma GCC diagnostic pop +#endif + +#endif // __LIBRBD_HPP diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h new file mode 100644 index 000000000..54852caa8 --- /dev/null +++ b/src/include/rbd/object_map_types.h @@ -0,0 +1,13 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H +#define CEPH_RBD_OBJECT_MAP_TYPES_H + +#include "include/int_types.h" + +static const uint8_t OBJECT_NONEXISTENT = 0; +static const uint8_t OBJECT_EXISTS = 1; +static const uint8_t OBJECT_PENDING = 2; +static const uint8_t OBJECT_EXISTS_CLEAN = 3; + +#endif // CEPH_RBD_OBJECT_MAP_TYPES_H diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h new file mode 100644 index 000000000..35a1a8bc3 --- /dev/null +++ b/src/include/rbd_types.h @@ -0,0 +1,159 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include "include/types.h" +#include "rbd/features.h" + +/* New-style rbd image 'foo' consists of objects + * rbd_id.foo - id of image + * rbd_header.<id> - image metadata + * rbd_object_map.<id> - optional image object map + * rbd_data.<id>.00000000 + * rbd_data.<id>.00000001 + * ... - data + */ + +#define RBD_HEADER_PREFIX "rbd_header." +#define RBD_OBJECT_MAP_PREFIX "rbd_object_map." +#define RBD_DATA_PREFIX "rbd_data." +#define RBD_ID_PREFIX "rbd_id." + +/* + * old-style rbd image 'foo' consists of objects + * foo.rbd - image metadata + * rb.<idhi>.<idlo>.00000000 + * rb.<idhi>.<idlo>.00000001 + * ... - data + */ + +#define RBD_SUFFIX ".rbd" +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" +#define RBD_NAMESPACE "rbd_namespace" +#define RBD_TASK "rbd_task" + +/* + * rbd_children object in each pool contains omap entries + * that map parent (poolid, imageid, snapid) to a list of children + * (imageids; snapids aren't required because we get all the snapshot + * info from a read of the child's header object anyway). + * + * The clone operation writes a new item to this child list, and rm or + * flatten removes an item, and may remove the whole entry if no children + * exist after the rm/flatten. + * + * When attempting to remove a parent, all pools are searched for + * rbd_children objects with entries referring to that parent; if any + * exist (and those children exist), the parent removal is prevented. + */ +#define RBD_CHILDREN "rbd_children" +#define RBD_LOCK_NAME "rbd_lock" + +/** + * rbd_mirroring object in each pool contains pool-specific settings + * for configuring mirroring. + */ +#define RBD_MIRRORING "rbd_mirroring" + +/** + * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used + * for pool-level coordination between rbd-mirror daemons. + */ +#define RBD_MIRROR_LEADER "rbd_mirror_leader" +#define RBD_MIRROR_INSTANCE_PREFIX "rbd_mirror_instance." + +#define RBD_MAX_OBJ_NAME_SIZE 96 +#define RBD_MAX_BLOCK_NAME_SIZE 24 + +/** + * Maximum string length of the RBD v2 image id (not including + * null termination). This limit was derived from the existing + * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data." + * prefix and null termination. + */ +#define RBD_MAX_IMAGE_ID_LENGTH 14 + +/** + * Maximum string length of the RBD block object name prefix (not including + * null termination). + * + * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra> + * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id> + * + * Note: new features might require increasing this maximum prefix length. + */ +#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_MIGRATE_HEADER_TEXT "<<< Migrating RBD Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +#define RBD_GROUP_INVALID_POOL (-1) + +#define RBD_GROUP_HEADER_PREFIX "rbd_group_header." + +#define RBD_GROUP_DIRECTORY "rbd_group_directory" + +#define RBD_TRASH "rbd_trash" + +/** + * MON config-key prefix for storing optional remote cluster connectivity + * parameters + */ +#define RBD_MIRROR_CONFIG_KEY_PREFIX "rbd/mirror/" +#define RBD_MIRROR_SITE_NAME_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "site_name" +#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id" +#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX RBD_MIRROR_CONFIG_KEY_PREFIX "peer/" + +struct rbd_info { + ceph_le64 max_id; +} __attribute__ ((packed)); + +struct rbd_obj_snap_ondisk { + ceph_le64 id; + ceph_le64 image_size; +} __attribute__((packed)); + +struct rbd_obj_header_ondisk { + char text[40]; + char block_name[RBD_MAX_BLOCK_NAME_SIZE]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + ceph_le64 image_size; + ceph_le64 snap_seq; + ceph_le32 snap_count; + ceph_le32 reserved; + ceph_le64 snap_names_len; + struct rbd_obj_snap_ondisk snaps[0]; +} __attribute__((packed)); + +enum { + RBD_PROTECTION_STATUS_UNPROTECTED = 0, + RBD_PROTECTION_STATUS_UNPROTECTING = 1, + RBD_PROTECTION_STATUS_PROTECTED = 2, + RBD_PROTECTION_STATUS_LAST = 3 +}; + +#endif diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h new file mode 100644 index 000000000..eacc65e7b --- /dev/null +++ b/src/include/scope_guard.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SCOPE_GUARD +#define SCOPE_GUARD + +#include <utility> + +template <typename F> +struct scope_guard { + F f; + scope_guard() = delete; + scope_guard(const scope_guard &) = delete; + scope_guard(scope_guard &&) = default; + scope_guard & operator=(const scope_guard &) = delete; + scope_guard & operator=(scope_guard &&) = default; + scope_guard(const F& f) : f(f) {} + scope_guard(F &&f) : f(std::move(f)) {} + template<typename... Args> + scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {} + ~scope_guard() { + std::move(f)(); // Support at-most-once functions + } +}; + +template <typename F> +[[nodiscard("Unassigned scope guards will execute immediately")]] +scope_guard<F> make_scope_guard(F &&f) { + return scope_guard<F>(std::forward<F>(f)); +} + +template<typename F, typename... Args> +[[nodiscard("Unassigned scope guards will execute immediately")]] +scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) { + return { std::in_place, std::forward<Args>(args)... }; +} + +#endif diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h new file mode 100644 index 000000000..14b5efa1d --- /dev/null +++ b/src/include/sock_compat.h @@ -0,0 +1,43 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_SOCK_COMPAT_H +#define CEPH_SOCK_COMPAT_H + +#include "include/compat.h" +#include <sys/socket.h> + +/* + * This optimization may not be available on all platforms (e.g. OSX). + * Apparently a similar approach based on TCP_CORK can be used. + */ +#ifndef MSG_MORE +# define MSG_MORE 0 +#endif + +/* + * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE. + */ +#ifndef MSG_NOSIGNAL +# define MSG_NOSIGNAL 0 +# ifdef SO_NOSIGPIPE +# define CEPH_USE_SO_NOSIGPIPE +# else +# define CEPH_USE_SIGPIPE_BLOCKER +# warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!" +# endif +#endif + +int socket_cloexec(int domain, int type, int protocol); +int socketpair_cloexec(int domain, int type, int protocol, int sv[2]); +int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen); + +#endif diff --git a/src/include/spinlock.h b/src/include/spinlock.h new file mode 100644 index 000000000..3f12bdc00 --- /dev/null +++ b/src/include/spinlock.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * @author Jesse Williamson <jwilliamson@suse.de> + * +*/ + +#ifndef CEPH_SPINLOCK_HPP +#define CEPH_SPINLOCK_HPP + +#include <atomic> + +namespace ceph { +inline namespace version_1_0 { + +class spinlock; + +inline void spin_lock(std::atomic_flag& lock); +inline void spin_unlock(std::atomic_flag& lock); +inline void spin_lock(ceph::spinlock& lock); +inline void spin_unlock(ceph::spinlock& lock); + +/* A pre-packaged spinlock type modelling BasicLockable: */ +class spinlock final +{ + std::atomic_flag af = ATOMIC_FLAG_INIT; + + public: + void lock() { + ceph::spin_lock(af); + } + + void unlock() noexcept { + ceph::spin_unlock(af); + } +}; + +// Free functions: +inline void spin_lock(std::atomic_flag& lock) +{ + while(lock.test_and_set(std::memory_order_acquire)) + ; +} + +inline void spin_unlock(std::atomic_flag& lock) +{ + lock.clear(std::memory_order_release); +} + +inline void spin_lock(std::atomic_flag *lock) +{ + spin_lock(*lock); +} + +inline void spin_unlock(std::atomic_flag *lock) +{ + spin_unlock(*lock); +} + +inline void spin_lock(ceph::spinlock& lock) +{ + lock.lock(); +} + +inline void spin_unlock(ceph::spinlock& lock) +{ + lock.unlock(); +} + +inline void spin_lock(ceph::spinlock *lock) +{ + spin_lock(*lock); +} + +inline void spin_unlock(ceph::spinlock *lock) +{ + spin_unlock(*lock); +} + +} // inline namespace (version) +} // namespace ceph + +#endif diff --git a/src/include/stat.h b/src/include/stat.h new file mode 100644 index 000000000..19398758e --- /dev/null +++ b/src/include/stat.h @@ -0,0 +1,145 @@ +#ifndef CEPH_STAT_H +#define CEPH_STAT_H + +#include <acconfig.h> + +#include <sys/stat.h> + +/* + * Access time-related `struct stat` members. + * + * Note that for each of the stat member get/set functions below, setting a + * high-res value (stat_set_*_nsec) on a platform without high-res support is + * a no-op. + */ + +#ifdef HAVE_STAT_ST_MTIM_TV_NSEC + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return st->st_mtim.tv_nsec; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_mtim.tv_nsec = nsec; +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return st->st_atim.tv_nsec; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_atim.tv_nsec = nsec; +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return st->st_ctim.tv_nsec; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_ctim.tv_nsec = nsec; +} + +#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC) + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return st->st_mtimespec.tv_nsec; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_mtimespec.tv_nsec = nsec; +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return st->st_atimespec.tv_nsec; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_atimespec.tv_nsec = nsec; +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return st->st_ctimespec.tv_nsec; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_ctimespec.tv_nsec = nsec; +} + +#else + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ +} + +#endif + +/* + * Access second-resolution `struct stat` members. + */ + +static inline uint32_t stat_get_mtime_sec(struct stat *st) +{ + return st->st_mtime; +} + +static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec) +{ + st->st_mtime = sec; +} + +static inline uint32_t stat_get_atime_sec(struct stat *st) +{ + return st->st_atime; +} + +static inline void stat_set_atime_sec(struct stat *st, uint32_t sec) +{ + st->st_atime = sec; +} + +static inline uint32_t stat_get_ctime_sec(struct stat *st) +{ + return st->st_ctime; +} + +static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec) +{ + st->st_ctime = sec; +} + +#endif diff --git a/src/include/statlite.h b/src/include/statlite.h new file mode 100644 index 000000000..0ff4b04e7 --- /dev/null +++ b/src/include/statlite.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_STATLITE_H +#define CEPH_STATLITE_H + +extern "C" { + +#include <time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <dirent.h> + +#include "include/compat.h" + +struct statlite { + dev_t st_dev; /* device */ + ino_t st_ino; /* inode */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device type (if inode device)*/ + unsigned long st_litemask; /* bit mask for optional fields */ + /***************************************************************/ + /**** Remaining fields are optional according to st_litemask ***/ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for filesystem I/O */ + blkcnt_t st_blocks; /* number of blocks allocated */ + struct timespec st_atim; /* Time of last access. */ + struct timespec st_mtim; /* Time of last modification. */ + struct timespec st_ctim; /* Time of last status change. */ + //time_t st_atime; /* time of last access */ + //time_t st_mtime; /* time of last modification */ + //time_t st_ctime; /* time of last change */ +}; + +#define S_STATLITE_SIZE 1 +#define S_STATLITE_BLKSIZE 2 +#define S_STATLITE_BLOCKS 4 +#define S_STATLITE_ATIME 8 +#define S_STATLITE_MTIME 16 +#define S_STATLITE_CTIME 32 + +#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) +#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) +#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) +#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) +#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) +#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) + +#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) +#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) +#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) +#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) +#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) +#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) + + +// readdirplus etc. + +struct dirent_plus { + struct dirent d_dirent; /* dirent struct for this entry */ + struct stat d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; +struct dirent_lite { + struct dirent d_dirent; /* dirent struct for this entry */ + struct statlite d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; + +} +#endif diff --git a/src/include/str_list.h b/src/include/str_list.h new file mode 100644 index 000000000..cad76c1d6 --- /dev/null +++ b/src/include/str_list.h @@ -0,0 +1,97 @@ +#ifndef CEPH_STRLIST_H +#define CEPH_STRLIST_H + +#include <list> +#include <set> +#include <string> +#include <string_view> +#include <vector> + +namespace ceph { + +/// Split a string using the given delimiters, passing each piece as a +/// (non-null-terminated) std::string_view to the callback. +template <typename Func> // where Func(std::string_view) is a valid call +void for_each_substr(std::string_view s, const char *delims, Func&& f) +{ + auto pos = s.find_first_not_of(delims); + while (pos != s.npos) { + s.remove_prefix(pos); // trim delims from the front + auto end = s.find_first_of(delims); + f(s.substr(0, end)); + pos = s.find_first_not_of(delims, end); + } +} + +} // namespace ceph + +/** + * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as list + * @param [out] str_list List modified containing str after it has been split +**/ +extern void get_str_list(const std::string& str, + std::list<std::string>& str_list); + +/** + * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as list + * @param [in] delims characters used to split **str** + * @param [out] str_list List modified containing str after it has been split +**/ +extern void get_str_list(const std::string& str, + const char *delims, + std::list<std::string>& str_list); + +std::list<std::string> get_str_list(const std::string& str, + const char *delims = ";,= \t"); + +/** + * Split **str** into a vector of strings, using the ";,= \t" delimiters and output the result in **str_vec**. + * + * @param [in] str String to split and save as Vector + * @param [out] str_vec Vector modified containing str after it has been split +**/ +void get_str_vec(std::string_view str, std::vector<std::string>& str_vec); + +/** + * Split **str** into a vector of strings, using the **delims** delimiters and output the result in **str_vec**. + * + * @param [in] str String to split and save as Vector + * @param [in] delims characters used to split **str** + * @param [out] str_vec Vector modified containing str after it has been split +**/ +void get_str_vec(std::string_view str, + const char *delims, + std::vector<std::string>& str_vec); + +std::vector<std::string> get_str_vec(std::string_view str, + const char *delims = ";,= \t"); + +/** + * Return a String containing the vector **v** joined with **sep** + * + * If **v** is empty, the function returns an empty string + * For each element in **v**, + * it will concatenate this element and **sep** with result + * + * @param [in] v Vector to join as a String + * @param [in] sep String used to join each element from **v** + * @return empty string if **v** is empty or concatenated string +**/ +inline std::string str_join(const std::vector<std::string>& v, const std::string& sep) +{ + if (v.empty()) + return std::string(); + auto i = v.cbegin(); + std::string r = *i; + for (++i; i != v.cend(); ++i) { + r += sep; + r += *i; + } + return r; +} + +#endif diff --git a/src/include/str_map.h b/src/include/str_map.h new file mode 100644 index 000000000..7f354fd46 --- /dev/null +++ b/src/include/str_map.h @@ -0,0 +1,180 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_STRMAP_H +#define CEPH_STRMAP_H + +#define CONST_DELIMS ",;\t\n " + +#include <map> +#include <string> +#include <sstream> + +template <typename Func> +void for_each_pair(std::string_view s, const char* delims, Func&& f) +{ + auto pos = s.find_first_not_of(delims); + while (pos != s.npos) { + s.remove_prefix(pos); // trim delims from the front + auto end = s.find_first_of(delims); + auto kv = s.substr(0, end); + if (auto equal = kv.find('='); equal != kv.npos) { + f(kv.substr(0, equal), kv.substr(equal + 1)); + } else { + f(kv.substr(0, equal), std::string_view()); + } + pos = s.find_first_not_of(delims, end); + } +} + +using str_map_t = std::map<std::string,std::string>; + +/** + * Parse **str** and set **str_map** with the key/value pairs read + * from it. The format of **str** is either a well formed JSON object + * or a custom key[=value] plain text format. + * + * JSON is tried first. If successfully parsed into a JSON object, it + * is copied into **str_map** verbatim. If it is not a JSON object ( a + * string, integer etc. ), -EINVAL is returned and **ss** is set to + * a human readable error message. + * + * If **str** is no valid JSON and if **fallback_to_plain** is set to true + * (default: true) it is assumed to be a string containing white space + * separated key=value pairs. A white space is either space, tab or newline. + * Function **get_str_map** will be leveraged to parse the plain-text + * key/value pairs. + * + * @param [in] str JSON or plain text key/value pairs + * @param [out] ss human readable message on error + * @param [out] str_map key/value pairs read from str + * @param [in] fallback_to_plain attempt parsing as plain-text if json fails + * @return **0** on success or a -EINVAL on error. + */ +int get_json_str_map( + const std::string &str, + std::ostream &ss, + str_map_t* str_map, + bool fallback_to_plain = true); + +/** + * Parse **str** and set **str_map** with the key/value pairs read from + * it. The format of **str** is a number of custom key[=value] pairs in + * plain text format. + * + * The string will be parsed taking **delims** as field delimiters for + * key/values. The value is optional resulting in an empty string when + * not provided. For example, using white space as delimiters: + * + * insert your own=political/ideological statement=here + * + * will be parsed into: + * + * { "insert": "", + * "your": "", + * "own": "political/ideological", + * "statement": "here" } + * + * Alternative delimiters may be provided. For instance, specifying + * "white space and slash", for the above statement, would be parsed + * into: + * + * { "insert": "", + * "your": "", + * "own": "political", + * "ideological": "", + * "statement": "here" } + * + * See how adding '/' to the delimiters field will spawn a new key without + * a set value. + * + * Always returns 0, as there is no condition for failure. + * + * @param [in] str plain text key/value pairs + * @param [in] delims field delimiters to be used for parsing str + * @param [out] str_map key/value pairs parsed from str + * @return **0** + */ +int get_str_map( + const std::string &str, + str_map_t* str_map, + const char *delims = CONST_DELIMS); + +// an alternate form (as we never fail): +str_map_t get_str_map( + const std::string& str, + const char* delim = CONST_DELIMS); + +/** + * Returns the value of **key** in **str_map** if available. + * + * If **key** is not available in **str_map**, and if **def_val** is + * not-NULL then returns **def_val**. Otherwise checks if the value of + * **key** is an empty string and if so will return **key**. + * If the map contains **key**, the function returns the value of **key**. + * + * @param[in] str_map Map to obtain **key** from + * @param[in] key The key to search for in the map + * @param[in] def_val The value to return in case **key** is not present + */ +std::string get_str_map_value( + const str_map_t& str_map, + const std::string &key, + const std::string *def_val = nullptr); + +/** + * Returns the value of **key** in **str_map** if available. + * + * If **key** is available in **str_map** returns the value of **key**. + * + * If **key** is not available in **str_map**, and if **def_key** + * is not-NULL and available in **str_map**, then returns the value + * of **def_key**. + * + * Otherwise returns an empty string. + * + * @param[in] str_map Map to obtain **key** or **def_key** from + * @param[in] key Key to obtain the value of from **str_map** + * @param[in] def_key Key to fallback to if **key** is not present + * in **str_map** + */ +std::string get_str_map_key( + const str_map_t& str_map, + const std::string &key, + const std::string *fallback_key = nullptr); + +// This function's only purpose is to check whether a given map has only +// ONE key with an empty value (which would mean that 'get_str_map()' read +// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such +// event, to assign said 'VALUE' to a given 'def_key', such that we end up +// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the +// original "m = { 'VALUE' : '' }". +int get_conf_str_map_helper( + const std::string &str, + std::ostringstream &oss, + str_map_t* str_map, + const std::string &default_key); + +std::string get_value_via_strmap( + const std::string& conf_string, + std::string_view default_key); + +std::string get_value_via_strmap( + const std::string& conf_string, + const std::string& key, + std::string_view default_key); + +#endif diff --git a/src/include/stringify.h b/src/include/stringify.h new file mode 100644 index 000000000..1b2a130c9 --- /dev/null +++ b/src/include/stringify.h @@ -0,0 +1,33 @@ +#ifndef __CEPH_STRINGIFY_H +#define __CEPH_STRINGIFY_H + +#include <string> +#include <sstream> + +#include "include/types.h" + +template<typename T> +inline std::string stringify(const T& a) { +#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER)) + static __thread std::ostringstream ss; + ss.str(""); +#else + std::ostringstream ss; +#endif + ss << a; + return ss.str(); +} + +template <class T, class A> +T joinify(const A &begin, const A &end, const T &t) +{ + T result; + for (A it = begin; it != end; it++) { + if (!result.empty()) + result.append(t); + result.append(*it); + } + return result; +} + +#endif diff --git a/src/include/timegm.h b/src/include/timegm.h new file mode 100644 index 000000000..fb970432d --- /dev/null +++ b/src/include/timegm.h @@ -0,0 +1,79 @@ +// (C) Copyright Howard Hinnant +// (C) Copyright 2010-2011 Vicente J. Botet Escriba +// Use, modification and distribution are subject to the Boost Software License, +// Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt). + +//===-------------------------- locale ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// This code was adapted by Vicente from Howard Hinnant's experimental work +// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get() + +#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H +#define BOOST_CHRONO_IO_TIME_POINT_IO_H + +#include <time.h> + +static int32_t is_leap(int32_t year) { + if(year % 400 == 0) + return 1; + if(year % 100 == 0) + return 0; + if(year % 4 == 0) + return 1; + return 0; +} + +static int32_t days_from_0(int32_t year) { + year--; + return 365 * year + (year / 400) - (year/100) + (year / 4); +} + +int32_t static days_from_1970(int32_t year) { + static const int days_from_0_to_1970 = days_from_0(1970); + return days_from_0(year) - days_from_0_to_1970; +} + +static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) { + static const int32_t days[2][12] = + { + { 0,31,59,90,120,151,181,212,243,273,304,334}, + { 0,31,60,91,121,152,182,213,244,274,305,335} + }; + + return days[is_leap(year)][month-1] + day - 1; +} + +static time_t internal_timegm(tm const *t) { + int year = t->tm_year + 1900; + int month = t->tm_mon; + if(month > 11) + { + year += month/12; + month %= 12; + } + else if(month < 0) + { + int years_diff = (-month + 11)/12; + year -= years_diff; + month+=12 * years_diff; + } + month++; + int day = t->tm_mday; + int day_of_year = days_from_1jan(year,month,day); + int days_since_epoch = days_from_1970(year) + day_of_year ; + + time_t seconds_in_day = 3600 * 24; + time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec; + + return result; +} + +#endif diff --git a/src/include/types.h b/src/include/types.h new file mode 100644 index 000000000..a76360db4 --- /dev/null +++ b/src/include/types.h @@ -0,0 +1,629 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_TYPES_H +#define CEPH_TYPES_H + +// this is needed for ceph_fs to compile in userland +#include "int_types.h" +#include "byteorder.h" + +#include "uuid.h" + +#include <netinet/in.h> +#include <fcntl.h> +#include <string.h> + +#include "ceph_fs.h" +#include "ceph_frag.h" +#include "rbd_types.h" + +#ifdef __cplusplus +#ifndef _BACKWARD_BACKWARD_WARNING_H +#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* +#endif +#endif + +extern "C" { +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "statlite.h" +} + +#include <string> +#include <list> +#include <set> +#include <boost/container/flat_set.hpp> +#include <boost/container/flat_map.hpp> +#include <map> +#include <vector> +#include <optional> +#include <ostream> +#include <iomanip> + + +#include "include/unordered_map.h" + +#include "object.h" +#include "intarith.h" + +#include "acconfig.h" + +#include "assert.h" + +// DARWIN compatibility +#ifdef __APPLE__ +typedef long long loff_t; +typedef long long off64_t; +#define O_DIRECT 00040000 +#endif + +// FreeBSD compatibility +#ifdef __FreeBSD__ +typedef off_t loff_t; +typedef off_t off64_t; +#endif + +#if defined(__sun) || defined(_AIX) +typedef off_t loff_t; +#endif + + +// -- io helpers -- + +// Forward declare all the I/O helpers so strict ADL can find them in +// the case of containers of containers. I'm tempted to abstract this +// stuff using template templates like I did for denc. + +namespace std { +template<class A, class B> +inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v); +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v); +template<class A, std::size_t N, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v); +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t); +template<typename T> +inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t); +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m); +} + +namespace boost { +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t); + +namespace container { +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset); +} +} + +namespace std { +template<class A, class B> +inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) { + return out << v.first << "," << v.second; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) { + bool first = true; + out << "["; + for (const auto& p : v) { + if (!first) out << ","; + out << p; + first = false; + } + out << "]"; + return out; +} + +template<class A, std::size_t N, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) { + bool first = true; + out << "["; + for (const auto& p : v) { + if (!first) out << ","; + out << p; + first = false; + } + out << "]"; + return out; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) { + out << "<"; + for (auto p = v.begin(); p != v.end(); ++p) { + if (p != v.begin()) out << ","; + out << *p; + } + out << ">"; + return out; +} + +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) { + auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable { + out << e; + if (++i != n) + out << ","; + }; + ceph::for_each(t, f); + return out; +} + +// Mimics boost::optional +template<typename T> +inline std::ostream& operator<<(std::ostream& out, const std::optional<T> &t) { + if (!t) + out << "--" ; + else + out << ' ' << *t ; + return out; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) { + for (auto it = ilist.begin(); + it != ilist.end(); + ++it) { + if (it != ilist.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m) +{ + out << "{"; + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}"; + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m) +{ + out << "{{"; + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}}"; + return out; +} + +} // namespace std + +namespace boost { +namespace tuples { +template<typename A, typename B, typename C> +inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) { + return out << boost::get<0>(t) << "," + << boost::get<1>(t) << "," + << boost::get<2>(t); +} +} +namespace container { +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) { + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + return out; +} +} +} // namespace boost + + + +/* + * comparators for stl containers + */ +// for ceph::unordered_map: +// ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals; +struct eqstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) == 0; + } +}; + +// for set, map +struct ltstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } +}; + + +namespace ceph { + class Formatter; +} + +#include "encoding.h" + +WRITE_RAW_ENCODER(ceph_fsid) +WRITE_RAW_ENCODER(ceph_file_layout) +WRITE_RAW_ENCODER(ceph_dir_layout) +WRITE_RAW_ENCODER(ceph_mds_session_head) +WRITE_RAW_ENCODER(ceph_mds_request_head_legacy) +WRITE_RAW_ENCODER(ceph_mds_request_release) +WRITE_RAW_ENCODER(ceph_filelock) +WRITE_RAW_ENCODER(ceph_mds_caps_head) +WRITE_RAW_ENCODER(ceph_mds_caps_export_body) +WRITE_RAW_ENCODER(ceph_mds_caps_non_export_body) +WRITE_RAW_ENCODER(ceph_mds_cap_peer) +WRITE_RAW_ENCODER(ceph_mds_cap_release) +WRITE_RAW_ENCODER(ceph_mds_cap_item) +WRITE_RAW_ENCODER(ceph_mds_lease) +WRITE_RAW_ENCODER(ceph_mds_snap_head) +WRITE_RAW_ENCODER(ceph_mds_snap_realm) +WRITE_RAW_ENCODER(ceph_mds_reply_head) +WRITE_RAW_ENCODER(ceph_mds_reply_cap) +WRITE_RAW_ENCODER(ceph_mds_cap_reconnect) +WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect) +WRITE_RAW_ENCODER(ceph_frag_tree_split) +WRITE_RAW_ENCODER(ceph_osd_reply_head) +WRITE_RAW_ENCODER(ceph_osd_op) +WRITE_RAW_ENCODER(ceph_msg_header) +WRITE_RAW_ENCODER(ceph_msg_footer) +WRITE_RAW_ENCODER(ceph_msg_footer_old) +WRITE_RAW_ENCODER(ceph_mon_subscribe_item) + +WRITE_RAW_ENCODER(ceph_mon_statfs) +WRITE_RAW_ENCODER(ceph_mon_statfs_reply) + +// ---------------------- +// some basic types + +// NOTE: these must match ceph_fs.h typedefs +typedef uint64_t ceph_tid_t; // transaction id +typedef uint64_t version_t; +typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) + +// -------------------------------------- +// identify individual mount clients by 64bit value + +struct client_t { + int64_t v; + + // cppcheck-suppress noExplicitConstructor + client_t(int64_t _v = -2) : v(_v) {} + + void encode(ceph::buffer::list& bl) const { + using ceph::encode; + encode(v, bl); + } + void decode(ceph::buffer::list::const_iterator& bl) { + using ceph::decode; + decode(v, bl); + } +}; +WRITE_CLASS_ENCODER(client_t) + +static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; } +static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; } +static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; } +static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; } +static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; } +static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; } + +static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; } +static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; } + +inline std::ostream& operator<<(std::ostream& out, const client_t& c) { + return out << c.v; +} + + + +// -- + +namespace { +inline std::ostream& format_u(std::ostream& out, const uint64_t v, const uint64_t n, + const int index, const uint64_t mult, const char* u) + { + char buffer[32]; + + if (index == 0) { + (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u); + } else if ((v % mult) == 0) { + // If this is an even multiple of the base, always display + // without any decimal fraction. + (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u); + } else { + // We want to choose a precision that reflects the best choice + // for fitting in 5 characters. This can get rather tricky when + // we have numbers that are very close to an order of magnitude. + // For example, when displaying 10239 (which is really 9.999K), + // we want only a single place of precision for 10.0K. We could + // develop some complex heuristics for this, but it's much + // easier just to try each combination in turn. + int i; + for (i = 2; i >= 0; i--) { + if (snprintf(buffer, sizeof(buffer), "%.*f%s", i, + static_cast<double>(v) / mult, u) <= 7) + break; + } + } + + return out << buffer; + } +} + +/* + * Use this struct to pretty print values that should be formatted with a + * decimal unit prefix (the classic SI units). No actual unit will be added. + */ +struct si_u_t { + uint64_t v; + explicit si_u_t(uint64_t _v) : v(_v) {}; +}; + +inline std::ostream& operator<<(std::ostream& out, const si_u_t& b) +{ + uint64_t n = b.v; + int index = 0; + uint64_t mult = 1; + const char* u[] = {"", "k", "M", "G", "T", "P", "E"}; + + while (n >= 1000 && index < 7) { + n /= 1000; + index++; + mult *= 1000; + } + + return format_u(out, b.v, n, index, mult, u[index]); +} + +/* + * Use this struct to pretty print values that should be formatted with a + * binary unit prefix (IEC units). Since binary unit prefixes are to be used for + * "multiples of units in data processing, data transmission, and digital + * information" (so bits and bytes) and so far bits are not printed, the unit + * "B" for "byte" is added besides the multiplier. + */ +struct byte_u_t { + uint64_t v; + explicit byte_u_t(uint64_t _v) : v(_v) {}; +}; + +inline std::ostream& operator<<(std::ostream& out, const byte_u_t& b) +{ + uint64_t n = b.v; + int index = 0; + const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"}; + + while (n >= 1024 && index < 7) { + n /= 1024; + index++; + } + + return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]); +} + +inline std::ostream& operator<<(std::ostream& out, const ceph_mon_subscribe_item& i) +{ + return out << (long)i.start + << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+"); +} + +struct weightf_t { + float v; + // cppcheck-suppress noExplicitConstructor + weightf_t(float _v) : v(_v) {} +}; + +inline std::ostream& operator<<(std::ostream& out, const weightf_t& w) +{ + if (w.v < -0.01F) { + return out << "-"; + } else if (w.v < 0.000001F) { + return out << "0"; + } else { + std::streamsize p = out.precision(); + return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p); + } +} + +struct shard_id_t { + int8_t id; + + shard_id_t() : id(0) {} + explicit shard_id_t(int8_t _id) : id(_id) {} + + operator int8_t() const { return id; } + + const static shard_id_t NO_SHARD; + + void encode(ceph::buffer::list &bl) const { + using ceph::encode; + encode(id, bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + decode(id, bl); + } + + bool operator==(const shard_id_t&) const = default; + auto operator<=>(const shard_id_t&) const = default; +}; +WRITE_CLASS_ENCODER(shard_id_t) +std::ostream &operator<<(std::ostream &lhs, const shard_id_t &rhs); + +#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || \ + defined(__FreeBSD__) || defined(_WIN32) +extern "C" { +__s32 ceph_to_hostos_errno(__s32 e); +__s32 hostos_to_ceph_errno(__s32 e); +} +#else +#define ceph_to_hostos_errno(e) (e) +#define hostos_to_ceph_errno(e) (e) +#endif + +struct errorcode32_t { + int32_t code; + + errorcode32_t() : code(0) {} + // cppcheck-suppress noExplicitConstructor + explicit errorcode32_t(int32_t i) : code(i) {} + + operator int() const { return code; } + int* operator&() { return &code; } + errorcode32_t& operator=(int32_t i) { + code = i; + return *this; + } + bool operator==(const errorcode32_t&) const = default; + auto operator<=>(const errorcode32_t&) const = default; + + void encode(ceph::buffer::list &bl) const { + using ceph::encode; + __s32 newcode = hostos_to_ceph_errno(code); + encode(newcode, bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + decode(code, bl); + code = ceph_to_hostos_errno(code); + } +}; +WRITE_CLASS_ENCODER(errorcode32_t) + +template <uint8_t S> +struct sha_digest_t { + constexpr static uint32_t SIZE = S; + // TODO: we might consider std::array in the future. Avoiding it for now + // as sha_digest_t is a part of our public API. + unsigned char v[S] = {0}; + + std::string to_str() const { + char str[S * 2 + 1] = {0}; + str[0] = '\0'; + for (size_t i = 0; i < S; i++) { + ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i])); + } + return std::string(str); + } + sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); }; + sha_digest_t() {} + + bool operator==(const sha_digest_t& r) const { + return ::memcmp(v, r.v, SIZE) == 0; + } + bool operator!=(const sha_digest_t& r) const { + return ::memcmp(v, r.v, SIZE) != 0; + } + + void encode(ceph::buffer::list &bl) const { + // copy to avoid reinterpret_cast, is_pod and other nasty things + using ceph::encode; + std::array<unsigned char, SIZE> tmparr; + memcpy(tmparr.data(), v, SIZE); + encode(tmparr, bl); + } + void decode(ceph::buffer::list::const_iterator &bl) { + using ceph::decode; + std::array<unsigned char, SIZE> tmparr; + decode(tmparr, bl); + memcpy(v, tmparr.data(), SIZE); + } +}; + +template<uint8_t S> +inline std::ostream &operator<<(std::ostream &out, const sha_digest_t<S> &b) { + std::string str = b.to_str(); + return out << str; +} + +#if FMT_VERSION >= 90000 +template <uint8_t S> struct fmt::formatter<sha_digest_t<S>> : fmt::ostream_formatter {}; +#endif + +using sha1_digest_t = sha_digest_t<20>; +WRITE_CLASS_ENCODER(sha1_digest_t) + +using sha256_digest_t = sha_digest_t<32>; +WRITE_CLASS_ENCODER(sha256_digest_t) + +using sha512_digest_t = sha_digest_t<64>; + +using md5_digest_t = sha_digest_t<16>; +WRITE_CLASS_ENCODER(md5_digest_t) + + +#endif diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h new file mode 100644 index 000000000..aee5f5a76 --- /dev/null +++ b/src/include/unordered_map.h @@ -0,0 +1,11 @@ +#ifndef CEPH_UNORDERED_MAP_H +#define CEPH_UNORDERED_MAP_H + +#include <unordered_map> + +namespace ceph { + using std::unordered_map; + using std::unordered_multimap; +} + +#endif diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h new file mode 100644 index 000000000..e30e1799e --- /dev/null +++ b/src/include/unordered_set.h @@ -0,0 +1,10 @@ +#ifndef CEPH_UNORDERED_SET_H +#define CEPH_UNORDERED_SET_H + +#include <unordered_set> + +namespace ceph { + using std::unordered_set; +} + +#endif diff --git a/src/include/uses_allocator.h b/src/include/uses_allocator.h new file mode 100644 index 000000000..35cdbd709 --- /dev/null +++ b/src/include/uses_allocator.h @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +// Derived from: +/* uses_allocator.h -*-C++-*- + * + * Copyright (C) 2016 Pablo Halpern <phalpern@halpernwightsoftware.com> + * Distributed under the Boost Software License - Version 1.0 + */ +// Downloaded from https://github.com/phalpern/uses-allocator.git + +#pragma once + +#include <memory> +#include <tuple> +#include <type_traits> +#include <utility> + +namespace ceph { + +namespace internal { +template <class T, class Tuple, std::size_t... Indexes> +T make_from_tuple_imp(Tuple&& t, std::index_sequence<Indexes...>) +{ + return T(std::get<Indexes>(std::forward<Tuple>(t))...); +} +} // namespace internal + +template<class T, class Tuple> +T make_from_tuple(Tuple&& args_tuple) +{ + using namespace internal; + using Indices = std::make_index_sequence<std::tuple_size_v< + std::decay_t<Tuple>>>; + return make_from_tuple_imp<T>(std::forward<Tuple>(args_tuple), Indices{}); +} + +//////////////////////////////////////////////////////////////////////// + +// Forward declaration +template <class T, class Alloc, class... Args> +auto uses_allocator_construction_args(const Alloc& a, Args&&... args); + +namespace internal { + +template <class T, class A> +struct has_allocator : std::uses_allocator<T, A> { }; + +// Specialization of `has_allocator` for `std::pair` +template <class T1, class T2, class A> +struct has_allocator<std::pair<T1, T2>, A> + : std::integral_constant<bool, has_allocator<T1, A>::value || + has_allocator<T2, A>::value> +{ +}; + +template <bool V> using boolean_constant = std::integral_constant<bool, V>; + +template <class T> struct is_pair : std::false_type { }; + +template <class T1, class T2> +struct is_pair<std::pair<T1, T2>> : std::true_type { }; + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload is handles types for which `has_allocator<T, Alloc>` is false. +template <class T, class Unused1, class Unused2, class Alloc, class... Args> +auto uses_allocator_args_imp(Unused1 /* is_pair */, + std::false_type /* has_allocator */, + Unused2 /* uses prefix allocator arg */, + const Alloc& /* ignored */, + Args&&... args) +{ + // Allocator is ignored + return std::forward_as_tuple(std::forward<Args>(args)...); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is +// true and constructor `T(allocator_arg_t, a, args...)` is valid. +template <class T, class Alloc, class... Args> +auto uses_allocator_args_imp(std::false_type /* is_pair */, + std::true_type /* has_allocator */, + std::true_type /* uses prefix allocator arg */, + const Alloc& a, + Args&&... args) +{ + // Allocator added to front of argument list, after `allocator_arg`. + return std::tuple<std::allocator_arg_t, const Alloc&, + Args&&...>(std::allocator_arg, a, std::forward<Args>(args)...); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles non-pair `T` for which `has_allocator<T, Alloc>` is +// true and constructor `T(allocator_arg_t, a, args...)` NOT valid. +// This function will produce invalid results unless `T(args..., a)` is valid. +template <class T1, class Alloc, class... Args> +auto uses_allocator_args_imp(std::false_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a, + Args&&... args) +{ + // Allocator added to end of argument list + return std::forward_as_tuple(std::forward<Args>(args)..., a); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles specializations of `T` = `std::pair` for which +// `has_allocator<T, Alloc>` is true for either or both of the elements and +// piecewise_construct arguments are passed in. +template <class T, class Alloc, class Tuple1, class Tuple2> +auto uses_allocator_args_imp(std::true_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a, + std::piecewise_construct_t, + Tuple1&& x, Tuple2&& y) +{ + using T1 = typename T::first_type; + using T2 = typename T::second_type; + + return std::make_tuple( + std::piecewise_construct, + std::apply([&a](auto&&... args1) -> auto { + return uses_allocator_construction_args<T1>( + a, std::forward<decltype(args1)>(args1)...); + }, std::forward<Tuple1>(x)), + std::apply([&a](auto&&... args2) -> auto { + return uses_allocator_construction_args<T2>( + a, std::forward<decltype(args2)>(args2)...); + }, std::forward<Tuple2>(y)) + ); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles specializations of `T` = `std::pair` for which +// `has_allocator<T, Alloc>` is true for either or both of the elements and +// no other constructor arguments are passed in. +template <class T, class Alloc> +auto uses_allocator_args_imp(std::true_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a) +{ + // using T1 = typename T::first_type; + // using T2 = typename T::second_type; + + // return std::make_tuple( + // piecewise_construct, + // uses_allocator_construction_args<T1>(a), + // uses_allocator_construction_args<T2>(a)); + return uses_allocator_construction_args<T>(a, std::piecewise_construct, + std::tuple<>{}, std::tuple<>{}); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles specializations of `T` = `std::pair` for which +// `has_allocator<T, Alloc>` is true for either or both of the elements and +// a single argument of type const-lvalue-of-pair is passed in. +template <class T, class Alloc, class U1, class U2> +auto uses_allocator_args_imp(std::true_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a, + const std::pair<U1, U2>& arg) +{ + // using T1 = typename T::first_type; + // using T2 = typename T::second_type; + + // return std::make_tuple( + // piecewise_construct, + // uses_allocator_construction_args<T1>(a, arg.first), + // uses_allocator_construction_args<T2>(a, arg.second)); + return uses_allocator_construction_args<T>(a, std::piecewise_construct, + std::forward_as_tuple(arg.first), + std::forward_as_tuple(arg.second)); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles specializations of `T` = `std::pair` for which +// `has_allocator<T, Alloc>` is true for either or both of the elements and +// a single argument of type rvalue-of-pair is passed in. +template <class T, class Alloc, class U1, class U2> +auto uses_allocator_args_imp(std::true_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a, + std::pair<U1, U2>&& arg) +{ + // using T1 = typename T::first_type; + // using T2 = typename T::second_type; + + // return std::make_tuple( + // piecewise_construct, + // uses_allocator_construction_args<T1>(a, forward<U1>(arg.first)), + // uses_allocator_construction_args<T2>(a, forward<U2>(arg.second))); + return uses_allocator_construction_args<T>(a, std::piecewise_construct, + std::forward_as_tuple(std::forward<U1>(arg.first)), + std::forward_as_tuple(std::forward<U2>(arg.second))); +} + +// Return a tuple of arguments appropriate for uses-allocator construction +// with allocator `Alloc` and ctor arguments `Args`. +// This overload handles specializations of `T` = `std::pair` for which +// `has_allocator<T, Alloc>` is true for either or both of the elements and +// two additional constructor arguments are passed in. +template <class T, class Alloc, class U1, class U2> +auto uses_allocator_args_imp(std::true_type /* is_pair */, + std::true_type /* has_allocator */, + std::false_type /* prefix allocator arg */, + const Alloc& a, + U1&& arg1, U2&& arg2) +{ + // using T1 = typename T::first_type; + // using T2 = typename T::second_type; + + // return std::make_tuple( + // piecewise_construct, + // uses_allocator_construction_args<T1>(a, forward<U1>(arg1)), + // uses_allocator_construction_args<T2>(a, forward<U2>(arg2))); + return uses_allocator_construction_args<T>( + a, std::piecewise_construct, + std::forward_as_tuple(std::forward<U1>(arg1)), + std::forward_as_tuple(std::forward<U2>(arg2))); +} + +} // close namespace internal + +template <class T, class Alloc, class... Args> +auto uses_allocator_construction_args(const Alloc& a, Args&&... args) +{ + using namespace internal; + return uses_allocator_args_imp<T>(is_pair<T>(), + has_allocator<T, Alloc>(), + std::is_constructible<T, std::allocator_arg_t, + Alloc, Args...>(), + a, std::forward<Args>(args)...); +} + +template <class T, class Alloc, class... Args> +T make_obj_using_allocator(const Alloc& a, Args&&... args) +{ + return make_from_tuple<T>( + uses_allocator_construction_args<T>(a, std::forward<Args>(args)...)); +} + +template <class T, class Alloc, class... Args> +T* uninitialized_construct_using_allocator(T* p, + const Alloc& a, + Args&&... args) +{ + return std::apply([p](auto&&... args2){ + return ::new(static_cast<void*>(p)) + T(std::forward<decltype(args2)>(args2)...); + }, uses_allocator_construction_args<T>( + a, std::forward<Args>(args)...)); +} + +} // namespace ceph diff --git a/src/include/util.h b/src/include/util.h new file mode 100644 index 000000000..acad4a52c --- /dev/null +++ b/src/include/util.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Inktank Storage, Inc. + * Copyright (C) 2014 Red Hat <contact@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#ifndef CEPH_UTIL_H +#define CEPH_UTIL_H + +#include "common/Formatter.h" +#include "include/types.h" + +std::string bytes2str(uint64_t count); + +struct ceph_data_stats +{ + uint64_t byte_total; + uint64_t byte_used; + uint64_t byte_avail; + int avail_percent; + + ceph_data_stats() : + byte_total(0), + byte_used(0), + byte_avail(0), + avail_percent(0) + { } + + void dump(ceph::Formatter *f) const { + ceph_assert(f != NULL); + f->dump_int("total", byte_total); + f->dump_int("used", byte_used); + f->dump_int("avail", byte_avail); + f->dump_int("avail_percent", avail_percent); + } + + void encode(ceph::buffer::list &bl) const { + ENCODE_START(1, 1, bl); + encode(byte_total, bl); + encode(byte_used, bl); + encode(byte_avail, bl); + encode(avail_percent, bl); + ENCODE_FINISH(bl); + } + + void decode(ceph::buffer::list::const_iterator &p) { + DECODE_START(1, p); + decode(byte_total, p); + decode(byte_used, p); + decode(byte_avail, p); + decode(avail_percent, p); + DECODE_FINISH(p); + } + + static void generate_test_instances(std::list<ceph_data_stats*>& ls) { + ls.push_back(new ceph_data_stats); + ls.push_back(new ceph_data_stats); + ls.back()->byte_total = 1024*1024; + ls.back()->byte_used = 512*1024; + ls.back()->byte_avail = 512*1024; + ls.back()->avail_percent = 50; + } +}; +typedef struct ceph_data_stats ceph_data_stats_t; +WRITE_CLASS_ENCODER(ceph_data_stats) + +int get_fs_stats(ceph_data_stats_t &stats, const char *path); + +/// get memory limit for the current cgroup +int get_cgroup_memory_limit(uint64_t *limit); + +/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo +void collect_sys_info(std::map<std::string, std::string> *m, CephContext *cct); + +#ifdef _WIN32 +/// Retrieve the actual Windows version, regardless of the app manifest. +int get_windows_version(POSVERSIONINFOEXW ver); +#endif + +/// dump service ids grouped by their host to the specified formatter +/// @param f formatter for the output +/// @param services a map from hostname to a list of service id hosted by this host +/// @param type the service type of given @p services, for example @p osd or @p mon. +void dump_services(ceph::Formatter* f, + const std::map<std::string, std::list<int> >& services, + const char* type); +/// dump service names grouped by their host to the specified formatter +/// @param f formatter for the output +/// @param services a map from hostname to a list of service name hosted by this host +/// @param type the service type of given @p services, for example @p osd or @p mon. +void dump_services(ceph::Formatter* f, const std::map<std::string, + std::list<std::string> >& services, const char* type); + +std::string cleanbin(ceph::buffer::list &bl, bool &b64, bool show = false); +std::string cleanbin(std::string &str); + +namespace ceph::util { + +// Returns true if s matches any parameters: +template <typename ...XS> +bool match_str(const std::string& s, const XS& ...xs) +{ + return ((s == xs) || ...); +} + +} // namespace ceph::util +#endif /* CEPH_UTIL_H */ diff --git a/src/include/utime.cc b/src/include/utime.cc new file mode 100644 index 000000000..2252a1ca4 --- /dev/null +++ b/src/include/utime.cc @@ -0,0 +1,31 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "utime.h" +#include "common/Formatter.h" + +void utime_t::dump(ceph::Formatter *f) const +{ + f->dump_int("seconds", tv.tv_sec); + f->dump_int("nanoseconds", tv.tv_nsec); +} + +void utime_t::generate_test_instances(std::list<utime_t*>& o) +{ + o.push_back(new utime_t()); + o.push_back(new utime_t()); + o.back()->tv.tv_sec = static_cast<__u32>((1L << 32) - 1); + o.push_back(new utime_t()); + o.back()->tv.tv_nsec = static_cast<__u32>((1L << 32) - 1); +} diff --git a/src/include/utime.h b/src/include/utime.h new file mode 100644 index 000000000..fad66af79 --- /dev/null +++ b/src/include/utime.h @@ -0,0 +1,602 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_UTIME_H +#define CEPH_UTIME_H + +#include <math.h> +#include <sys/time.h> +#include <time.h> +#include <errno.h> + +#if defined(WITH_SEASTAR) +#include <seastar/core/lowres_clock.hh> +#endif + +#include "include/compat.h" +#include "include/types.h" +#include "include/timegm.h" +#include "common/strtol.h" +#include "common/ceph_time.h" +#include "common/safe_io.h" +#include "common/SubProcess.h" +#include "include/denc.h" + + +// -------- +// utime_t + +inline __u32 cap_to_u32_max(__u64 t) { + return std::min(t, (__u64)std::numeric_limits<uint32_t>::max()); +} +/* WARNING: If add member in utime_t, please make sure the encode/decode function + * work well. For little-endian machine, we should make sure there is no padding + * in 32-bit machine and 64-bit machine. + * You should also modify the padding_check function. + */ +class utime_t { +public: + struct { + __u32 tv_sec, tv_nsec; + } tv; + + public: + bool is_zero() const { + return (tv.tv_sec == 0) && (tv.tv_nsec == 0); + } + + void normalize() { + if (tv.tv_nsec > 1000000000ul) { + tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul)); + tv.tv_nsec %= 1000000000ul; + } + } + + // cons + utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; } + utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); } + utime_t(const struct ceph_timespec &v) { + decode_timeval(&v); + } + utime_t(const struct timespec v) + { + // NOTE: this is used by ceph_clock_now() so should be kept + // as thin as possible. + tv.tv_sec = v.tv_sec; + tv.tv_nsec = v.tv_nsec; + } + // conversion from ceph::real_time/coarse_real_time + template <typename Clock, typename std::enable_if_t< + ceph::converts_to_timespec_v<Clock>>* = nullptr> + explicit utime_t(const std::chrono::time_point<Clock>& t) + : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor + + template<class Rep, class Period> + explicit utime_t(const std::chrono::duration<Rep, Period>& dur) { + using common_t = std::common_type_t<Rep, int>; + tv.tv_sec = std::max<common_t>(std::chrono::duration_cast<std::chrono::seconds>(dur).count(), 0); + tv.tv_nsec = std::max<common_t>((std::chrono::duration_cast<std::chrono::nanoseconds>(dur) % + std::chrono::seconds(1)).count(), 0); + } +#if defined(WITH_SEASTAR) + explicit utime_t(const seastar::lowres_system_clock::time_point& t) { + tv.tv_sec = std::chrono::duration_cast<std::chrono::seconds>( + t.time_since_epoch()).count(); + tv.tv_nsec = std::chrono::duration_cast<std::chrono::nanoseconds>( + t.time_since_epoch() % std::chrono::seconds(1)).count(); + } + explicit operator seastar::lowres_system_clock::time_point() const noexcept { + using clock_t = seastar::lowres_system_clock; + return clock_t::time_point{std::chrono::duration_cast<clock_t::duration>( + std::chrono::seconds{tv.tv_sec} + std::chrono::nanoseconds{tv.tv_nsec})}; + } +#endif + + utime_t(const struct timeval &v) { + set_from_timeval(&v); + } + utime_t(const struct timeval *v) { + set_from_timeval(v); + } + void to_timespec(struct timespec *ts) const { + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_nsec; + } + void set_from_double(double d) { + tv.tv_sec = (__u32)trunc(d); + tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0); + } + + ceph::real_time to_real_time() const { + ceph_timespec ts; + encode_timeval(&ts); + return ceph::real_clock::from_ceph_timespec(ts); + } + + // accessors + time_t sec() const { return tv.tv_sec; } + long usec() const { return tv.tv_nsec/1000; } + int nsec() const { return tv.tv_nsec; } + + // ref accessors/modifiers + __u32& sec_ref() { return tv.tv_sec; } + __u32& nsec_ref() { return tv.tv_nsec; } + + uint64_t to_nsec() const { + return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull; + } + uint64_t to_msec() const { + return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull; + } + + void copy_to_timeval(struct timeval *v) const { + v->tv_sec = tv.tv_sec; + v->tv_usec = tv.tv_nsec/1000; + } + void set_from_timeval(const struct timeval *v) { + tv.tv_sec = v->tv_sec; + tv.tv_nsec = v->tv_usec*1000; + } + void padding_check() { + static_assert( + sizeof(utime_t) == + sizeof(tv.tv_sec) + + sizeof(tv.tv_nsec) + , + "utime_t have padding"); + } + void encode(ceph::buffer::list &bl) const { +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)(this), sizeof(__u32) + sizeof(__u32)); +#else + using ceph::encode; + encode(tv.tv_sec, bl); + encode(tv.tv_nsec, bl); +#endif + } + void decode(ceph::buffer::list::const_iterator &p) { +#if defined(CEPH_LITTLE_ENDIAN) + p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this)); +#else + using ceph::decode; + decode(tv.tv_sec, p); + decode(tv.tv_nsec, p); +#endif + } + + DENC(utime_t, v, p) { + denc(v.tv.tv_sec, p); + denc(v.tv.tv_nsec, p); + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<utime_t*>& o); + + void encode_timeval(struct ceph_timespec *t) const { + t->tv_sec = tv.tv_sec; + t->tv_nsec = tv.tv_nsec; + } + void decode_timeval(const struct ceph_timespec *t) { + tv.tv_sec = t->tv_sec; + tv.tv_nsec = t->tv_nsec; + } + + utime_t round_to_minute() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + utime_t round_to_hour() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + bdt.tm_min = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + utime_t round_to_day() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + bdt.tm_min = 0; + bdt.tm_hour = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + // cast to double + operator double() const { + return (double)sec() + ((double)nsec() / 1000000000.0f); + } + operator ceph_timespec() const { + ceph_timespec ts; + ts.tv_sec = sec(); + ts.tv_nsec = nsec(); + return ts; + } + + void sleep() const { + struct timespec ts; + to_timespec(&ts); + nanosleep(&ts, NULL); + } + + // output + std::ostream& gmtime(std::ostream& out, bool legacy_form=false) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // conform to http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday; + if (legacy_form) { + out << ' '; + } else { + out << 'T'; + } + out << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(6) << usec(); + out << "Z"; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + // output + std::ostream& gmtime_nsec(std::ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // conform to http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday + << 'T' + << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(9) << nsec(); + out << "Z"; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + // output + std::ostream& asctime(std::ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + + char buf[128]; + asctime_r(&bdt, buf); + int len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + out << buf; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + std::ostream& localtime(std::ostream& out, bool legacy_form=false) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // conform to http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday; + if (legacy_form) { + out << ' '; + } else { + out << 'T'; + } + out << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(6) << usec(); + if (!legacy_form) { + char buf[32] = { 0 }; + strftime(buf, sizeof(buf), "%z", &bdt); + out << buf; + } + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + static int invoke_date(const std::string& date_str, utime_t *result) { + char buf[256]; + + SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE, + SubProcess::KEEP); + bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL); + + int r = bin_date.spawn(); + if (r < 0) return r; + + ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf)); + + r = bin_date.join(); + if (r || n <= 0) return -EINVAL; + + uint64_t epoch, nsec; + std::istringstream iss(buf); + + iss >> epoch; + iss >> nsec; + + *result = utime_t(epoch, nsec); + + return 0; + } + + + static int parse_date(const std::string& date, uint64_t *epoch, uint64_t *nsec, + std::string *out_date=nullptr, + std::string *out_time=nullptr) { + struct tm tm; + memset(&tm, 0, sizeof(tm)); + + if (nsec) + *nsec = 0; + + const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm); + if (p) { + if (*p == ' ' || *p == 'T') { + p++; + // strptime doesn't understand fractional/decimal seconds, and + // it also only takes format chars or literals, so we have to + // get creative. + char fmt[32] = {0}; + strncpy(fmt, p, sizeof(fmt) - 1); + fmt[0] = '%'; + fmt[1] = 'H'; + fmt[2] = ':'; + fmt[3] = '%'; + fmt[4] = 'M'; + fmt[6] = '%'; + fmt[7] = 'S'; + const char *subsec = 0; + char *q = fmt + 8; + if (*q == '.') { + ++q; + subsec = p + 9; + q = fmt + 9; + while (*q && isdigit(*q)) { + ++q; + } + } + // look for tz... + if (*q == '-' || *q == '+') { + *q = '%'; + *(q+1) = 'z'; + *(q+2) = 0; + } + p = strptime(p, fmt, &tm); + if (!p) { + return -EINVAL; + } + if (nsec && subsec) { + unsigned i; + char buf[10]; /* 9 digit + null termination */ + for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) { + buf[i] = *subsec; + } + for (; i < sizeof(buf) - 1; ++i) { + buf[i] = '0'; + } + buf[i] = '\0'; + std::string err; + *nsec = (uint64_t)strict_strtol(buf, 10, &err); + if (!err.empty()) { + return -EINVAL; + } + } + } + } else { + int sec, usec; + int r = sscanf(date.c_str(), "%d.%d", &sec, &usec); + if (r != 2) { + return -EINVAL; + } + + time_t tt = sec; + gmtime_r(&tt, &tm); + + if (nsec) { + *nsec = (uint64_t)usec * 1000; + } + } + + #ifndef _WIN32 + // apply the tm_gmtoff manually below, since none of mktime, + // gmtime, and localtime seem to do it. zero it out here just in + // case some other libc *does* apply it. :( + auto gmtoff = tm.tm_gmtoff; + tm.tm_gmtoff = 0; + #else + auto gmtoff = _timezone; + #endif /* _WIN32 */ + + time_t t = internal_timegm(&tm); + if (epoch) + *epoch = (uint64_t)t; + + *epoch -= gmtoff; + + if (out_date) { + char buf[32]; + strftime(buf, sizeof(buf), "%Y-%m-%d", &tm); + *out_date = buf; + } + if (out_time) { + char buf[32]; + strftime(buf, sizeof(buf), "%H:%M:%S", &tm); + *out_time = buf; + } + + return 0; + } + + bool parse(const std::string& s) { + uint64_t epoch, nsec; + int r = parse_date(s, &epoch, &nsec); + if (r < 0) { + return false; + } + *this = utime_t(epoch, nsec); + return true; + } +}; +WRITE_CLASS_ENCODER(utime_t) +WRITE_CLASS_DENC(utime_t) + +// arithmetic operators +inline utime_t operator+(const utime_t& l, const utime_t& r) { + __u64 sec = (__u64)l.sec() + r.sec(); + return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec()); +} +inline utime_t& operator+=(utime_t& l, const utime_t& r) { + l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec()); + l.nsec_ref() += r.nsec(); + l.normalize(); + return l; +} +inline utime_t& operator+=(utime_t& l, double f) { + double fs = trunc(f); + double ns = (f - fs) * 1000000000.0; + l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs); + l.nsec_ref() += (long)ns; + l.normalize(); + return l; +} + +inline utime_t operator-(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0), + l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) ); +} +inline utime_t& operator-=(utime_t& l, const utime_t& r) { + l.sec_ref() -= r.sec(); + if (l.nsec() >= r.nsec()) + l.nsec_ref() -= r.nsec(); + else { + l.nsec_ref() += 1000000000L - r.nsec(); + l.sec_ref()--; + } + return l; +} +inline utime_t& operator-=(utime_t& l, double f) { + double fs = trunc(f); + double ns = (f - fs) * 1000000000.0; + l.sec_ref() -= (long)fs; + long nsl = (long)ns; + if (nsl) { + l.sec_ref()--; + l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl; + } + l.normalize(); + return l; +} + + +// comparators +inline bool operator>(const utime_t& a, const utime_t& b) +{ + return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec()); +} +inline bool operator<=(const utime_t& a, const utime_t& b) +{ + return !(operator>(a, b)); +} +inline bool operator<(const utime_t& a, const utime_t& b) +{ + return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec()); +} +inline bool operator>=(const utime_t& a, const utime_t& b) +{ + return !(operator<(a, b)); +} + +inline bool operator==(const utime_t& a, const utime_t& b) +{ + return a.sec() == b.sec() && a.nsec() == b.nsec(); +} +inline bool operator!=(const utime_t& a, const utime_t& b) +{ + return a.sec() != b.sec() || a.nsec() != b.nsec(); +} + + +// output + +// ostream +inline std::ostream& operator<<(std::ostream& out, const utime_t& t) +{ + return t.localtime(out); +} + +inline std::string utimespan_str(const utime_t& age) { + auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec()); + return ceph::timespan_str(age_ts); +} + +#endif diff --git a/src/include/utime_fmt.h b/src/include/utime_fmt.h new file mode 100644 index 000000000..e7a98d209 --- /dev/null +++ b/src/include/utime_fmt.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#pragma once +/** + * \file fmtlib formatter for utime_t + */ +#include <fmt/chrono.h> +#include <fmt/format.h> + +#include "include/utime.h" + +template <> +struct fmt::formatter<utime_t> { + template <typename ParseContext> + constexpr auto parse(ParseContext& ctx) + { + auto it = ctx.begin(); + if (it != ctx.end() && *it == 's') { + short_format = true; + ++it; + } + return it; + } + + template <typename FormatContext> + auto format(const utime_t& utime, FormatContext& ctx) + { + if (utime.sec() < ((time_t)(60 * 60 * 24 * 365 * 10))) { + // raw seconds. this looks like a relative time. + return fmt::format_to(ctx.out(), "{}.{:06}", (long)utime.sec(), + utime.usec()); + } + + // this looks like an absolute time. + // conform to http://en.wikipedia.org/wiki/ISO_8601 + // (unless short_format is set) + auto aslocal = fmt::localtime(utime.sec()); + if (short_format) { + return fmt::format_to(ctx.out(), "{:%FT%T}.{:03}", aslocal, + utime.usec() / 1000); + } + return fmt::format_to(ctx.out(), "{:%FT%T}.{:06}{:%z}", aslocal, + utime.usec(), aslocal); + } + + bool short_format{false}; +}; diff --git a/src/include/uuid.cc b/src/include/uuid.cc new file mode 100644 index 000000000..106fc1db5 --- /dev/null +++ b/src/include/uuid.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2019 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "uuid.h" +#include "common/Formatter.h" + +void uuid_d::dump(ceph::Formatter *f) const +{ + f->dump_stream("uuid") << to_string(); +} + +void uuid_d::generate_test_instances(std::list<uuid_d*>& o) +{ + // these are sourced from examples at + // https://www.boost.org/doc/libs/1_62_0/libs/uuid/uuid.html#Synopsis_generators + boost::uuids::string_generator gen; + o.push_back(new uuid_d()); + o.back()->uuid = gen("{01234567-89ab-cdef-0123-456789abcdef}"); + o.push_back(new uuid_d()); + o.back()->uuid = gen(L"01234567-89ab-cdef-0123-456789abcdef"); + o.push_back(new uuid_d()); + o.back()->uuid = gen(std::string("0123456789abcdef0123456789abcdef")); + o.push_back(new uuid_d()); + o.back()->uuid = gen(std::wstring(L"01234567-89ab-cdef-0123-456789abcdef")); +} diff --git a/src/include/uuid.h b/src/include/uuid.h new file mode 100644 index 000000000..f6ef9878d --- /dev/null +++ b/src/include/uuid.h @@ -0,0 +1,107 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +#ifndef _CEPH_UUID_H +#define _CEPH_UUID_H + +/* + * Thin C++ wrapper around libuuid. + */ + +#include "encoding.h" +#include "random.h" + +#include <ostream> +#include <random> + +#include <boost/uuid/uuid.hpp> +#include <boost/uuid/uuid_generators.hpp> +#include <boost/uuid/uuid_io.hpp> + +#if FMT_VERSION >= 90000 +#include <fmt/ostream.h> +#endif + +namespace ceph { + class Formatter; +} + +struct uuid_d { + boost::uuids::uuid uuid; + + uuid_d() { + boost::uuids::nil_generator gen; + uuid = gen(); + } + + bool is_zero() const { + return uuid.is_nil(); + } + + void generate_random() { + random_device_t rng; + boost::uuids::basic_random_generator gen(rng); + uuid = gen(); + } + + bool parse(const char *s) { + try { + boost::uuids::string_generator gen; + uuid = gen(s); + return true; + } catch (std::runtime_error& e) { + return false; + } + } + void print(char *s) const { + memcpy(s, boost::uuids::to_string(uuid).c_str(), 37); + } + + std::string to_string() const { + return boost::uuids::to_string(uuid); + } + + const char *bytes() const { + return (const char*)uuid.data; + } + + void encode(::ceph::buffer::list::contiguous_appender& p) const { + p.append(reinterpret_cast<const char *>(&uuid), sizeof(uuid)); + } + + void bound_encode(size_t& p) const { + p += sizeof(uuid); + } + + void decode(::ceph::buffer::ptr::const_iterator& p) { + assert((p.get_end() - p.get_pos()) >= (int)sizeof(*this)); + memcpy((char *)this, p.get_pos_add(sizeof(*this)), sizeof(*this)); + } + + void dump(ceph::Formatter *f) const; + static void generate_test_instances(std::list<uuid_d*>& o); +}; +WRITE_CLASS_DENC_BOUNDED(uuid_d) + +inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) { + char b[37]; + u.print(b); + return out << b; +} + +inline bool operator==(const uuid_d& l, const uuid_d& r) { + return l.uuid == r.uuid; +} +inline bool operator!=(const uuid_d& l, const uuid_d& r) { + return l.uuid != r.uuid; +} +inline bool operator<(const uuid_d& l, const uuid_d& r) { + return l.to_string() < r.to_string(); +} +inline bool operator>(const uuid_d& l, const uuid_d& r) { + return l.to_string() > r.to_string(); +} + +#if FMT_VERSION >= 90000 +template <> struct fmt::formatter<uuid_d> : fmt::ostream_formatter {}; +#endif + +#endif diff --git a/src/include/win32/arpa/inet.h b/src/include/win32/arpa/inet.h new file mode 100644 index 000000000..44983f03f --- /dev/null +++ b/src/include/win32/arpa/inet.h @@ -0,0 +1 @@ +#include "winsock_compat.h" diff --git a/src/include/win32/dlfcn.h b/src/include/win32/dlfcn.h new file mode 100644 index 000000000..32e51f16f --- /dev/null +++ b/src/include/win32/dlfcn.h @@ -0,0 +1 @@ +#include "../dlfcn_compat.h" diff --git a/src/include/win32/fs_compat.h b/src/include/win32/fs_compat.h new file mode 100644 index 000000000..deeedf071 --- /dev/null +++ b/src/include/win32/fs_compat.h @@ -0,0 +1,47 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2021 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +// Those definitions allow handling information coming from Ceph and should +// not be passed to Windows functions. + +#pragma once + +#define S_IFLNK 0120000 + +#define S_ISTYPE(m, TYPE) ((m & S_IFMT) == TYPE) +#define S_ISLNK(m) S_ISTYPE(m, S_IFLNK) +#define S_ISUID 04000 +#define S_ISGID 02000 +#define S_ISVTX 01000 + +#define LOCK_SH 1 +#define LOCK_EX 2 +#define LOCK_NB 4 +#define LOCK_UN 8 +#define LOCK_MAND 32 +#define LOCK_READ 64 +#define LOCK_WRITE 128 +#define LOCK_RW 192 + +#define AT_SYMLINK_NOFOLLOW 0x100 +#define AT_REMOVEDIR 0x200 + +#define MAXSYMLINKS 65000 + +#define O_DIRECTORY 0200000 +#define O_NOFOLLOW 0400000 + +#define XATTR_CREATE 1 +#define XATTR_REPLACE 2 + +typedef unsigned int uid_t; +typedef unsigned int gid_t; diff --git a/src/include/win32/ifaddrs.h b/src/include/win32/ifaddrs.h new file mode 100644 index 000000000..45e1a362c --- /dev/null +++ b/src/include/win32/ifaddrs.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2002-2016 Free Software Foundation, Inc. + * Copyright (C) 2019 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef IFADDRS_H +#define IFADDRS_H + +#include "winsock_compat.h" +#include <ifdef.h> + +struct ifaddrs { + struct ifaddrs *ifa_next; /* Next item in list */ + char *ifa_name; /* Name of interface */ + unsigned int ifa_flags; /* Flags from SIOCGIFFLAGS */ + struct sockaddr *ifa_addr; /* Address of interface */ + struct sockaddr *ifa_netmask; /* Netmask of interface */ + + struct sockaddr_storage in_addrs; + struct sockaddr_storage in_netmasks; + + char ad_name[IF_MAX_STRING_SIZE]; + size_t speed; +}; + +int getifaddrs(struct ifaddrs **ifap); +void freeifaddrs(struct ifaddrs *ifa); + +#endif diff --git a/src/include/win32/netdb.h b/src/include/win32/netdb.h new file mode 100644 index 000000000..44983f03f --- /dev/null +++ b/src/include/win32/netdb.h @@ -0,0 +1 @@ +#include "winsock_compat.h" diff --git a/src/include/win32/netinet/in.h b/src/include/win32/netinet/in.h new file mode 100644 index 000000000..44983f03f --- /dev/null +++ b/src/include/win32/netinet/in.h @@ -0,0 +1 @@ +#include "winsock_compat.h" diff --git a/src/include/win32/netinet/ip.h b/src/include/win32/netinet/ip.h new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/src/include/win32/netinet/ip.h diff --git a/src/include/win32/netinet/tcp.h b/src/include/win32/netinet/tcp.h new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/src/include/win32/netinet/tcp.h diff --git a/src/include/win32/poll.h b/src/include/win32/poll.h new file mode 100644 index 000000000..44983f03f --- /dev/null +++ b/src/include/win32/poll.h @@ -0,0 +1 @@ +#include "winsock_compat.h" diff --git a/src/include/win32/sys/errno.h b/src/include/win32/sys/errno.h new file mode 100644 index 000000000..339f4fc10 --- /dev/null +++ b/src/include/win32/sys/errno.h @@ -0,0 +1 @@ +#include <errno.h> diff --git a/src/include/win32/sys/select.h b/src/include/win32/sys/select.h new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/src/include/win32/sys/select.h diff --git a/src/include/win32/sys/socket.h b/src/include/win32/sys/socket.h new file mode 100644 index 000000000..44983f03f --- /dev/null +++ b/src/include/win32/sys/socket.h @@ -0,0 +1 @@ +#include "winsock_compat.h" diff --git a/src/include/win32/sys/statvfs.h b/src/include/win32/sys/statvfs.h new file mode 100644 index 000000000..73a892b88 --- /dev/null +++ b/src/include/win32/sys/statvfs.h @@ -0,0 +1,36 @@ +#ifndef _SYS_STATVFS_H +#define _SYS_STATVFS_H 1 + +typedef unsigned __int64 fsfilcnt64_t; +typedef unsigned __int64 fsblkcnt64_t; +typedef unsigned __int64 fsblkcnt_t; + +struct statvfs +{ + unsigned long int f_bsize; + unsigned long int f_frsize; + fsblkcnt64_t f_blocks; + fsblkcnt64_t f_bfree; + fsblkcnt64_t f_bavail; + fsfilcnt64_t f_files; + fsfilcnt64_t f_ffree; + fsfilcnt64_t f_favail; + unsigned long int f_fsid; + unsigned long int f_flag; + unsigned long int f_namemax; + int __f_spare[6]; +}; +struct flock { + short l_type; + short l_whence; + off_t l_start; + off_t l_len; + pid_t l_pid; +}; + +#define F_RDLCK 0 +#define F_WRLCK 1 +#define F_UNLCK 2 +#define F_SETLK 6 + +#endif /* _SYS_STATVFS_H */ diff --git a/src/include/win32/sys/uio.h b/src/include/win32/sys/uio.h new file mode 100644 index 000000000..15e95be7f --- /dev/null +++ b/src/include/win32/sys/uio.h @@ -0,0 +1 @@ +#include "include/compat.h" diff --git a/src/include/win32/sys/un.h b/src/include/win32/sys/un.h new file mode 100644 index 000000000..d08940b2c --- /dev/null +++ b/src/include/win32/sys/un.h @@ -0,0 +1 @@ +#include "include/win32/winsock_compat.h" diff --git a/src/include/win32/syslog.h b/src/include/win32/syslog.h new file mode 100644 index 000000000..28389e0b9 --- /dev/null +++ b/src/include/win32/syslog.h @@ -0,0 +1,64 @@ +/* + * Copyright 2013, 2015 Cloudbase Solutions Srl + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License.You may obtain + * a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the + * License for the specific language governing permissions and limitations + * under the License. + */ + +#ifndef SYSLOG_H +#define SYSLOG_H 1 + +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ + +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* security/authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* security/authorization messages */ +#define LOG_FTP (11<<3) /* FTP daemon */ + +#define LOG_LOCAL0 (16<<3) /* reserved for local use */ +#define LOG_LOCAL1 (17<<3) /* reserved for local use */ +#define LOG_LOCAL2 (18<<3) /* reserved for local use */ +#define LOG_LOCAL3 (19<<3) /* reserved for local use */ +#define LOG_LOCAL4 (20<<3) /* reserved for local use */ +#define LOG_LOCAL5 (21<<3) /* reserved for local use */ +#define LOG_LOCAL6 (22<<3) /* reserved for local use */ +#define LOG_LOCAL7 (23<<3) /* reserved for local use */ + +#define LOG_PRIMASK 0x07 /* mask to extract priority part (internal) */ + /* extract priority */ +#define LOG_PRI(p) ((p) & LOG_PRIMASK) + + +static inline void +openlog(const char *ident, int option, int facility) +{ +} + +void +syslog(int priority, const char *format, ...); + +#endif /* syslog.h */ diff --git a/src/include/win32/win32_errno.h b/src/include/win32/win32_errno.h new file mode 100644 index 000000000..dd8ff8474 --- /dev/null +++ b/src/include/win32/win32_errno.h @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +// We're going to preserve the error numbers defined by the Windows SDK but not +// by Mingw headers. For others, we're going to use numbers greater than 256 to +// avoid unintended overlaps. + +#ifndef WIN32_ERRNO_H +#define WIN32_ERRNO_H 1 + +#include <errno.h> + +#include "include/int_types.h" + +#ifndef EBADMSG +#define EBADMSG 104 +#endif + +#ifndef ENODATA +#define ENODATA 120 +#endif + +#ifndef ENOLINK +#define ENOLINK 121 +#endif + +#ifndef ENOMSG +#define ENOMSG 122 +#endif + +#ifndef ENOTRECOVERABLE +#define ENOTRECOVERABLE 127 +#endif + +#ifndef ETIME +#define ETIME 137 +#endif + +#ifndef ETXTBSY +#define ETXTBSY 139 +#endif + +#ifndef ENODATA +#define ENODATA 120 +#endif + +#define ESTALE 256 +#define EREMOTEIO 257 + +#ifndef EBADE +#define EBADE 258 +#endif + +#define EUCLEAN 259 +#define EREMCHG 260 +#define EKEYREJECTED 261 +#define EREMOTE 262 + +// Not used at moment. Full coverage ensures that remote errors will be +// converted and handled properly. +#define EADV 263 +#define EBADFD 264 +#define EBADR 265 +#define EBADRQC 266 +#define EBADSLT 267 +#define EBFONT 268 +#define ECHRNG 269 +#define ECOMM 270 +#define EDOTDOT 271 +#define EHOSTDOWN 272 +#define EHWPOISON 273 +// Defined by Boost. +#ifndef EIDRM +#define EIDRM 274 +#endif +#define EISNAM 275 +#define EKEYEXPIRED 276 +#define EKEYREVOKED 277 +#define EL2HLT 278 +#define EL2NSYNC 279 +#define EL3HLT 280 +#define EL3RST 281 +#define ELIBACC 282 +#define ELIBBAD 283 +#define ELIBEXEC 284 +#define ELIBMAX 285 +#define ELIBSCN 286 +#define ELNRNG 287 +#define EMEDIUMTYPE 288 +#define EMULTIHOP 289 +#define ENAVAIL 290 +#define ENOANO 291 +#define ENOCSI 292 +#define ENOKEY 293 +#define ENOMEDIUM 294 +#define ENONET 295 +#define ENOPKG 296 +#ifndef ENOSR +#define ENOSR 297 +#endif +#ifndef ENOSTR +#define ENOSTR 298 +#endif +#define ENOTNAM 299 +#define ENOTUNIQ 300 +#define EPFNOSUPPORT 301 +#define ERFKILL 302 +#define ESOCKTNOSUPPORT 303 +#define ESRMNT 304 +#define ESTRPIPE 305 +#define ETOOMANYREFS 306 +#define EUNATCH 307 +#define EUSERS 308 +#define EXFULL 309 +#define ENOTBLK 310 + +#ifndef EDQUOT +#define EDQUOT 311 +#endif + +#define ESHUTDOWN 312 + +#ifdef __cplusplus +extern "C" { +#endif + +__s32 wsae_to_errno(__s32 r); +__u32 errno_to_ntstatus(__s32 r); +__u32 cephfs_errno_to_ntstatus_map(int cephfs_errno); + +#ifdef __cplusplus +} +#endif + +#endif // WIN32_ERRNO_H diff --git a/src/include/win32/winsock_compat.h b/src/include/win32/winsock_compat.h new file mode 100644 index 000000000..990cc4823 --- /dev/null +++ b/src/include/win32/winsock_compat.h @@ -0,0 +1,39 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (c) 2019 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WINSOCK_COMPAT_H +#define WINSOCK_COMPAT_H 1 + +#include "winsock_wrapper.h" + +#ifndef poll +#define poll WSAPoll +#endif + +// afunix.h is available starting with Windows SDK 17063. Still, it wasn't +// picked up by mingw yet, for which reason we're going to define sockaddr_un +// here. +#ifndef _AFUNIX_ +#define UNIX_PATH_MAX 108 + +typedef struct sockaddr_un +{ + ADDRESS_FAMILY sun_family; /* AF_UNIX */ + char sun_path[UNIX_PATH_MAX]; /* pathname */ +} SOCKADDR_UN, *PSOCKADDR_UN; + +#define SIO_AF_UNIX_GETPEERPID _WSAIOR(IOC_VENDOR, 256) +#endif /* _AFUNIX */ + +#endif /* WINSOCK_COMPAT_H */ diff --git a/src/include/win32/winsock_wrapper.h b/src/include/win32/winsock_wrapper.h new file mode 100644 index 000000000..1bb951a9d --- /dev/null +++ b/src/include/win32/winsock_wrapper.h @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (c) 2020 SUSE LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef WINSOCK_WRAPPER_H +#define WINSOCK_WRAPPER_H 1 + +#ifdef __cplusplus +// Boost complains if winsock2.h (or windows.h) is included before asio.hpp. +#include <boost/asio.hpp> +#endif + +#include <winsock2.h> +#include <ws2ipdef.h> +#include <ws2tcpip.h> + +#endif /* WINSOCK_WRAPPER_H */ diff --git a/src/include/xlist.h b/src/include/xlist.h new file mode 100644 index 000000000..76d0ddccd --- /dev/null +++ b/src/include/xlist.h @@ -0,0 +1,237 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_XLIST_H +#define CEPH_XLIST_H + +#include <iterator> +#include <cstdlib> +#include <ostream> + +#include "include/ceph_assert.h" + +template<typename T> +class xlist { +public: + class item { + public: + item(T i) : _item(i) {} + ~item() { + ceph_assert(!is_on_list()); + } + + item(const item& other) = delete; + item(item&& other) = delete; + const item& operator= (const item& right) = delete; + item& operator= (item&& right) = delete; + + xlist* get_list() { return _list; } + bool is_on_list() const { return _list ? true:false; } + bool remove_myself() { + if (_list) { + _list->remove(this); + ceph_assert(_list == 0); + return true; + } else + return false; + } + void move_to_front() { + ceph_assert(_list); + _list->push_front(this); + } + void move_to_back() { + ceph_assert(_list); + _list->push_back(this); + } + + private: + friend xlist; + T _item; + item *_prev = nullptr, *_next = nullptr; + xlist *_list = nullptr; + }; + + typedef item* value_type; + typedef item* const_reference; + +private: + item *_front, *_back; + size_t _size; + +public: + xlist(const xlist& other) { + _front = other._front; + _back = other._back; + _size = other._size; + } + + xlist() : _front(0), _back(0), _size(0) {} + ~xlist() { + ceph_assert(_size == 0); + ceph_assert(_front == 0); + ceph_assert(_back == 0); + } + + size_t size() const { + ceph_assert((bool)_front == (bool)_size); + return _size; + } + bool empty() const { + ceph_assert((bool)_front == (bool)_size); + return _front == 0; + } + + void clear() { + while (_front) + remove(_front); + ceph_assert((bool)_front == (bool)_size); + } + + void push_front(item *i) { + if (i->_list) + i->_list->remove(i); + + i->_list = this; + i->_next = _front; + i->_prev = 0; + if (_front) + _front->_prev = i; + else + _back = i; + _front = i; + _size++; + } + void push_back(item *i) { + if (i->_list) + i->_list->remove(i); + + i->_list = this; + i->_next = 0; + i->_prev = _back; + if (_back) + _back->_next = i; + else + _front = i; + _back = i; + _size++; + } + void remove(item *i) { + ceph_assert(i->_list == this); + + if (i->_prev) + i->_prev->_next = i->_next; + else + _front = i->_next; + if (i->_next) + i->_next->_prev = i->_prev; + else + _back = i->_prev; + _size--; + + i->_list = 0; + i->_next = i->_prev = 0; + ceph_assert((bool)_front == (bool)_size); + } + + T front() { return static_cast<T>(_front->_item); } + const T front() const { return static_cast<const T>(_front->_item); } + + T back() { return static_cast<T>(_back->_item); } + const T back() const { return static_cast<const T>(_back->_item); } + + void pop_front() { + ceph_assert(!empty()); + remove(_front); + } + void pop_back() { + ceph_assert(!empty()); + remove(_back); + } + + class iterator { + private: + item *cur; + public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using reference = T&; + iterator(item *i = 0) : cur(i) {} + T operator*() { return static_cast<T>(cur->_item); } + iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur->_list); + cur = cur->_next; + return *this; + } + bool end() const { return cur == 0; } + friend bool operator==(const iterator& lhs, const iterator& rhs) { + return lhs.cur == rhs.cur; + } + friend bool operator!=(const iterator& lhs, const iterator& rhs) { + return lhs.cur != rhs.cur; + } + }; + + iterator begin() { return iterator(_front); } + iterator end() { return iterator(NULL); } + + class const_iterator { + private: + item *cur; + public: + using iterator_category = std::forward_iterator_tag; + using value_type = T; + using difference_type = std::ptrdiff_t; + using pointer = const T*; + using reference = const T&; + + const_iterator(item *i = 0) : cur(i) {} + const T operator*() { return static_cast<const T>(cur->_item); } + const_iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur->_list); + cur = cur->_next; + return *this; + } + bool end() const { return cur == 0; } + friend bool operator==(const const_iterator& lhs, + const const_iterator& rhs) { + return lhs.cur == rhs.cur; + } + friend bool operator!=(const const_iterator& lhs, + const const_iterator& rhs) { + return lhs.cur != rhs.cur; + } + }; + + const_iterator begin() const { return const_iterator(_front); } + const_iterator end() const { return const_iterator(NULL); } + + friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) { + bool first = true; + for (const auto &item : list) { + if (!first) { + oss << ", "; + } + oss << *item; /* item should be a pointer */ + first = false; + } + return oss; + } +}; + + +#endif |