diff options
Diffstat (limited to 'src/include')
100 files changed, 32336 insertions, 0 deletions
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt new file mode 100644 index 00000000..39cdc6b2 --- /dev/null +++ b/src/include/CMakeLists.txt @@ -0,0 +1,35 @@ +install(FILES + rados/librados.h + rados/rados_types.h + rados/rados_types.hpp + rados/librados_fwd.hpp + rados/librados.hpp + buffer.h + buffer_fwd.h + inline_memory.h + page.h + crc32c.h + rados/objclass.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados) +if(WITH_LIBRADOSSTRIPER) + install(FILES + radosstriper/libradosstriper.h + radosstriper/libradosstriper.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/radosstriper) +endif() + +if(WITH_RBD) + install(FILES + rbd/features.h + rbd/librbd.h + rbd/librbd.hpp + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rbd) +endif() + +if(WITH_RADOSGW) + install(FILES + rados/librgw.h + rados/rgw_file.h + rgw/librgw_admin_user.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rados) +endif() diff --git a/src/include/CompatSet.h b/src/include/CompatSet.h new file mode 100644 index 00000000..a9e15f76 --- /dev/null +++ b/src/include/CompatSet.h @@ -0,0 +1,273 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMPATSET_H +#define CEPH_COMPATSET_H + +#include <iostream> +#include <map> +#include <string> + +#include "include/buffer.h" +#include "include/encoding.h" +#include "include/types.h" +#include "common/Formatter.h" + +struct CompatSet { + + struct Feature { + uint64_t id; + std::string name; + + Feature(uint64_t _id, const std::string& _name) : id(_id), name(_name) {} + }; + + class FeatureSet { + uint64_t mask; + std::map<uint64_t, std::string> names; + + public: + friend struct CompatSet; + friend class CephCompatSet_AllSet_Test; + friend class CephCompatSet_other_Test; + friend class CephCompatSet_merge_Test; + friend std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs); + friend std::ostream& operator<<(std::ostream& out, const CompatSet& compat); + FeatureSet() : mask(1), names() {} + void insert(const Feature& f) { + ceph_assert(f.id > 0); + ceph_assert(f.id < 64); + mask |= ((uint64_t)1<<f.id); + names[f.id] = f.name; + } + + bool contains(const Feature& f) const { + return names.count(f.id); + } + bool contains(uint64_t f) const { + return names.count(f); + } + /** + * Getter instead of using name[] to be const safe + */ + std::string get_name(uint64_t const f) const { + std::map<uint64_t, std::string>::const_iterator i = names.find(f); + ceph_assert(i != names.end()); + return i->second; + } + + void remove(uint64_t f) { + if (names.count(f)) { + names.erase(f); + mask &= ~((uint64_t)1<<f); + } + } + void remove(const Feature& f) { + remove(f.id); + } + + void encode(bufferlist& bl) const { + using ceph::encode; + /* See below, mask always has the lowest bit set in memory, but + * unset in the encoding */ + encode(mask & (~(uint64_t)1), bl); + encode(names, bl); + } + + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(mask, bl); + decode(names, bl); + /** + * Previously, there was a bug where insert did + * mask |= f.id rather than mask |= (1 << f.id). + * In FeatureSets from those version, mask always + * has the lowest bit set. Since then, masks always + * have the lowest bit unset. + * + * When we encounter such a FeatureSet, we have to + * reconstruct the mask from the names map. + */ + if (mask & 1) { + mask = 1; + std::map<uint64_t, std::string> temp_names; + temp_names.swap(names); + for (auto i = temp_names.begin(); i != temp_names.end(); ++i) { + insert(Feature(i->first, i->second)); + } + } else { + mask |= 1; + } + } + + void dump(Formatter *f) const { + for (auto p = names.cbegin(); p != names.cend(); ++p) { + char s[18]; + snprintf(s, sizeof(s), "feature_%llu", (unsigned long long)p->first); + f->dump_string(s, p->second); + } + } + }; + + // These features have no impact on the read / write status + FeatureSet compat; + // If any of these features are missing, read is possible ( as long + // as no incompat feature is missing ) but it is not possible to write + FeatureSet ro_compat; + // If any of these features are missing, read or write is not possible + FeatureSet incompat; + + CompatSet(FeatureSet& _compat, FeatureSet& _ro_compat, FeatureSet& _incompat) : + compat(_compat), ro_compat(_ro_compat), incompat(_incompat) {} + + CompatSet() : compat(), ro_compat(), incompat() { } + + + /* does this filesystem implementation have the + features required to read the other? */ + bool readable(CompatSet const& other) const { + return !((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + } + + /* does this filesystem implementation have the + features required to write the other? */ + bool writeable(CompatSet const& other) const { + return readable(other) && + !((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + } + + /* Compare this CompatSet to another. + * CAREFULLY NOTE: This operation is NOT commutative. + * a > b DOES NOT imply that b < a. + * If returns: + * 0: The CompatSets have the same feature set. + * 1: This CompatSet's features are a strict superset of the other's. + * -1: This CompatSet is missing at least one feature + * described in the other. It may still have more features, though. + */ + int compare(const CompatSet& other) { + if ((other.compat.mask == compat.mask) && + (other.ro_compat.mask == ro_compat.mask) && + (other.incompat.mask == incompat.mask)) return 0; + //okay, they're not the same + + //if we're writeable we have a superset of theirs on incompat and ro_compat + if (writeable(other) && !((other.compat.mask ^ compat.mask) + & other.compat.mask)) return 1; + //if we make it here, we weren't writeable or had a difference compat set + return -1; + } + + /* Get the features supported by other CompatSet but not this one, + * as a CompatSet. + */ + CompatSet unsupported(CompatSet& other) { + CompatSet diff; + uint64_t other_compat = + ((other.compat.mask ^ compat.mask) & other.compat.mask); + uint64_t other_ro_compat = + ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + uint64_t other_incompat = + ((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + for (int id = 1; id < 64; ++id) { + uint64_t mask = (uint64_t)1 << id; + if (mask & other_compat) { + diff.compat.insert( Feature(id, other.compat.names[id])); + } + if (mask & other_ro_compat) { + diff.ro_compat.insert(Feature(id, other.ro_compat.names[id])); + } + if (mask & other_incompat) { + diff.incompat.insert( Feature(id, other.incompat.names[id])); + } + } + return diff; + } + + /* Merge features supported by other CompatSet into this one. + * Return: true if some features were merged + */ + bool merge(CompatSet const & other) { + uint64_t other_compat = + ((other.compat.mask ^ compat.mask) & other.compat.mask); + uint64_t other_ro_compat = + ((other.ro_compat.mask ^ ro_compat.mask) & other.ro_compat.mask); + uint64_t other_incompat = + ((other.incompat.mask ^ incompat.mask) & other.incompat.mask); + if (!other_compat && !other_ro_compat && !other_incompat) + return false; + for (int id = 1; id < 64; ++id) { + uint64_t mask = (uint64_t)1 << id; + if (mask & other_compat) { + compat.insert( Feature(id, other.compat.get_name(id))); + } + if (mask & other_ro_compat) { + ro_compat.insert(Feature(id, other.ro_compat.get_name(id))); + } + if (mask & other_incompat) { + incompat.insert( Feature(id, other.incompat.get_name(id))); + } + } + return true; + } + + void encode(bufferlist& bl) const { + compat.encode(bl); + ro_compat.encode(bl); + incompat.encode(bl); + } + + void decode(bufferlist::const_iterator& bl) { + compat.decode(bl); + ro_compat.decode(bl); + incompat.decode(bl); + } + + void dump(Formatter *f) const { + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + f->open_object_section("ro_compat"); + ro_compat.dump(f); + f->close_section(); + f->open_object_section("incompat"); + incompat.dump(f); + f->close_section(); + } + + static void generate_test_instances(std::list<CompatSet*>& o) { + o.push_back(new CompatSet); + o.push_back(new CompatSet); + o.back()->compat.insert(Feature(1, "one")); + o.back()->compat.insert(Feature(2, "two")); + o.back()->ro_compat.insert(Feature(4, "four")); + o.back()->incompat.insert(Feature(3, "three")); + } +}; +WRITE_CLASS_ENCODER(CompatSet) + +using ceph::operator <<; +inline std::ostream& operator<<(std::ostream& out, const CompatSet::FeatureSet& fs) +{ + return out << fs.names; +} + +inline std::ostream& operator<<(std::ostream& out, const CompatSet& compat) +{ + return out << "compat=" << compat.compat + << ",rocompat=" << compat.ro_compat + << ",incompat=" << compat.incompat; +} + +#endif diff --git a/src/include/Context.h b/src/include/Context.h new file mode 100644 index 00000000..b588b0f1 --- /dev/null +++ b/src/include/Context.h @@ -0,0 +1,502 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CONTEXT_H +#define CEPH_CONTEXT_H + +#include "common/dout.h" + +#include <boost/function.hpp> +#include <list> +#include <set> +#include <memory> + +#include "include/ceph_assert.h" +#include "common/Mutex.h" + +#define mydout(cct, v) lgeneric_subdout(cct, context, v) + +/* + * GenContext - abstract callback class + */ +template <typename T> +class GenContext { + GenContext(const GenContext& other); + const GenContext& operator=(const GenContext& other); + + protected: + virtual void finish(T t) = 0; + + public: + GenContext() {} + virtual ~GenContext() {} // we want a virtual destructor!!! + + template <typename C> + void complete(C &&t) { + finish(std::forward<C>(t)); + delete this; + } +}; + +template <typename T> +using GenContextURef = std::unique_ptr<GenContext<T> >; + +/* + * Context - abstract callback class + */ +class Finisher; +class Context { + Context(const Context& other); + const Context& operator=(const Context& other); + + protected: + virtual void finish(int r) = 0; + + // variant of finish that is safe to call "synchronously." override should + // return true. + virtual bool sync_finish(int r) { + return false; + } + + public: + Context() {} + virtual ~Context() {} // we want a virtual destructor!!! + virtual void complete(int r) { + finish(r); + delete this; + } + virtual bool sync_complete(int r) { + if (sync_finish(r)) { + delete this; + return true; + } + return false; + } +}; + +/** + * Simple context holding a single object + */ +template<class T> +class ContainerContext : public Context { + T obj; +public: + ContainerContext(T &obj) : obj(obj) {} + void finish(int r) override {} +}; +template <typename T> +ContainerContext<T> *make_container_context(T &&t) { + return new ContainerContext<T>(std::forward<T>(t)); +} + +template <class T> +struct Wrapper : public Context { + Context *to_run; + T val; + Wrapper(Context *to_run, T val) : to_run(to_run), val(val) {} + void finish(int r) override { + if (to_run) + to_run->complete(r); + } +}; +struct RunOnDelete { + Context *to_run; + RunOnDelete(Context *to_run) : to_run(to_run) {} + ~RunOnDelete() { + if (to_run) + to_run->complete(0); + } +}; +typedef std::shared_ptr<RunOnDelete> RunOnDeleteRef; + +template <typename T> +struct LambdaContext : public Context { + T t; + LambdaContext(T &&t) : t(std::forward<T>(t)) {} + void finish(int) override { + t(); + } +}; +template <typename T> +LambdaContext<T> *make_lambda_context(T &&t) { + return new LambdaContext<T>(std::move(t)); +} + +template <typename F, typename T> +struct LambdaGenContext : GenContext<T> { + F f; + LambdaGenContext(F &&f) : f(std::forward<F>(f)) {} + void finish(T t) override { + f(std::forward<T>(t)); + } +}; +template <typename T, typename F> +GenContextURef<T> make_gen_lambda_context(F &&f) { + return GenContextURef<T>(new LambdaGenContext<F, T>(std::move(f))); +} + +/* + * finish and destroy a list of Contexts + */ +template<class C> +inline void finish_contexts(CephContext *cct, C& finished, int result = 0) +{ + if (finished.empty()) + return; + + C ls; + ls.swap(finished); // swap out of place to avoid weird loops + + if (cct) + mydout(cct,10) << ls.size() << " contexts to finish with " << result << dendl; + for (Context* c : ls) { + if (cct) + mydout(cct,10) << "---- " << c << dendl; + c->complete(result); + } +} + +class C_NoopContext : public Context { +public: + void finish(int r) override { } +}; + + +struct C_Lock : public Context { + Mutex *lock; + Context *fin; + C_Lock(Mutex *l, Context *c) : lock(l), fin(c) {} + ~C_Lock() override { + delete fin; + } + void finish(int r) override { + if (fin) { + lock->Lock(); + fin->complete(r); + fin = NULL; + lock->Unlock(); + } + } +}; + +/* + * C_Contexts - set of Contexts + * + * ContextType must be an ancestor class of ContextInstanceType, or the same class. + * ContextInstanceType must be default-constructable. + */ +template <class ContextType, class ContextInstanceType, class Container = std::list<ContextType *>> +class C_ContextsBase : public ContextInstanceType { +public: + CephContext *cct; + Container contexts; + + C_ContextsBase(CephContext *cct_) + : cct(cct_) + { + } + ~C_ContextsBase() override { + for (auto c : contexts) { + delete c; + } + } + void add(ContextType* c) { + contexts.push_back(c); + } + void take(Container& ls) { + Container c; + c.swap(ls); + if constexpr (std::is_same_v<Container, std::list<ContextType *>>) { + contexts.splice(contexts.end(), c); + } else { + contexts.insert(contexts.end(), c.begin(), c.end()); + } + } + void complete(int r) override { + // Neuter any ContextInstanceType custom complete(), because although + // I want to look like it, I don't actually want to run its code. + Context::complete(r); + } + void finish(int r) override { + finish_contexts(cct, contexts, r); + } + bool empty() { return contexts.empty(); } + + template<class C> + static ContextType *list_to_context(C& cs) { + if (cs.size() == 0) { + return 0; + } else if (cs.size() == 1) { + ContextType *c = cs.front(); + cs.clear(); + return c; + } else { + C_ContextsBase<ContextType, ContextInstanceType> *c(new C_ContextsBase<ContextType, ContextInstanceType>(0)); + c->take(cs); + return c; + } + } +}; + +typedef C_ContextsBase<Context, Context> C_Contexts; + +/* + * C_Gather + * + * ContextType must be an ancestor class of ContextInstanceType, or the same class. + * ContextInstanceType must be default-constructable. + * + * BUG:? only reports error from last sub to have an error return + */ +template <class ContextType, class ContextInstanceType> +class C_GatherBase { +private: + CephContext *cct; + int result; + ContextType *onfinish; +#ifdef DEBUG_GATHER + std::set<ContextType*> waitfor; +#endif + int sub_created_count; + int sub_existing_count; + mutable Mutex lock; + bool activated; + + void sub_finish(ContextType* sub, int r) { + lock.Lock(); +#ifdef DEBUG_GATHER + ceph_assert(waitfor.count(sub)); + waitfor.erase(sub); +#endif + --sub_existing_count; + mydout(cct,10) << "C_GatherBase " << this << ".sub_finish(r=" << r << ") " << sub +#ifdef DEBUG_GATHER + << " (remaining " << waitfor << ")" +#endif + << dendl; + if (r < 0 && result == 0) + result = r; + if ((activated == false) || (sub_existing_count != 0)) { + lock.Unlock(); + return; + } + lock.Unlock(); + delete_me(); + } + + void delete_me() { + if (onfinish) { + onfinish->complete(result); + onfinish = 0; + } + delete this; + } + + class C_GatherSub : public ContextInstanceType { + C_GatherBase *gather; + public: + C_GatherSub(C_GatherBase *g) : gather(g) {} + void complete(int r) override { + // Cancel any customized complete() functionality + // from the Context subclass we're templated for, + // we only want to hit that in onfinish, not at each + // sub finish. e.g. MDSInternalContext. + Context::complete(r); + } + void finish(int r) override { + gather->sub_finish(this, r); + gather = 0; + } + ~C_GatherSub() override { + if (gather) + gather->sub_finish(this, 0); + } + }; + +public: + C_GatherBase(CephContext *cct_, ContextType *onfinish_) + : cct(cct_), result(0), onfinish(onfinish_), + sub_created_count(0), sub_existing_count(0), + lock("C_GatherBase::lock", true, false), //disable lockdep + activated(false) + { + mydout(cct,10) << "C_GatherBase " << this << ".new" << dendl; + } + ~C_GatherBase() { + mydout(cct,10) << "C_GatherBase " << this << ".delete" << dendl; + } + void set_finisher(ContextType *onfinish_) { + Mutex::Locker l(lock); + ceph_assert(!onfinish); + onfinish = onfinish_; + } + void activate() { + lock.Lock(); + ceph_assert(activated == false); + activated = true; + if (sub_existing_count != 0) { + lock.Unlock(); + return; + } + lock.Unlock(); + delete_me(); + } + ContextType *new_sub() { + Mutex::Locker l(lock); + ceph_assert(activated == false); + sub_created_count++; + sub_existing_count++; + ContextType *s = new C_GatherSub(this); +#ifdef DEBUG_GATHER + waitfor.insert(s); +#endif + mydout(cct,10) << "C_GatherBase " << this << ".new_sub is " << sub_created_count << " " << s << dendl; + return s; + } + + inline int get_sub_existing_count() const { + Mutex::Locker l(lock); + return sub_existing_count; + } + + inline int get_sub_created_count() const { + Mutex::Locker l(lock); + return sub_created_count; + } +}; + +/* + * The C_GatherBuilder remembers each C_Context created by + * C_GatherBuilder.new_sub() in a C_Gather. When a C_Context created + * by new_sub() is complete(), C_Gather forgets about it. When + * C_GatherBuilder notices that there are no C_Context left in + * C_Gather, it calls complete() on the C_Context provided as the + * second argument of the constructor (finisher). + * + * How to use C_GatherBuilder: + * + * 1. Create a C_GatherBuilder on the stack + * 2. Call gather_bld.new_sub() as many times as you want to create new subs + * It is safe to call this 0 times, or 100, or anything in between. + * 3. If you didn't supply a finisher in the C_GatherBuilder constructor, + * set one with gather_bld.set_finisher(my_finisher) + * 4. Call gather_bld.activate() + * + * Example: + * + * C_SaferCond all_done; + * C_GatherBuilder gb(g_ceph_context, all_done); + * j.submit_entry(1, first, 0, gb.new_sub()); // add a C_Context to C_Gather + * j.submit_entry(2, first, 0, gb.new_sub()); // add a C_Context to C_Gather + * gb.activate(); // consume C_Context as soon as they complete() + * all_done.wait(); // all_done is complete() after all new_sub() are complete() + * + * The finisher may be called at any point after step 4, including immediately + * from the activate() function. + * The finisher will never be called before activate(). + * + * Note: Currently, subs must be manually freed by the caller (for some reason.) + */ +template <class ContextType, class GatherType> +class C_GatherBuilderBase +{ +public: + C_GatherBuilderBase(CephContext *cct_) + : cct(cct_), c_gather(NULL), finisher(NULL), activated(false) + { + } + C_GatherBuilderBase(CephContext *cct_, ContextType *finisher_) + : cct(cct_), c_gather(NULL), finisher(finisher_), activated(false) + { + } + ~C_GatherBuilderBase() { + if (c_gather) { + ceph_assert(activated); // Don't forget to activate your C_Gather! + } + else { + delete finisher; + } + } + ContextType *new_sub() { + if (!c_gather) { + c_gather = new GatherType(cct, finisher); + } + return c_gather->new_sub(); + } + void activate() { + if (!c_gather) + return; + ceph_assert(finisher != NULL); + activated = true; + c_gather->activate(); + } + void set_finisher(ContextType *finisher_) { + finisher = finisher_; + if (c_gather) + c_gather->set_finisher(finisher); + } + GatherType *get() const { + return c_gather; + } + bool has_subs() const { + return (c_gather != NULL); + } + int num_subs_created() { + ceph_assert(!activated); + if (c_gather == NULL) + return 0; + return c_gather->get_sub_created_count(); + } + int num_subs_remaining() { + ceph_assert(!activated); + if (c_gather == NULL) + return 0; + return c_gather->get_sub_existing_count(); + } + +private: + CephContext *cct; + GatherType *c_gather; + ContextType *finisher; + bool activated; +}; + +typedef C_GatherBase<Context, Context> C_Gather; +typedef C_GatherBuilderBase<Context, C_Gather > C_GatherBuilder; + +class FunctionContext : public Context { +public: + FunctionContext(boost::function<void(int)> &&callback) + : m_callback(std::move(callback)) + { + } + + void finish(int r) override { + m_callback(r); + } +private: + boost::function<void(int)> m_callback; +}; + +template <class ContextType> +class ContextFactory { +public: + virtual ~ContextFactory() {} + virtual ContextType *build() = 0; +}; + +#undef mydout + +#endif diff --git a/src/include/Distribution.h b/src/include/Distribution.h new file mode 100644 index 00000000..e4f0b30b --- /dev/null +++ b/src/include/Distribution.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_DISTRIBUTION_H +#define CEPH_DISTRIBUTION_H + +#include <vector> + +class Distribution { + vector<float> p; + vector<int> v; + + public: + //Distribution() { + //} + + unsigned get_width() { + return p.size(); + } + + void clear() { + p.clear(); + v.clear(); + } + void add(int val, float pr) { + p.push_back(pr); + v.push_back(val); + } + + void random() { + float sum = 0.0; + for (unsigned i=0; i<p.size(); i++) { + p[i] = (float)(rand() % 10000); + sum += p[i]; + } + for (unsigned i=0; i<p.size(); i++) + p[i] /= sum; + } + + int sample() { + float s = (float)(rand() % 10000) / 10000.0; + for (unsigned i=0; i<p.size(); i++) { + if (s < p[i]) return v[i]; + s -= p[i]; + } + ceph_abort(); + return v[p.size() - 1]; // hmm. :/ + } + + float normalize() { + float s = 0.0; + for (unsigned i=0; i<p.size(); i++) + s += p[i]; + for (unsigned i=0; i<p.size(); i++) + p[i] /= s; + return s; + } + +}; + +#endif diff --git a/src/include/addr_parsing.h b/src/include/addr_parsing.h new file mode 100644 index 00000000..c205ac75 --- /dev/null +++ b/src/include/addr_parsing.h @@ -0,0 +1,28 @@ +/* + * addr_parsing.h + * + * Created on: Sep 14, 2010 + * Author: gregf + * contains functions used by Ceph to convert named addresses + * (eg ceph.com) into IP addresses (ie 127.0.0.1). + */ + +#ifndef ADDR_PARSING_H_ +#define ADDR_PARSING_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +int safe_cat(char **pstr, int *plen, int pos, const char *str2); + +/* + * returns a string allocated by malloc; caller must free + */ +char *resolve_addrs(const char *orig_str); + +#ifdef __cplusplus +} +#endif + +#endif /* ADDR_PARSING_H_ */ diff --git a/src/include/alloc_ptr.h b/src/include/alloc_ptr.h new file mode 100644 index 00000000..258c5833 --- /dev/null +++ b/src/include/alloc_ptr.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ALLOC_PTR_H +#define CEPH_ALLOC_PTR_H + +#include <memory> + +template <class T> +class alloc_ptr +{ +public: + typedef typename std::pointer_traits< std::unique_ptr<T> >::pointer pointer; + typedef typename std::pointer_traits< std::unique_ptr<T> >::element_type element_type; + + alloc_ptr() : ptr() {} + + template<class U> + alloc_ptr(U&& u) : ptr(std::forward<U>(u)) {} + + alloc_ptr(alloc_ptr<pointer>&& rhs) : ptr(std::move(rhs.ptr)) {} + alloc_ptr(const alloc_ptr<pointer>& rhs) = delete; + alloc_ptr& operator=(const alloc_ptr<pointer>&& rhs) { + ptr = rhs.ptr; + } + alloc_ptr& operator=(const alloc_ptr<pointer>& rhs) { + ptr = rhs.ptr; + } + + void swap (alloc_ptr<pointer>& rhs) { + ptr.swap(rhs.ptr); + } + element_type* release() { + return ptr.release(); + } + void reset(element_type *p = nullptr) { + ptr.reset(p); + } + element_type* get() const { + if (!ptr) + ptr.reset(new element_type); + return ptr.get(); + } + element_type& operator*() const { + if (!ptr) + ptr.reset(new element_type); + return *ptr; + } + element_type* operator->() const { + if (!ptr) + ptr.reset(new element_type); + return ptr.get(); + } + operator bool() const { + return !!ptr; + } + + friend bool operator< (const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::less<element_type>(*lhs, *rhs); + } + friend bool operator<=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::less_equal<element_type>(*lhs, *rhs); + } + friend bool operator> (const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::greater<element_type>(*lhs, *rhs); + } + friend bool operator>=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return std::greater_equal<element_type>(*lhs, *rhs); + } + friend bool operator==(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return *lhs == *rhs; + } + friend bool operator!=(const alloc_ptr& lhs, const alloc_ptr& rhs) { + return *lhs != *rhs; + } +private: + mutable std::unique_ptr<element_type> ptr; +}; + +#endif diff --git a/src/include/any.h b/src/include/any.h new file mode 100644 index 00000000..da59c88f --- /dev/null +++ b/src/include/any.h @@ -0,0 +1,704 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Adam C. Emerson <aemerson@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef INCLUDE_STATIC_ANY +#define INCLUDE_STATIC_ANY + +#include <any> +#include <cstddef> +#include <initializer_list> +#include <memory> +#include <typeinfo> +#include <type_traits> + +#include <boost/smart_ptr/shared_ptr.hpp> +#include <boost/smart_ptr/make_shared.hpp> + +namespace ceph { + +namespace _any { + +// Shared Functionality +// -------------------- +// +// Common implementation details. Most functionality is here. We +// assume that destructors do not throw. Some of them might and +// they'll invoke terminate and that's fine. +// +// We are using the Curiously Recurring Template Pattern! We require +// that all classes inheriting from us provide: +// +// - `static constexpr size_t capacity`: Maximum capacity. No object +// larger than this may be +// stored. `dynamic` for dynamic. +// - `void* ptr() const noexcept`: returns a pointer to storage. +// (`alloc_storage` must have been called. +// `free_storage` must not have been called +// since.) +// - `void* alloc_storage(const std::size_t)`: allocate storage +// - `void free_storage() noexcept`: free storage. Must be idempotent. +// +// We provide most of the public interface, as well as the operator function, +// cast_helper, and the type() call. + +// Set `capacity` to this value to indicate that there is no fixed +// capacity. +// +inline constexpr std::size_t dynamic = ~0; + +// Driver Function +// --------------- +// +// The usual type-erasure control function trick. This one is simpler +// than usual since we punt on moving and copying. We could dispense +// with this and just store a deleter and a pointer to a typeinfo, but +// that would be twice the space. +// +// Moved out here so the type of `func_t` isn't dependent on the +// enclosing class. +// +enum class op { type, destroy }; +template<typename T> +inline void op_func(const op o, void* p) noexcept { + static const std::type_info& type = typeid(T); + switch (o) { + case op::type: + *(reinterpret_cast<const std::type_info**>(p)) = &type; + break; + case op::destroy: + reinterpret_cast<T*>(p)->~T(); + break; + } +} +using func_t = void (*)(const op, void* p) noexcept; + +// The base class +// -------------- +// +// The `storage_t` parameter gives the type of the value that manages +// storage and allocation. We use it to create a protected data member +// (named `storage`). This allows us to sidestep the problem in +// initialization order where, where exposed constructors were using +// trying to allocate or free storage *before* the data members of the +// derived class were initialized. +// +// Making storage_t a member type of the derived class won't work, due +// to C++'s rules for nested types being *horrible*. Just downright +// *horrible*. +// +template<typename D, typename storage_t> +class base { + // Make definitions from our superclass visible + // -------------------------------------------- + // + // And check that they fit the requirements. At least those that are + // statically checkable. + // + static constexpr std::size_t capacity = D::capacity; + + void* ptr() const noexcept { + static_assert( + noexcept(static_cast<const D*>(this)->ptr()) && + std::is_same_v<decltype(static_cast<const D*>(this)->ptr()), void*>, + "‘void* ptr() const noexcept’ missing from superclass"); + return static_cast<const D*>(this)->ptr(); + } + + void* alloc_storage(const std::size_t z) { + static_assert( + std::is_same_v<decltype(static_cast<D*>(this)->alloc_storage(z)), void*>, + "‘void* alloc_storage(const size_t)’ missing from superclass."); + return static_cast<D*>(this)->alloc_storage(z); + } + + void free_storage() noexcept { + static_assert( + noexcept(static_cast<D*>(this)->free_storage()) && + std::is_void_v<decltype(static_cast<D*>(this)->free_storage())>, + "‘void free_storage() noexcept’ missing from superclass."); + static_cast<D*>(this)->free_storage(); + } + + + // Pile O' Templates + // ----------------- + // + // These are just verbose and better typed once than twice. They're + // used for SFINAE and declaring noexcept. + // + template<class T> + struct is_in_place_type_helper : std::false_type {}; + template<class T> + struct is_in_place_type_helper<std::in_place_type_t<T>> : std::true_type {}; + + template<class T> + static constexpr bool is_in_place_type_v = + is_in_place_type_helper<std::decay_t<T>>::value; + + // SFINAE condition for value initialized + // constructors/assigners. This is analogous to the standard's + // requirement that this overload only participate in overload + // resolution if std::decay_t<T> is not the same type as the + // any-type, nor a specialization of std::in_place_type_t + // + template<typename T> + using value_condition_t = std::enable_if_t< + !std::is_same_v<std::decay_t<T>, D> && + !is_in_place_type_v<std::decay_t<T>>>; + + // This `noexcept` condition for value construction lets + // `immobile_any`'s value constructor/assigner be noexcept, so long + // as the type's copy or move constructor cooperates. + // + template<typename T> + static constexpr bool value_noexcept_v = + std::is_nothrow_constructible_v<std::decay_t<T>, T> && capacity != dynamic; + + // SFINAE condition for in-place constructors/assigners + // + template<typename T, typename... Args> + using in_place_condition_t = std::enable_if_t<std::is_constructible_v< + std::decay_t<T>, Args...>>; + + // Analogous to the above. Give noexcept to immobile_any::emplace + // when possible. + // + template<typename T, typename... Args> + static constexpr bool in_place_noexcept_v = + std::is_nothrow_constructible_v<std::decay_t<T>, Args...> && + capacity != dynamic; + +private: + + // Functionality! + // -------------- + + // The driver function for the currently stored object. Whether this + // is null is the canonical way to know whether an instance has a + // value. + // + func_t func = nullptr; + + // Construct an object within ourselves. As you can see we give the + // weak exception safety guarantee. + // + template<typename T, typename ...Args> + std::decay_t<T>& construct(Args&& ...args) { + using Td = std::decay_t<T>; + static_assert(capacity == dynamic || sizeof(Td) <= capacity, + "Supplied type is too large for this specialization."); + try { + func = &op_func<Td>; + return *new (reinterpret_cast<Td*>(alloc_storage(sizeof(Td)))) + Td(std::forward<Args>(args)...); + } catch (...) { + reset(); + throw; + } + } + +protected: + + // We hold the storage, even if the superclass class manipulates it, + // so that its default initialization comes soon enough for us to + // use it in our constructors. + // + storage_t storage; + +public: + + base() noexcept = default; + ~base() noexcept { + reset(); + } + +protected: + // Since some of our derived classes /can/ be copied or moved. + // + base(const base& rhs) noexcept : func(rhs.func) { + if constexpr (std::is_copy_assignable_v<storage_t>) { + storage = rhs.storage; + } + } + base& operator =(const base& rhs) noexcept { + reset(); + func = rhs.func; + if constexpr (std::is_copy_assignable_v<storage_t>) { + storage = rhs.storage; + } + return *this; + } + + base(base&& rhs) noexcept : func(std::move(rhs.func)) { + if constexpr (std::is_move_assignable_v<storage_t>) { + storage = std::move(rhs.storage); + } + rhs.func = nullptr; + } + base& operator =(base&& rhs) noexcept { + reset(); + func = rhs.func; + if constexpr (std::is_move_assignable_v<storage_t>) { + storage = std::move(rhs.storage); + } + rhs.func = nullptr; + return *this; + } + +public: + + // Value construct/assign + // ---------------------- + // + template<typename T, + typename = value_condition_t<T>> + base(T&& t) noexcept(value_noexcept_v<T>) { + construct<T>(std::forward<T>(t)); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename = value_condition_t<T>> + base& operator =(T&& t) noexcept(value_noexcept_v<T>) { + reset(); + construct<T>(std::forward<T>(t)); + return *this; + } + + // In-place construct/assign + // ------------------------- + // + // I really hate the way the C++ standard library treats references + // as if they were stepchildren in a Charles Dickens novel. I am + // quite upset that std::optional lacks a specialization for + // references. There's no legitimate reason for it. The whole + // 're-seat or refuse' debate is simply a canard. The optional is + // effectively a container, so of course it can be emptied or + // reassigned. No, pointers are not an acceptable substitute. A + // pointer gives an address in memory which may be null and which + // may represent an object or may a location in which an object is + // to be created. An optional reference, on the other hand, is a + // reference to an initialized, live object or /empty/. This is an + // obvious difference that should be communicable to any programmer + // reading the code through the type system. + // + // `std::any`, even in the case of in-place construction, + // only stores the decayed type. I suspect this was to get around + // the question of whether, for a std::any holding a T&, + // std::any_cast<T> should return a copy or throw + // std::bad_any_cast. + // + // I think the appropriate response in that case would be to make a + // copy if the type supports it and fail otherwise. Once a concrete + // type is known the problem solves itself. + // + // If one were inclined, one could easily load the driver function + // with a heavy subset of the type traits (those that depend only on + // the type in question) and simply /ask/ whether it's a reference. + // + // At the moment, I'm maintaining compatibility with the standard + // library except for copy/move semantics. + // + template<typename T, + typename... Args, + typename = in_place_condition_t<T, Args...>> + base(std::in_place_type_t<T>, + Args&& ...args) noexcept(in_place_noexcept_v<T, Args...>) { + construct<T>(std::forward<Args>(args)...); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename... Args, + typename = in_place_condition_t<T>> + std::decay_t<T>& emplace(Args&& ...args) noexcept(in_place_noexcept_v< + T, Args...>) { + reset(); + return construct<T>(std::forward<Args>(args)...); + } + + template<typename T, + typename U, + typename... Args, + typename = in_place_condition_t<T, std::initializer_list<U>, + Args...>> + base(std::in_place_type_t<T>, + std::initializer_list<U> i, + Args&& ...args) noexcept(in_place_noexcept_v<T, std::initializer_list<U>, + Args...>) { + construct<T>(i, std::forward<Args>(args)...); + } + + // On exception, *this is set to empty. + // + template<typename T, + typename U, + typename... Args, + typename = in_place_condition_t<T, std::initializer_list<U>, + Args...>> + std::decay_t<T>& emplace(std::initializer_list<U> i, + Args&& ...args) noexcept(in_place_noexcept_v<T, + std::initializer_list<U>, + Args...>) { + reset(); + return construct<T>(i,std::forward<Args>(args)...); + } + + // Empty ourselves, using the subclass to free any storage. + // + void reset() noexcept { + if (has_value()) { + func(op::destroy, ptr()); + func = nullptr; + } + free_storage(); + } + + template<typename U = storage_t, + typename = std::enable_if<std::is_swappable_v<storage_t>>> + void swap(base& rhs) { + using std::swap; + swap(func, rhs.func); + swap(storage, rhs.storage); + } + + // All other functions should use this function to test emptiness + // rather than examining `func` directly. + // + bool has_value() const noexcept { + return !!func; + } + + // Returns the type of the value stored, if any. + // + const std::type_info& type() const noexcept { + if (has_value()) { + const std::type_info* t; + func(op::type, reinterpret_cast<void*>(&t)); + return *t; + } else { + return typeid(void); + } + } + + template<typename T, typename U, typename V> + friend inline void* cast_helper(const base<U, V>& b) noexcept; +}; + +// Function used by all `any_cast` functions +// +// Returns a void* to the contents if they exist and match the +// requested type, otherwise `nullptr`. +// +template<typename T, typename U, typename V> +inline void* cast_helper(const base<U, V>& b) noexcept { + if (b.func && ((&op_func<T> == b.func) || + (b.type() == typeid(T)))) { + return b.ptr(); + } else { + return nullptr; + } +} +} + +// `any_cast` +// ========== +// +// Just the usual gamut of `any_cast` overloads. These get a bit +// repetitive and it would be nice to think of a way to collapse them +// down a bit. +// + +// The pointer pair! +// +template<typename T, typename U, typename V> +inline T* any_cast(_any::base<U, V>* a) noexcept { + if (a) { + return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a)); + } + return nullptr; +} + +template<typename T, typename U, typename V> +inline const T* any_cast(const _any::base<U, V>* a) noexcept { + if (a) { + return static_cast<T*>(_any::cast_helper<std::decay_t<T>>(*a)); + } + return nullptr; +} + +// While we disallow copying the immobile any itself, we can allow +// anything with an extracted value that the type supports. +// +template<typename T, typename U, typename V> +inline T any_cast(_any::base<U, V>& a) { + static_assert(std::is_reference_v<T> || + std::is_copy_constructible_v<T>, + "The supplied type must be either a reference or " + "copy constructible."); + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline T any_cast(const _any::base<U, V>& a) { + static_assert(std::is_reference_v<T> || + std::is_copy_constructible_v<T>, + "The supplied type must be either a reference or " + "copy constructible."); + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline std::enable_if_t<(std::is_move_constructible_v<T> || + std::is_copy_constructible_v<T>) && + !std::is_rvalue_reference_v<T>, T> +any_cast(_any::base<U, V>&& a) { + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return std::move((*p)); + } + throw std::bad_any_cast(); +} + +template<typename T, typename U, typename V> +inline std::enable_if_t<std::is_rvalue_reference_v<T>, T> +any_cast(_any::base<U, V>&& a) { + auto p = any_cast<std::decay_t<T>>(&a); + if (p) { + return static_cast<T>(*p); + } + throw std::bad_any_cast(); +} + +// `immobile_any` +// ============== +// +// Sometimes, uncopyable objects exist and I want to do things with +// them. The C++ standard library is really quite keen on insisting +// things be copyable before it deigns to work. I find this annoying. +// +// Also, the allocator, while useful, is really not considerate of +// other people's time. Every time we go to visit it, it takes us +// quite an awfully long time to get away again. As such, I've been +// trying to avoid its company whenever it is convenient and seemly. +// +// We accept any type that will fit in the declared capacity. You may +// store types with throwing destructors, but terminate will be +// invoked when they throw. +// +template<std::size_t S> +class immobile_any : public _any::base<immobile_any<S>, + std::aligned_storage_t<S>> { + using base = _any::base<immobile_any<S>, std::aligned_storage_t<S>>; + friend base; + + using _any::base<immobile_any<S>, std::aligned_storage_t<S>>::storage; + + // Superclass requirements! + // ------------------------ + // + // Simple as anything. We have a buffer of fixed size and return the + // pointer to it when asked. + // + static constexpr std::size_t capacity = S; + void* ptr() const noexcept { + return const_cast<void*>(static_cast<const void*>(&storage)); + } + void* alloc_storage(std::size_t) noexcept { + return ptr(); + } + void free_storage() noexcept {} + + static_assert(capacity != _any::dynamic, + "That is not a valid size for an immobile_any."); + +public: + + immobile_any() noexcept = default; + + immobile_any(const immobile_any&) = delete; + immobile_any& operator =(const immobile_any&) = delete; + immobile_any(immobile_any&&) = delete; + immobile_any& operator =(immobile_any&&) = delete; + + using base::base; + using base::operator =; + + void swap(immobile_any&) = delete; +}; + +template<typename T, std::size_t S, typename... Args> +inline immobile_any<S> make_immobile_any(Args&& ...args) { + return immobile_any<S>(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, std::size_t S, typename U, typename... Args> +inline immobile_any<S> make_immobile_any(std::initializer_list<U> i, Args&& ...args) { + return immobile_any<S>(std::in_place_type<T>, i, std::forward<Args>(args)...); +} + +// `unique_any` +// ============ +// +// Oh dear. Now we're getting back into allocation. You don't think +// the allocator noticed all those mean things we said about it, do +// you? +// +// Well. Okay, allocator. Sometimes when it's the middle of the night +// and you're writing template code you say things you don't exactly +// mean. If it weren't for you, we wouldn't have any memory to run all +// our programs in at all. Really, I'm just being considerate of +// *your* needs, trying to avoid having to run to you every time we +// instantiate a type, making a few that can be self-sufficient…uh… +// +// **Anyway**, this is movable but not copyable, as you should expect +// from anything with ‘unique’ in the name. +// +class unique_any : public _any::base<unique_any, std::unique_ptr<std::byte[]>> { + using base = _any::base<unique_any, std::unique_ptr<std::byte[]>>; + friend base; + + using base::storage; + + // Superclass requirements + // ----------------------- + // + // Our storage is a single chunk of RAM owned by a + // `std::unique_ptr`. + // + static constexpr std::size_t capacity = _any::dynamic; + void* ptr() const noexcept { + return static_cast<void*>(storage.get()); + return nullptr; + } + + void* alloc_storage(const std::size_t z) { + storage.reset(new std::byte[z]); + return ptr(); + } + + void free_storage() noexcept { + storage.reset(); + } + +public: + + unique_any() noexcept = default; + ~unique_any() noexcept = default; + + unique_any(const unique_any&) = delete; + unique_any& operator =(const unique_any&) = delete; + + // We can rely on the behavior of `unique_ptr` and the base class to + // give us a default move constructor that does the right thing. + // + unique_any(unique_any&& rhs) noexcept = default; + unique_any& operator =(unique_any&& rhs) = default; + + using base::base; + using base::operator =; +}; + +inline void swap(unique_any& lhs, unique_any& rhs) noexcept { + lhs.swap(rhs); +} + +template<typename T, typename... Args> +inline unique_any make_unique_any(Args&& ...args) { + return unique_any(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, typename U, typename... Args> +inline unique_any make_unique_any(std::initializer_list<U> i, Args&& ...args) { + return unique_any(std::in_place_type<T>, i, std::forward<Args>(args)...); +} + +// `shared_any` +// ============ +// +// Once more with feeling! +// +// This is both copyable *and* movable. In case you need that sort of +// thing. It seemed a reasonable completion. +// +class shared_any : public _any::base<shared_any, boost::shared_ptr<std::byte[]>> { + using base = _any::base<shared_any, boost::shared_ptr<std::byte[]>>; + friend base; + + using base::storage; + + // Superclass requirements + // ----------------------- + // + // Our storage is a single chunk of RAM allocated from the + // heap. This time it's owned by a `boost::shared_ptr` so we can use + // `boost::make_shared_noinit`. (This lets us get the optimization + // that allocates array and control block in one without wasting + // time on `memset`.) + // + static constexpr std::size_t capacity = _any::dynamic; + void* ptr() const noexcept { + return static_cast<void*>(storage.get()); + } + + void* alloc_storage(std::size_t n) { + storage = boost::make_shared_noinit<std::byte[]>(n); + return ptr(); + } + + void free_storage() noexcept { + storage.reset(); + } + +public: + + shared_any() noexcept = default; + ~shared_any() noexcept = default; + + shared_any(const shared_any& rhs) noexcept = default; + shared_any& operator =(const shared_any&) noexcept = default; + + shared_any(shared_any&& rhs) noexcept = default; + shared_any& operator =(shared_any&& rhs) noexcept = default; + + using base::base; + using base::operator =; +}; + +inline void swap(shared_any& lhs, shared_any& rhs) noexcept { + lhs.swap(rhs); +} + +template<typename T, typename... Args> +inline shared_any make_shared_any(Args&& ...args) { + return shared_any(std::in_place_type<T>, std::forward<Args>(args)...); +} + +template<typename T, typename U, typename... Args> +inline shared_any make_shared_any(std::initializer_list<U> i, Args&& ...args) { + return shared_any(std::in_place_type<T>, i, std::forward<Args>(args)...); +} +} + +#endif // INCLUDE_STATIC_ANY diff --git a/src/include/bitmapper.h b/src/include/bitmapper.h new file mode 100644 index 00000000..5a65cc20 --- /dev/null +++ b/src/include/bitmapper.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BITMAPPER_H +#define CEPH_BITMAPPER_H + +class bitmapper { + char *_data; + int _len; + + public: + bitmapper() : _data(0), _len(0) { } + bitmapper(char *data, int len) : _data(data), _len(len) { } + + void set_data(char *data, int len) { _data = data; _len = len; } + + int bytes() const { return _len; } + int bits() const { return _len * 8; } + + bool operator[](int b) const { + return get(b); + } + bool get(int b) const { + return _data[b >> 3] & (1 << (b&7)); + } + void set(int b) { + _data[b >> 3] |= 1 << (b&7); + } + void clear(int b) { + _data[b >> 3] &= ~(1 << (b&7)); + } + void toggle(int b) { + _data[b >> 3] ^= 1 << (b&7); + } +}; + +#endif diff --git a/src/include/blobhash.h b/src/include/blobhash.h new file mode 100644 index 00000000..597884e4 --- /dev/null +++ b/src/include/blobhash.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BLOBHASH_H +#define CEPH_BLOBHASH_H + +#include "hash.h" + +/* +- this is to make some of the STL types work with 64 bit values, string hash keys, etc. +- added when i was using an old STL.. maybe try taking these out and see if things + compile now? +*/ + +class blobhash { +public: + uint32_t operator()(const char *p, unsigned len) { + static rjhash<uint32_t> H; + uint32_t acc = 0; + while (len >= sizeof(acc)) { + acc ^= *(uint32_t*)p; + p += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + int sh = 0; + while (len) { + acc ^= (uint32_t)*p << sh; + sh += 8; + len--; + p++; + } + return H(acc); + } +}; + + +#endif diff --git a/src/include/btree_map.h b/src/include/btree_map.h new file mode 100644 index 00000000..1f42ea41 --- /dev/null +++ b/src/include/btree_map.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_INCLUDE_BTREE_MAP_H +#define CEPH_INCLUDE_BTREE_MAP_H + +#include "include/cpp-btree/btree.h" +#include "include/cpp-btree/btree_map.h" +#include "include/ceph_assert.h" // cpp-btree uses system assert, blech +#include "include/encoding.h" + +template<class T, class U> +inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U> +inline void encode(const btree::btree_map<T,U>& m, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U> +inline void decode(btree::btree_map<T,U>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U> +inline void encode_nohead(const btree::btree_map<T,U>& m, bufferlist& bl) +{ + for (typename btree::btree_map<T,U>::const_iterator p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U> +inline void decode_nohead(int n, btree::btree_map<T,U>& m, bufferlist::const_iterator& p) +{ + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +#endif diff --git a/src/include/buffer.h b/src/include/buffer.h new file mode 100644 index 00000000..774ca052 --- /dev/null +++ b/src/include/buffer.h @@ -0,0 +1,1331 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_BUFFER_H +#define CEPH_BUFFER_H + +#if defined(__linux__) || defined(__FreeBSD__) +#include <stdlib.h> +#endif +#include <limits.h> + +#ifndef _XOPEN_SOURCE +# define _XOPEN_SOURCE 600 +#endif + +#include <stdio.h> +#include <sys/uio.h> + +#if defined(__linux__) // For malloc(2). +#include <malloc.h> +#endif + +#include <inttypes.h> +#include <stdint.h> +#include <string.h> + +#ifndef __CYGWIN__ +# include <sys/mman.h> +#endif + +#include <iosfwd> +#include <iomanip> +#include <list> +#include <vector> +#include <string> +#if __cplusplus >= 201703L +#include <string_view> +#endif // __cplusplus >= 201703L + +#include <exception> +#include <type_traits> + +#include "page.h" +#include "crc32c.h" +#include "buffer_fwd.h" + +#ifdef __CEPH__ +# include "include/ceph_assert.h" +#else +# include <assert.h> +#endif + +#include "inline_memory.h" + +#define CEPH_BUFFER_API + +#if defined(HAVE_XIO) +struct xio_reg_mem; +class XioDispatchHook; +#endif +#ifdef HAVE_SEASTAR +namespace seastar { +template <typename T> class temporary_buffer; +namespace net { +class packet; +} +} +#endif // HAVE_SEASTAR +class deleter; +template<uint8_t S> +struct sha_digest_t; +using sha1_digest_t = sha_digest_t<20>; + +namespace ceph { + +template <class T> +struct nop_delete { + void operator()(T*) {} +}; + +// This is not unique_ptr-like smart pointer! It just signalizes ownership +// but DOES NOT manage the resource. It WILL LEAK if not manually deleted. +// It's rather a replacement for raw pointer than any other smart one. +// +// Considered options: +// * unique_ptr with custom deleter implemented in .cc (would provide +// the non-zero-cost resource management), +// * GSL's owner<T*> (pretty neat but would impose an extra depedency), +// * unique_ptr with nop deleter, +// * raw pointer (doesn't embed ownership enforcement - std::move). +template <class T> +struct unique_leakable_ptr : public std::unique_ptr<T, ceph::nop_delete<T>> { + using std::unique_ptr<T, ceph::nop_delete<T>>::unique_ptr; +}; + +namespace buffer CEPH_BUFFER_API { +inline namespace v14_2_0 { + + /* + * exceptions + */ + + struct error : public std::exception{ + const char *what() const throw () override; + }; + struct bad_alloc : public error { + const char *what() const throw () override; + }; + struct end_of_buffer : public error { + const char *what() const throw () override; + }; + struct malformed_input : public error { + explicit malformed_input(const std::string& w) { + snprintf(buf, sizeof(buf), "buffer::malformed_input: %s", w.c_str()); + } + const char *what() const throw () override; + private: + char buf[256]; + }; + struct error_code : public malformed_input { + explicit error_code(int error); + int code; + }; + + + /// count of cached crc hits (matching input) + int get_cached_crc(); + /// count of cached crc hits (mismatching input, required adjustment) + int get_cached_crc_adjusted(); + /// count of crc cache misses + int get_missed_crc(); + /// enable/disable tracking of cached crcs + void track_cached_crc(bool b); + + /* + * an abstract raw buffer. with a reference count. + */ + class raw; + class raw_malloc; + class raw_static; + class raw_posix_aligned; + class raw_hack_aligned; + class raw_char; + class raw_claimed_char; + class raw_unshareable; // diagnostic, unshareable char buffer + class raw_combined; + class raw_claim_buffer; + + + class xio_mempool; + class xio_msg_buffer; + + /* + * named constructors + */ + ceph::unique_leakable_ptr<raw> copy(const char *c, unsigned len); + ceph::unique_leakable_ptr<raw> create(unsigned len); + ceph::unique_leakable_ptr<raw> create_in_mempool(unsigned len, int mempool); + raw* claim_char(unsigned len, char *buf); + raw* create_malloc(unsigned len); + raw* claim_malloc(unsigned len, char *buf); + raw* create_static(unsigned len, char *buf); + ceph::unique_leakable_ptr<raw> create_aligned(unsigned len, unsigned align); + ceph::unique_leakable_ptr<raw> create_aligned_in_mempool(unsigned len, unsigned align, int mempool); + ceph::unique_leakable_ptr<raw> create_page_aligned(unsigned len); + ceph::unique_leakable_ptr<raw> create_small_page_aligned(unsigned len); + raw* create_unshareable(unsigned len); + raw* create_static(unsigned len, char *buf); + raw* claim_buffer(unsigned len, char *buf, deleter del); + +#ifdef HAVE_SEASTAR + /// create a raw buffer to wrap seastar cpu-local memory, using foreign_ptr to + /// make it safe to share between cpus + raw* create_foreign(seastar::temporary_buffer<char>&& buf); + /// create a raw buffer to wrap seastar cpu-local memory, without the safety + /// of foreign_ptr. the caller must otherwise guarantee that the buffer ptr is + /// destructed on this cpu + raw* create(seastar::temporary_buffer<char>&& buf); +#endif +#if defined(HAVE_XIO) + raw* create_msg(unsigned len, char *buf, XioDispatchHook *m_hook); +#endif + + /* + * a buffer pointer. references (a subsequence of) a raw buffer. + */ + class CEPH_BUFFER_API ptr { + raw *_raw; + public: // dirty hack for testing; if it works, this will be abstracted + unsigned _off, _len; + private: + + void release(); + + template<bool is_const> + class iterator_impl { + const ptr *bp; ///< parent ptr + const char *start; ///< starting pointer into bp->c_str() + const char *pos; ///< pointer into bp->c_str() + const char *end_ptr; ///< pointer to bp->end_c_str() + const bool deep; ///< if true, do not allow shallow ptr copies + + iterator_impl(typename std::conditional<is_const, const ptr*, ptr*>::type p, + size_t offset, bool d) + : bp(p), + start(p->c_str() + offset), + pos(start), + end_ptr(p->end_c_str()), + deep(d) + {} + + friend class ptr; + + public: + using pointer = typename std::conditional<is_const, const char*, char *>::type; + pointer get_pos_add(size_t n) { + auto r = pos; + advance(n); + return r; + } + ptr get_ptr(size_t len) { + if (deep) { + return buffer::copy(get_pos_add(len), len); + } else { + size_t off = pos - bp->c_str(); + advance(len); + return ptr(*bp, off, len); + } + } + + void advance(size_t len) { + pos += len; + if (pos > end_ptr) + throw end_of_buffer(); + } + + const char *get_pos() { + return pos; + } + const char *get_end() { + return end_ptr; + } + + size_t get_offset() { + return pos - start; + } + + bool end() const { + return pos == end_ptr; + } + }; + + public: + using const_iterator = iterator_impl<true>; + using iterator = iterator_impl<false>; + + ptr() : _raw(nullptr), _off(0), _len(0) {} + // cppcheck-suppress noExplicitConstructor + ptr(raw* r); + ptr(ceph::unique_leakable_ptr<raw> r); + // cppcheck-suppress noExplicitConstructor + ptr(unsigned l); + ptr(const char *d, unsigned l); + ptr(const ptr& p); + ptr(ptr&& p) noexcept; + ptr(const ptr& p, unsigned o, unsigned l); + ptr(const ptr& p, ceph::unique_leakable_ptr<raw> r); + ptr& operator= (const ptr& p); + ptr& operator= (ptr&& p) noexcept; + ~ptr() { + // BE CAREFUL: this destructor is called also for hypercombined ptr_node. + // After freeing underlying raw, `*this` can become inaccessible as well! + release(); + } + + bool have_raw() const { return _raw ? true:false; } + + ceph::unique_leakable_ptr<raw> clone(); + void swap(ptr& other) noexcept; + + iterator begin(size_t offset=0) { + return iterator(this, offset, false); + } + const_iterator begin(size_t offset=0) const { + return const_iterator(this, offset, false); + } + const_iterator cbegin() const { + return begin(); + } + const_iterator begin_deep(size_t offset=0) const { + return const_iterator(this, offset, true); + } + + // misc + bool is_aligned(unsigned align) const { + return ((long)c_str() & (align-1)) == 0; + } + bool is_page_aligned() const { return is_aligned(CEPH_PAGE_SIZE); } + bool is_n_align_sized(unsigned align) const + { + return (length() % align) == 0; + } + bool is_n_page_sized() const { return is_n_align_sized(CEPH_PAGE_SIZE); } + bool is_partial() const { + return have_raw() && (start() > 0 || end() < raw_length()); + } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + // accessors + raw *get_raw() const { return _raw; } + const char *c_str() const; + char *c_str(); + const char *end_c_str() const; + char *end_c_str(); + unsigned length() const { return _len; } + unsigned offset() const { return _off; } + unsigned start() const { return _off; } + unsigned end() const { return _off + _len; } + unsigned unused_tail_length() const; + const char& operator[](unsigned n) const; + char& operator[](unsigned n); + + const char *raw_c_str() const; + unsigned raw_length() const; + int raw_nref() const; + + void copy_out(unsigned o, unsigned l, char *dest) const; + + unsigned wasted() const; + + int cmp(const ptr& o) const; + bool is_zero() const; + + // modifiers + void set_offset(unsigned o) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= o); +#else + assert(raw_length() >= o); +#endif + _off = o; + } + void set_length(unsigned l) { +#ifdef __CEPH__ + ceph_assert(raw_length() >= l); +#else + assert(raw_length() >= l); +#endif + _len = l; + } + + unsigned append(char c); + unsigned append(const char *p, unsigned l); +#if __cplusplus >= 201703L + inline unsigned append(std::string_view s) { + return append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void copy_in(unsigned o, unsigned l, const char *src, bool crc_reset = true); + void zero(bool crc_reset = true); + void zero(unsigned o, unsigned l, bool crc_reset = true); + unsigned append_zeros(unsigned l); + +#ifdef HAVE_SEASTAR + /// create a temporary_buffer, copying the ptr as its deleter + operator seastar::temporary_buffer<char>() &; + /// convert to temporary_buffer, stealing the ptr as its deleter + operator seastar::temporary_buffer<char>() &&; +#endif // HAVE_SEASTAR + + }; + + + struct ptr_hook { + mutable ptr_hook* next; + + ptr_hook() = default; + ptr_hook(ptr_hook* const next) + : next(next) { + } + }; + + class ptr_node : public ptr_hook, public ptr { + public: + struct cloner { + ptr_node* operator()(const ptr_node& clone_this); + }; + struct disposer { + void operator()(ptr_node* const delete_this) { + if (!dispose_if_hypercombined(delete_this)) { + delete delete_this; + } + } + }; + + ~ptr_node() = default; + + static std::unique_ptr<ptr_node, disposer> + create(ceph::unique_leakable_ptr<raw> r) { + return create_hypercombined(std::move(r)); + } + static std::unique_ptr<ptr_node, disposer> create(raw* const r) { + return create_hypercombined(r); + } + static std::unique_ptr<ptr_node, disposer> create(const unsigned l) { + return create_hypercombined(buffer::create(l)); + } + template <class... Args> + static std::unique_ptr<ptr_node, disposer> create(Args&&... args) { + return std::unique_ptr<ptr_node, disposer>( + new ptr_node(std::forward<Args>(args)...)); + } + + static ptr_node* copy_hypercombined(const ptr_node& copy_this); + + private: + template <class... Args> + ptr_node(Args&&... args) : ptr(std::forward<Args>(args)...) { + } + ptr_node(const ptr_node&) = default; + + ptr& operator= (const ptr& p) = delete; + ptr& operator= (ptr&& p) noexcept = delete; + ptr_node& operator= (const ptr_node& p) = delete; + ptr_node& operator= (ptr_node&& p) noexcept = delete; + void swap(ptr& other) noexcept = delete; + void swap(ptr_node& other) noexcept = delete; + + static bool dispose_if_hypercombined(ptr_node* delete_this); + static std::unique_ptr<ptr_node, disposer> create_hypercombined( + buffer::raw* r); + static std::unique_ptr<ptr_node, disposer> create_hypercombined( + ceph::unique_leakable_ptr<raw> r); + }; + /* + * list - the useful bit! + */ + + class CEPH_BUFFER_API list { + public: + // this the very low-level implementation of singly linked list + // ceph::buffer::list is built on. We don't use intrusive slist + // of Boost (or any other 3rd party) to save extra dependencies + // in our public headers. + class buffers_t { + // _root.next can be thought as _head + ptr_hook _root; + ptr_hook* _tail; + std::size_t _size; + + public: + template <class T> + class buffers_iterator { + typename std::conditional< + std::is_const<T>::value, const ptr_hook*, ptr_hook*>::type cur; + template <class U> friend class buffers_iterator; + public: + using value_type = T; + using reference = typename std::add_lvalue_reference<T>::type; + using pointer = typename std::add_pointer<T>::type; + using difference_type = std::ptrdiff_t; + using iterator_category = std::forward_iterator_tag; + + template <class U> + buffers_iterator(U* const p) + : cur(p) { + } + template <class U> + buffers_iterator(const buffers_iterator<U>& other) + : cur(other.cur) { + } + buffers_iterator() = default; + + T& operator*() const { + return *reinterpret_cast<T*>(cur); + } + T* operator->() const { + return reinterpret_cast<T*>(cur); + } + + buffers_iterator& operator++() { + cur = cur->next; + return *this; + } + buffers_iterator operator++(int) { + const auto temp(*this); + ++*this; + return temp; + } + + template <class U> + buffers_iterator& operator=(buffers_iterator<U>& other) { + cur = other.cur; + return *this; + } + + bool operator==(const buffers_iterator& rhs) const { + return cur == rhs.cur; + } + bool operator!=(const buffers_iterator& rhs) const { + return !(*this==rhs); + } + + using citer_t = buffers_iterator<typename std::add_const<T>::type>; + operator citer_t() const { + return citer_t(cur); + } + }; + + typedef buffers_iterator<const ptr_node> const_iterator; + typedef buffers_iterator<ptr_node> iterator; + + typedef const ptr_node& const_reference; + typedef ptr_node& reference; + + buffers_t() + : _root(&_root), + _tail(&_root), + _size(0) { + } + buffers_t(const buffers_t&) = delete; + buffers_t(buffers_t&& other) + : _root(other._root.next == &other._root ? &_root : other._root.next), + _tail(other._tail == &other._root ? &_root : other._tail), + _size(other._size) { + other._root.next = &other._root; + other._tail = &other._root; + other._size = 0; + + _tail->next = &_root; + } + buffers_t& operator=(buffers_t&& other) { + if (&other != this) { + clear_and_dispose(); + swap(other); + } + return *this; + } + + void push_back(reference item) { + item.next = &_root; + // this updates _root.next when called on empty + _tail->next = &item; + _tail = &item; + _size++; + } + + void push_front(reference item) { + item.next = _root.next; + _root.next = &item; + _tail = _tail == &_root ? &item : _tail; + _size++; + } + + // *_after + iterator erase_after(const_iterator it) { + const auto* to_erase = it->next; + + it->next = to_erase->next; + _root.next = _root.next == to_erase ? to_erase->next : _root.next; + _tail = _tail == to_erase ? (ptr_hook*)&*it : _tail; + _size--; + return it->next; + } + + void insert_after(const_iterator it, reference item) { + item.next = it->next; + it->next = &item; + _root.next = it == end() ? &item : _root.next; + _tail = const_iterator(_tail) == it ? &item : _tail; + _size++; + } + + void splice_back(buffers_t& other) { + if (other._size == 0) { + return; + } + + other._tail->next = &_root; + // will update root.next if empty() == true + _tail->next = other._root.next; + _tail = other._tail; + _size += other._size; + + other._root.next = &other._root; + other._tail = &other._root; + other._size = 0; + } + + std::size_t size() const { return _size; } + bool empty() const { return _tail == &_root; } + + const_iterator begin() const { + return _root.next; + } + const_iterator before_begin() const { + return &_root; + } + const_iterator end() const { + return &_root; + } + iterator begin() { + return _root.next; + } + iterator before_begin() { + return &_root; + } + iterator end() { + return &_root; + } + + reference front() { + return reinterpret_cast<reference>(*_root.next); + } + reference back() { + return reinterpret_cast<reference>(*_tail); + } + const_reference front() const { + return reinterpret_cast<const_reference>(*_root.next); + } + const_reference back() const { + return reinterpret_cast<const_reference>(*_tail); + } + + void clone_from(const buffers_t& other) { + clear_and_dispose(); + for (auto& node : other) { + ptr_node* clone = ptr_node::cloner()(node); + push_back(*clone); + } + } + void clear_and_dispose() { + for (auto it = begin(); it != end(); /* nop */) { + auto& node = *it; + it = it->next; + ptr_node::disposer()(&node); + } + _root.next = &_root; + _tail = &_root; + _size = 0; + } + iterator erase_after_and_dispose(iterator it) { + auto* to_dispose = &*std::next(it); + auto ret = erase_after(it); + ptr_node::disposer()(to_dispose); + return ret; + } + + void swap(buffers_t& other) { + const auto copy_root = _root; + _root.next = \ + other._root.next == &other._root ? &this->_root : other._root.next; + other._root.next = \ + copy_root.next == &_root ? &other._root : copy_root.next; + + const auto copy_tail = _tail; + _tail = other._tail == &other._root ? &this->_root : other._tail; + other._tail = copy_tail == &_root ? &other._root : copy_tail; + + _tail->next = &_root; + other._tail->next = &other._root; + std::swap(_size, other._size); + } + }; + + class iterator; + + private: + // my private bits + buffers_t _buffers; + + // track bufferptr we can modify (especially ::append() to). Not all bptrs + // bufferlist holds have this trait -- if somebody ::push_back(const ptr&), + // he expects it won't change. + ptr* _carriage; + unsigned _len; + unsigned _memcopy_count; //the total of memcopy using rebuild(). + + template <bool is_const> + class CEPH_BUFFER_API iterator_impl { + protected: + typedef typename std::conditional<is_const, + const list, + list>::type bl_t; + typedef typename std::conditional<is_const, + const buffers_t, + buffers_t >::type list_t; + typedef typename std::conditional<is_const, + typename buffers_t::const_iterator, + typename buffers_t::iterator>::type list_iter_t; + bl_t* bl; + list_t* ls; // meh.. just here to avoid an extra pointer dereference.. + list_iter_t p; + unsigned off; // in bl + unsigned p_off; // in *p + friend class iterator_impl<true>; + + public: + using iterator_category = std::forward_iterator_tag; + using value_type = typename std::conditional<is_const, const char, char>::type; + using difference_type = std::ptrdiff_t; + using pointer = typename std::add_pointer<value_type>::type; + using reference = typename std::add_lvalue_reference<value_type>::type; + + // constructor. position. + iterator_impl() + : bl(0), ls(0), off(0), p_off(0) {} + iterator_impl(bl_t *l, unsigned o=0); + iterator_impl(bl_t *l, unsigned o, list_iter_t ip, unsigned po) + : bl(l), ls(&bl->_buffers), p(ip), off(o), p_off(po) {} + iterator_impl(const list::iterator& i); + + /// get current iterator offset in buffer::list + unsigned get_off() const { return off; } + + /// get number of bytes remaining from iterator position to the end of the buffer::list + unsigned get_remaining() const { return bl->length() - off; } + + /// true if iterator is at the end of the buffer::list + bool end() const { + return p == ls->end(); + //return off == bl->length(); + } + + void advance(int o) = delete; + void advance(unsigned o); + void advance(size_t o) { advance(static_cast<unsigned>(o)); } + void seek(unsigned o); + char operator*() const; + iterator_impl& operator++(); + ptr get_current_ptr() const; + bool is_pointing_same_raw(const ptr& other) const; + + bl_t& get_bl() const { return *bl; } + + // copy data out. + // note that these all _append_ to dest! + void copy(unsigned len, char *dest); + // deprecated, use copy_deep() + void copy(unsigned len, ptr &dest) __attribute__((deprecated)); + void copy_deep(unsigned len, ptr &dest); + void copy_shallow(unsigned len, ptr &dest); + void copy(unsigned len, list &dest); + void copy(unsigned len, std::string &dest); + void copy_all(list &dest); + + // get a pointer to the currenet iterator position, return the + // number of bytes we can read from that position (up to want), + // and advance the iterator by that amount. + size_t get_ptr_and_advance(size_t want, const char **p); + + /// calculate crc from iterator position + uint32_t crc32c(size_t length, uint32_t crc); + + friend bool operator==(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() == &rhs.get_bl() && lhs.get_off() == rhs.get_off(); + } + friend bool operator!=(const iterator_impl& lhs, + const iterator_impl& rhs) { + return &lhs.get_bl() != &rhs.get_bl() || lhs.get_off() != rhs.get_off(); + } + }; + + public: + typedef iterator_impl<true> const_iterator; + + class CEPH_BUFFER_API iterator : public iterator_impl<false> { + public: + iterator() = default; + iterator(bl_t *l, unsigned o=0); + iterator(bl_t *l, unsigned o, list_iter_t ip, unsigned po); + // copy data in + void copy_in(unsigned len, const char *src, bool crc_reset = true); + void copy_in(unsigned len, const list& otherl); + }; + + struct reserve_t { + char* bp_data; + unsigned* bp_len; + unsigned* bl_len; + }; + + class contiguous_appender { + ceph::bufferlist& bl; + ceph::bufferlist::reserve_t space; + char* pos; + bool deep; + + /// running count of bytes appended that are not reflected by @pos + size_t out_of_band_offset = 0; + + contiguous_appender(bufferlist& bl, size_t len, bool d) + : bl(bl), + space(bl.obtain_contiguous_space(len)), + pos(space.bp_data), + deep(d) { + } + + void flush_and_continue() { + const size_t l = pos - space.bp_data; + *space.bp_len += l; + *space.bl_len += l; + space.bp_data = pos; + } + + friend class list; + + public: + ~contiguous_appender() { + flush_and_continue(); + } + + size_t get_out_of_band_offset() const { + return out_of_band_offset; + } + void append(const char* __restrict__ p, size_t l) { + maybe_inline_memcpy(pos, p, l, 16); + pos += l; + } + char *get_pos_add(size_t len) { + char *r = pos; + pos += len; + return r; + } + char *get_pos() { + return pos; + } + + void append(const bufferptr& p) { + const auto plen = p.length(); + if (!plen) { + return; + } + if (deep) { + append(p.c_str(), plen); + } else { + flush_and_continue(); + bl.append(p); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += plen; + } + } + void append(const bufferlist& l) { + if (deep) { + for (const auto &p : l._buffers) { + append(p.c_str(), p.length()); + } + } else { + flush_and_continue(); + bl.append(l); + space = bl.obtain_contiguous_space(0); + out_of_band_offset += l.length(); + } + } + + size_t get_logical_offset() { + return out_of_band_offset + (pos - space.bp_data); + } + }; + + contiguous_appender get_contiguous_appender(size_t len, bool deep=false) { + return contiguous_appender(*this, len, deep); + } + + class contiguous_filler { + friend buffer::list; + char* pos; + + contiguous_filler(char* const pos) : pos(pos) {} + + public: + void advance(const unsigned len) { + pos += len; + } + void copy_in(const unsigned len, const char* const src) { + memcpy(pos, src, len); + advance(len); + } + char* c_str() { + return pos; + } + }; + // The contiguous_filler is supposed to be not costlier than a single + // pointer. Keep it dumb, please. + static_assert(sizeof(contiguous_filler) == sizeof(char*), + "contiguous_filler should be no costlier than pointer"); + + class page_aligned_appender { + bufferlist *pbl; + unsigned min_alloc; + ptr buffer; + char *pos, *end; + + page_aligned_appender(list *l, unsigned min_pages) + : pbl(l), + min_alloc(min_pages * CEPH_PAGE_SIZE), + pos(nullptr), end(nullptr) {} + + friend class list; + + public: + ~page_aligned_appender() { + flush(); + } + + void flush() { + if (pos && pos != buffer.c_str()) { + size_t len = pos - buffer.c_str(); + pbl->append(buffer, 0, len); + buffer.set_length(buffer.length() - len); + buffer.set_offset(buffer.offset() + len); + } + } + + void append(const char *buf, size_t len) { + while (len > 0) { + if (!pos) { + size_t alloc = (len + CEPH_PAGE_SIZE - 1) & CEPH_PAGE_MASK; + if (alloc < min_alloc) { + alloc = min_alloc; + } + buffer = create_page_aligned(alloc); + pos = buffer.c_str(); + end = buffer.end_c_str(); + } + size_t l = len; + if (l > (size_t)(end - pos)) { + l = end - pos; + } + memcpy(pos, buf, l); + pos += l; + buf += l; + len -= l; + if (pos == end) { + pbl->append(buffer, 0, buffer.length()); + pos = end = nullptr; + } + } + } + }; + + page_aligned_appender get_page_aligned_appender(unsigned min_pages=1) { + return page_aligned_appender(this, min_pages); + } + + private: + mutable iterator last_p; + + // always_empty_bptr has no underlying raw but its _len is always 0. + // This is useful for e.g. get_append_buffer_unused_tail_length() as + // it allows to avoid conditionals on hot paths. + static ptr always_empty_bptr; + ptr_node& refill_append_space(const unsigned len); + + public: + // cons/des + list() + : _carriage(&always_empty_bptr), + _len(0), + _memcopy_count(0), + last_p(this) { + } + // cppcheck-suppress noExplicitConstructor + // cppcheck-suppress noExplicitConstructor + list(unsigned prealloc) + : _carriage(&always_empty_bptr), + _len(0), + _memcopy_count(0), + last_p(this) { + reserve(prealloc); + } + + list(const list& other) + : _carriage(&always_empty_bptr), + _len(other._len), + _memcopy_count(other._memcopy_count), + last_p(this) { + _buffers.clone_from(other._buffers); + } + list(list&& other) noexcept; + + ~list() { + _buffers.clear_and_dispose(); + } + + list& operator= (const list& other) { + if (this != &other) { + _carriage = &always_empty_bptr; + _buffers.clone_from(other._buffers); + _len = other._len; + last_p = begin(); + } + return *this; + } + list& operator= (list&& other) noexcept { + _buffers = std::move(other._buffers); + _carriage = other._carriage; + _len = other._len; + _memcopy_count = other._memcopy_count; + last_p = begin(); + other.clear(); + return *this; + } + + uint64_t get_wasted_space() const; + unsigned get_num_buffers() const { return _buffers.size(); } + const ptr_node& front() const { return _buffers.front(); } + const ptr_node& back() const { return _buffers.back(); } + + int get_mempool() const; + void reassign_to_mempool(int pool); + void try_assign_to_mempool(int pool); + + size_t get_append_buffer_unused_tail_length() const { + return _carriage->unused_tail_length(); + } + + unsigned get_memcopy_count() const {return _memcopy_count; } + const buffers_t& buffers() const { return _buffers; } + void swap(list& other) noexcept; + unsigned length() const { +#if 0 + // DEBUG: verify _len + unsigned len = 0; + for (std::list<ptr>::const_iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + len += (*it).length(); + } +#ifdef __CEPH__ + ceph_assert(len == _len); +#else + assert(len == _len); +#endif // __CEPH__ +#endif + return _len; + } + + bool contents_equal(const buffer::list& other) const; + + bool is_provided_buffer(const char *dst) const; + bool is_aligned(unsigned align) const; + bool is_page_aligned() const; + bool is_n_align_sized(unsigned align) const; + bool is_n_page_sized() const; + bool is_aligned_size_and_memory(unsigned align_size, + unsigned align_memory) const; + + bool is_zero() const; + + // modifiers + void clear() noexcept { + _carriage = &always_empty_bptr; + _buffers.clear_and_dispose(); + _len = 0; + _memcopy_count = 0; + last_p = begin(); + } + void push_back(const ptr& bp) { + if (bp.length() == 0) + return; + _buffers.push_back(*ptr_node::create(bp).release()); + _len += bp.length(); + } + void push_back(ptr&& bp) { + if (bp.length() == 0) + return; + _len += bp.length(); + _buffers.push_back(*ptr_node::create(std::move(bp)).release()); + _carriage = &always_empty_bptr; + } + void push_back(const ptr_node&) = delete; + void push_back(ptr_node&) = delete; + void push_back(ptr_node&&) = delete; + void push_back(std::unique_ptr<ptr_node, ptr_node::disposer> bp) { + if (bp->length() == 0) + return; + _carriage = bp.get(); + _len += bp->length(); + _buffers.push_back(*bp.release()); + } + void push_back(raw* const r) { + _buffers.push_back(*ptr_node::create(r).release()); + _carriage = &_buffers.back(); + _len += _buffers.back().length(); + } + void push_back(ceph::unique_leakable_ptr<raw> r) { + push_back(r.release()); + } + + void zero(); + void zero(unsigned o, unsigned l); + + bool is_contiguous() const; + void rebuild(); + void rebuild(std::unique_ptr<ptr_node, ptr_node::disposer> nb); + bool rebuild_aligned(unsigned align); + // max_buffers = 0 mean don't care _buffers.size(), other + // must make _buffers.size() <= max_buffers after rebuilding. + bool rebuild_aligned_size_and_memory(unsigned align_size, + unsigned align_memory, + unsigned max_buffers = 0); + bool rebuild_page_aligned(); + + void reserve(size_t prealloc); + + // assignment-op with move semantics + const static unsigned int CLAIM_DEFAULT = 0; + const static unsigned int CLAIM_ALLOW_NONSHAREABLE = 1; + + void claim(list& bl, unsigned int flags = CLAIM_DEFAULT); + void claim_append(list& bl, unsigned int flags = CLAIM_DEFAULT); + // only for bl is bufferlist::page_aligned_appender + void claim_append_piecewise(list& bl); + + // copy with explicit volatile-sharing semantics + void share(const list& bl) + { + if (this != &bl) { + clear(); + for (const auto& bp : bl._buffers) { + _buffers.push_back(*ptr_node::create(bp).release()); + } + _len = bl._len; + } + } + +#ifdef HAVE_SEASTAR + /// convert the bufferlist into a network packet + operator seastar::net::packet() &&; +#endif + + iterator begin() { + return iterator(this, 0); + } + iterator end() { + return iterator(this, _len, _buffers.end(), 0); + } + + const_iterator begin() const { + return const_iterator(this, 0); + } + const_iterator cbegin() const { + return begin(); + } + const_iterator end() const { + return const_iterator(this, _len, _buffers.end(), 0); + } + + // crope lookalikes. + // **** WARNING: this are horribly inefficient for large bufferlists. **** + void copy(unsigned off, unsigned len, char *dest) const; + void copy(unsigned off, unsigned len, list &dest) const; + void copy(unsigned off, unsigned len, std::string& dest) const; + void copy_in(unsigned off, unsigned len, const char *src, bool crc_reset = true); + void copy_in(unsigned off, unsigned len, const list& src); + + void append(char c); + void append(const char *data, unsigned len); + void append(std::string s) { + append(s.data(), s.length()); + } +#if __cplusplus >= 201703L + // To forcibly disambiguate between string and string_view in the + // case of arrays + template<std::size_t N> + void append(const char (&s)[N]) { + append(s, N); + } + void append(const char* s) { + append(s, strlen(s)); + } + void append(std::string_view s) { + append(s.data(), s.length()); + } +#endif // __cplusplus >= 201703L + void append(const ptr& bp); + void append(ptr&& bp); + void append(const ptr& bp, unsigned off, unsigned len); + void append(const list& bl); + void append(std::istream& in); + contiguous_filler append_hole(unsigned len); + void append_zero(unsigned len); + void prepend_zero(unsigned len); + + reserve_t obtain_contiguous_space(unsigned len); + + /* + * get a char + */ + const char& operator[](unsigned n) const; + char *c_str(); + std::string to_str() const; + + void substr_of(const list& other, unsigned off, unsigned len); + + // funky modifer + void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */); + void write(int off, int len, std::ostream& out) const; + + void encode_base64(list& o); + void decode_base64(list& o); + + void write_stream(std::ostream &out) const; + void hexdump(std::ostream &out, bool trailing_newline = true) const; + int read_file(const char *fn, std::string *error); + ssize_t read_fd(int fd, size_t len); + int write_file(const char *fn, int mode=0644); + int write_fd(int fd) const; + int write_fd(int fd, uint64_t offset) const; + template<typename VectorT> + void prepare_iov(VectorT *piov) const { +#ifdef __CEPH__ + ceph_assert(_buffers.size() <= IOV_MAX); +#else + assert(_buffers.size() <= IOV_MAX); +#endif + piov->resize(_buffers.size()); + unsigned n = 0; + for (auto& p : _buffers) { + (*piov)[n].iov_base = (void *)p.c_str(); + (*piov)[n].iov_len = p.length(); + ++n; + } + } + uint32_t crc32c(uint32_t crc) const; + void invalidate_crc(); + sha1_digest_t sha1(); + + // These functions return a bufferlist with a pointer to a single + // static buffer. They /must/ not outlive the memory they + // reference. + static list static_from_mem(char* c, size_t l); + static list static_from_cstring(char* c); + static list static_from_string(std::string& s); + }; + +} // inline namespace v14_2_0 + + /* + * efficient hash of one or more bufferlists + */ + + class hash { + uint32_t crc; + + public: + hash() : crc(0) { } + // cppcheck-suppress noExplicitConstructor + hash(uint32_t init) : crc(init) { } + + void update(const buffer::list& bl) { + crc = bl.crc32c(crc); + } + + uint32_t digest() { + return crc; + } + }; + +inline bool operator>(bufferlist& l, bufferlist& r) { + for (unsigned p = 0; ; p++) { + if (l.length() > p && r.length() == p) return true; + if (l.length() == p) return false; + if (l[p] > r[p]) return true; + if (l[p] < r[p]) return false; + } +} +inline bool operator>=(bufferlist& l, bufferlist& r) { + for (unsigned p = 0; ; p++) { + if (l.length() > p && r.length() == p) return true; + if (r.length() == p && l.length() == p) return true; + if (l.length() == p && r.length() > p) return false; + if (l[p] > r[p]) return true; + if (l[p] < r[p]) return false; + } +} + +inline bool operator==(const bufferlist &l, const bufferlist &r) { + if (l.length() != r.length()) + return false; + for (unsigned p = 0; p < l.length(); p++) { + if (l[p] != r[p]) + return false; + } + return true; +} +inline bool operator<(bufferlist& l, bufferlist& r) { + return r > l; +} +inline bool operator<=(bufferlist& l, bufferlist& r) { + return r >= l; +} + + +std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); + +std::ostream& operator<<(std::ostream& out, const buffer::raw &r); + +std::ostream& operator<<(std::ostream& out, const buffer::list& bl); + +std::ostream& operator<<(std::ostream& out, const buffer::error& e); + +inline bufferhash& operator<<(bufferhash& l, const bufferlist &r) { + l.update(r); + return l; +} + +} // namespace buffer + +#if defined(HAVE_XIO) +xio_reg_mem* get_xio_mp(const buffer::ptr& bp); +#endif + +} // namespace ceph + +#endif diff --git a/src/include/buffer_fwd.h b/src/include/buffer_fwd.h new file mode 100644 index 00000000..7fac5963 --- /dev/null +++ b/src/include/buffer_fwd.h @@ -0,0 +1,19 @@ +#ifndef BUFFER_FWD_H +#define BUFFER_FWD_H + +namespace ceph { + namespace buffer { + inline namespace v14_2_0 { + class ptr; + class list; + } + class hash; + } + + using bufferptr = buffer::ptr; + using bufferlist = buffer::list; + using bufferhash = buffer::hash; +} + +#endif + diff --git a/src/include/buffer_raw.h b/src/include/buffer_raw.h new file mode 100644 index 00000000..7557795c --- /dev/null +++ b/src/include/buffer_raw.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 20127 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BUFFER_RAW_H +#define CEPH_BUFFER_RAW_H + +#include <atomic> +#include <map> +#include <utility> +#include <type_traits> +#include "include/buffer.h" +#include "include/mempool.h" +#include "include/spinlock.h" + +namespace ceph::buffer { +inline namespace v14_2_0 { + + class raw { + public: + // In the future we might want to have a slab allocator here with few + // embedded slots. This would allow to avoid the "if" in dtor of ptr_node. + std::aligned_storage<sizeof(ptr_node), + alignof(ptr_node)>::type bptr_storage; + char *data; + unsigned len; + std::atomic<unsigned> nref { 0 }; + int mempool; + + std::pair<size_t, size_t> last_crc_offset {std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max()}; + std::pair<uint32_t, uint32_t> last_crc_val; + + mutable ceph::spinlock crc_spinlock; + + explicit raw(unsigned l, int mempool=mempool::mempool_buffer_anon) + : data(nullptr), len(l), nref(0), mempool(mempool) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + raw(char *c, unsigned l, int mempool=mempool::mempool_buffer_anon) + : data(c), len(l), nref(0), mempool(mempool) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + virtual ~raw() { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + } + + void _set_len(unsigned l) { + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + len = l; + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count(1, len); + } + + void reassign_to_mempool(int pool) { + if (pool == mempool) { + return; + } + mempool::get_pool(mempool::pool_index_t(mempool)).adjust_count( + -1, -(int)len); + mempool = pool; + mempool::get_pool(mempool::pool_index_t(pool)).adjust_count(1, len); + } + + void try_assign_to_mempool(int pool) { + if (mempool == mempool::mempool_buffer_anon) { + reassign_to_mempool(pool); + } + } + +private: + // no copying. + // cppcheck-suppress noExplicitConstructor + raw(const raw &other) = delete; + const raw& operator=(const raw &other) = delete; +public: + char *get_data() { + return data; + } + virtual raw* clone_empty() = 0; + ceph::unique_leakable_ptr<raw> clone() { + raw* const c = clone_empty(); + memcpy(c->data, data, len); + return ceph::unique_leakable_ptr<raw>(c); + } + virtual bool is_shareable() const { + // true if safe to reference/share the existing buffer copy + // false if it is not safe to share the buffer, e.g., due to special + // and/or registered memory that is scarce + return true; + } + bool get_crc(const std::pair<size_t, size_t> &fromto, + std::pair<uint32_t, uint32_t> *crc) const { + std::lock_guard lg(crc_spinlock); + if (last_crc_offset == fromto) { + *crc = last_crc_val; + return true; + } + return false; + } + void set_crc(const std::pair<size_t, size_t> &fromto, + const std::pair<uint32_t, uint32_t> &crc) { + std::lock_guard lg(crc_spinlock); + last_crc_offset = fromto; + last_crc_val = crc; + } + void invalidate_crc() { + std::lock_guard lg(crc_spinlock); + last_crc_offset.first = std::numeric_limits<size_t>::max(); + last_crc_offset.second = std::numeric_limits<size_t>::max(); + } + }; + +} // inline namespace v14_2_0 +} // namespace ceph::buffer + +#endif // CEPH_BUFFER_RAW_H diff --git a/src/include/byteorder.h b/src/include/byteorder.h new file mode 100644 index 00000000..85268543 --- /dev/null +++ b/src/include/byteorder.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#pragma once + +#include <type_traits> +#include "acconfig.h" +#include "int_types.h" + + +#ifdef __GNUC__ +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type +swab(T val) { + return __builtin_bswap16(val); +} +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type +swab(T val) { + return __builtin_bswap32(val); +} +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type +swab(T val) { + return __builtin_bswap64(val); +} +#else +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint16_t), T>::type +swab(T val) { + return (val >> 8) | (val << 8); +} +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint32_t), T>::type +swab(T val) { + return (( val >> 24) | + ((val >> 8) & 0xff00) | + ((val << 8) & 0xff0000) | + ((val << 24))); +} +template<typename T> +inline typename std::enable_if<sizeof(T) == sizeof(uint64_t), T>::type +swab(T val) { + return (( val >> 56) | + ((val >> 40) & 0xff00ull) | + ((val >> 24) & 0xff0000ull) | + ((val >> 8) & 0xff000000ull) | + ((val << 8) & 0xff00000000ull) | + ((val << 24) & 0xff0000000000ull) | + ((val << 40) & 0xff000000000000ull) | + ((val << 56))); +} +#endif + +// mswab == maybe swab (if not LE) +#ifdef CEPH_BIG_ENDIAN +template<typename T> +inline T mswab(T val) { + return swab(val); +} +#else +template<typename T> +inline T mswab(T val) { + return val; +} +#endif + +template<typename T> +struct ceph_le { + T v; + ceph_le<T>& operator=(T nv) { + v = mswab(nv); + return *this; + } + operator T() const { return mswab(v); } +} __attribute__ ((packed)); + +template<typename T> +inline bool operator==(ceph_le<T> a, ceph_le<T> b) { + return a.v == b.v; +} + +using ceph_le64 = ceph_le<__u64>; +using ceph_le32 = ceph_le<__u32>; +using ceph_le16 = ceph_le<__u16>; + +inline ceph_le64 init_le64(__u64 x) { + ceph_le64 v; + v = x; + return v; +} +inline ceph_le32 init_le32(__u32 x) { + ceph_le32 v; + v = x; + return v; +} +inline ceph_le16 init_le16(__u16 x) { + ceph_le16 v; + v = x; + return v; +} + + /* +#define cpu_to_le64(x) (x) +#define cpu_to_le32(x) (x) +#define cpu_to_le16(x) (x) + */ +#define le64_to_cpu(x) ((uint64_t)x) +#define le32_to_cpu(x) ((__u32)x) +#define le16_to_cpu(x) ((__u16)x) diff --git a/src/include/ceph_assert.h b/src/include/ceph_assert.h new file mode 100644 index 00000000..36d6c430 --- /dev/null +++ b/src/include/ceph_assert.h @@ -0,0 +1,147 @@ +#ifndef CEPH_ASSERT_H +#define CEPH_ASSERT_H + +#include <cstdlib> +#include <string> + +#if defined(__linux__) +#include <features.h> + +#ifndef __STRING +# define __STRING(x) #x +#endif + +#elif defined(__FreeBSD__) +#include <sys/cdefs.h> +#define __GNUC_PREREQ(minor, major) __GNUC_PREREQ__(minor, major) +#elif defined(__sun) || defined(_AIX) +#include "include/compat.h" +#include <assert.h> +#endif + +#ifdef __CEPH__ +# include "acconfig.h" +#endif + +class CephContext; + +namespace ceph { + +struct BackTrace; + +/* + * Select a function-name variable based on compiler tests, and any compiler + * specific overrides. + */ +#if defined(HAVE_PRETTY_FUNC) +# define __CEPH_ASSERT_FUNCTION __PRETTY_FUNCTION__ +#elif defined(HAVE_FUNC) +# define __CEPH_ASSERT_FUNCTION __func__ +#else +# define __CEPH_ASSERT_FUNCTION ((__const char *) 0) +#endif + +extern void register_assert_context(CephContext *cct); + +struct assert_data { + const char *assertion; + const char *file; + const int line; + const char *function; +}; + +extern void __ceph_assert_fail(const char *assertion, const char *file, int line, const char *function) + __attribute__ ((__noreturn__)); +extern void __ceph_assert_fail(const assert_data &ctx) + __attribute__ ((__noreturn__)); + +extern void __ceph_assertf_fail(const char *assertion, const char *file, int line, const char *function, const char* msg, ...) + __attribute__ ((__noreturn__)); +extern void __ceph_assert_warn(const char *assertion, const char *file, int line, const char *function); + +[[noreturn]] void __ceph_abort(const char *file, int line, const char *func, + const std::string& msg); + +[[noreturn]] void __ceph_abortf(const char *file, int line, const char *func, + const char* msg, ...); + +#define _CEPH_ASSERT_VOID_CAST static_cast<void> + +#define assert_warn(expr) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assert_warn (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION)) + +} + +using namespace ceph; + + +/* + * ceph_abort aborts the program with a nice backtrace. + * + * Currently, it's the same as assert(0), but we may one day make assert a + * debug-only thing, like it is in many projects. + */ +#define ceph_abort(msg, ...) \ + __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, "abort() called") + +#define ceph_abort_msg(msg) \ + __ceph_abort( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, msg) + +#define ceph_abort_msgf(...) \ + __ceph_abortf( __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__) + +#ifdef __SANITIZE_ADDRESS__ +#define ceph_assert(expr) \ + do { \ + ((expr)) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \ + } while (false) +#else +#define ceph_assert(expr) \ + do { static const ceph::assert_data assert_data_ctx = \ + {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assert_fail(assert_data_ctx)); } while(false) +#endif + +// this variant will *never* get compiled out to NDEBUG in the future. +// (ceph_assert currently doesn't either, but in the future it might.) +#ifdef __SANITIZE_ADDRESS__ +#define ceph_assert_always(expr) \ + do { \ + ((expr)) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assert_fail(__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION); \ + } while(false) +#else +#define ceph_assert_always(expr) \ + do { static const ceph::assert_data assert_data_ctx = \ + {__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION}; \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assert_fail(assert_data_ctx)); } while(false) +#endif + +// Named by analogy with printf. Along with an expression, takes a format +// string and parameters which are printed if the assertion fails. +#define assertf(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) +#define ceph_assertf(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) + +// this variant will *never* get compiled out to NDEBUG in the future. +// (ceph_assertf currently doesn't either, but in the future it might.) +#define ceph_assertf_always(expr, ...) \ + ((expr) \ + ? _CEPH_ASSERT_VOID_CAST (0) \ + : __ceph_assertf_fail (__STRING(expr), __FILE__, __LINE__, __CEPH_ASSERT_FUNCTION, __VA_ARGS__)) + +#endif diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h new file mode 100644 index 00000000..6fec3a0c --- /dev/null +++ b/src/include/ceph_features.h @@ -0,0 +1,279 @@ +#ifndef __CEPH_FEATURES +#define __CEPH_FEATURES + +#include "sys/types.h" + +/* + * Each time we reclaim bits for reuse we need to specify another + * bitmask that, if all bits are set, indicates we have the new + * incarnation of that feature. Base case is 1 (first use) + */ +#define CEPH_FEATURE_INCARNATION_1 (0ull) +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC + +#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +// this bit is ignored but still advertised by release *when* +#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ + const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +// this bit is ignored by release *unused* and not advertised by +// release *unadvertised* +#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) + + +// test for a feature. this test is safer than a typical mask against +// the bit because it ensures that we have the bit AND the marker for the +// bit's incarnation. this must be used in any case where the features +// bits may include an old meaning of the bit. +#define HAVE_FEATURE(x, name) \ + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) + + +/* + * Notes on deprecation: + * + * A *major* release is a release through which all upgrades must pass + * (e.g., jewel). For example, no pre-jewel server will ever talk to + * a post-jewel server (mon, osd, etc). + * + * For feature bits used *only* on the server-side: + * + * - In the first phase we indicate that a feature is DEPRECATED as of + * a particular release. This is the first major release X (say, + * jewel) that does not depend on its peers advertising the feature. + * That is, it safely assumes its peers all have the feature. We + * indicate this with the DEPRECATED macro. For example, + * + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL) + * + * because 10.2.z (jewel) did not care if its peers advertised this + * feature bit. + * + * - In the second phase we stop advertising the the bit and call it + * RETIRED. This can normally be done in the *next* major release + * following the one in which we marked the feature DEPRECATED. In + * the above example, for 12.0.z (luminous) we can say: + * + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) + * + * - The bit can be reused in the first post-luminous release, 13.0.z + * (m). + * + * This ensures that no two versions who have different meanings for + * the bit ever speak to each other. + */ + +/* + * Notes on the kernel client: + * + * - "X" means that the feature bit has been advertised and supported + * since kernel X + * + * - "X req" means that the feature bit has been advertised and required + * since kernel X + * + * The remaining feature bits are not and have never been used by the + * kernel client. + */ + +DEFINE_CEPH_FEATURE( 0, 1, UID) +DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) // 2.6.35 req +DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS) +DEFINE_CEPH_FEATURE( 3, 1, FLOCK) // 2.6.36 +DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) // 4.6 req +DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) +DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) // 3.10 req +DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) // 2.6.38 +DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) +DEFINE_CEPH_FEATURE( 9, 1, PGID64) // 3.9 req +DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) +DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) // 3.9 req +DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) +DEFINE_CEPH_FEATURE(13, 1, OSDENC) // 3.9 req +DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) +DEFINE_CEPH_FEATURE(15, 1, MONENC) +DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(16, 3, SERVER_O) +DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(17, 3, OS_PERF_STAT_NS) +DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) // 3.6 +DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(19, 2, OSD_PGLOG_HARDLIMIT) + +DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) // 4.13 +DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap +DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap +DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE(22, 2, OSD_FIXED_COLLECTION_LIST) +DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) // 3.19 req (unless nocephx_require_signatures) +DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS) +DEFINE_CEPH_FEATURE(24, 2, RECOVERY_RESERVATION_2) +DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) // 3.9 +DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) +DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) // 3.9 +DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC) +DEFINE_CEPH_FEATURE(29, 1, MDSENC) // 4.7 +DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) // 3.9 +DEFINE_CEPH_FEATURE_DEPRECATED(31, 1, MON_SINGLE_PAXOS, NAUTILUS) +DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) // 3.14 +DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) // 3.14 +DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) // 3.14 +DEFINE_CEPH_FEATURE_DEPRECATED(38, 1, OSD_ERASURE_CODES, MIMIC) +DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) // 3.15 +DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) // 3.19 +DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) // 3.15 +DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap +DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) // 4.3 (for consistency) +DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) // 4.13 +DEFINE_CEPH_FEATURE_DEPRECATED(44, 1, ERASURE_CODE_PLUGINS_V2, MIMIC) +DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) // 4.17 +DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) // 4.1 +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE_DEPRECATED(50, 1, MON_METADATA, MIMIC) +DEFINE_CEPH_FEATURE_DEPRECATED(51, 1, OSD_BITWISE_HOBJ_SORT, MIMIC) +DEFINE_CEPH_FEATURE_DEPRECATED(52, 1, OSD_PROXY_WRITE_FEATURES, MIMIC) +DEFINE_CEPH_FEATURE_DEPRECATED(53, 1, ERASURE_CODE_PLUGINS_V3, MIMIC) +DEFINE_CEPH_FEATURE_DEPRECATED(54, 1, OSD_HITSET_GMT, MIMIC) +DEFINE_CEPH_FEATURE_DEPRECATED(55, 1, HAMMER_0_94_4, MIMIC) +DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) // 4.13 (for pg_pool_t >= v25) +DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) // 4.13 +DEFINE_CEPH_FEATURE_DEPRECATED(57, 1, MON_ROUTE_OSDMAP, MIMIC) // overlap +DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap +DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) // 4.5 +DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap +DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap +DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) +DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap +DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* +DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // 4.19, *do not share this bit* + +DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinel +DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing + + +/* + * Features supported. Should be everything above. + */ +#define CEPH_FEATURES_ALL \ + (CEPH_FEATURE_UID | \ + CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_FLOCK | \ + CEPH_FEATURE_SUBSCRIBE2 | \ + CEPH_FEATURE_MONNAMES | \ + CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH | \ + CEPH_FEATURE_OBJECTLOCATOR | \ + CEPH_FEATURE_PGID64 | \ + CEPH_FEATURE_INCSUBOSDMAP | \ + CEPH_FEATURE_PGPOOL3 | \ + CEPH_FEATURE_OSDREPLYMUX | \ + CEPH_FEATURE_OSDENC | \ + CEPH_FEATURE_MONENC | \ + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_MSG_AUTH | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ + CEPH_FEATURE_CREATEPOOLID | \ + CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_MDSENC | \ + CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ + DEPRECATED_CEPH_FEATURE_MON_SINGLE_PAXOS | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_EXPORT_PEER | \ + DEPRECATED_CEPH_FEATURE_OSD_ERASURE_CODES | \ + CEPH_FEATURE_OSDMAP_ENC | \ + CEPH_FEATURE_MDS_INLINE_DATA | \ + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_MSGR_KEEPALIVE2 | \ + CEPH_FEATURE_OSD_POOLRESEND | \ + DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 | \ + CEPH_FEATURE_OSD_FADVISE_FLAGS | \ + CEPH_FEATURE_MDS_QUOTA | \ + CEPH_FEATURE_CRUSH_V4 | \ + DEPRECATED_CEPH_FEATURE_MON_METADATA | \ + DEPRECATED_CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT | \ + DEPRECATED_CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 | \ + DEPRECATED_CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES | \ + DEPRECATED_CEPH_FEATURE_OSD_HITSET_GMT | \ + DEPRECATED_CEPH_FEATURE_HAMMER_0_94_4 | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ + DEPRECATED_CEPH_FEATURE_MON_ROUTE_OSDMAP | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_FS_FILE_LAYOUT_V2 | \ + CEPH_FEATURE_SERVER_KRAKEN | \ + CEPH_FEATURE_FS_BTIME | \ + CEPH_FEATURE_FS_CHANGE_ATTR | \ + CEPH_FEATURE_MSG_ADDR2 | \ + CEPH_FEATURE_SERVER_LUMINOUS | \ + CEPH_FEATURE_RESEND_ON_SPLIT | \ + CEPH_FEATURE_RADOS_BACKOFF | \ + CEPH_FEATURE_OSD_RECOVERY_DELETES | \ + CEPH_FEATURE_SERVER_MIMIC | \ + CEPH_FEATURE_RECOVERY_RESERVATION_2 | \ + CEPH_FEATURE_SERVER_NAUTILUS | \ + CEPH_FEATURE_CEPHX_V2 | \ + CEPH_FEATURE_OSD_PGLOG_HARDLIMIT | \ + CEPH_FEATURE_OSD_FIXED_COLLECTION_LIST | \ + 0ULL) + +#define CEPH_FEATURES_SUPPORTED_DEFAULT CEPH_FEATURES_ALL + +/* + * crush related features + */ +#define CEPH_FEATURES_CRUSH \ + (CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ + CEPH_FEATURE_CRUSH_TUNABLES3 | \ + CEPH_FEATURE_CRUSH_TUNABLES5 | \ + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS) + +/* + * make sure we don't try to use the reserved features + */ +#define CEPH_STATIC_ASSERT(x) (void)(sizeof(int[((x)==0) ? -1 : 0])) + +static inline void ____build_time_check_for_reserved_bits(void) { + CEPH_STATIC_ASSERT((CEPH_FEATURES_ALL & + (CEPH_FEATURE_RESERVED | + DEPRECATED_CEPH_FEATURE_RESERVED_BROKEN)) == 0); +} + +#endif diff --git a/src/include/ceph_frag.h b/src/include/ceph_frag.h new file mode 100644 index 00000000..5babb8e9 --- /dev/null +++ b/src/include/ceph_frag.h @@ -0,0 +1,109 @@ +#ifndef FS_CEPH_FRAG_H +#define FS_CEPH_FRAG_H + +/* + * "Frags" are a way to describe a subset of a 32-bit number space, + * using a mask and a value to match against that mask. Any given frag + * (subset of the number space) can be partitioned into 2^n sub-frags. + * + * Frags are encoded into a 32-bit word: + * 8 upper bits = "bits" + * 24 lower bits = "value" + * (We could go to 5+27 bits, but who cares.) + * + * We use the _most_ significant bits of the 24 bit value. This makes + * values logically sort. + * + * Unfortunately, because the "bits" field is still in the high bits, we + * can't sort encoded frags numerically. However, it does allow you + * to feed encoded frags as values into frag_contains_value. + */ +static inline __u32 ceph_frag_make(__u32 b, __u32 v) +{ + return (b << 24) | + (v & (0xffffffu << (24-b)) & 0xffffffu); +} +static inline __u32 ceph_frag_bits(__u32 f) +{ + return f >> 24; +} +static inline __u32 ceph_frag_value(__u32 f) +{ + return f & 0xffffffu; +} +static inline __u32 ceph_frag_mask(__u32 f) +{ + return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; +} +static inline __u32 ceph_frag_mask_shift(__u32 f) +{ + return 24 - ceph_frag_bits(f); +} + +static inline int ceph_frag_contains_value(__u32 f, __u32 v) +{ + return (v & ceph_frag_mask(f)) == ceph_frag_value(f); +} +static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) +{ + /* is sub as specific as us, and contained by us? */ + return ceph_frag_bits(sub) >= ceph_frag_bits(f) && + (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); +} + +static inline __u32 ceph_frag_parent(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f) - 1, + ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); +} +static inline int ceph_frag_is_left_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; +} +static inline int ceph_frag_is_right_child(__u32 f) +{ + return ceph_frag_bits(f) > 0 && + (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; +} +static inline __u32 ceph_frag_sibling(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); +} +static inline __u32 ceph_frag_left_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); +} +static inline __u32 ceph_frag_right_child(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f)+1, + ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); +} +static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) +{ + int newbits = ceph_frag_bits(f) + by; + return ceph_frag_make(newbits, + ceph_frag_value(f) | (i << (24 - newbits))); +} +static inline int ceph_frag_is_leftmost(__u32 f) +{ + return ceph_frag_value(f) == 0; +} +static inline int ceph_frag_is_rightmost(__u32 f) +{ + return ceph_frag_value(f) == ceph_frag_mask(f); +} +static inline __u32 ceph_frag_next(__u32 f) +{ + return ceph_frag_make(ceph_frag_bits(f), + ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); +} + +/* + * comparator to sort frags logically, as when traversing the + * number space in ascending order... + */ +int ceph_frag_compare(__u32 a, __u32 b); + +#endif diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h new file mode 100644 index 00000000..1c73ff37 --- /dev/null +++ b/src/include/ceph_fs.h @@ -0,0 +1,982 @@ +/* + * ceph_fs.h - Ceph constants and data types to share between kernel and + * user space. + * + * Most types in this file are defined as little-endian, and are + * primarily intended to describe data structures that pass over the + * wire or that are stored on disk. + * + * LGPL2.1 + */ + +#ifndef CEPH_FS_H +#define CEPH_FS_H + +#include "msgr.h" +#include "rados.h" + +/* + * The data structures defined here are shared between Linux kernel and + * user space. Also, those data structures are maintained always in + * little-endian byte order, even on big-endian systems. This is handled + * differently in kernel vs. user space. For use as kernel headers, the + * little-endian fields need to use the __le16/__le32/__le64 types. These + * are markers that indicate endian conversion routines must be used + * whenever such fields are accessed, which can be verified by checker + * tools like "sparse". For use as user-space headers, the little-endian + * fields instead use types ceph_le16/ceph_le32/ceph_le64, which are C++ + * classes that implement automatic endian conversion on every access. + * To still allow for header sharing, this file uses the __le types, but + * redefines those to the ceph_ types when compiled in user space. + */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * subprotocol versions. when specific messages types or high-level + * protocols change, bump the affected components. we keep rev + * internal cluster protocols separately from the public, + * client-facing protocol. + */ +#define CEPH_OSDC_PROTOCOL 24 /* server/client */ +#define CEPH_MDSC_PROTOCOL 32 /* server/client */ +#define CEPH_MONC_PROTOCOL 15 /* server/client */ + + +#define CEPH_INO_ROOT 1 +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_LOST_AND_FOUND 4 /* reserved ino for use in recovery */ + +/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ +#define CEPH_MAX_MON 31 + +/* + * ceph_file_layout - describe data layout for a file/inode + */ +struct ceph_file_layout { + /* file -> object mapping */ + __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + __le32 fl_stripe_count; /* over this many objects */ + __le32 fl_object_size; /* until objects are this big, then move to + new objects */ + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ + + /* object -> pg layout */ + __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */ + __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ +} __attribute__ ((packed)); + +#define CEPH_MIN_STRIPE_UNIT 65536 + +struct ceph_dir_layout { + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ + __u8 dl_unused1; + __u16 dl_unused2; + __u32 dl_unused3; +} __attribute__ ((packed)); + +/* crypto algorithms */ +#define CEPH_CRYPTO_NONE 0x0 +#define CEPH_CRYPTO_AES 0x1 + +#define CEPH_AES_IV "cephsageyudagreg" + +/* security/authentication protocols */ +#define CEPH_AUTH_UNKNOWN 0x0 +#define CEPH_AUTH_NONE 0x1 +#define CEPH_AUTH_CEPHX 0x2 + +/* msgr2 protocol modes */ +#define CEPH_CON_MODE_UNKNOWN 0x0 +#define CEPH_CON_MODE_CRC 0x1 +#define CEPH_CON_MODE_SECURE 0x2 + +extern const char *ceph_con_mode_name(int con_mode); + +/* For options with "_", like: GSS_GSS + which means: Mode/Protocol to validate "authentication_authorization", + where: + - Authentication: Verifying the identity of an entity. + - Authorization: Verifying that an authenticated entity has + the right to access a particular resource. +*/ +#define CEPH_AUTH_GSS 0x4 +#define CEPH_AUTH_GSS_GSS CEPH_AUTH_GSS + +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + + +/********************************************* + * message layer + */ + +/* + * message types + */ + +/* misc */ +#define CEPH_MSG_SHUTDOWN 1 +#define CEPH_MSG_PING 2 + +/* client <-> monitor */ +#define CEPH_MSG_MON_MAP 4 +#define CEPH_MSG_MON_GET_MAP 5 +#define CEPH_MSG_MON_GET_OSDMAP 6 +#define CEPH_MSG_MON_METADATA 7 +#define CEPH_MSG_STATFS 13 +#define CEPH_MSG_STATFS_REPLY 14 +#define CEPH_MSG_MON_SUBSCRIBE 15 +#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 +#define CEPH_MSG_AUTH 17 +#define CEPH_MSG_AUTH_REPLY 18 +#define CEPH_MSG_MON_GET_VERSION 19 +#define CEPH_MSG_MON_GET_VERSION_REPLY 20 + +/* client <-> mds */ +#define CEPH_MSG_MDS_MAP 21 + +#define CEPH_MSG_CLIENT_SESSION 22 +#define CEPH_MSG_CLIENT_RECONNECT 23 + +#define CEPH_MSG_CLIENT_REQUEST 24 +#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 +#define CEPH_MSG_CLIENT_REPLY 26 +#define CEPH_MSG_CLIENT_RECLAIM 27 +#define CEPH_MSG_CLIENT_RECLAIM_REPLY 28 +#define CEPH_MSG_CLIENT_CAPS 0x310 +#define CEPH_MSG_CLIENT_LEASE 0x311 +#define CEPH_MSG_CLIENT_SNAP 0x312 +#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +#define CEPH_MSG_CLIENT_QUOTA 0x314 + +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + +/* osd */ +#define CEPH_MSG_OSD_MAP 41 +#define CEPH_MSG_OSD_OP 42 +#define CEPH_MSG_OSD_OPREPLY 43 +#define CEPH_MSG_WATCH_NOTIFY 44 +#define CEPH_MSG_OSD_BACKOFF 61 + +/* FSMap subscribers (see all MDS clusters at once) */ +#define CEPH_MSG_FS_MAP 45 +/* FSMapUser subscribers (get MDS clusters name->ID mapping) */ +#define CEPH_MSG_FS_MAP_USER 103 + +/* watch-notify operations */ +enum { + CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ + CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ + CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ +}; + +const char *ceph_watch_event_name(int o); + +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + +struct ceph_mon_request_header { + __le64 have_version; + __le16 session_mon; + __le64 session_mon_tid; +} __attribute__ ((packed)); + +struct ceph_mon_statfs { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_statfs { + __le64 kb, kb_used, kb_avail; + __le64 num_objects; +} __attribute__ ((packed)); + +struct ceph_mon_statfs_reply { + struct ceph_fsid fsid; + __le64 version; + struct ceph_statfs st; +} __attribute__ ((packed)); + +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 __old_auid; // obsolete + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + +struct ceph_osd_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 start; +} __attribute__ ((packed)); + +struct ceph_mds_getmap { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +struct ceph_client_mount { + struct ceph_mon_request_header monhdr; +} __attribute__ ((packed)); + +#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ + +struct ceph_mon_subscribe_item { + __le64 start; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_mon_subscribe_ack { + __le32 duration; /* seconds */ + struct ceph_fsid fsid; +} __attribute__ ((packed)); + +/* + * mdsmap flags + */ +#define CEPH_MDSMAP_NOT_JOINABLE (1<<0) /* standbys cannot join */ +#define CEPH_MDSMAP_DOWN (CEPH_MDSMAP_NOT_JOINABLE) /* backwards compat */ +#define CEPH_MDSMAP_ALLOW_SNAPS (1<<1) /* cluster allowed to create snapshots */ +/* deprecated #define CEPH_MDSMAP_ALLOW_MULTIMDS (1<<2) cluster allowed to have >1 active MDS */ +/* deprecated #define CEPH_MDSMAP_ALLOW_DIRFRAGS (1<<3) cluster allowed to fragment directories */ +#define CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS (1<<4) /* cluster alllowed to enable MULTIMDS + and SNAPS at the same time */ +#define CEPH_MDSMAP_ALLOW_STANDBY_REPLAY (1<<5) /* cluster alllowed to enable MULTIMDS */ + +#define CEPH_MDSMAP_DEFAULTS (CEPH_MDSMAP_ALLOW_SNAPS | \ + CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS) + +/* + * mds states + * > 0 -> in + * <= 0 -> out + */ +#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ +#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. + empty log. */ +#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ +#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ +#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ +#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ +#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ +#define CEPH_MDS_STATE_REPLAYONCE -9 /* Legacy, unused */ +#define CEPH_MDS_STATE_NULL -10 + +#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ +#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed + operations (import, rename, etc.) */ +#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ +#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ +#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ +#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ +#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ +#define CEPH_MDS_STATE_DAMAGED 15 /* rank not replayable, need repair */ + +extern const char *ceph_mds_state_name(int s); + + +/* + * metadata lock types. + * - these are bitmasks.. we can compose them + * - they also define the lock ordering by the MDS + * - a few of these are internal to the mds + */ +#define CEPH_LOCK_DVERSION 1 +#define CEPH_LOCK_DN 2 +#define CEPH_LOCK_IVERSION 16 /* mds internal */ +#define CEPH_LOCK_ISNAP 32 +#define CEPH_LOCK_IFILE 64 +#define CEPH_LOCK_IAUTH 128 +#define CEPH_LOCK_ILINK 256 +#define CEPH_LOCK_IDFT 512 /* dir frag tree */ +#define CEPH_LOCK_INEST 1024 /* mds internal */ +#define CEPH_LOCK_IXATTR 2048 +#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ +#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ +#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */ + +/* client_session ops */ +enum { + CEPH_SESSION_REQUEST_OPEN, + CEPH_SESSION_OPEN, + CEPH_SESSION_REQUEST_CLOSE, + CEPH_SESSION_CLOSE, + CEPH_SESSION_REQUEST_RENEWCAPS, + CEPH_SESSION_RENEWCAPS, + CEPH_SESSION_STALE, + CEPH_SESSION_RECALL_STATE, + CEPH_SESSION_FLUSHMSG, + CEPH_SESSION_FLUSHMSG_ACK, + CEPH_SESSION_FORCE_RO, + // A response to REQUEST_OPEN indicating that the client should + // permanently desist from contacting the MDS + CEPH_SESSION_REJECT, + CEPH_SESSION_REQUEST_FLUSH_MDLOG +}; + +// flags for state reclaim +#define CEPH_RECLAIM_RESET 1 + +extern const char *ceph_session_op_name(int op); + +struct ceph_mds_session_head { + __le32 op; + __le64 seq; + struct ceph_timespec stamp; + __le32 max_caps, max_leases; +} __attribute__ ((packed)); + +/* client_request */ +/* + * metadata ops. + * & 0x001000 -> write op + * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). + & & 0x100000 -> use weird ino/path trace + */ +#define CEPH_MDS_OP_WRITE 0x001000 +enum { + CEPH_MDS_OP_LOOKUP = 0x00100, + CEPH_MDS_OP_GETATTR = 0x00101, + CEPH_MDS_OP_LOOKUPHASH = 0x00102, + CEPH_MDS_OP_LOOKUPPARENT = 0x00103, + CEPH_MDS_OP_LOOKUPINO = 0x00104, + CEPH_MDS_OP_LOOKUPNAME = 0x00105, + + CEPH_MDS_OP_SETXATTR = 0x01105, + CEPH_MDS_OP_RMXATTR = 0x01106, + CEPH_MDS_OP_SETLAYOUT = 0x01107, + CEPH_MDS_OP_SETATTR = 0x01108, + CEPH_MDS_OP_SETFILELOCK= 0x01109, + CEPH_MDS_OP_GETFILELOCK= 0x00110, + CEPH_MDS_OP_SETDIRLAYOUT=0x0110a, + + CEPH_MDS_OP_MKNOD = 0x01201, + CEPH_MDS_OP_LINK = 0x01202, + CEPH_MDS_OP_UNLINK = 0x01203, + CEPH_MDS_OP_RENAME = 0x01204, + CEPH_MDS_OP_MKDIR = 0x01220, + CEPH_MDS_OP_RMDIR = 0x01221, + CEPH_MDS_OP_SYMLINK = 0x01222, + + CEPH_MDS_OP_CREATE = 0x01301, + CEPH_MDS_OP_OPEN = 0x00302, + CEPH_MDS_OP_READDIR = 0x00305, + + CEPH_MDS_OP_LOOKUPSNAP = 0x00400, + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, + CEPH_MDS_OP_RENAMESNAP = 0x01403, + + // internal op + CEPH_MDS_OP_FRAGMENTDIR= 0x01500, + CEPH_MDS_OP_EXPORTDIR = 0x01501, + CEPH_MDS_OP_FLUSH = 0x01502, + CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01503, + CEPH_MDS_OP_REPAIR_FRAGSTATS = 0x01504, + CEPH_MDS_OP_REPAIR_INODESTATS = 0x01505, + CEPH_MDS_OP_UPGRADE_SNAPREALM = 0x01506 +}; + +extern const char *ceph_mds_op_name(int op); + +#ifndef CEPH_SETATTR_MODE +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#endif +#define CEPH_SETATTR_KILL_SGUID (1 << 10) + +/* + * open request flags + */ +#define CEPH_O_RDONLY 00000000 +#define CEPH_O_WRONLY 00000001 +#define CEPH_O_RDWR 00000002 +#define CEPH_O_CREAT 00000100 +#define CEPH_O_EXCL 00000200 +#define CEPH_O_TRUNC 00001000 +#define CEPH_O_LAZY 00020000 +#define CEPH_O_DIRECTORY 00200000 +#define CEPH_O_NOFOLLOW 00400000 + +int ceph_flags_sys2wire(int flags); + +/* + * Ceph setxattr request flags. + */ +#define CEPH_XATTR_CREATE (1 << 0) +#define CEPH_XATTR_REPLACE (1 << 1) +#define CEPH_XATTR_REMOVE (1 << 31) + +/* + * readdir request flags; + */ +#define CEPH_READDIR_REPLY_BITFLAGS (1<<0) + +/* + * readdir reply flags. + */ +#define CEPH_READDIR_FRAG_END (1<<0) +#define CEPH_READDIR_FRAG_COMPLETE (1<<8) +#define CEPH_READDIR_HASH_ORDER (1<<9) +#define CEPH_READDIR_OFFSET_HASH (1<<10) + +/* Note that this is embedded wthin ceph_mds_request_head_legacy. */ +union ceph_mds_request_args_legacy { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 pool; /* if >= 0 and CREATEPOOLID feature */ + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; /* if O_TRUNC */ + } __attribute__ ((packed)) open; + struct { + __le32 flags; + __le32 osdmap_epoch; /* use for set file/dir layout */ + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id requesting the lock */ + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; +} __attribute__ ((packed)); + +#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ +#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ + +struct ceph_mds_request_head_legacy { + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_legacy args; +} __attribute__ ((packed)); + +/* + * Note that this is embedded wthin ceph_mds_request_head. Also, compatibility + * with the ceph_mds_request_args_legacy must be maintained! + */ +union ceph_mds_request_args { + struct { + __le32 mask; /* CEPH_CAP_* */ + } __attribute__ ((packed)) getattr; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr; + struct { + __le32 frag; /* which dir fragment */ + __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; + __le16 flags; + __le32 offset_hash; + } __attribute__ ((packed)) readdir; + struct { + __le32 mode; + __le32 rdev; + } __attribute__ ((packed)) mknod; + struct { + __le32 mode; + } __attribute__ ((packed)) mkdir; + struct { + __le32 flags; + __le32 mode; + __le32 stripe_unit; /* layout for newly created file */ + __le32 stripe_count; /* ... */ + __le32 object_size; + __le32 pool; /* if >= 0 and CREATEPOOLID feature */ + __le32 mask; /* CEPH_CAP_* */ + __le64 old_size; /* if O_TRUNC */ + } __attribute__ ((packed)) open; + struct { + __le32 flags; + __le32 osdmap_epoch; /* use for set file/dir layout */ + } __attribute__ ((packed)) setxattr; + struct { + struct ceph_file_layout layout; + } __attribute__ ((packed)) setlayout; + struct { + __u8 rule; /* currently fcntl or flock */ + __u8 type; /* shared, exclusive, remove*/ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id requesting the lock */ + __le64 start; /* initial location to lock */ + __le64 length; /* num bytes to lock from start */ + __u8 wait; /* will caller wait for lock to become available? */ + } __attribute__ ((packed)) filelock_change; + struct { + __le32 mask; /* CEPH_CAP_* */ + __le64 snapid; + __le64 parent; + __le32 hash; + } __attribute__ ((packed)) lookupino; +} __attribute__ ((packed)); + +#define CEPH_MDS_REQUEST_HEAD_VERSION 1 + +/* + * Note that any change to this structure must ensure that it is compatible + * with ceph_mds_request_head_legacy. + */ +struct ceph_mds_request_head { + __le16 version; + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* count retry, fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args args; +} __attribute__ ((packed)); + +/* cap/lease release record */ +struct ceph_mds_request_release { + __le64 ino, cap_id; /* ino and unique cap id */ + __le32 caps, wanted; /* new issued, wanted */ + __le32 seq, issue_seq, mseq; + __le32 dname_seq; /* if releasing a dentry lease, a */ + __le32 dname_len; /* string follows. */ +} __attribute__ ((packed)); + +static inline void +copy_from_legacy_head(struct ceph_mds_request_head *head, + struct ceph_mds_request_head_legacy *legacy) +{ + memcpy(&(head->oldest_client_tid), legacy, sizeof(*legacy)); +} + +static inline void +copy_to_legacy_head(struct ceph_mds_request_head_legacy *legacy, + struct ceph_mds_request_head *head) +{ + memcpy(legacy, &(head->oldest_client_tid), sizeof(*legacy)); +} + +/* client reply */ +struct ceph_mds_reply_head { + __le32 op; + __le32 result; + __le32 mdsmap_epoch; + __u8 safe; /* true if committed to disk */ + __u8 is_dentry, is_target; /* true if dentry, target inode records + are included with reply */ +} __attribute__ ((packed)); + +/* one for each node split */ +struct ceph_frag_tree_split { + __le32 frag; /* this frag splits... */ + __le32 by; /* ...by this many bits */ +} __attribute__ ((packed)); + +struct ceph_frag_tree_head { + __le32 nsplits; /* num ceph_frag_tree_split records */ + struct ceph_frag_tree_split splits[]; +} __attribute__ ((packed)); + +/* capability issue, for bundling with mds reply */ +struct ceph_mds_reply_cap { + __le32 caps, wanted; /* caps issued, wanted */ + __le64 cap_id; + __le32 seq, mseq; + __le64 realm; /* snap realm */ + __u8 flags; /* CEPH_CAP_FLAG_* */ +} __attribute__ ((packed)); + +#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* ask client to release the cap */ + +/* reply_lease follows dname, and reply_inode */ +struct ceph_mds_reply_lease { + __le16 mask; /* lease type(s) */ + __le32 duration_ms; /* lease duration */ + __le32 seq; +} __attribute__ ((packed)); + +struct ceph_mds_reply_dirfrag { + __le32 frag; /* fragment */ + __le32 auth; /* auth mds, if this is a delegation point */ + __le32 ndist; /* number of mds' this is replicated on */ + __le32 dist[]; +} __attribute__ ((packed)); + +#define CEPH_LOCK_FCNTL 1 +#define CEPH_LOCK_FLOCK 2 +#define CEPH_LOCK_FCNTL_INTR 3 +#define CEPH_LOCK_FLOCK_INTR 4 + +#define CEPH_LOCK_SHARED 1 +#define CEPH_LOCK_EXCL 2 +#define CEPH_LOCK_UNLOCK 4 + +struct ceph_filelock { + __le64 start;/* file offset to start lock at */ + __le64 length; /* num bytes to lock; 0 for all following start */ + __le64 client; /* which client holds the lock */ + __le64 owner; /* who requests/holds the lock */ + __le64 pid; /* process id holding the lock on the client */ + __u8 type; /* shared lock, exclusive lock, or unlock */ +} __attribute__ ((packed)); + + +/* file access modes */ +#define CEPH_FILE_MODE_PIN 0 +#define CEPH_FILE_MODE_RD 1 +#define CEPH_FILE_MODE_WR 2 +#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ +#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ +#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ + +int ceph_flags_to_mode(int flags); + +/* inline data state */ +#define CEPH_INLINE_NONE ((__u64)-1) +#define CEPH_INLINE_MAX_SIZE CEPH_MIN_STRIPE_UNIT + +/* capability bits */ +#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ + +/* generic cap bits */ +/* note: these definitions are duplicated in mds/locks.c */ +#define CEPH_CAP_GSHARED 1 /* client can reads */ +#define CEPH_CAP_GEXCL 2 /* client can read and update */ +#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ +#define CEPH_CAP_GRD 8 /* (file) client can read */ +#define CEPH_CAP_GWR 16 /* (file) client can write */ +#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ +#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ +#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ + +#define CEPH_CAP_SIMPLE_BITS 2 +#define CEPH_CAP_FILE_BITS 8 + +/* per-lock shift */ +#define CEPH_CAP_SAUTH 2 +#define CEPH_CAP_SLINK 4 +#define CEPH_CAP_SXATTR 6 +#define CEPH_CAP_SFILE 8 + +/* composed values */ +#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) +#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) +#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) +#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) +#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) +#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) +#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) +#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) + +/* cap masks (for getattr) */ +#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN +#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ +#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN +#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED +#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED +#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED +#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ +#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED +#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ + CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_XATTR_SHARED) +#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ + CEPH_CAP_FILE_RD) +#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND + +#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ + CEPH_CAP_LINK_SHARED | \ + CEPH_CAP_XATTR_SHARED | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ + CEPH_CAP_FILE_CACHE) + +#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ + CEPH_CAP_LINK_EXCL | \ + CEPH_CAP_XATTR_EXCL | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \ + CEPH_CAP_FILE_SHARED) +#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ + CEPH_CAP_FILE_EXCL) +#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) +#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ + CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \ + CEPH_CAP_PIN) + +#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ + CEPH_LOCK_IXATTR) + +int ceph_caps_for_mode(int mode); + +enum { + CEPH_CAP_OP_GRANT, /* mds->client grant */ + CEPH_CAP_OP_REVOKE, /* mds->client revoke */ + CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ + CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ + CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ + CEPH_CAP_OP_UPDATE, /* client->mds update */ + CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ + CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ + CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ + CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ + CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ + CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ + CEPH_CAP_OP_RENEW, /* client->mds renewal request */ +}; + +extern const char *ceph_cap_op_name(int op); + +/* extra info for cap import/export */ +struct ceph_mds_cap_peer { + __le64 cap_id; + __le32 seq; + __le32 mseq; + __le32 mds; + __u8 flags; +} __attribute__ ((packed)); + +/* + * caps message, used for capability callbacks, acks, requests, etc. + */ +struct ceph_mds_caps_head { + __le32 op; /* CEPH_CAP_OP_* */ + __le64 ino, realm; + __le64 cap_id; + __le32 seq, issue_seq; + __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ + __le32 migrate_seq; + __le64 snap_follows; + __le32 snap_trace_len; + + /* authlock */ + __le32 uid, gid, mode; + + /* linklock */ + __le32 nlink; + + /* xattrlock */ + __le32 xattr_len; + __le64 xattr_version; +} __attribute__ ((packed)); + +struct ceph_mds_caps_body_legacy { + union { + /* all except export */ + struct { + /* filelock */ + __le64 size, max_size, truncate_size; + __le32 truncate_seq; + struct ceph_timespec mtime, atime, ctime; + struct ceph_file_layout layout; + __le32 time_warp_seq; + }; + /* export message */ + struct ceph_mds_cap_peer peer; + }; +} __attribute__ ((packed)); + +/* cap release msg head */ +struct ceph_mds_cap_release { + __le32 num; /* number of cap_items that follow */ +} __attribute__ ((packed)); + +struct ceph_mds_cap_item { + __le64 ino; + __le64 cap_id; + __le32 migrate_seq, seq; +} __attribute__ ((packed)); + +#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ +#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ +#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ +#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ + +extern const char *ceph_lease_op_name(int o); + +/* lease msg header */ +struct ceph_mds_lease { + __u8 action; /* CEPH_MDS_LEASE_* */ + __le16 mask; /* which lease */ + __le64 ino; + __le64 first, last; /* snap range */ + __le32 seq; + __le32 duration_ms; /* duration of renewal */ +} __attribute__ ((packed)); +/* followed by a __le32+string for dname */ + +/* client reconnect */ +struct ceph_mds_cap_reconnect { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ + __le32 flock_len; /* size of flock state blob, if any */ +} __attribute__ ((packed)); +/* followed by flock blob */ + +struct ceph_mds_cap_reconnect_v1 { + __le64 cap_id; + __le32 wanted; + __le32 issued; + __le64 size; + struct ceph_timespec mtime, atime; + __le64 snaprealm; + __le64 pathbase; /* base ino for our path to this ino */ +} __attribute__ ((packed)); + +struct ceph_mds_snaprealm_reconnect { + __le64 ino; /* snap realm base */ + __le64 seq; /* snap seq for this snap realm */ + __le64 parent; /* parent realm */ +} __attribute__ ((packed)); + +/* + * snaps + */ +enum { + CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ + CEPH_SNAP_OP_CREATE, + CEPH_SNAP_OP_DESTROY, + CEPH_SNAP_OP_SPLIT, +}; + +extern const char *ceph_snap_op_name(int o); + +/* snap msg header */ +struct ceph_mds_snap_head { + __le32 op; /* CEPH_SNAP_OP_* */ + __le64 split; /* ino to split off, if any */ + __le32 num_split_inos; /* # inos belonging to new child realm */ + __le32 num_split_realms; /* # child realms udner new child realm */ + __le32 trace_len; /* size of snap trace blob */ +} __attribute__ ((packed)); +/* followed by split ino list, then split realms, then the trace blob */ + +/* + * encode info about a snaprealm, as viewed by a client + */ +struct ceph_mds_snap_realm { + __le64 ino; /* ino */ + __le64 created; /* snap: when created */ + __le64 parent; /* ino: parent realm */ + __le64 parent_since; /* snap: same parent since */ + __le64 seq; /* snap: version */ + __le32 num_snaps; + __le32 num_prior_parent_snaps; +} __attribute__ ((packed)); +/* followed by my snap list, then prior parent snap list */ + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/ceph_fuse.h b/src/include/ceph_fuse.h new file mode 100644 index 00000000..45881930 --- /dev/null +++ b/src/include/ceph_fuse.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Inktank Storage, Inc. + * Copyright (C) 2014 Red Hat <contact@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#ifndef CEPH_FUSE_H +#define CEPH_FUSE_H + +#define FUSE_USE_VERSION 30 +#include "acconfig.h" +#include <fuse.h> + +static inline int filler_compat(fuse_fill_dir_t filler, + void *buf, const char *name, + const struct stat *stbuf, + off_t off) +{ + return filler(buf, name, stbuf, off +#if FUSE_VERSION >= FUSE_MAKE_VERSION(3, 0) + , static_cast<enum fuse_fill_dir_flags>(0) +#endif + ); +} +#endif /* CEPH_FUSE_H */ diff --git a/src/include/ceph_hash.h b/src/include/ceph_hash.h new file mode 100644 index 00000000..f9d80ac3 --- /dev/null +++ b/src/include/ceph_hash.h @@ -0,0 +1,14 @@ +#ifndef FS_CEPH_HASH_H +#define FS_CEPH_HASH_H + +#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ +#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ + +extern unsigned ceph_str_hash_linux(const char *s, unsigned len); +extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); + +extern unsigned ceph_str_hash(int type, const char *s, unsigned len); +extern const char *ceph_str_hash_name(int type); +extern bool ceph_str_hash_valid(int type); + +#endif diff --git a/src/include/cephfs/ceph_ll_client.h b/src/include/cephfs/ceph_ll_client.h new file mode 100644 index 00000000..4f3d4235 --- /dev/null +++ b/src/include/cephfs/ceph_ll_client.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * scalable distributed file system + * + * Copyright (C) Jeff Layton <jlayton@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_CEPH_LL_CLIENT_H +#define CEPH_CEPH_LL_CLIENT_H +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { + +class Fh; + +struct inodeno_t; +struct vinodeno_t; +typedef struct vinodeno_t vinodeno; + +#else /* __cplusplus */ + +typedef struct Fh Fh; + +typedef struct inodeno_t { + uint64_t val; +} inodeno_t; + +typedef struct _snapid_t { + uint64_t val; +} snapid_t; + +typedef struct vinodeno_t { + inodeno_t ino; + snapid_t snapid; +} vinodeno_t; + +#endif /* __cplusplus */ + +/* + * Heavily borrowed from David Howells' draft statx patchset. + * + * Since the xstat patches are still a work in progress, we borrow its data + * structures and #defines to implement ceph_getattrx. Once the xstat stuff + * has been merged we should drop this and switch over to using that instead. + */ +struct ceph_statx { + uint32_t stx_mask; + uint32_t stx_blksize; + uint32_t stx_nlink; + uint32_t stx_uid; + uint32_t stx_gid; + uint16_t stx_mode; + uint64_t stx_ino; + uint64_t stx_size; + uint64_t stx_blocks; + dev_t stx_dev; + dev_t stx_rdev; + struct timespec stx_atime; + struct timespec stx_ctime; + struct timespec stx_mtime; + struct timespec stx_btime; + uint64_t stx_version; +}; + +#define CEPH_STATX_MODE 0x00000001U /* Want/got stx_mode */ +#define CEPH_STATX_NLINK 0x00000002U /* Want/got stx_nlink */ +#define CEPH_STATX_UID 0x00000004U /* Want/got stx_uid */ +#define CEPH_STATX_GID 0x00000008U /* Want/got stx_gid */ +#define CEPH_STATX_RDEV 0x00000010U /* Want/got stx_rdev */ +#define CEPH_STATX_ATIME 0x00000020U /* Want/got stx_atime */ +#define CEPH_STATX_MTIME 0x00000040U /* Want/got stx_mtime */ +#define CEPH_STATX_CTIME 0x00000080U /* Want/got stx_ctime */ +#define CEPH_STATX_INO 0x00000100U /* Want/got stx_ino */ +#define CEPH_STATX_SIZE 0x00000200U /* Want/got stx_size */ +#define CEPH_STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */ +#define CEPH_STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ +#define CEPH_STATX_BTIME 0x00000800U /* Want/got stx_btime */ +#define CEPH_STATX_VERSION 0x00001000U /* Want/got stx_version */ +#define CEPH_STATX_ALL_STATS 0x00001fffU /* All supported stats */ + +/* + * Compatibility macros until these defines make their way into glibc + */ +#ifndef AT_NO_ATTR_SYNC +#define AT_NO_ATTR_SYNC 0x4000 /* Don't sync attributes with the server */ +#endif + +/* + * The statx interfaces only allow these flags. In order to allow us to add + * others in the future, we disallow setting any that aren't recognized. + */ +#define CEPH_REQ_FLAG_MASK (AT_SYMLINK_NOFOLLOW|AT_NO_ATTR_SYNC) + +/* delegation recalls */ +typedef void (*ceph_deleg_cb_t)(Fh *fh, void *priv); + +/* inode data/metadata invalidation */ +typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, + int64_t off, int64_t len); + +/* dentry invalidation */ +typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino, + vinodeno_t ino, const char *name, + size_t len); + +/* remount entire fs */ +typedef int (*client_remount_callback_t)(void *handle); + +/* lock request interrupted */ +typedef void (*client_switch_interrupt_callback_t)(void *handle, void *data); + +/* fetch umask of actor */ +typedef mode_t (*client_umask_callback_t)(void *handle); + +/* request that application release Inode references */ +typedef void (*client_ino_release_t)(void *handle, vinodeno_t ino); + +/* + * The handle is an opaque value that gets passed to some callbacks. Any fields + * set to NULL will be left alone. There is no way to unregister callbacks. + */ +struct ceph_client_callback_args { + void *handle; + client_ino_callback_t ino_cb; + client_dentry_callback_t dentry_cb; + client_switch_interrupt_callback_t switch_intr_cb; + client_remount_callback_t remount_cb; + client_umask_callback_t umask_cb; + client_ino_release_t ino_release_cb; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_STATX_H */ + diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h new file mode 100755 index 00000000..c1668769 --- /dev/null +++ b/src/include/cephfs/libcephfs.h @@ -0,0 +1,1869 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2009-2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIB_H +#define CEPH_LIB_H + +#if defined(__linux__) +#include <features.h> +#endif +#include <utime.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/statvfs.h> +#include <sys/socket.h> +#include <stdint.h> +#include <stdbool.h> +#include <fcntl.h> + +#include "ceph_ll_client.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBCEPHFS_VER_MAJOR 10 +#define LIBCEPHFS_VER_MINOR 0 +#define LIBCEPHFS_VER_EXTRA 2 + +#define LIBCEPHFS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA) + +/* + * If using glibc check that file offset is 64-bit. + */ +#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64) +# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted +#endif + +/* + * XXXX redeclarations from ceph_fs.h, rados.h, etc. We need more of this + * in the interface, but shouldn't be re-typing it (and using different + * C data types). + */ +#ifndef __cplusplus + +#define CEPH_INO_ROOT 1 +#define CEPH_NOSNAP ((uint64_t)(-2)) + +struct ceph_file_layout { + /* file -> object mapping */ + uint32_t fl_stripe_unit; /* stripe unit, in bytes. must be multiple + of page size. */ + uint32_t fl_stripe_count; /* over this many objects */ + uint32_t fl_object_size; /* until objects are this big, then move to + new objects */ + uint32_t fl_cas_hash; /* 0 = none; 1 = sha256 */ + + /* pg -> disk layout */ + uint32_t fl_object_stripe_unit; /* for per-object parity, if any */ + + /* object -> pg layout */ + uint32_t fl_pg_preferred; /* preferred primary for pg (-1 for none) */ + uint32_t fl_pg_pool; /* namespace, crush ruleset, rep level */ +} __attribute__ ((packed)); + +#endif /* ! __cplusplus */ + +struct UserPerm; +typedef struct UserPerm UserPerm; + +struct Inode; +typedef struct Inode Inode; + +struct ceph_mount_info; +struct ceph_dir_result; +struct CephContext; + +/* setattr mask bits */ +#ifndef CEPH_SETATTR_MODE +# define CEPH_SETATTR_MODE 1 +# define CEPH_SETATTR_UID 2 +# define CEPH_SETATTR_GID 4 +# define CEPH_SETATTR_MTIME 8 +# define CEPH_SETATTR_ATIME 16 +# define CEPH_SETATTR_SIZE 32 +# define CEPH_SETATTR_CTIME 64 +# define CEPH_SETATTR_MTIME_NOW 128 +# define CEPH_SETATTR_ATIME_NOW 256 +# define CEPH_SETATTR_BTIME 512 +#endif + +/* define error codes for the mount function*/ +# define CEPHFS_ERROR_MON_MAP_BUILD 1000 +# define CEPHFS_ERROR_NEW_CLIENT 1002 +# define CEPHFS_ERROR_MESSENGER_START 1003 + +/** + * Create a UserPerm credential object. + * + * Some calls (most notably, the ceph_ll_* ones), take a credential object + * that represents the credentials that the calling program is using. This + * function creates a new credential object for this purpose. Returns a + * pointer to the object, or NULL if it can't be allocated. + * + * Note that the gidlist array is used directly and is not copied. It must + * remain valid over the lifetime of the created UserPerm object. + * + * @param uid uid to be used + * @param gid gid to be used + * @param ngids number of gids in supplemental grouplist + * @param gidlist array of gid_t's in the list of groups + */ +UserPerm *ceph_userperm_new(uid_t uid, gid_t gid, int ngids, gid_t *gidlist); + +/** + * Destroy a UserPerm credential object. + * + * @param perm pointer to object to be destroyed + * + * Currently this just frees the object. Note that the gidlist array is not + * freed. The caller must do so if it's necessary. + */ +void ceph_userperm_destroy(UserPerm *perm); + +/** + * Get a pointer to the default UserPerm object for the mount. + * + * @param cmount the mount info handle + * + * Every cmount has a default set of credentials. This returns a pointer to + * that object. + * + * Unlike with ceph_userperm_new, this object should not be freed. + */ +struct UserPerm *ceph_mount_perms(struct ceph_mount_info *cmount); + +/** + * Set cmount's default permissions + * + * @param cmount the mount info handle + * @param perm permissions to set to default for mount + * + * Every cmount has a default set of credentials. This does a deep copy of + * the given permissions to the ones in the cmount. Must be done after + * ceph_init but before ceph_mount. + * + * Returns 0 on success, and -EISCONN if the cmount is already mounted. + */ +int ceph_mount_perms_set(struct ceph_mount_info *cmount, UserPerm *perm); + +/** + * @defgroup libcephfs_h_init Setup and Teardown + * These are the first and last functions that should be called + * when using libcephfs. + * + * @{ + */ + +/** + * Get the version of libcephfs. + * + * The version number is major.minor.patch. + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param patch where to store the extra version number + */ +const char *ceph_version(int *major, int *minor, int *patch); + +/** + * Create a mount handle for interacting with Ceph. All libcephfs + * functions operate on a mount info handle. + * + * @param cmount the mount info handle to initialize + * @param id the id of the client. This can be a unique id that identifies + * this client, and will get appended onto "client.". Callers can + * pass in NULL, and the id will be the process id of the client. + * @returns 0 on success, negative error code on failure + */ +int ceph_create(struct ceph_mount_info **cmount, const char * const id); + +/** + * Create a mount handle from a CephContext, which holds the configuration + * for the ceph cluster. A CephContext can be acquired from an existing ceph_mount_info + * handle, using the @ref ceph_get_mount_context call. Note that using the same CephContext + * for two different mount handles results in the same client entity id being used. + * + * @param cmount the mount info handle to initialize + * @param conf reuse this pre-existing CephContext config + * @returns 0 on success, negative error code on failure + */ +int ceph_create_with_context(struct ceph_mount_info **cmount, struct CephContext *conf); + + +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif // VOIDPTR_RADOS_T + +/** + * Create a mount handle from a rados_t, for using libcephfs in the + * same process as librados. + * + * @param cmount the mount info handle to initialize + * @param cluster reference to already-initialized librados handle + * @returns 0 on success, negative error code on failure + */ +int ceph_create_from_rados(struct ceph_mount_info **cmount, rados_t cluster); + +/** + * Initialize the filesystem client (but do not mount the filesystem yet) + * + * @returns 0 on success, negative error code on failure + */ +int ceph_init(struct ceph_mount_info *cmount); + +/** + * Optionally set which filesystem to mount, before calling mount. + * + * An error will be returned if this libcephfs instance is already + * mounted. This function is an alternative to setting the global + * client_mds_namespace setting. Using this function enables multiple + * libcephfs instances in the same process to mount different filesystems. + * + * The filesystem name is *not* validated in this function. That happens + * during mount(), where an ENOENT error will result if a non-existent + * filesystem was specified here. + * + * @param cmount the mount info handle + * @returns 0 on success, negative error code on failure + */ +int ceph_select_filesystem(struct ceph_mount_info *cmount, const char *fs_name); + + +/** + * Perform a mount using the path for the root of the mount. + * + * It is optional to call ceph_init before this. If ceph_init has + * not already been called, it will be called in the course of this operation. + * + * @param cmount the mount info handle + * @param root the path for the root of the mount. This can be an existing + * directory within the ceph cluster, but most likely it will + * be "/". Passing in NULL is equivalent to "/". + * @returns 0 on success, negative error code on failure + */ +int ceph_mount(struct ceph_mount_info *cmount, const char *root); + +/** + * Return cluster ID for a mounted ceph filesystem + * + * Every ceph filesystem has a filesystem ID associated with it. This + * function returns that value. If the ceph_mount_info does not refer to a + * mounted filesystem, this returns a negative error code. + */ +int64_t ceph_get_fs_cid(struct ceph_mount_info *cmount); + +/** + * Execute a management command remotely on an MDS. + * + * Must have called ceph_init or ceph_mount before calling this. + * + * @param mds_spec string representing rank, MDS name, GID or '*' + * @param cmd array of null-terminated strings + * @param cmdlen length of cmd array + * @param inbuf non-null-terminated input data to command + * @param inbuflen length in octets of inbuf + * @param outbuf populated with pointer to buffer (command output data) + * @param outbuflen length of allocated outbuf + * @param outs populated with pointer to buffer (command error strings) + * @param outslen length of allocated outs + * + * @return 0 on success, negative error code on failure + * + */ +int ceph_mds_command(struct ceph_mount_info *cmount, + const char *mds_spec, + const char **cmd, + size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * Free a buffer, such as those used for output arrays from ceph_mds_command + */ +void ceph_buffer_free(char *buf); + +/** + * Unmount a mount handle. + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure + */ +int ceph_unmount(struct ceph_mount_info *cmount); + +/** + * Abort mds connections + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure + */ +int ceph_abort_conn(struct ceph_mount_info *cmount); + +/** + * Destroy the mount handle. + * + * The handle should not be mounted. This should be called on completion of + * all libcephfs functions. + * + * @param cmount the mount handle + * @return 0 on success, negative error code on failure. + */ +int ceph_release(struct ceph_mount_info *cmount); + +/** + * Deprecated. Unmount and destroy the ceph mount handle. This should be + * called on completion of all libcephfs functions. + * + * Equivalent to ceph_unmount() + ceph_release() without error handling. + * + * @param cmount the mount handle to shutdown + */ +void ceph_shutdown(struct ceph_mount_info *cmount); + +/** + * Get a global id for current instance + * + * The handle should not be mounted. This should be called on completion of + * all libcephfs functions. + * + * @param cmount the mount handle + * @returns instance global id + */ +uint64_t ceph_get_instance_id(struct ceph_mount_info *cmount); + +/** + * Extract the CephContext from the mount point handle. + * + * @param cmount the ceph mount handle to get the context from. + * @returns the CephContext associated with the mount handle. + */ +struct CephContext *ceph_get_mount_context(struct ceph_mount_info *cmount); + +/* + * Check mount status. + * + * Return non-zero value if mounted. Otherwise, zero. + */ +int ceph_is_mounted(struct ceph_mount_info *cmount); + +/** @} init */ + +/** + * @defgroup libcephfs_h_config Config + * Functions for manipulating the Ceph configuration at runtime. + * + * @{ + */ + +/** + * Load the ceph configuration from the specified config file. + * + * @param cmount the mount handle to load the configuration into. + * @param path_list the configuration file path + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_read_file(struct ceph_mount_info *cmount, const char *path_list); + +/** + * Parse the command line arguments and load the configuration parameters. + * + * @param cmount the mount handle to load the configuration parameters into. + * @param argc count of the arguments in argv + * @param argv the argument list + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_parse_argv(struct ceph_mount_info *cmount, int argc, const char **argv); + +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre ceph_mount() has not been called on the handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cmount handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +int ceph_conf_parse_env(struct ceph_mount_info *cmount, const char *var); + +/** Sets a configuration value from a string. + * + * @param cmount the mount handle to set the configuration value on + * @param option the configuration option to set + * @param value the value of the configuration option to set + * + * @returns 0 on success, negative error code otherwise. + */ +int ceph_conf_set(struct ceph_mount_info *cmount, const char *option, const char *value); + +/** + * Gets the configuration value as a string. + * + * @param cmount the mount handle to set the configuration value on + * @param option the config option to get + * @param buf the buffer to fill with the value + * @param len the length of the buffer. + * @returns the size of the buffer filled in with the value, or negative error code on failure + */ +int ceph_conf_get(struct ceph_mount_info *cmount, const char *option, char *buf, size_t len); + +/** @} config */ + +/** + * @defgroup libcephfs_h_fsops File System Operations. + * Functions for getting/setting file system wide information specific to a particular + * mount handle. + * + * @{ + */ + +/** + * Perform a statfs on the ceph file system. This call fills in file system wide statistics + * into the passed in buffer. + * + * @param cmount the ceph mount handle to use for performing the statfs. + * @param path can be any path within the mounted filesystem + * @param stbuf the file system statistics filled in by this function. + * @return 0 on success, negative error code otherwise. + */ +int ceph_statfs(struct ceph_mount_info *cmount, const char *path, struct statvfs *stbuf); + +/** + * Synchronize all filesystem data to persistent media. + * + * @param cmount the ceph mount handle to use for performing the sync_fs. + * @returns 0 on success or negative error code on failure. + */ +int ceph_sync_fs(struct ceph_mount_info *cmount); + +/** + * Get the current working directory. + * + * @param cmount the ceph mount to get the current working directory for. + * @returns the path to the current working directory + */ +const char* ceph_getcwd(struct ceph_mount_info *cmount); + +/** + * Change the current working directory. + * + * @param cmount the ceph mount to change the current working directory for. + * @param path the path to the working directory to change into. + * @returns 0 on success, negative error code otherwise. + */ +int ceph_chdir(struct ceph_mount_info *cmount, const char *path); + +/** @} fsops */ + +/** + * @defgroup libcephfs_h_dir Directory Operations. + * Functions for manipulating and listing directories. + * + * @{ + */ + +/** + * Open the given directory. + * + * @param cmount the ceph mount handle to use to open the directory + * @param name the path name of the directory to open. Must be either an absolute path + * or a path relative to the current working directory. + * @param dirpp the directory result pointer structure to fill in. + * @returns 0 on success or negative error code otherwise. + */ +int ceph_opendir(struct ceph_mount_info *cmount, const char *name, struct ceph_dir_result **dirpp); + +/** + * Close the open directory. + * + * @param cmount the ceph mount handle to use for closing the directory + * @param dirp the directory result pointer (set by ceph_opendir) to close + * @returns 0 on success or negative error code on failure. + */ +int ceph_closedir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Get the next entry in an open directory. + * + * @param cmount the ceph mount handle to use for performing the readdir. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @returns the next directory entry or NULL if at the end of the directory (or the directory + * is empty. This pointer should not be freed by the caller, and is only safe to + * access between return and the next call to ceph_readdir or ceph_closedir. + */ +struct dirent * ceph_readdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * A safe version of ceph_readdir, where the directory entry struct is allocated by the caller. + * + * @param cmount the ceph mount handle to use for performing the readdir. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @param de the directory entry pointer filled in with the next directory entry of the dirp state. + * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached, + * and a negative error code on failure. + */ +int ceph_readdir_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de); + +/** + * A safe version of ceph_readdir that also returns the file statistics (readdir+stat). + * + * @param cmount the ceph mount handle to use for performing the readdir_plus_r. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry to return. + * @param de the directory entry pointer filled in with the next directory entry of the dirp state. + * @param stx the stats of the file/directory of the entry returned + * @param want mask showing desired inode attrs for returned entry + * @param flags bitmask of flags to use when filling out attributes + * @param out optional returned Inode argument. If non-NULL, then a reference will be taken on + * the inode and the pointer set on success. + * @returns 1 if the next entry was filled in, 0 if the end of the directory stream was reached, + * and a negative error code on failure. + */ +int ceph_readdirplus_r(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, struct dirent *de, + struct ceph_statx *stx, unsigned want, unsigned flags, struct Inode **out); + +/** + * Gets multiple directory entries. + * + * @param cmount the ceph mount handle to use for performing the getdents. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry/entries to return. + * @param name an array of struct dirent that gets filled in with the to fill returned directory entries into. + * @param buflen the length of the buffer, which should be the number of dirent structs * sizeof(struct dirent). + * @returns the length of the buffer that was filled in, will always be multiples of sizeof(struct dirent), or a + * negative error code. If the buffer is not large enough for a single entry, -ERANGE is returned. + */ +int ceph_getdents(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen); + +/** + * Gets multiple directory names. + * + * @param cmount the ceph mount handle to use for performing the getdents. + * @param dirp the directory stream pointer from an opendir holding the state of the + * next entry/entries to return. + * @param name a buffer to fill in with directory entry names. + * @param buflen the length of the buffer that can be filled in. + * @returns the length of the buffer filled in with entry names, or a negative error code on failure. + * If the buffer isn't large enough for a single entry, -ERANGE is returned. + */ +int ceph_getdnames(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, char *name, int buflen); + +/** + * Rewind the directory stream to the beginning of the directory. + * + * @param cmount the ceph mount handle to use for performing the rewinddir. + * @param dirp the directory stream pointer to rewind. + */ +void ceph_rewinddir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Get the current position of a directory stream. + * + * @param cmount the ceph mount handle to use for performing the telldir. + * @param dirp the directory stream pointer to get the current position of. + * @returns the position of the directory stream. Note that the offsets returned + * by ceph_telldir do not have a particular order (cannot be compared with + * inequality). + */ +int64_t ceph_telldir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp); + +/** + * Move the directory stream to a position specified by the given offset. + * + * @param cmount the ceph mount handle to use for performing the seekdir. + * @param dirp the directory stream pointer to move. + * @param offset the position to move the directory stream to. This offset should be + * a value returned by seekdir. Note that this value does not refer to the nth + * entry in a directory, and can not be manipulated with plus or minus. + */ +void ceph_seekdir(struct ceph_mount_info *cmount, struct ceph_dir_result *dirp, int64_t offset); + +/** + * Create a directory. + * + * @param cmount the ceph mount handle to use for making the directory. + * @param path the path of the directory to create. This must be either an + * absolute path or a relative path off of the current working directory. + * @param mode the permissions the directory should have once created. + * @returns 0 on success or a negative return code on error. + */ +int ceph_mkdir(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Create multiple directories at once. + * + * @param cmount the ceph mount handle to use for making the directories. + * @param path the full path of directories and sub-directories that should + * be created. + * @param mode the permissions the directory should have once created. + * @returns 0 on success or a negative return code on error. + */ +int ceph_mkdirs(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Remove a directory. + * + * @param cmount the ceph mount handle to use for removing directories. + * @param path the path of the directory to remove. + * @returns 0 on success or a negative return code on error. + */ +int ceph_rmdir(struct ceph_mount_info *cmount, const char *path); + +/** @} dir */ + +/** + * @defgroup libcephfs_h_links Links and Link Handling. + * Functions for creating and manipulating hard links and symbolic inks. + * + * @{ + */ + +/** + * Create a link. + * + * @param cmount the ceph mount handle to use for creating the link. + * @param existing the path to the existing file/directory to link to. + * @param newname the path to the new file/directory to link from. + * @returns 0 on success or a negative return code on error. + */ +int ceph_link(struct ceph_mount_info *cmount, const char *existing, const char *newname); + +/** + * Read a symbolic link. + * + * @param cmount the ceph mount handle to use for creating the link. + * @param path the path to the symlink to read + * @param buf the buffer to hold the path of the file that the symlink points to. + * @param size the length of the buffer + * @returns number of bytes copied on success or negative error code on failure + */ +int ceph_readlink(struct ceph_mount_info *cmount, const char *path, char *buf, int64_t size); + +/** + * Creates a symbolic link. + * + * @param cmount the ceph mount handle to use for creating the symbolic link. + * @param existing the path to the existing file/directory to link to. + * @param newname the path to the new file/directory to link from. + * @returns 0 on success or a negative return code on failure. + */ +int ceph_symlink(struct ceph_mount_info *cmount, const char *existing, const char *newname); + +/** @} links */ + +/** + * @defgroup libcephfs_h_files File manipulation and handling. + * Functions for creating and manipulating files. + * + * @{ + */ + +/** + * Removes a file, link, or symbolic link. If the file/link has multiple links to it, the + * file will not disappear from the namespace until all references to it are removed. + * + * @param cmount the ceph mount handle to use for performing the unlink. + * @param path the path of the file or link to unlink. + * @returns 0 on success or negative error code on failure. + */ +int ceph_unlink(struct ceph_mount_info *cmount, const char *path); + +/** + * Rename a file or directory. + * + * @param cmount the ceph mount handle to use for performing the rename. + * @param from the path to the existing file or directory. + * @param to the new name of the file or directory + * @returns 0 on success or negative error code on failure. + */ +int ceph_rename(struct ceph_mount_info *cmount, const char *from, const char *to); + +/** + * Get an open file's extended statistics and attributes. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param fd the file descriptor of the file to get statistics of. + * @param stx the ceph_statx struct that will be filled in with the file's statistics. + * @param want bitfield of CEPH_STATX_* flags showing designed attributes + * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_fstatx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, + unsigned int want, unsigned int flags); + +/** + * Get a file's extended statistics and attributes. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stx the ceph_statx struct that will be filled in with the file's statistics. + * @param want bitfield of CEPH_STATX_* flags showing designed attributes + * @param flags bitfield that can be used to set AT_* modifier flags (only AT_NO_ATTR_SYNC and AT_SYMLINK_NOFOLLOW) + * @returns 0 on success or negative error code on failure. + */ +int ceph_statx(struct ceph_mount_info *cmount, const char *path, struct ceph_statx *stx, + unsigned int want, unsigned int flags); + +/** + * Get a file's statistics and attributes. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stbuf the stat struct that will be filled in with the file's statistics. + * @returns 0 on success or negative error code on failure. + */ +int ceph_stat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf); + +/** + * Get a file's statistics and attributes, without following symlinks. + * + * @param cmount the ceph mount handle to use for performing the stat. + * @param path the file or directory to get the statistics of. + * @param stbuf the stat struct that will be filled in with the file's statistics. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lstat(struct ceph_mount_info *cmount, const char *path, struct stat *stbuf); + +/** + * Get the open file's statistics. + * + * @param cmount the ceph mount handle to use for performing the fstat. + * @param fd the file descriptor of the file to get statistics of. + * @param stbuf the stat struct of the file's statistics, filled in by the + * function. + * @returns 0 on success or a negative error code on failure + */ +int ceph_fstat(struct ceph_mount_info *cmount, int fd, struct stat *stbuf); + +/** + * Set a file's attributes. + * + * @param cmount the ceph mount handle to use for performing the setattr. + * @param relpath the path to the file/directory to set the attributes of. + * @param stx the statx struct that must include attribute values to set on the file. + * @param mask a mask of all the CEPH_SETATTR_* values that have been set in the statx struct. + * @param flags mask of AT_* flags (only AT_ATTR_NOFOLLOW is respected for now) + * @returns 0 on success or negative error code on failure. + */ +int ceph_setattrx(struct ceph_mount_info *cmount, const char *relpath, struct ceph_statx *stx, int mask, int flags); + +/** + * Set a file's attributes (extended version). + * + * @param cmount the ceph mount handle to use for performing the setattr. + * @param fd the fd of the open file/directory to set the attributes of. + * @param stx the statx struct that must include attribute values to set on the file. + * @param mask a mask of all the stat values that have been set on the stat struct. + * @returns 0 on success or negative error code on failure. + */ +int ceph_fsetattrx(struct ceph_mount_info *cmount, int fd, struct ceph_statx *stx, int mask); + +/** + * Change the mode bits (permissions) of a file/directory. + * + * @param cmount the ceph mount handle to use for performing the chmod. + * @param path the path to the file/directory to change the mode bits on. + * @param mode the new permissions to set. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_chmod(struct ceph_mount_info *cmount, const char *path, mode_t mode); + +/** + * Change the mode bits (permissions) of an open file. + * + * @param cmount the ceph mount handle to use for performing the chmod. + * @param fd the open file descriptor to change the mode bits on. + * @param mode the new permissions to set. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fchmod(struct ceph_mount_info *cmount, int fd, mode_t mode); + +/** + * Change the ownership of a file/directory. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param path the path of the file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_chown(struct ceph_mount_info *cmount, const char *path, int uid, int gid); + +/** + * Change the ownership of a file from an open file descriptor. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param fd the fd of the open file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_fchown(struct ceph_mount_info *cmount, int fd, int uid, int gid); + +/** + * Change the ownership of a file/directory, don't follow symlinks. + * + * @param cmount the ceph mount handle to use for performing the chown. + * @param path the path of the file/directory to change the ownership of. + * @param uid the user id to set on the file/directory. + * @param gid the group id to set on the file/directory. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lchown(struct ceph_mount_info *cmount, const char *path, int uid, int gid); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param buf holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_utime(struct ceph_mount_info *cmount, const char *path, struct utimbuf *buf); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param buf holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futime(struct ceph_mount_info *cmount, int fd, struct utimbuf *buf); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_utimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]); + +/** + * Change file/directory last access and modification times, don't follow symlinks. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param path the path to the file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_lutimes(struct ceph_mount_info *cmount, const char *path, struct timeval times[2]); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futimes(struct ceph_mount_info *cmount, int fd, struct timeval times[2]); + +/** + * Change file/directory last access and modification times. + * + * @param cmount the ceph mount handle to use for performing the utime. + * @param fd the fd of the open file/directory to set the time values of. + * @param times holding the access and modification times to set on the file. + * @returns 0 on success or negative error code on failure. + */ +int ceph_futimens(struct ceph_mount_info *cmount, int fd, struct timespec times[2]); + +/** + * Apply or remove an advisory lock. + * + * @param cmount the ceph mount handle to use for performing the lock. + * @param fd the open file descriptor to change advisory lock. + * @param operation the advisory lock operation to be performed on the file + * descriptor among LOCK_SH (shared lock), LOCK_EX (exclusive lock), + * or LOCK_UN (remove lock). The LOCK_NB value can be ORed to perform a + * non-blocking operation. + * @param owner the user-supplied owner identifier (an arbitrary integer) + * @returns 0 on success or negative error code on failure. + */ +int ceph_flock(struct ceph_mount_info *cmount, int fd, int operation, + uint64_t owner); + +/** + * Truncate the file to the given size. If this operation causes the + * file to expand, the empty bytes will be filled in with zeros. + * + * @param cmount the ceph mount handle to use for performing the truncate. + * @param path the path to the file to truncate. + * @param size the new size of the file. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_truncate(struct ceph_mount_info *cmount, const char *path, int64_t size); + +/** + * Make a block or character special file. + * + * @param cmount the ceph mount handle to use for performing the mknod. + * @param path the path to the special file. + * @param mode the permissions to use and the type of special file. The type can be + * one of S_IFREG, S_IFCHR, S_IFBLK, S_IFIFO. + * @param rdev If the file type is S_IFCHR or S_IFBLK then this parameter specifies the + * major and minor numbers of the newly created device special file. Otherwise, + * it is ignored. + * @returns 0 on success or negative error code on failure. + */ +int ceph_mknod(struct ceph_mount_info *cmount, const char *path, mode_t mode, dev_t rdev); +/** + * Create and/or open a file. + * + * @param cmount the ceph mount handle to use for performing the open. + * @param path the path of the file to open. If the flags parameter includes O_CREAT, + * the file will first be created before opening. + * @param flags a set of option masks that control how the file is created/opened. + * @param mode the permissions to place on the file if the file does not exist and O_CREAT + * is specified in the flags. + * @returns a non-negative file descriptor number on success or a negative error code on failure. + */ +int ceph_open(struct ceph_mount_info *cmount, const char *path, int flags, mode_t mode); + +/** + * Create and/or open a file with a specific file layout. + * + * @param cmount the ceph mount handle to use for performing the open. + * @param path the path of the file to open. If the flags parameter includes O_CREAT, + * the file will first be created before opening. + * @param flags a set of option masks that control how the file is created/opened. + * @param mode the permissions to place on the file if the file does not exist and O_CREAT + * is specified in the flags. + * @param stripe_unit the stripe unit size (option, 0 for default) + * @param stripe_count the stripe count (optional, 0 for default) + * @param object_size the object size (optional, 0 for default) + * @param data_pool name of target data pool name (optional, NULL or empty string for default) + * @returns a non-negative file descriptor number on success or a negative error code on failure. + */ +int ceph_open_layout(struct ceph_mount_info *cmount, const char *path, int flags, + mode_t mode, int stripe_unit, int stripe_count, int object_size, + const char *data_pool); + +/** + * Close the open file. + * + * @param cmount the ceph mount handle to use for performing the close. + * @param fd the file descriptor referring to the open file. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_close(struct ceph_mount_info *cmount, int fd); + +/** + * Reposition the open file stream based on the given offset. + * + * @param cmount the ceph mount handle to use for performing the lseek. + * @param fd the open file descriptor referring to the open file and holding the + * current position of the stream. + * @param offset the offset to set the stream to + * @param whence the flag to indicate what type of seeking to perform: + * SEEK_SET: the offset is set to the given offset in the file. + * SEEK_CUR: the offset is set to the current location plus @e offset bytes. + * SEEK_END: the offset is set to the end of the file plus @e offset bytes. + * @returns 0 on success or a negative error code on failure. + */ +int64_t ceph_lseek(struct ceph_mount_info *cmount, int fd, int64_t offset, int whence); +/** + * Read data from the file. + * + * @param cmount the ceph mount handle to use for performing the read. + * @param fd the file descriptor of the open file to read from. + * @param buf the buffer to read data into + * @param size the initial size of the buffer + * @param offset the offset in the file to read from. If this value is negative, the + * function reads from the current offset of the file descriptor. + * @returns the number of bytes read into buf, or a negative error code on failure. + */ +int ceph_read(struct ceph_mount_info *cmount, int fd, char *buf, int64_t size, int64_t offset); + +/** + * Read data from the file. + * @param cmount the ceph mount handle to use for performing the read. + * @param fd the file descriptor of the open file to read from. + * @param iov the iov structure to read data into + * @param iovcnt the number of items that iov includes + * @param offset the offset in the file to read from. If this value is negative, the + * function reads from the current offset of the file descriptor. + * @returns the number of bytes read into buf, or a negative error code on failure. + */ +int ceph_preadv(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt, + int64_t offset); + +/** + * Write data to a file. + * + * @param cmount the ceph mount handle to use for performing the write. + * @param fd the file descriptor of the open file to write to + * @param buf the bytes to write to the file + * @param size the size of the buf array + * @param offset the offset of the file write into. If this value is negative, the + * function writes to the current offset of the file descriptor. + * @returns the number of bytes written, or a negative error code + */ +int ceph_write(struct ceph_mount_info *cmount, int fd, const char *buf, int64_t size, + int64_t offset); + +/** + * Write data to a file. + * + * @param cmount the ceph mount handle to use for performing the write. + * @param fd the file descriptor of the open file to write to + * @param iov the iov structure to read data into + * @param iovcnt the number of items that iov includes + * @param offset the offset of the file write into. If this value is negative, the + * function writes to the current offset of the file descriptor. + * @returns the number of bytes written, or a negative error code + */ +int ceph_pwritev(struct ceph_mount_info *cmount, int fd, const struct iovec *iov, int iovcnt, + int64_t offset); + +/** + * Truncate a file to the given size. + * + * @param cmount the ceph mount handle to use for performing the ftruncate. + * @param fd the file descriptor of the file to truncate + * @param size the new size of the file + * @returns 0 on success or a negative error code on failure. + */ +int ceph_ftruncate(struct ceph_mount_info *cmount, int fd, int64_t size); + +/** + * Synchronize an open file to persistent media. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param syncdataonly a boolean whether to synchronize metadata and data (0) + * or just data (1). + * @return 0 on success or a negative error code on failure. + */ +int ceph_fsync(struct ceph_mount_info *cmount, int fd, int syncdataonly); + +/** + * Preallocate or release disk space for the file for the byte range. + * + * @param cmount the ceph mount handle to use for performing the fallocate. + * @param fd the file descriptor of the file to fallocate. + * @param mode the flags determines the operation to be performed on the given range. + * default operation (0) allocate and initialize to zero the file in the byte range, + * and the file size will be changed if offset + length is greater than + * the file size. if the FALLOC_FL_KEEP_SIZE flag is specified in the mode, + * the file size will not be changed. if the FALLOC_FL_PUNCH_HOLE flag is + * specified in the mode, the operation is deallocate space and zero the byte range. + * @param offset the byte range starting. + * @param length the length of the range. + * @return 0 on success or a negative error code on failure. + */ +int ceph_fallocate(struct ceph_mount_info *cmount, int fd, int mode, + int64_t offset, int64_t length); + +/** + * Enable/disable lazyio for the file. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param enable a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio(struct ceph_mount_info *cmount, int fd, int enable); + + +/** + * Flushes the write buffer for the file thereby propogating the buffered write to the file. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param offset a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio_propagate(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count); + + +/** + * Flushes the write buffer for the file and invalidate the read cache. This allows a subsequent read operation to read and cache data directly from the file and hence everyone's propagated writes would be visible. + * + * @param cmount the ceph mount handle to use for performing the fsync. + * @param fd the file descriptor of the file to sync. + * @param offset a boolean to enable lazyio or disable lazyio. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lazyio_synchronize(struct ceph_mount_info *cmount, int fd, int64_t offset, size_t count); + +/** @} file */ + +/** + * @defgroup libcephfs_h_xattr Extended Attribute manipulation and handling. + * Functions for creating and manipulating extended attributes on files. + * + * @{ + */ + +/** + * Get an extended attribute. + * + * @param cmount the ceph mount handle to use for performing the getxattr. + * @param path the path to the file + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_getxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + void *value, size_t size); + +/** + * Get an extended attribute. + * + * @param cmount the ceph mount handle to use for performing the getxattr. + * @param fd the open file descriptor referring to the file to get extended attribute from. + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_fgetxattr(struct ceph_mount_info *cmount, int fd, const char *name, + void *value, size_t size); + +/** + * Get an extended attribute without following symbolic links. This function is + * identical to ceph_getxattr, but if the path refers to a symbolic link, + * we get the extended attributes of the symlink rather than the attributes + * of the link itself. + * + * @param cmount the ceph mount handle to use for performing the lgetxattr. + * @param path the path to the file + * @param name the name of the extended attribute to get + * @param value a pre-allocated buffer to hold the xattr's value + * @param size the size of the pre-allocated buffer + * @returns the size of the value or a negative error code on failure. + */ +int ceph_lgetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + void *value, size_t size); + +/** + * List the extended attribute keys on a file. + * + * @param cmount the ceph mount handle to use for performing the listxattr. + * @param path the path to the file. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_listxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size); + +/** + * List the extended attribute keys on a file. + * + * @param cmount the ceph mount handle to use for performing the listxattr. + * @param fd the open file descriptor referring to the file to list extended attributes on. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_flistxattr(struct ceph_mount_info *cmount, int fd, char *list, size_t size); + +/** + * Get the list of extended attribute keys on a file, but do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the llistxattr. + * @param path the path to the file. + * @param list a buffer to be filled in with the list of extended attributes keys. + * @param size the size of the list buffer. + * @returns the size of the resulting list filled in. + */ +int ceph_llistxattr(struct ceph_mount_info *cmount, const char *path, char *list, size_t size); + +/** + * Remove an extended attribute from a file. + * + * @param cmount the ceph mount handle to use for performing the removexattr. + * @param path the path to the file. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_removexattr(struct ceph_mount_info *cmount, const char *path, const char *name); + +/** + * Remove an extended attribute from a file. + * + * @param cmount the ceph mount handle to use for performing the removexattr. + * @param fd the open file descriptor referring to the file to remove extended attribute from. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fremovexattr(struct ceph_mount_info *cmount, int fd, const char *name); + +/** + * Remove the extended attribute from a file, do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the lremovexattr. + * @param path the path to the file. + * @param name the name of the extended attribute to remove. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lremovexattr(struct ceph_mount_info *cmount, const char *path, const char *name); + +/** + * Set an extended attribute on a file. + * + * @param cmount the ceph mount handle to use for performing the setxattr. + * @param path the path to the file. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_setxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + const void *value, size_t size, int flags); + +/** + * Set an extended attribute on a file. + * + * @param cmount the ceph mount handle to use for performing the setxattr. + * @param fd the open file descriptor referring to the file to set extended attribute on. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_fsetxattr(struct ceph_mount_info *cmount, int fd, const char *name, + const void *value, size_t size, int flags); + +/** + * Set an extended attribute on a file, do not follow symbolic links. + * + * @param cmount the ceph mount handle to use for performing the lsetxattr. + * @param path the path to the file. + * @param name the name of the extended attribute to set. + * @param value the bytes of the extended attribute value + * @param size the size of the extended attribute value + * @param flags the flags can be: + * CEPH_XATTR_CREATE: create the extended attribute. Must not exist. + * CEPH_XATTR_REPLACE: replace the extended attribute, Must already exist. + * @returns 0 on success or a negative error code on failure. + */ +int ceph_lsetxattr(struct ceph_mount_info *cmount, const char *path, const char *name, + const void *value, size_t size, int flags); + +/** @} xattr */ + +/** + * @defgroup libcephfs_h_filelayout Control File Layout. + * Functions for setting and getting the file layout of existing files. + * + * @{ + */ + +/** + * Get the file striping unit from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the striping unit of. + * @returns the striping unit of the file or a negative error code on failure. + */ +int ceph_get_file_stripe_unit(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file striping unit. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the striping unit of. + * @returns the striping unit of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_unit(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file striping count from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_file_stripe_count(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file striping count. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the striping count of. + * @returns the striping count of the file or a negative error code on failure. + */ +int ceph_get_path_stripe_count(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file object size from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_file_object_size(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file object size. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the object size of. + * @returns the object size of the file or a negative error code on failure. + */ +int ceph_get_path_object_size(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the file pool information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the pool information of. + * @returns the ceph pool id that the file is in + */ +int ceph_get_file_pool(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file pool information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the pool information of. + * @returns the ceph pool id that the file is in + */ +int ceph_get_path_pool(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the name of the pool a opened file is stored in, + * + * Write the name of the file's pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_file_pool_name(struct ceph_mount_info *cmount, int fh, char *buf, size_t buflen); + +/** + * get the name of a pool by id + * + * Given a pool's numeric identifier, get the pool's alphanumeric name. + * + * @param cmount the ceph mount handle to use + * @param pool the numeric pool id + * @param buf buffer to sore the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough + */ +int ceph_get_pool_name(struct ceph_mount_info *cmount, int pool, char *buf, size_t buflen); + +/** + * Get the name of the pool a file is stored in + * + * Write the name of the file's pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_path_pool_name(struct ceph_mount_info *cmount, const char *path, char *buf, size_t buflen); + +/** + * Get the default pool name of cephfs + * Write the name of the default pool to the buffer. If buflen is 0, return + * a suggested length for the buffer. + * @param cmount the ceph mount handle to use. + * @param buf buffer to store the name in + * @param buflen size of the buffer + * @returns length in bytes of the pool name, or -ERANGE if the buffer is not large enough. + */ +int ceph_get_default_data_pool_name(struct ceph_mount_info *cmount, char *buf, size_t buflen); + +/** + * Get the file layout from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_file_layout(struct ceph_mount_info *cmount, int fh, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file layout. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the layout of. + * @param stripe_unit where to store the striping unit of the file + * @param stripe_count where to store the striping count of the file + * @param object_size where to store the object size of the file + * @param pg_pool where to store the ceph pool id that the file is in + * @returns 0 on success or a negative error code on failure. + */ +int ceph_get_path_layout(struct ceph_mount_info *cmount, const char *path, int *stripe_unit, int *stripe_count, int *object_size, int *pg_pool); + +/** + * Get the file replication information from an open file descriptor. + * + * @param cmount the ceph mount handle to use. + * @param fh the open file descriptor referring to the file to get the replication information of. + * @returns the replication factor of the file. + */ +int ceph_get_file_replication(struct ceph_mount_info *cmount, int fh); + +/** + * Get the file replication information. + * + * @param cmount the ceph mount handle to use. + * @param path the path of the file/directory get the replication information of. + * @returns the replication factor of the file. + */ +int ceph_get_path_replication(struct ceph_mount_info *cmount, const char *path); + +/** + * Get the id of the named pool. + * + * @param cmount the ceph mount handle to use. + * @param pool_name the name of the pool. + * @returns the pool id, or a negative error code on failure. + */ +int ceph_get_pool_id(struct ceph_mount_info *cmount, const char *pool_name); + +/** + * Get the pool replication factor. + * + * @param cmount the ceph mount handle to use. + * @param pool_id the pool id to look up + * @returns the replication factor, or a negative error code on failure. + */ +int ceph_get_pool_replication(struct ceph_mount_info *cmount, int pool_id); + +/** + * Get the OSD address where the primary copy of a file stripe is located. + * + * @param cmount the ceph mount handle to use. + * @param fd the open file descriptor referring to the file to get the striping unit of. + * @param offset the offset into the file to specify the stripe. The offset can be + * anywhere within the stripe unit. + * @param addr the address of the OSD holding that stripe + * @param naddr the capacity of the address passed in. + * @returns the size of the addressed filled into the @e addr parameter, or a negative + * error code on failure. + */ +int ceph_get_file_stripe_address(struct ceph_mount_info *cmount, int fd, int64_t offset, + struct sockaddr_storage *addr, int naddr); + +/** + * Get the list of OSDs where the objects containing a file offset are located. + * + * @param cmount the ceph mount handle to use. + * @param fd the open file descriptor referring to the file. + * @param offset the offset within the file. + * @param length return the number of bytes between the offset and the end of + * the stripe unit (optional). + * @param osds an integer array to hold the OSD ids. + * @param nosds the size of the integer array. + * @returns the number of items stored in the output array, or -ERANGE if the + * array is not large enough. + */ +int ceph_get_file_extent_osds(struct ceph_mount_info *cmount, int fd, + int64_t offset, int64_t *length, int *osds, int nosds); + +/** + * Get the fully qualified CRUSH location of an OSD. + * + * Returns (type, name) string pairs for each device in the CRUSH bucket + * hierarchy starting from the given osd to the root. Each pair element is + * separated by a NULL character. + * + * @param cmount the ceph mount handle to use. + * @param osd the OSD id. + * @param path buffer to store location. + * @param len size of buffer. + * @returns the amount of bytes written into the buffer, or -ERANGE if the + * array is not large enough. + */ +int ceph_get_osd_crush_location(struct ceph_mount_info *cmount, + int osd, char *path, size_t len); + +/** + * Get the network address of an OSD. + * + * @param cmount the ceph mount handle. + * @param osd the OSD id. + * @param addr the OSD network address. + * @returns zero on success, other returns a negative error code. + */ +int ceph_get_osd_addr(struct ceph_mount_info *cmount, int osd, + struct sockaddr_storage *addr); + +/** + * Get the file layout stripe unit granularity. + * @param cmount the ceph mount handle. + * @returns the stripe unit granularity or a negative error code on failure. + */ +int ceph_get_stripe_unit_granularity(struct ceph_mount_info *cmount); + +/** @} filelayout */ + +/** + * No longer available. Do not use. + * These functions will return -EOPNOTSUPP. + */ +int ceph_set_default_file_stripe_unit(struct ceph_mount_info *cmount, int stripe); +int ceph_set_default_file_stripe_count(struct ceph_mount_info *cmount, int count); +int ceph_set_default_object_size(struct ceph_mount_info *cmount, int size); +int ceph_set_default_preferred_pg(struct ceph_mount_info *cmount, int osd); +int ceph_set_default_file_replication(struct ceph_mount_info *cmount, int replication); + +/** + * Read from local replicas when possible. + * + * @param cmount the ceph mount handle to use. + * @param val a boolean to set (1) or clear (0) the option to favor local objects + * for reads. + * @returns 0 + */ +int ceph_localize_reads(struct ceph_mount_info *cmount, int val); + +/** + * Get the osd id of the local osd (if any) + * + * @param cmount the ceph mount handle to use. + * @returns the osd (if any) local to the node where this call is made, otherwise + * -1 is returned. + */ +int ceph_get_local_osd(struct ceph_mount_info *cmount); + +/** @} default_filelayout */ + +/** + * Get the capabilities currently issued to the client. + * + * @param cmount the ceph mount handle to use. + * @param fd the file descriptor to get issued + * @returns the current capabilities issued to this client + * for the open file + */ +int ceph_debug_get_fd_caps(struct ceph_mount_info *cmount, int fd); + +/** + * Get the capabilities currently issued to the client. + * + * @param cmount the ceph mount handle to use. + * @param path the path to the file + * @returns the current capabilities issued to this client + * for the file + */ +int ceph_debug_get_file_caps(struct ceph_mount_info *cmount, const char *path); + +/* Low Level */ +struct Inode *ceph_ll_get_inode(struct ceph_mount_info *cmount, + vinodeno_t vino); +int ceph_ll_lookup_inode( + struct ceph_mount_info *cmount, + struct inodeno_t ino, + Inode **inode); + +/** + * Get the root inode of FS. Increase counter of references for root Inode. You must call ceph_ll_forget for it! + * + * @param cmount the ceph mount handle to use. + * @param parent pointer to pointer to Inode struct. Pointer to root inode will be returned + * @returns 0 if all good + */ +int ceph_ll_lookup_root(struct ceph_mount_info *cmount, + Inode **parent); +int ceph_ll_lookup(struct ceph_mount_info *cmount, Inode *parent, + const char *name, Inode **out, struct ceph_statx *stx, + unsigned want, unsigned flags, const UserPerm *perms); +int ceph_ll_put(struct ceph_mount_info *cmount, struct Inode *in); +int ceph_ll_forget(struct ceph_mount_info *cmount, struct Inode *in, + int count); +int ceph_ll_walk(struct ceph_mount_info *cmount, const char* name, Inode **i, + struct ceph_statx *stx, unsigned int want, unsigned int flags, + const UserPerm *perms); +int ceph_ll_getattr(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_statx *stx, unsigned int want, unsigned int flags, + const UserPerm *perms); +int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_statx *stx, int mask, const UserPerm *perms); +int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags, + struct Fh **fh, const UserPerm *perms); +off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle, + off_t offset, int whence); +int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle, + int64_t off, uint64_t len, char* buf); +int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh, + int syncdataonly); +int ceph_ll_sync_inode(struct ceph_mount_info *cmount, struct Inode *in, + int syncdataonly); +int ceph_ll_fallocate(struct ceph_mount_info *cmount, struct Fh *fh, + int mode, int64_t offset, int64_t length); +int ceph_ll_write(struct ceph_mount_info *cmount, struct Fh* filehandle, + int64_t off, uint64_t len, const char *data); +int64_t ceph_ll_readv(struct ceph_mount_info *cmount, struct Fh *fh, + const struct iovec *iov, int iovcnt, int64_t off); +int64_t ceph_ll_writev(struct ceph_mount_info *cmount, struct Fh *fh, + const struct iovec *iov, int iovcnt, int64_t off); +int ceph_ll_close(struct ceph_mount_info *cmount, struct Fh* filehandle); +int ceph_ll_iclose(struct ceph_mount_info *cmount, struct Inode *in, int mode); +/** + * Get xattr value by xattr name. + * + * @param cmount the ceph mount handle to use. + * @param in file handle + * @param name name of attribute + * @param value pointer to begin buffer + * @param size buffer size + * @param perms pointer to UserPerms object + * @returns size of returned buffer. Negative number in error case + */ +int ceph_ll_getxattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, void *value, size_t size, + const UserPerm *perms); +int ceph_ll_setxattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const void *value, size_t size, + int flags, const UserPerm *perms); +int ceph_ll_listxattr(struct ceph_mount_info *cmount, struct Inode *in, + char *list, size_t buf_size, size_t *list_size, + const UserPerm *perms); +int ceph_ll_removexattr(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +int ceph_ll_create(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, int oflags, Inode **outp, + Fh **fhp, struct ceph_statx *stx, unsigned want, + unsigned lflags, const UserPerm *perms); +int ceph_ll_mknod(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, dev_t rdev, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm *perms); +int ceph_ll_mkdir(struct ceph_mount_info *cmount, Inode *parent, + const char *name, mode_t mode, Inode **out, + struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm *perms); +int ceph_ll_link(struct ceph_mount_info *cmount, struct Inode *in, + struct Inode *newparent, const char *name, + const UserPerm *perms); +int ceph_ll_opendir(struct ceph_mount_info *cmount, struct Inode *in, + struct ceph_dir_result **dirpp, const UserPerm *perms); +int ceph_ll_releasedir(struct ceph_mount_info *cmount, + struct ceph_dir_result* dir); +int ceph_ll_rename(struct ceph_mount_info *cmount, struct Inode *parent, + const char *name, struct Inode *newparent, + const char *newname, const UserPerm *perms); +int ceph_ll_unlink(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +int ceph_ll_statfs(struct ceph_mount_info *cmount, struct Inode *in, + struct statvfs *stbuf); +int ceph_ll_readlink(struct ceph_mount_info *cmount, struct Inode *in, + char *buf, size_t bufsize, const UserPerm *perms); +int ceph_ll_symlink(struct ceph_mount_info *cmount, + Inode *in, const char *name, const char *value, + Inode **out, struct ceph_statx *stx, + unsigned want, unsigned flags, + const UserPerm *perms); +int ceph_ll_rmdir(struct ceph_mount_info *cmount, struct Inode *in, + const char *name, const UserPerm *perms); +uint32_t ceph_ll_stripe_unit(struct ceph_mount_info *cmount, + struct Inode *in); +uint32_t ceph_ll_file_layout(struct ceph_mount_info *cmount, + struct Inode *in, + struct ceph_file_layout *layout); +uint64_t ceph_ll_snap_seq(struct ceph_mount_info *cmount, + struct Inode *in); +int ceph_ll_get_stripe_osd(struct ceph_mount_info *cmount, + struct Inode *in, + uint64_t blockno, + struct ceph_file_layout* layout); +int ceph_ll_num_osds(struct ceph_mount_info *cmount); +int ceph_ll_osdaddr(struct ceph_mount_info *cmount, + int osd, uint32_t *addr); +uint64_t ceph_ll_get_internal_offset(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockno); +int ceph_ll_read_block(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockid, + char* bl, uint64_t offset, uint64_t length, + struct ceph_file_layout* layout); +int ceph_ll_write_block(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t blockid, + char* buf, uint64_t offset, + uint64_t length, struct ceph_file_layout* layout, + uint64_t snapseq, uint32_t sync); +int ceph_ll_commit_blocks(struct ceph_mount_info *cmount, + struct Inode *in, uint64_t offset, uint64_t range); + + +int ceph_ll_getlk(struct ceph_mount_info *cmount, + Fh *fh, struct flock *fl, uint64_t owner); +int ceph_ll_setlk(struct ceph_mount_info *cmount, + Fh *fh, struct flock *fl, uint64_t owner, int sleep); + +int ceph_ll_lazyio(struct ceph_mount_info *cmount, Fh *fh, int enable); + +/* + * Delegation support + * + * Delegations are way for an application to request exclusive or + * semi-exclusive access to an Inode. The client requests the delegation and + * if it's successful it can reliably cache file data and metadata until the + * delegation is recalled. + * + * Recalls are issued via a callback function, provided by the application. + * Callback functions should act something like signal handlers. You want to + * do as little as possible in the callback. Any major work should be deferred + * in some fashion as it's difficult to predict the context in which this + * function will be called. + * + * Once the delegation has been recalled, the application should return it as + * soon as possible. The application has client_deleg_timeout seconds to + * return it, after which the cmount structure is forcibly unmounted and + * further calls into it fail. + * + * The application can set the client_deleg_timeout config option to suit its + * needs, but it should take care to choose a value that allows it to avoid + * forcible eviction from the cluster in the event of an application bug. + */ + +/* Commands for manipulating delegation state */ +#ifndef CEPH_DELEGATION_NONE +# define CEPH_DELEGATION_NONE 0 +# define CEPH_DELEGATION_RD 1 +# define CEPH_DELEGATION_WR 2 +#endif + +/** + * Get the amount of time that the client has to return caps + * @param cmount the ceph mount handle to use. + * + * In the event that a client does not return its caps, the MDS may blacklist + * it after this timeout. Applications should check this value and ensure + * that they set the delegation timeout to a value lower than this. + * + * This call returns the cap return timeout (in seconds) for this cmount, or + * zero if it's not mounted. + */ +uint32_t ceph_get_cap_return_timeout(struct ceph_mount_info *cmount); + +/** + * Set the delegation timeout for the mount (thereby enabling delegations) + * @param cmount the ceph mount handle to use. + * @param timeout the delegation timeout (in seconds) + * + * Since the client could end up blacklisted if it doesn't return delegations + * in time, we mandate that any application wanting to use delegations + * explicitly set the timeout beforehand. Until this call is done on the + * mount, attempts to set a delegation will return -ETIME. + * + * Once a delegation is recalled, if it is not returned in this amount of + * time, the cmount will be forcibly unmounted and further access attempts + * will fail (usually with -ENOTCONN errors). + * + * This value is further vetted against the cap return timeout, and this call + * can fail with -EINVAL if the timeout value is too long. Delegations can be + * disabled again by setting the timeout to 0. + */ +int ceph_set_deleg_timeout(struct ceph_mount_info *cmount, uint32_t timeout); + +/** + * Request a delegation on an open Fh + * @param cmount the ceph mount handle to use. + * @param fh file handle + * @param cmd CEPH_DELEGATION_* command + * @param cb callback function for recalling delegation + * @param priv opaque token passed back during recalls + * + * Returns 0 if the delegation was granted, -EAGAIN if there was a conflict + * and other error codes if there is a fatal error of some sort (e.g. -ENOMEM, + * -ETIME) + */ +int ceph_ll_delegation(struct ceph_mount_info *cmount, Fh *fh, + unsigned int cmd, ceph_deleg_cb_t cb, void *priv); + +mode_t ceph_umask(struct ceph_mount_info *cmount, mode_t mode); + +/* state reclaim */ +#define CEPH_RECLAIM_RESET 1 + +/** + * Set ceph client uuid + * @param cmount the ceph mount handle to use. + * @param uuid the uuid to set + * + * Must be called before mount. + */ +void ceph_set_uuid(struct ceph_mount_info *cmount, const char *uuid); + +/** + * Set ceph client session timeout + * @param cmount the ceph mount handle to use. + * @param timeout the timeout to set + * + * Must be called before mount. + */ +void ceph_set_session_timeout(struct ceph_mount_info *cmount, unsigned timeout); + +/** + * Start to reclaim states of other client + * @param cmount the ceph mount handle to use. + * @param uuid uuid of client whose states need to be reclaimed + * @param flags flags that control how states get reclaimed + * + * Returns 0 success, -EOPNOTSUPP if mds does not support the operation, + * -ENOENT if CEPH_RECLAIM_RESET is specified and there is no client + * with the given uuid, -ENOTRECOVERABLE in all other error cases. + */ +int ceph_start_reclaim(struct ceph_mount_info *cmount, + const char *uuid, unsigned flags); + +/** + * finish reclaiming states of other client ( + * @param cmount the ceph mount handle to use. + */ +void ceph_finish_reclaim(struct ceph_mount_info *cmount); + +/** + * Register a set of callbacks to be used with this cmount + * @param cmount the ceph mount handle on which the cb's should be registerd + * @param args callback arguments to register with the cmount + * + * Any fields set to NULL will be ignored. There currently is no way to + * unregister these callbacks, so this is a one-way change. + */ +void ceph_ll_register_callbacks(struct ceph_mount_info *cmount, + struct ceph_client_callback_args *args); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/cmp.h b/src/include/cmp.h new file mode 100644 index 00000000..79372fde --- /dev/null +++ b/src/include/cmp.h @@ -0,0 +1,205 @@ +#ifndef __CEPH_CMP_H +#define __CEPH_CMP_H + +/* + * macros to define comparison operators for classes with small numbers of members. + */ + +#define WRITE_EQ_OPERATORS_1(type, a) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a; \ + } + +#define WRITE_CMP_OPERATORS_1(type, a) \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a; \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a; \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a >= r.a; \ + } \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a <= r.a; \ + } + +#define WRITE_EQ_OPERATORS_2(type, a, b) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a && l.b == r.b; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a || l.b != r.b; \ + } + +#define WRITE_CMP_OPERATORS_2(type, a, b) \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b)); \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b)); \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b >= r.b)); \ + } \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b <= r.b)); \ + } + + +#define WRITE_EQ_OPERATORS_3(type, a, b, c) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a && l.b == r.b && l.c == r.c; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a || l.b != r.b || l.c != r.c; \ + } + +#define WRITE_CMP_OPERATORS_3(type, a, b, c) \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c)))); \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c)))); \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c >= r.c)))); \ + } \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c <= r.c)))); \ + } + +#define WRITE_EQ_OPERATORS_4(type, a, b, c, d) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d; \ + } + +#define WRITE_CMP_OPERATORS_4(type, a, b, c, d) \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d > r.d)))))); \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d < r.d)))))); \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d >= r.d)))))); \ + } \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d <= r.d)))))); \ + } + + + +#define WRITE_EQ_OPERATORS_5(type, a, b, c, d, e) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e; \ + } + +#define WRITE_CMP_OPERATORS_5(type, a, b, c, d, e) \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d > r.d || \ + (l.d == r.d && l.e > r.e))))))); \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d < r.d || \ + (l.d == r.d && (l.e < r.e)))))))); \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d > r.d || \ + (l.d == r.d && l.e >= r.e))))))); \ + } \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d < r.d || \ + (l.d == r.d && l.e <= r.e))))))); \ + } + +#define WRITE_EQ_OPERATORS_7(type, a, b, c, d, e, f, g) \ + inline bool operator==(const type &l, const type &r) { \ + return l.a == r.a && l.b == r.b && l.c == r.c && l.d == r.d && l.e == r.e && l.f == r.f && l.g == r.g; \ + } \ + inline bool operator!=(const type &l, const type &r) { \ + return l.a != r.a || l.b != r.b || l.c != r.c || l.d != r.d || l.e != r.e || l.f != r.f || l.g != r.g; \ + } +#define WRITE_CMP_OPERATORS_7(type, a, b, c, d, e, f, g) \ + inline bool operator<=(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d < r.d || \ + (l.d == r.d && (l.e < r.e || \ + (l.e == r.e && (l.f < r.f || \ + (l.f == r.f && l.g <= r.g))))))))))); \ + } \ + inline bool operator>=(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d > r.d || \ + (l.d == r.d && (l.e > r.e || \ + (l.e == r.e && (l.f > r.f || \ + (l.f == r.f && l.g >= r.g))))))))))); \ + } \ + inline bool operator>(const type &l, const type &r) { \ + return l.a > r.a || \ + (l.a == r.a && (l.b > r.b || \ + (l.b == r.b && (l.c > r.c || \ + (l.c == r.c && (l.d > r.d || \ + (l.d == r.d && (l.e > r.e || \ + (l.e == r.e && (l.f > r.f || \ + (l.f == r.f && l.g > r.g))))))))))); \ + } \ + inline bool operator<(const type &l, const type &r) { \ + return l.a < r.a || \ + (l.a == r.a && (l.b < r.b || \ + (l.b == r.b && (l.c < r.c || \ + (l.c == r.c && (l.d < r.d || \ + (l.d == r.d && (l.e < r.e || \ + (l.e == r.e && (l.f < r.f || \ + (l.f == r.f && l.g < r.g))))))))))); \ + } +#endif diff --git a/src/include/color.h b/src/include/color.h new file mode 100644 index 00000000..6c8df40e --- /dev/null +++ b/src/include/color.h @@ -0,0 +1,13 @@ +#ifndef CEPH_COLOR_H +#define CEPH_COLOR_H + +#define TEXT_NORMAL "\033[0m" +/*#define TEXT_HAZARD "\033[5;31m"*/ +#define TEXT_RED "\033[0;31m" +#define TEXT_GREEN "\033[0;32m" +#define TEXT_YELLOW "\033[0;33m" +#define TEXT_BLUE "\033[0;34m" +#define TEXT_MAGENTA "\033[0;35m" +#define TEXT_CYAN "\033[0;36m" + +#endif diff --git a/src/include/compact_map.h b/src/include/compact_map.h new file mode 100644 index 00000000..3ccb7982 --- /dev/null +++ b/src/include/compact_map.h @@ -0,0 +1,383 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_COMPACT_MAP_H +#define CEPH_COMPACT_MAP_H + +#include "buffer.h" +#include "encoding.h" + +#include <map> +#include <memory> + +#include "include/encoding.h" + +template <class Key, class T, class Map> +class compact_map_base { +protected: + std::unique_ptr<Map> map; + void alloc_internal() { + if (!map) + map.reset(new Map); + } + void free_internal() { + map.reset(); + } + template <class It> + class const_iterator_base { + const compact_map_base *map; + It it; + const_iterator_base() : map(0) { } + const_iterator_base(const compact_map_base* m) : map(m) { } + const_iterator_base(const compact_map_base *m, const It& i) : map(m), it(i) { } + friend class compact_map_base; + friend class iterator_base; + public: + const_iterator_base(const const_iterator_base& o) { + map = o.map; + it = o.it; + } + bool operator==(const const_iterator_base& o) const { + return (map == o.map) && (!map->map || it == o.it); + } + bool operator!=(const const_iterator_base& o) const { + return !(*this == o);; + } + const_iterator_base& operator=(const const_iterator_base& o) { + map = o.map; + it = o.it; + return *this; + } + const_iterator_base& operator++() { + ++it; + return *this; + } + const_iterator_base& operator--() { + --it; + return *this; + } + const std::pair<const Key,T>& operator*() { + return *it; + } + const std::pair<const Key,T>* operator->() { + return it.operator->(); + } + }; + template <class It> + class iterator_base { + private: + const compact_map_base* map; + It it; + iterator_base() : map(0) { } + iterator_base(compact_map_base* m) : map(m) { } + iterator_base(compact_map_base* m, const It& i) : map(m), it(i) { } + friend class compact_map_base; + public: + iterator_base(const iterator_base& o) { + map = o.map; + it = o.it; + } + bool operator==(const iterator_base& o) const { + return (map == o.map) && (!map->map || it == o.it); + } + bool operator!=(const iterator_base& o) const { + return !(*this == o);; + } + iterator_base& operator=(const iterator_base& o) { + map = o.map; + it = o.it; + return *this; + } + iterator_base& operator++() { + ++it; + return *this; + } + iterator_base operator++(int) { + iterator_base tmp = *this; + ++it; + return tmp; + } + iterator_base& operator--() { + --it; + return *this; + } + std::pair<const Key,T>& operator*() { + return *it; + } + std::pair<const Key,T>* operator->() { + return it.operator->(); + } + operator const_iterator_base<It>() const { + return const_iterator_base<It>(map, it); + } + }; + +public: + class iterator : public iterator_base<typename Map::iterator> { + public: + iterator() { } + iterator(const iterator_base<typename Map::iterator>& o) + : iterator_base<typename Map::iterator>(o) { } + iterator(compact_map_base* m) : iterator_base<typename Map::iterator>(m) { } + iterator(compact_map_base* m, const typename Map::iterator& i) + : iterator_base<typename Map::iterator>(m, i) { } + }; + class const_iterator : public const_iterator_base<typename Map::const_iterator> { + public: + const_iterator() { } + const_iterator(const iterator_base<typename Map::const_iterator>& o) + : const_iterator_base<typename Map::const_iterator>(o) { } + const_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_iterator>(m) { } + const_iterator(const compact_map_base* m, const typename Map::const_iterator& i) + : const_iterator_base<typename Map::const_iterator>(m, i) { } + }; + class reverse_iterator : public iterator_base<typename Map::reverse_iterator> { + public: + reverse_iterator() { } + reverse_iterator(const iterator_base<typename Map::reverse_iterator>& o) + : iterator_base<typename Map::reverse_iterator>(o) { } + reverse_iterator(compact_map_base* m) : iterator_base<typename Map::reverse_iterator>(m) { } + reverse_iterator(compact_map_base* m, const typename Map::reverse_iterator& i) + : iterator_base<typename Map::reverse_iterator>(m, i) { } + }; + class const_reverse_iterator : public const_iterator_base<typename Map::const_reverse_iterator> { + public: + const_reverse_iterator() { } + const_reverse_iterator(const iterator_base<typename Map::const_reverse_iterator>& o) + : iterator_base<typename Map::const_reverse_iterator>(o) { } + const_reverse_iterator(const compact_map_base* m) : const_iterator_base<typename Map::const_reverse_iterator>(m) { } + const_reverse_iterator(const compact_map_base* m, const typename Map::const_reverse_iterator& i) + : const_iterator_base<typename Map::const_reverse_iterator>(m, i) { } + }; + compact_map_base(const compact_map_base& o) { + if (o.map) { + alloc_internal(); + *map = *o.map; + } + } + compact_map_base() {} + ~compact_map_base() {} + + bool empty() const { + return !map || map->empty(); + } + size_t size() const { + return map ? map->size() : 0; + } + bool operator==(const compact_map_base& o) const { + return (empty() && o.empty()) || (map && o.map && *map == *o.map); + } + bool operator!=(const compact_map_base& o) const { + return !(*this == o); + } + size_t count (const Key& k) const { + return map ? map->count(k) : 0; + } + iterator erase (iterator p) { + if (map) { + ceph_assert(this == p.map); + auto it = map->erase(p.it); + if (map->empty()) { + free_internal(); + return iterator(this); + } else { + return iterator(this, it); + } + } else { + return iterator(this); + } + } + size_t erase (const Key& k) { + if (!map) + return 0; + size_t r = map->erase(k); + if (map->empty()) + free_internal(); + return r; + } + void clear() { + free_internal(); + } + void swap(compact_map_base& o) { + map.swap(o.map); + } + compact_map_base& operator=(const compact_map_base& o) { + if (o.map) { + alloc_internal(); + *map = *o.map; + } else + free_internal(); + return *this; + } + iterator insert(const std::pair<const Key, T>& val) { + alloc_internal(); + return iterator(this, map->insert(val)); + } + template <class... Args> + std::pair<iterator,bool> emplace ( Args&&... args ) { + alloc_internal(); + auto em = map->emplace(std::forward<Args>(args)...); + return std::pair<iterator,bool>(iterator(this, em.first), em.second); + } + iterator begin() { + if (!map) + return iterator(this); + return iterator(this, map->begin()); + } + iterator end() { + if (!map) + return iterator(this); + return iterator(this, map->end()); + } + reverse_iterator rbegin() { + if (!map) + return reverse_iterator(this); + return reverse_iterator(this, map->rbegin()); + } + reverse_iterator rend() { + if (!map) + return reverse_iterator(this); + return reverse_iterator(this, map->rend()); + } + iterator find(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->find(k)); + } + iterator lower_bound(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->lower_bound(k)); + } + iterator upper_bound(const Key& k) { + if (!map) + return iterator(this); + return iterator(this, map->upper_bound(k)); + } + const_iterator begin() const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->begin()); + } + const_iterator end() const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->end()); + } + const_reverse_iterator rbegin() const { + if (!map) + return const_reverse_iterator(this); + return const_reverse_iterator(this, map->rbegin()); + } + const_reverse_iterator rend() const { + if (!map) + return const_reverse_iterator(this); + return const_reverse_iterator(this, map->rend()); + } + const_iterator find(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->find(k)); + } + const_iterator lower_bound(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->lower_bound(k)); + } + const_iterator upper_bound(const Key& k) const { + if (!map) + return const_iterator(this); + return const_iterator(this, map->upper_bound(k)); + } + void encode(bufferlist &bl) const { + using ceph::encode; + if (map) + encode(*map, bl); + else + encode((uint32_t)0, bl); + } + void encode(bufferlist &bl, uint64_t features) const { + using ceph::encode; + if (map) + encode(*map, bl, features); + else + encode((uint32_t)0, bl); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + using ceph::decode_nohead; + uint32_t n; + decode(n, p); + if (n > 0) { + alloc_internal(); + decode_nohead(n, *map, p); + } else + free_internal(); + } +}; + +template<class Key, class T, class Map> +inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl) { + m.encode(bl); +} +template<class Key, class T, class Map> +inline void encode(const compact_map_base<Key, T, Map>& m, bufferlist& bl, + uint64_t features) { + m.encode(bl, features); +} +template<class Key, class T, class Map> +inline void decode(compact_map_base<Key, T, Map>& m, bufferlist::const_iterator& p) { + m.decode(p); +} + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +class compact_map : public compact_map_base<Key, T, std::map<Key,T,Compare,Alloc> > { +public: + T& operator[](const Key& k) { + this->alloc_internal(); + return (*(this->map))[k]; + } +}; + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +inline std::ostream& operator<<(std::ostream& out, const compact_map<Key, T, Compare, Alloc>& m) +{ + out << "{"; + bool first = true; + for (const auto &p : m) { + if (!first) + out << ","; + out << p.first << "=" << p.second; + first = false; + } + out << "}"; + return out; +} + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +class compact_multimap : public compact_map_base<Key, T, std::multimap<Key,T,Compare,Alloc> > { +}; + +template <class Key, class T, class Compare = std::less<Key>, class Alloc = std::allocator< std::pair<const Key, T> > > +inline std::ostream& operator<<(std::ostream& out, const compact_multimap<Key, T, Compare, Alloc>& m) +{ + out << "{{"; + bool first = true; + for (const auto &p : m) { + if (!first) + out << ","; + out << p.first << "=" << p.second; + first = false; + } + out << "}}"; + return out; +} +#endif diff --git a/src/include/compact_set.h b/src/include/compact_set.h new file mode 100644 index 00000000..ba743fb0 --- /dev/null +++ b/src/include/compact_set.h @@ -0,0 +1,305 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Red Hat, Inc + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_COMPACT_SET_H +#define CEPH_COMPACT_SET_H + +#include "buffer.h" +#include "encoding.h" + +#include <memory> +#include <set> + +template <class T, class Set> +class compact_set_base { +protected: + std::unique_ptr<Set> set; + void alloc_internal() { + if (!set) + set.reset(new Set); + } + void free_internal() { + set.reset(); + } + template <class It> + class iterator_base { + private: + const compact_set_base* set; + It it; + iterator_base() : set(0) { } + iterator_base(const compact_set_base* s) : set(s) { } + iterator_base(const compact_set_base* s, const It& i) : set(s), it(i) { } + friend class compact_set_base; + public: + iterator_base(const iterator_base& o) { + set = o.set; + it = o.it; + } + bool operator==(const iterator_base& o) const { + return (set == o.set) && (!set->set || it == o.it); + } + bool operator!=(const iterator_base& o) const { + return !(*this == o);; + } + iterator_base& operator=(const iterator_base& o) { + set->set = o.set; + it = o.it; + return *this; + } + iterator_base& operator++() { + ++it; + return *this; + } + iterator_base operator++(int) { + iterator_base tmp = *this; + ++it; + return tmp; + } + iterator_base& operator--() { + --it; + return *this; + } + const T& operator*() { + return *it; + } + }; +public: + class const_iterator : public iterator_base<typename Set::const_iterator> { + public: + const_iterator() { } + const_iterator(const iterator_base<typename Set::const_iterator>& o) + : iterator_base<typename Set::const_iterator>(o) { } + const_iterator(const compact_set_base* s) : iterator_base<typename Set::const_iterator>(s) { } + const_iterator(const compact_set_base* s, const typename Set::const_iterator& i) + : iterator_base<typename Set::const_iterator>(s, i) { } + }; + class iterator : public iterator_base<typename Set::iterator> { + public: + iterator() { } + iterator(const iterator_base<typename Set::iterator>& o) + : iterator_base<typename Set::iterator>(o) { } + iterator(compact_set_base* s) : iterator_base<typename Set::iterator>(s) { } + iterator(compact_set_base* s, const typename Set::iterator& i) + : iterator_base<typename Set::iterator>(s, i) { } + operator const_iterator() const { + return const_iterator(this->set, this->it); + } + }; + class const_reverse_iterator : public iterator_base<typename Set::const_reverse_iterator> { + public: + const_reverse_iterator() { } + const_reverse_iterator(const iterator_base<typename Set::const_reverse_iterator>& o) + : iterator_base<typename Set::const_reverse_iterator>(o) { } + const_reverse_iterator(const compact_set_base* s) : iterator_base<typename Set::const_reverse_iterator>(s) { } + const_reverse_iterator(const compact_set_base* s, const typename Set::const_reverse_iterator& i) + : iterator_base<typename Set::const_reverse_iterator>(s, i) { } + }; + class reverse_iterator : public iterator_base<typename Set::reverse_iterator> { + public: + reverse_iterator() { } + reverse_iterator(const iterator_base<typename Set::reverse_iterator>& o) + : iterator_base<typename Set::reverse_iterator>(o) { } + reverse_iterator(compact_set_base* s) : iterator_base<typename Set::reverse_iterator>(s) { } + reverse_iterator(compact_set_base* s, const typename Set::reverse_iterator& i) + : iterator_base<typename Set::reverse_iterator>(s, i) { } + operator const_iterator() const { + return const_iterator(this->set, this->it); + } + }; + + compact_set_base() {} + compact_set_base(const compact_set_base& o) { + if (o.set) { + alloc_internal(); + *set = *o.set; + } + } + ~compact_set_base() {} + + + bool empty() const { + return !set || set->empty(); + } + size_t size() const { + return set ? set->size() : 0; + } + bool operator==(const compact_set_base& o) const { + return (empty() && o.empty()) || (set && o.set && *set == *o.set); + } + bool operator!=(const compact_set_base& o) const { + return !(*this == o); + } + size_t count(const T& t) const { + return set ? set->count(t) : 0; + } + iterator erase (iterator p) { + if (set) { + ceph_assert(this == p.set); + auto it = set->erase(p.it); + if (set->empty()) { + free_internal(); + return iterator(this); + } else { + return iterator(this, it); + } + } else { + return iterator(this); + } + } + size_t erase (const T& t) { + if (!set) + return 0; + size_t r = set->erase(t); + if (set->empty()) + free_internal(); + return r; + } + void clear() { + free_internal(); + } + void swap(compact_set_base& o) { + set.swap(o.set); + } + compact_set_base& operator=(const compact_set_base& o) { + if (o.set) { + alloc_internal(); + *set = *o.set; + } else + free_internal(); + return *this; + } + std::pair<iterator,bool> insert(const T& t) { + alloc_internal(); + std::pair<typename Set::iterator,bool> r = set->insert(t); + return std::make_pair(iterator(this, r.first), r.second); + } + template <class... Args> + std::pair<iterator,bool> emplace ( Args&&... args ) { + alloc_internal(); + auto em = set->emplace(std::forward<Args>(args)...); + return std::pair<iterator,bool>(iterator(this, em.first), em.second); + } + + iterator begin() { + if (!set) + return iterator(this); + return iterator(this, set->begin()); + } + iterator end() { + if (!set) + return iterator(this); + return iterator(this, set->end()); + } + reverse_iterator rbegin() { + if (!set) + return reverse_iterator(this); + return reverse_iterator(this, set->rbegin()); + } + reverse_iterator rend() { + if (!set) + return reverse_iterator(this); + return reverse_iterator(this, set->rend()); + } + iterator find(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->find(t)); + } + iterator lower_bound(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->lower_bound(t)); + } + iterator upper_bound(const T& t) { + if (!set) + return iterator(this); + return iterator(this, set->upper_bound(t)); + } + const_iterator begin() const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->begin()); + } + const_iterator end() const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->end()); + } + const_reverse_iterator rbegin() const { + if (!set) + return const_reverse_iterator(this); + return const_reverse_iterator(this, set->rbegin()); + } + const_reverse_iterator rend() const { + if (!set) + return const_reverse_iterator(this); + return const_reverse_iterator(this, set->rend()); + } + const_iterator find(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->find(t)); + } + const_iterator lower_bound(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->lower_bound(t)); + } + const_iterator upper_bound(const T& t) const { + if (!set) + return const_iterator(this); + return const_iterator(this, set->upper_bound(t)); + } + void encode(bufferlist &bl) const { + using ceph::encode; + if (set) + encode(*set, bl); + else + encode((uint32_t)0, bl); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + uint32_t n; + decode(n, p); + if (n > 0) { + alloc_internal(); + decode_nohead(n, *set, p); + } else + free_internal(); + } +}; + +template<class T, class Set> +inline void encode(const compact_set_base<T, Set>& m, bufferlist& bl) { + m.encode(bl); +} +template<class T, class Set> +inline void decode(compact_set_base<T, Set>& m, bufferlist::const_iterator& p) { + m.decode(p); +} + +template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> > +class compact_set : public compact_set_base<T, std::set<T, Compare, Alloc> > { +}; + +template <class T, class Compare = std::less<T>, class Alloc = std::allocator<T> > +inline std::ostream& operator<<(std::ostream& out, const compact_set<T,Compare,Alloc>& s) +{ + bool first = true; + for (auto &v : s) { + if (!first) + out << ","; + out << v; + first = false; + } + return out; +} +#endif diff --git a/src/include/compat.h b/src/include/compat.h new file mode 100644 index 00000000..7c75dac2 --- /dev/null +++ b/src/include/compat.h @@ -0,0 +1,198 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 Stanislav Sedov <stas@FreeBSD.org> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_COMPAT_H +#define CEPH_COMPAT_H + +#include "acconfig.h" +#include <sys/types.h> + +#if defined(__linux__) +#define PROCPREFIX +#endif + +#include <sys/stat.h> +#ifndef ACCESSPERMS +#define ACCESSPERMS (S_IRWXU|S_IRWXG|S_IRWXO) +#endif + +#if defined(__FreeBSD__) + +// FreeBSD supports Linux procfs with its compatibility module +// And all compatibility stuff is standard mounted on this +#define PROCPREFIX "/compat/linux" + +#ifndef MSG_MORE +#define MSG_MORE 0 +#endif + +#ifndef O_DSYNC +#define O_DSYNC O_SYNC +#endif + +/* And include the extra required include file */ +#include <pthread_np.h> + +#include <sys/param.h> +#include <sys/cpuset.h> +#define cpu_set_t cpuset_t +int sched_setaffinity(pid_t pid, size_t cpusetsize, + cpu_set_t *mask); + +#endif /* __FreeBSD__ */ + +#if defined(__APPLE__) || defined(__FreeBSD__) +/* Make sure that ENODATA is defined in the correct way */ +#ifdef ENODATA +#if (ENODATA == 9919) +// #warning ENODATA already defined to be 9919, redefining to fix +// Silencing this warning because it fires at all files where compat.h +// is included after boost files. +// +// This value stems from the definition in the boost library +// And when this case occurs it is due to the fact that boost files +// are included before this file. Redefinition might not help in this +// case since already parsed code has evaluated to the wrong value. +// This would warrrant for d definition that would actually be evaluated +// at the location of usage and report a possible conflict. +// This is left up to a future improvement +#elif (ENODATA != 87) +// #warning ENODATA already defined to a value different from 87 (ENOATRR), refining to fix +#endif +#undef ENODATA +#endif +#define ENODATA ENOATTR + +// Fix clock accuracy +#if !defined(CLOCK_MONOTONIC_COARSE) +#if defined(CLOCK_MONOTONIC_FAST) +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST +#else +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC +#endif +#endif +#if !defined(CLOCK_REALTIME_COARSE) +#if defined(CLOCK_REALTIME_FAST) +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME_FAST +#else +#define CLOCK_REALTIME_COARSE CLOCK_REALTIME +#endif +#endif + +/* get PATH_MAX */ +#include <limits.h> + +#ifndef EUCLEAN +#define EUCLEAN 117 +#endif +#ifndef EREMOTEIO +#define EREMOTEIO 121 +#endif +#ifndef EKEYREJECTED +#define EKEYREJECTED 129 +#endif +#ifndef XATTR_CREATE +#define XATTR_CREATE 1 +#endif + +#ifndef HOST_NAME_MAX +#ifdef MAXHOSTNAMELEN +#define HOST_NAME_MAX MAXHOSTNAMELEN +#else +#define HOST_NAME_MAX 255 +#endif +#endif + +#endif /* __APPLE__ */ + +/* O_LARGEFILE is not defined/required on OSX/FreeBSD */ +#ifndef O_LARGEFILE +#define O_LARGEFILE 0 +#endif + +/* Could be relevant for other platforms */ +#ifndef ERESTART +#define ERESTART EINTR +#endif + +#ifndef TEMP_FAILURE_RETRY +#define TEMP_FAILURE_RETRY(expression) ({ \ + __typeof(expression) __result; \ + do { \ + __result = (expression); \ + } while (__result == -1 && errno == EINTR); \ + __result; }) +#endif + +#ifdef __cplusplus +# define VOID_TEMP_FAILURE_RETRY(expression) \ + static_cast<void>(TEMP_FAILURE_RETRY(expression)) +#else +# define VOID_TEMP_FAILURE_RETRY(expression) \ + do { (void)TEMP_FAILURE_RETRY(expression); } while (0) +#endif + +#if defined(__FreeBSD__) || defined(__APPLE__) +#define lseek64(fd, offset, whence) lseek(fd, offset, whence) +#endif + +#if defined(__sun) || defined(_AIX) +#define LOG_AUTHPRIV (10<<3) +#define LOG_FTP (11<<3) +#define __STRING(x) "x" +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#endif + +#if defined(_AIX) +#define MSG_DONTWAIT MSG_NONBLOCK +#endif + +#if defined(HAVE_PTHREAD_SETNAME_NP) + #if defined(__APPLE__) + #define ceph_pthread_setname(thread, name) ({ \ + int __result = 0; \ + if (thread == pthread_self()) \ + __result = pthread_setname_np(name); \ + __result; }) + #else + #define ceph_pthread_setname pthread_setname_np + #endif +#elif defined(HAVE_PTHREAD_SET_NAME_NP) + /* Fix a small name diff and return 0 */ + #define ceph_pthread_setname(thread, name) ({ \ + pthread_set_name_np(thread, name); \ + 0; }) +#else + /* compiler warning free success noop */ + #define ceph_pthread_setname(thread, name) ({ \ + int __i = 0; \ + __i; }) +#endif + +#if defined(HAVE_PTHREAD_GETNAME_NP) + #define ceph_pthread_getname pthread_getname_np +#elif defined(HAVE_PTHREAD_GET_NAME_NP) + #define ceph_pthread_getname(thread, name, len) ({ \ + pthread_get_name_np(thread, name, len); \ + 0; }) +#else + /* compiler warning free success noop */ + #define ceph_pthread_getname(thread, name, len) ({ \ + if (name != NULL) \ + *name = '\0'; \ + 0; }) +#endif + +int ceph_posix_fallocate(int fd, off_t offset, off_t len); + +int pipe_cloexec(int pipefd[2]); + +#endif /* !CEPH_COMPAT_H */ diff --git a/src/include/config-h.in.cmake b/src/include/config-h.in.cmake new file mode 100644 index 00000000..acced696 --- /dev/null +++ b/src/include/config-h.in.cmake @@ -0,0 +1,366 @@ +/* config.h file expanded by Cmake for build */ + +#ifndef CONFIG_H +#define CONFIG_H + +/* fallocate(2) is supported */ +#cmakedefine CEPH_HAVE_FALLOCATE + +/* Define to 1 if you have the `posix_fadvise' function. */ +#cmakedefine HAVE_POSIX_FADVISE 1 + +/* Define to 1 if you have the `posix_fallocate' function. */ +#cmakedefine HAVE_POSIX_FALLOCATE 1 + +/* Define to 1 if you have the `syncfs' function. */ +#cmakedefine HAVE_SYS_SYNCFS 1 + +/* sync_file_range(2) is supported */ +#cmakedefine HAVE_SYNC_FILE_RANGE + +/* Define if you have mallinfo */ +#cmakedefine HAVE_MALLINFO + +/* Define to 1 if you have the `pwritev' function. */ +#cmakedefine HAVE_PWRITEV 1 + +/* Define to 1 if you have the <sys/mount.h> header file. */ +#cmakedefine HAVE_SYS_MOUNT_H 1 + +/* Define to 1 if you have the <sys/param.h> header file. */ +#cmakedefine HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the <sys/types.h> header file. */ +#cmakedefine HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the <sys/vfs.h> header file. */ +#cmakedefine HAVE_SYS_VFS_H 1 + +/* Define to 1 if you have the <execinfo.h> header file. */ +#cmakedefine HAVE_EXECINFO_H 1 + +/* Define to 1 if the system has the type `__be16'. */ +#cmakedefine HAVE___BE16 1 + +/* Define to 1 if the system has the type `__be32'. */ +#cmakedefine HAVE___BE32 1 + +/* Define to 1 if the system has the type `__be64'. */ +#cmakedefine HAVE___BE64 1 + +/* Define to 1 if the system has the type `__le16'. */ +#cmakedefine HAVE___LE16 1 + +/* Define to 1 if the system has the type `__le32'. */ +#cmakedefine HAVE___LE32 1 + +/* Define to 1 if the system has the type `__le64'. */ +#cmakedefine HAVE___LE64 1 + +/* Define to 1 if the system has the type `__s16'. */ +#cmakedefine HAVE___S16 1 + +/* Define to 1 if the system has the type `__s32'. */ +#cmakedefine HAVE___S32 1 + +/* Define to 1 if the system has the type `__s64'. */ +#cmakedefine HAVE___S64 1 + +/* Define to 1 if the system has the type `__s8'. */ +#cmakedefine HAVE___S8 1 + +/* Define to 1 if the system has the type `__u16'. */ +#cmakedefine HAVE___U16 1 + +/* Define to 1 if the system has the type `__u32'. */ +#cmakedefine HAVE___U32 1 + +/* Define to 1 if the system has the type `__u64'. */ +#cmakedefine HAVE___U64 1 + +/* Define to 1 if the system has the type `__u8'. */ +#cmakedefine HAVE___U8 1 + +/* Define if you have res_nquery */ +#cmakedefine HAVE_RES_NQUERY + +/* Defined if you have LZ4 */ +#cmakedefine HAVE_LZ4 + +/* Defined if you have BROTLI */ +#cmakedefine HAVE_BROTLI + +/* Defined if you have libaio */ +#cmakedefine HAVE_LIBAIO + +/* Defind if you have POSIX AIO */ +#cmakedefine HAVE_POSIXAIO + +/* Defined if OpenLDAP enabled */ +#cmakedefine HAVE_OPENLDAP + +/* Define if you have fuse */ +#cmakedefine HAVE_LIBFUSE + +/* Define to 1 if you have libxfs */ +#cmakedefine HAVE_LIBXFS 1 + +/* SPDK conditional compilation */ +#cmakedefine HAVE_SPDK + +/* DPDK conditional compilation */ +#cmakedefine HAVE_DPDK + +/* PMEM conditional compilation */ +#cmakedefine HAVE_PMEM + +/* Defined if LevelDB supports bloom filters */ +#cmakedefine HAVE_LEVELDB_FILTER_POLICY + +/* Define if you have tcmalloc */ +#cmakedefine HAVE_LIBTCMALLOC + +/* Define if have curl_multi_wait() */ +#cmakedefine HAVE_CURL_MULTI_WAIT 1 + +/* Define if using NSS. */ +#cmakedefine USE_NSS + +/* Define if using OpenSSL. */ +#cmakedefine USE_OPENSSL + +/* Accelio conditional compilation */ +#cmakedefine HAVE_XIO + + +/* AsyncMessenger RDMA conditional compilation */ +#cmakedefine HAVE_RDMA + +/* ibverbs experimental conditional compilation */ +#cmakedefine HAVE_IBV_EXP + +/* define if bluestore enabled */ +#cmakedefine WITH_BLUESTORE + +/* define if cephfs enabled */ +#cmakedefine WITH_CEPHFS + +/*define if GSSAPI/KRB5 enabled */ +#cmakedefine HAVE_GSSAPI + +/* define if rbd enabled */ +#cmakedefine WITH_RBD + +/* define if kernel rbd enabled */ +#cmakedefine WITH_KRBD + +/* define if key-value-store is enabled */ +#cmakedefine WITH_KVS + +/* define if radosgw enabled */ +#cmakedefine WITH_RADOSGW + +/* define if radosgw enabled */ +#cmakedefine WITH_RADOSGW_FCGI_FRONTEND + +/* define if leveldb is enabled */ +#cmakedefine WITH_LEVELDB + +/* define if radosgw's beast frontend enabled */ +#cmakedefine WITH_RADOSGW_BEAST_FRONTEND + +/* define if radosgw has openssl support */ +#cmakedefine WITH_CURL_OPENSSL + +/* define if HAVE_THREAD_SAFE_RES_QUERY */ +#cmakedefine HAVE_THREAD_SAFE_RES_QUERY + +/* define if HAVE_REENTRANT_STRSIGNAL */ +#cmakedefine HAVE_REENTRANT_STRSIGNAL + +/* Define if you want to use LTTng */ +#cmakedefine WITH_LTTNG + +/* Define if you want to OSD function instrumentation */ +#cmakedefine WITH_OSD_INSTRUMENT_FUNCTIONS + +/* Define if you want to use Babeltrace */ +#cmakedefine WITH_BABELTRACE + +/* Define to 1 if you have the <babeltrace/babeltrace.h> header file. */ +#cmakedefine HAVE_BABELTRACE_BABELTRACE_H 1 + +/* Define to 1 if you have the <babeltrace/ctf/events.h> header file. */ +#cmakedefine HAVE_BABELTRACE_CTF_EVENTS_H 1 + +/* Define to 1 if you have the <babeltrace/ctf/iterator.h> header file. */ +#cmakedefine HAVE_BABELTRACE_CTF_ITERATOR_H 1 + +/* Define to 1 if you have the <arpa/nameser_compat.h> header file. */ +#cmakedefine HAVE_ARPA_NAMESER_COMPAT_H 1 + +/* FastCGI headers are in /usr/include/fastcgi */ +#cmakedefine FASTCGI_INCLUDE_DIR + +/* splice(2) is supported */ +#cmakedefine CEPH_HAVE_SPLICE + +/* Define if you want C_Gather debugging */ +#cmakedefine DEBUG_GATHER + +/* Define to 1 if you have the `getgrouplist' function. */ +#cmakedefine HAVE_GETGROUPLIST 1 + +/* LTTng is disabled, so define this macro to be nothing. */ +#cmakedefine tracepoint + +/* Define to 1 if you have fdatasync. */ +#cmakedefine HAVE_FDATASYNC 1 + +/* Defined if you have librocksdb enabled */ +#cmakedefine HAVE_LIBROCKSDB + +/* Define to 1 if you have the <valgrind/helgrind.h> header file. */ +#cmakedefine HAVE_VALGRIND_HELGRIND_H 1 + +/* Define to 1 if you have the <sys/prctl.h> header file. */ +#cmakedefine HAVE_SYS_PRCTL_H 1 + +/* Define to 1 if you have the <linux/types.h> header file. */ +#cmakedefine HAVE_LINUX_TYPES_H 1 + +/* Define to 1 if you have the <linux/version.h> header file. */ +#cmakedefine HAVE_LINUX_VERSION_H 1 + +/* Define to 1 if you have sched.h. */ +#cmakedefine HAVE_SCHED 1 + +/* Define to 1 if you have sigdescr_np. */ +#cmakedefine HAVE_SIGDESCR_NP 1 + +/* Support SSE (Streaming SIMD Extensions) instructions */ +#cmakedefine HAVE_SSE + +/* Support SSE2 (Streaming SIMD Extensions 2) instructions */ +#cmakedefine HAVE_SSE2 + +/* Define to 1 if you have the `pipe2' function. */ +#cmakedefine HAVE_PIPE2 1 + +/* Support NEON instructions */ +#cmakedefine HAVE_NEON + +/* Define if you have pthread_spin_init */ +#cmakedefine HAVE_PTHREAD_SPINLOCK + +/* name_to_handle_at exists */ +#cmakedefine HAVE_NAME_TO_HANDLE_AT + +/* we have a recent yasm and are x86_64 */ +#cmakedefine HAVE_GOOD_YASM_ELF64 + +/* yasm can also build the isa-l */ +#cmakedefine HAVE_BETTER_YASM_ELF64 + +/* Define to 1 if strerror_r returns char *. */ +#cmakedefine STRERROR_R_CHAR_P 1 + +/* Defined if you have libzfs enabled */ +#cmakedefine HAVE_LIBZFS + +/* Define if the C compiler supports __func__ */ +#cmakedefine HAVE_FUNC + +/* Define if the C compiler supports __PRETTY_FUNCTION__ */ +#cmakedefine HAVE_PRETTY_FUNC + +/* Have eventfd extension. */ +#cmakedefine HAVE_EVENTFD + +/* Define if enabling coverage. */ +#cmakedefine ENABLE_COVERAGE + +/* Defined if you want pg ref debugging */ +#cmakedefine PG_DEBUG_REFS + +/* Support ARMv8 CRC instructions */ +#cmakedefine HAVE_ARMV8_CRC + +/* Support ARMv8 CRYPTO instructions */ +#cmakedefine HAVE_ARMV8_CRYPTO + +/* Support ARMv8 CRC and CRYPTO intrinsics */ +#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + +/* Define if you have struct stat.st_mtimespec.tv_nsec */ +#cmakedefine HAVE_STAT_ST_MTIMESPEC_TV_NSEC + +/* Define if you have struct stat.st_mtim.tv_nsec */ +#cmakedefine HAVE_STAT_ST_MTIM_TV_NSEC + +/* Define if compiler supports static_cast<> */ +#cmakedefine HAVE_STATIC_CAST + +/* Version number of package */ +#cmakedefine VERSION "@VERSION@" + +/* Defined if pthread_setname_np() is available */ +#cmakedefine HAVE_PTHREAD_SETNAME_NP 1 + +/* Defined if pthread_rwlockattr_setkind_np() is available */ +#cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP + +/* Defined if blkin enabled */ +#cmakedefine WITH_BLKIN + +/* Defined if pthread_set_name_np() is available */ +#cmakedefine HAVE_PTHREAD_SET_NAME_NP + +/* Defined if pthread_getname_np() is available */ +#cmakedefine HAVE_PTHREAD_GETNAME_NP 1 + +/* Support POWER8 instructions */ +#cmakedefine HAVE_POWER8 + +/* Define if endian type is big endian */ +#cmakedefine CEPH_BIG_ENDIAN + +/* Define if endian type is little endian */ +#cmakedefine CEPH_LITTLE_ENDIAN + +#cmakedefine MGR_PYTHON_EXECUTABLE "@MGR_PYTHON_EXECUTABLE@" + +/* Define to 1 if you have the `getprogname' function. */ +#cmakedefine HAVE_GETPROGNAME 1 + +/* Defined if getentropy() is available */ +#cmakedefine HAVE_GETENTROPY + +/* Defined if boost::context is available */ +#cmakedefine HAVE_BOOST_CONTEXT + +/* Defined if libradosstriper is enabled: */ +#cmakedefine WITH_LIBRADOSSTRIPER + +/* Defined if OpenSSL is available for the rgw beast frontend */ +#cmakedefine WITH_RADOSGW_BEAST_OPENSSL + +/* Defined if rabbitmq-c is available for rgw amqp push endpoint */ +#cmakedefine WITH_RADOSGW_AMQP_ENDPOINT + +/* Defined if libedkafka is available for rgw kafka push endpoint */ +#cmakedefine WITH_RADOSGW_KAFKA_ENDPOINT + +/* Defined if std::map::merge() is supported */ +#cmakedefine HAVE_STDLIB_MAP_SPLICING + +/* Defined if Intel QAT compress/decompress is supported */ +#cmakedefine HAVE_QATZIP + +/* Define if seastar is available. */ +#cmakedefine HAVE_SEASTAR + +/* Define if unit tests are built. */ +#cmakedefine UNIT_TESTS_BUILT + +#endif /* CONFIG_H */ diff --git a/src/include/coredumpctl.h b/src/include/coredumpctl.h new file mode 100644 index 00000000..60fab432 --- /dev/null +++ b/src/include/coredumpctl.h @@ -0,0 +1,105 @@ +#pragma once + +#include "acconfig.h" + +#ifdef HAVE_SYS_PRCTL_H +#include <iostream> +#include <sys/prctl.h> +#include "common/errno.h" + +class PrCtl { + int saved_state = -1; + static int get_dumpable() { + int r = prctl(PR_GET_DUMPABLE); + if (r == -1) { + r = errno; + std::cerr << "warning: unable to get dumpable flag: " << cpp_strerror(r) + << std::endl; + } + return r; + } + static int set_dumpable(bool new_state) { + int r = prctl(PR_SET_DUMPABLE, new_state); + if (r) { + r = -errno; + std::cerr << "warning: unable to " << (new_state ? "set" : "unset") + << " dumpable flag: " << cpp_strerror(r) + << std::endl; + } + return r; + } +public: + PrCtl(int new_state = 0) { + int r = get_dumpable(); + if (r == -1) { + return; + } + if (r != new_state) { + if (!set_dumpable(new_state)) { + saved_state = r; + } + } + } + ~PrCtl() { + if (saved_state < 0) { + return; + } + set_dumpable(saved_state); + } +}; + +#else +#include <sys/resource.h> +#ifdef RLIMIT_CORE +#include <iostream> +#include <sys/resource.h> +#include "common/errno.h" + +class PrCtl { + rlimit saved_lim; + static int get_dumpable(rlimit* saved) { + int r = getrlimit(RLIMIT_CORE, saved); + if (r) { + r = errno; + std::cerr << "warning: unable to getrlimit(): " << cpp_strerror(r) + << std::endl; + } + return r; + } + static void set_dumpable(const rlimit& rlim) { + int r = setrlimit(RLIMIT_CORE, &rlim); + if (r) { + r = -errno; + std::cerr << "warning: unable to setrlimit(): " << cpp_strerror(r) + << std::endl; + } + } +public: + PrCtl(int new_state = 0) { + int r = get_dumpable(&saved_lim); + if (r == -1) { + return; + } + rlimit new_lim; + if (new_state) { + new_lim.rlim_cur = saved_lim.rlim_max; + } else { + new_lim.rlim_cur = new_lim.rlim_max = 0; + } + if (new_lim.rlim_cur == saved_lim.rlim_cur) { + return; + } + set_dumpable(new_lim); + } + ~PrCtl() { + set_dumpable(saved_lim); + } +}; +#else +struct PrCtl { + // to silence the Wunused-variable warning + PrCtl() {} +}; + +#endif // RLIMIT_CORE +#endif diff --git a/src/include/counter.h b/src/include/counter.h new file mode 100644 index 00000000..61ed7409 --- /dev/null +++ b/src/include/counter.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COUNTER_H +#define CEPH_COUNTER_H + +#include <atomic> + +template <typename T> +class Counter { +public: + Counter() { + _count()++; + _increments()++; + } + Counter(const Counter &rhs) { + _count()++; + _increments()++; + } + Counter(Counter &&rhs) {} + ~Counter() { + _count()--; + } + static uint64_t count() { + return _count(); + } + static uint64_t increments() { + return _increments(); + } + static uint64_t decrements() { + return increments()-count(); + } + +private: + static std::atomic<uint64_t> &_count() { + static std::atomic<uint64_t> c; + return c; + } + static std::atomic<uint64_t> &_increments() { + static std::atomic<uint64_t> i; + return i; + } +}; + +#endif diff --git a/src/include/cpp-btree/btree.h b/src/include/cpp-btree/btree.h new file mode 100644 index 00000000..0a40e0e1 --- /dev/null +++ b/src/include/cpp-btree/btree.h @@ -0,0 +1,2396 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// A btree implementation of the STL set and map interfaces. A btree is both +// smaller and faster than STL set/map. The red-black tree implementation of +// STL set/map has an overhead of 3 pointers (left, right and parent) plus the +// node color information for each stored value. So a set<int32> consumes 20 +// bytes for each value stored. This btree implementation stores multiple +// values on fixed size nodes (usually 256 bytes) and doesn't store child +// pointers for leaf nodes. The result is that a btree_set<int32> may use much +// less memory per stored value. For the random insertion benchmark in +// btree_test.cc, a btree_set<int32> with node-size of 256 uses 4.9 bytes per +// stored value. +// +// The packing of multiple values on to each node of a btree has another effect +// besides better space utilization: better cache locality due to fewer cache +// lines being accessed. Better cache locality translates into faster +// operations. +// +// CAVEATS +// +// Insertions and deletions on a btree can cause splitting, merging or +// rebalancing of btree nodes. And even without these operations, insertions +// and deletions on a btree will move values around within a node. In both +// cases, the result is that insertions and deletions can invalidate iterators +// pointing to values other than the one being inserted/deleted. This is +// notably different from STL set/map which takes care to not invalidate +// iterators on insert/erase except, of course, for iterators pointing to the +// value being erased. A partial workaround when erasing is available: +// erase() returns an iterator pointing to the item just after the one that was +// erased (or end() if none exists). See also safe_btree. + +// PERFORMANCE +// +// btree_bench --benchmarks=. 2>&1 | ./benchmarks.awk +// +// Run on pmattis-warp.nyc (4 X 2200 MHz CPUs); 2010/03/04-15:23:06 +// Benchmark STL(ns) B-Tree(ns) @ <size> +// -------------------------------------------------------- +// BM_set_int32_insert 1516 608 +59.89% <256> [40.0, 5.2] +// BM_set_int32_lookup 1160 414 +64.31% <256> [40.0, 5.2] +// BM_set_int32_fulllookup 960 410 +57.29% <256> [40.0, 4.4] +// BM_set_int32_delete 1741 528 +69.67% <256> [40.0, 5.2] +// BM_set_int32_queueaddrem 3078 1046 +66.02% <256> [40.0, 5.5] +// BM_set_int32_mixedaddrem 3600 1384 +61.56% <256> [40.0, 5.3] +// BM_set_int32_fifo 227 113 +50.22% <256> [40.0, 4.4] +// BM_set_int32_fwditer 158 26 +83.54% <256> [40.0, 5.2] +// BM_map_int32_insert 1551 636 +58.99% <256> [48.0, 10.5] +// BM_map_int32_lookup 1200 508 +57.67% <256> [48.0, 10.5] +// BM_map_int32_fulllookup 989 487 +50.76% <256> [48.0, 8.8] +// BM_map_int32_delete 1794 628 +64.99% <256> [48.0, 10.5] +// BM_map_int32_queueaddrem 3189 1266 +60.30% <256> [48.0, 11.6] +// BM_map_int32_mixedaddrem 3822 1623 +57.54% <256> [48.0, 10.9] +// BM_map_int32_fifo 151 134 +11.26% <256> [48.0, 8.8] +// BM_map_int32_fwditer 161 32 +80.12% <256> [48.0, 10.5] +// BM_set_int64_insert 1546 636 +58.86% <256> [40.0, 10.5] +// BM_set_int64_lookup 1200 512 +57.33% <256> [40.0, 10.5] +// BM_set_int64_fulllookup 971 487 +49.85% <256> [40.0, 8.8] +// BM_set_int64_delete 1745 616 +64.70% <256> [40.0, 10.5] +// BM_set_int64_queueaddrem 3163 1195 +62.22% <256> [40.0, 11.6] +// BM_set_int64_mixedaddrem 3760 1564 +58.40% <256> [40.0, 10.9] +// BM_set_int64_fifo 146 103 +29.45% <256> [40.0, 8.8] +// BM_set_int64_fwditer 162 31 +80.86% <256> [40.0, 10.5] +// BM_map_int64_insert 1551 720 +53.58% <256> [48.0, 20.7] +// BM_map_int64_lookup 1214 612 +49.59% <256> [48.0, 20.7] +// BM_map_int64_fulllookup 994 592 +40.44% <256> [48.0, 17.2] +// BM_map_int64_delete 1778 764 +57.03% <256> [48.0, 20.7] +// BM_map_int64_queueaddrem 3189 1547 +51.49% <256> [48.0, 20.9] +// BM_map_int64_mixedaddrem 3779 1887 +50.07% <256> [48.0, 21.6] +// BM_map_int64_fifo 147 145 +1.36% <256> [48.0, 17.2] +// BM_map_int64_fwditer 162 41 +74.69% <256> [48.0, 20.7] +// BM_set_string_insert 1989 1966 +1.16% <256> [64.0, 44.5] +// BM_set_string_lookup 1709 1600 +6.38% <256> [64.0, 44.5] +// BM_set_string_fulllookup 1573 1529 +2.80% <256> [64.0, 35.4] +// BM_set_string_delete 2520 1920 +23.81% <256> [64.0, 44.5] +// BM_set_string_queueaddrem 4706 4309 +8.44% <256> [64.0, 48.3] +// BM_set_string_mixedaddrem 5080 4654 +8.39% <256> [64.0, 46.7] +// BM_set_string_fifo 318 512 -61.01% <256> [64.0, 35.4] +// BM_set_string_fwditer 182 93 +48.90% <256> [64.0, 44.5] +// BM_map_string_insert 2600 2227 +14.35% <256> [72.0, 55.8] +// BM_map_string_lookup 2068 1730 +16.34% <256> [72.0, 55.8] +// BM_map_string_fulllookup 1859 1618 +12.96% <256> [72.0, 44.0] +// BM_map_string_delete 3168 2080 +34.34% <256> [72.0, 55.8] +// BM_map_string_queueaddrem 5840 4701 +19.50% <256> [72.0, 59.4] +// BM_map_string_mixedaddrem 6400 5200 +18.75% <256> [72.0, 57.8] +// BM_map_string_fifo 398 596 -49.75% <256> [72.0, 44.0] +// BM_map_string_fwditer 243 113 +53.50% <256> [72.0, 55.8] + +#ifndef UTIL_BTREE_BTREE_H__ +#define UTIL_BTREE_BTREE_H__ + +#include <stddef.h> +#include <string.h> +#include <sys/types.h> +#include <algorithm> +#include <functional> +#include <iostream> +#include <iterator> +#include <limits> +#include <type_traits> +#include <new> +#include <ostream> +#include <string> +#include <utility> + +#include "include/ceph_assert.h" + +namespace btree { + +// Inside a btree method, if we just call swap(), it will choose the +// btree::swap method, which we don't want. And we can't say ::swap +// because then MSVC won't pickup any std::swap() implementations. We +// can't just use std::swap() directly because then we don't get the +// specialization for types outside the std namespace. So the solution +// is to have a special swap helper function whose name doesn't +// collide with other swap functions defined by the btree classes. +template <typename T> +inline void btree_swap_helper(T &a, T &b) { + using std::swap; + swap(a, b); +} + +// A template helper used to select A or B based on a condition. +template<bool cond, typename A, typename B> +struct if_{ + typedef A type; +}; + +template<typename A, typename B> +struct if_<false, A, B> { + typedef B type; +}; + +// Types small_ and big_ are promise that sizeof(small_) < sizeof(big_) +typedef char small_; + +struct big_ { + char dummy[2]; +}; + +// A compile-time assertion. +template <bool> +struct CompileAssert { +}; + +#define COMPILE_ASSERT(expr, msg) \ + typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + +// A helper type used to indicate that a key-compare-to functor has been +// provided. A user can specify a key-compare-to functor by doing: +// +// struct MyStringComparer +// : public util::btree::btree_key_compare_to_tag { +// int operator()(const string &a, const string &b) const { +// return a.compare(b); +// } +// }; +// +// Note that the return type is an int and not a bool. There is a +// COMPILE_ASSERT which enforces this return type. +struct btree_key_compare_to_tag { +}; + +// A helper class that indicates if the Compare parameter is derived from +// btree_key_compare_to_tag. +template <typename Compare> +struct btree_is_key_compare_to + : public std::is_convertible<Compare, btree_key_compare_to_tag> { +}; + +// A helper class to convert a boolean comparison into a three-way +// "compare-to" comparison that returns a negative value to indicate +// less-than, zero to indicate equality and a positive value to +// indicate greater-than. This helper class is specialized for +// less<string> and greater<string>. The btree_key_compare_to_adapter +// class is provided so that btree users automatically get the more +// efficient compare-to code when using common google string types +// with common comparison functors. +template <typename Compare> +struct btree_key_compare_to_adapter : Compare { + btree_key_compare_to_adapter() { } + btree_key_compare_to_adapter(const Compare &c) : Compare(c) { } + btree_key_compare_to_adapter(const btree_key_compare_to_adapter<Compare> &c) + : Compare(c) { + } +}; + +template <> +struct btree_key_compare_to_adapter<std::less<std::string> > + : public btree_key_compare_to_tag { + btree_key_compare_to_adapter() {} + btree_key_compare_to_adapter(const std::less<std::string>&) {} + btree_key_compare_to_adapter( + const btree_key_compare_to_adapter<std::less<std::string> >&) {} + int operator()(const std::string &a, const std::string &b) const { + return a.compare(b); + } +}; + +template <> +struct btree_key_compare_to_adapter<std::greater<std::string> > + : public btree_key_compare_to_tag { + btree_key_compare_to_adapter() {} + btree_key_compare_to_adapter(const std::greater<std::string>&) {} + btree_key_compare_to_adapter( + const btree_key_compare_to_adapter<std::greater<std::string> >&) {} + int operator()(const std::string &a, const std::string &b) const { + return b.compare(a); + } +}; + +// A helper class that allows a compare-to functor to behave like a plain +// compare functor. This specialization is used when we do not have a +// compare-to functor. +template <typename Key, typename Compare, bool HaveCompareTo> +struct btree_key_comparer { + btree_key_comparer() {} + btree_key_comparer(Compare c) : comp(c) {} + static bool bool_compare(const Compare &comp, const Key &x, const Key &y) { + return comp(x, y); + } + bool operator()(const Key &x, const Key &y) const { + return bool_compare(comp, x, y); + } + Compare comp; +}; + +// A specialization of btree_key_comparer when a compare-to functor is +// present. We need a plain (boolean) comparison in some parts of the btree +// code, such as insert-with-hint. +template <typename Key, typename Compare> +struct btree_key_comparer<Key, Compare, true> { + btree_key_comparer() {} + btree_key_comparer(Compare c) : comp(c) {} + static bool bool_compare(const Compare &comp, const Key &x, const Key &y) { + return comp(x, y) < 0; + } + bool operator()(const Key &x, const Key &y) const { + return bool_compare(comp, x, y); + } + Compare comp; +}; + +// A helper function to compare to keys using the specified compare +// functor. This dispatches to the appropriate btree_key_comparer comparison, +// depending on whether we have a compare-to functor or not (which depends on +// whether Compare is derived from btree_key_compare_to_tag). +template <typename Key, typename Compare> +static bool btree_compare_keys( + const Compare &comp, const Key &x, const Key &y) { + typedef btree_key_comparer<Key, Compare, + btree_is_key_compare_to<Compare>::value> key_comparer; + return key_comparer::bool_compare(comp, x, y); +} + +template <typename Key, typename Compare, + typename Alloc, int TargetNodeSize, int ValueSize> +struct btree_common_params { + // If Compare is derived from btree_key_compare_to_tag then use it as the + // key_compare type. Otherwise, use btree_key_compare_to_adapter<> which will + // fall-back to Compare if we don't have an appropriate specialization. + typedef typename if_< + btree_is_key_compare_to<Compare>::value, + Compare, btree_key_compare_to_adapter<Compare> >::type key_compare; + // A type which indicates if we have a key-compare-to functor or a plain old + // key-compare functor. + typedef btree_is_key_compare_to<key_compare> is_key_compare_to; + + typedef Alloc allocator_type; + typedef Key key_type; + typedef ssize_t size_type; + typedef ptrdiff_t difference_type; + + enum { + kTargetNodeSize = TargetNodeSize, + + // Available space for values. This is largest for leaf nodes, + // which has overhead no fewer than two pointers. + kNodeValueSpace = TargetNodeSize - 2 * sizeof(void*), + }; + + // This is an integral type large enough to hold as many + // ValueSize-values as will fit a node of TargetNodeSize bytes. + typedef typename if_< + (kNodeValueSpace / ValueSize) >= 256, + uint16_t, + uint8_t>::type node_count_type; +}; + +// A parameters structure for holding the type parameters for a btree_map. +template <typename Key, typename Data, typename Compare, + typename Alloc, int TargetNodeSize> +struct btree_map_params + : public btree_common_params<Key, Compare, Alloc, TargetNodeSize, + sizeof(Key) + sizeof(Data)> { + typedef Data data_type; + typedef Data mapped_type; + typedef std::pair<const Key, data_type> value_type; + typedef std::pair<Key, data_type> mutable_value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + + enum { + kValueSize = sizeof(Key) + sizeof(data_type), + }; + + static const Key& key(const value_type &x) { return x.first; } + static const Key& key(const mutable_value_type &x) { return x.first; } + static void swap(mutable_value_type *a, mutable_value_type *b) { + btree_swap_helper(a->first, b->first); + btree_swap_helper(a->second, b->second); + } +}; + +// A parameters structure for holding the type parameters for a btree_set. +template <typename Key, typename Compare, typename Alloc, int TargetNodeSize> +struct btree_set_params + : public btree_common_params<Key, Compare, Alloc, TargetNodeSize, + sizeof(Key)> { + typedef std::false_type data_type; + typedef std::false_type mapped_type; + typedef Key value_type; + typedef value_type mutable_value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + + enum { + kValueSize = sizeof(Key), + }; + + static const Key& key(const value_type &x) { return x; } + static void swap(mutable_value_type *a, mutable_value_type *b) { + btree_swap_helper<mutable_value_type>(*a, *b); + } +}; + +// An adapter class that converts a lower-bound compare into an upper-bound +// compare. +template <typename Key, typename Compare> +struct btree_upper_bound_adapter : public Compare { + btree_upper_bound_adapter(Compare c) : Compare(c) {} + bool operator()(const Key &a, const Key &b) const { + return !static_cast<const Compare&>(*this)(b, a); + } +}; + +template <typename Key, typename CompareTo> +struct btree_upper_bound_compare_to_adapter : public CompareTo { + btree_upper_bound_compare_to_adapter(CompareTo c) : CompareTo(c) {} + int operator()(const Key &a, const Key &b) const { + return static_cast<const CompareTo&>(*this)(b, a); + } +}; + +// Dispatch helper class for using linear search with plain compare. +template <typename K, typename N, typename Compare> +struct btree_linear_search_plain_compare { + static int lower_bound(const K &k, const N &n, Compare comp) { + return n.linear_search_plain_compare(k, 0, n.count(), comp); + } + static int upper_bound(const K &k, const N &n, Compare comp) { + typedef btree_upper_bound_adapter<K, Compare> upper_compare; + return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp)); + } +}; + +// Dispatch helper class for using linear search with compare-to +template <typename K, typename N, typename CompareTo> +struct btree_linear_search_compare_to { + static int lower_bound(const K &k, const N &n, CompareTo comp) { + return n.linear_search_compare_to(k, 0, n.count(), comp); + } + static int upper_bound(const K &k, const N &n, CompareTo comp) { + typedef btree_upper_bound_adapter<K, + btree_key_comparer<K, CompareTo, true> > upper_compare; + return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp)); + } +}; + +// Dispatch helper class for using binary search with plain compare. +template <typename K, typename N, typename Compare> +struct btree_binary_search_plain_compare { + static int lower_bound(const K &k, const N &n, Compare comp) { + return n.binary_search_plain_compare(k, 0, n.count(), comp); + } + static int upper_bound(const K &k, const N &n, Compare comp) { + typedef btree_upper_bound_adapter<K, Compare> upper_compare; + return n.binary_search_plain_compare(k, 0, n.count(), upper_compare(comp)); + } +}; + +// Dispatch helper class for using binary search with compare-to. +template <typename K, typename N, typename CompareTo> +struct btree_binary_search_compare_to { + static int lower_bound(const K &k, const N &n, CompareTo comp) { + return n.binary_search_compare_to(k, 0, n.count(), CompareTo()); + } + static int upper_bound(const K &k, const N &n, CompareTo comp) { + typedef btree_upper_bound_adapter<K, + btree_key_comparer<K, CompareTo, true> > upper_compare; + return n.linear_search_plain_compare(k, 0, n.count(), upper_compare(comp)); + } +}; + +// A node in the btree holding. The same node type is used for both internal +// and leaf nodes in the btree, though the nodes are allocated in such a way +// that the children array is only valid in internal nodes. +template <typename Params> +class btree_node { + public: + typedef Params params_type; + typedef btree_node<Params> self_type; + typedef typename Params::key_type key_type; + typedef typename Params::data_type data_type; + typedef typename Params::value_type value_type; + typedef typename Params::mutable_value_type mutable_value_type; + typedef typename Params::pointer pointer; + typedef typename Params::const_pointer const_pointer; + typedef typename Params::reference reference; + typedef typename Params::const_reference const_reference; + typedef typename Params::key_compare key_compare; + typedef typename Params::size_type size_type; + typedef typename Params::difference_type difference_type; + // Typedefs for the various types of node searches. + typedef btree_linear_search_plain_compare< + key_type, self_type, key_compare> linear_search_plain_compare_type; + typedef btree_linear_search_compare_to< + key_type, self_type, key_compare> linear_search_compare_to_type; + typedef btree_binary_search_plain_compare< + key_type, self_type, key_compare> binary_search_plain_compare_type; + typedef btree_binary_search_compare_to< + key_type, self_type, key_compare> binary_search_compare_to_type; + // If we have a valid key-compare-to type, use linear_search_compare_to, + // otherwise use linear_search_plain_compare. + typedef typename if_< + Params::is_key_compare_to::value, + linear_search_compare_to_type, + linear_search_plain_compare_type>::type linear_search_type; + // If we have a valid key-compare-to type, use binary_search_compare_to, + // otherwise use binary_search_plain_compare. + typedef typename if_< + Params::is_key_compare_to::value, + binary_search_compare_to_type, + binary_search_plain_compare_type>::type binary_search_type; + // If the key is an integral or floating point type, use linear search which + // is faster than binary search for such types. Might be wise to also + // configure linear search based on node-size. + typedef typename if_< + std::is_integral<key_type>::value || + std::is_floating_point<key_type>::value, + linear_search_type, binary_search_type>::type search_type; + + struct base_fields { + typedef typename Params::node_count_type field_type; + + // A boolean indicating whether the node is a leaf or not. + bool leaf; + // The position of the node in the node's parent. + field_type position; + // The maximum number of values the node can hold. + field_type max_count; + // The count of the number of values in the node. + field_type count; + // A pointer to the node's parent. + btree_node *parent; + }; + + enum { + kValueSize = params_type::kValueSize, + kTargetNodeSize = params_type::kTargetNodeSize, + + // Compute how many values we can fit onto a leaf node. + kNodeTargetValues = (kTargetNodeSize - sizeof(base_fields)) / kValueSize, + // We need a minimum of 3 values per internal node in order to perform + // splitting (1 value for the two nodes involved in the split and 1 value + // propagated to the parent as the delimiter for the split). + kNodeValues = kNodeTargetValues >= 3 ? kNodeTargetValues : 3, + + kExactMatch = 1 << 30, + kMatchMask = kExactMatch - 1, + }; + + struct leaf_fields : public base_fields { + // The array of values. Only the first count of these values have been + // constructed and are valid. + mutable_value_type values[kNodeValues]; + }; + + struct internal_fields : public leaf_fields { + // The array of child pointers. The keys in children_[i] are all less than + // key(i). The keys in children_[i + 1] are all greater than key(i). There + // are always count + 1 children. + btree_node *children[kNodeValues + 1]; + }; + + struct root_fields : public internal_fields { + btree_node *rightmost; + size_type size; + }; + + public: + // Getter/setter for whether this is a leaf node or not. This value doesn't + // change after the node is created. + bool leaf() const { return fields_.leaf; } + + // Getter for the position of this node in its parent. + int position() const { return fields_.position; } + void set_position(int v) { fields_.position = v; } + + // Getter/setter for the number of values stored in this node. + int count() const { return fields_.count; } + void set_count(int v) { fields_.count = v; } + int max_count() const { return fields_.max_count; } + + // Getter for the parent of this node. + btree_node* parent() const { return fields_.parent; } + // Getter for whether the node is the root of the tree. The parent of the + // root of the tree is the leftmost node in the tree which is guaranteed to + // be a leaf. + bool is_root() const { return parent()->leaf(); } + void make_root() { + ceph_assert(parent()->is_root()); + fields_.parent = fields_.parent->parent(); + } + + // Getter for the rightmost root node field. Only valid on the root node. + btree_node* rightmost() const { return fields_.rightmost; } + btree_node** mutable_rightmost() { return &fields_.rightmost; } + + // Getter for the size root node field. Only valid on the root node. + size_type size() const { return fields_.size; } + size_type* mutable_size() { return &fields_.size; } + + // Getters for the key/value at position i in the node. + const key_type& key(int i) const { + return params_type::key(fields_.values[i]); + } + reference value(int i) { + return reinterpret_cast<reference>(fields_.values[i]); + } + const_reference value(int i) const { + return reinterpret_cast<const_reference>(fields_.values[i]); + } + mutable_value_type* mutable_value(int i) { + return &fields_.values[i]; + } + + // Swap value i in this node with value j in node x. + void value_swap(int i, btree_node *x, int j) { + params_type::swap(mutable_value(i), x->mutable_value(j)); + } + + // Getters/setter for the child at position i in the node. + btree_node* child(int i) const { return fields_.children[i]; } + btree_node** mutable_child(int i) { return &fields_.children[i]; } + void set_child(int i, btree_node *c) { + *mutable_child(i) = c; + c->fields_.parent = this; + c->fields_.position = i; + } + + // Returns the position of the first value whose key is not less than k. + template <typename Compare> + int lower_bound(const key_type &k, const Compare &comp) const { + return search_type::lower_bound(k, *this, comp); + } + // Returns the position of the first value whose key is greater than k. + template <typename Compare> + int upper_bound(const key_type &k, const Compare &comp) const { + return search_type::upper_bound(k, *this, comp); + } + + // Returns the position of the first value whose key is not less than k using + // linear search performed using plain compare. + template <typename Compare> + int linear_search_plain_compare( + const key_type &k, int s, int e, const Compare &comp) const { + while (s < e) { + if (!btree_compare_keys(comp, key(s), k)) { + break; + } + ++s; + } + return s; + } + + // Returns the position of the first value whose key is not less than k using + // linear search performed using compare-to. + template <typename Compare> + int linear_search_compare_to( + const key_type &k, int s, int e, const Compare &comp) const { + while (s < e) { + int c = comp(key(s), k); + if (c == 0) { + return s | kExactMatch; + } else if (c > 0) { + break; + } + ++s; + } + return s; + } + + // Returns the position of the first value whose key is not less than k using + // binary search performed using plain compare. + template <typename Compare> + int binary_search_plain_compare( + const key_type &k, int s, int e, const Compare &comp) const { + while (s != e) { + int mid = (s + e) / 2; + if (btree_compare_keys(comp, key(mid), k)) { + s = mid + 1; + } else { + e = mid; + } + } + return s; + } + + // Returns the position of the first value whose key is not less than k using + // binary search performed using compare-to. + template <typename CompareTo> + int binary_search_compare_to( + const key_type &k, int s, int e, const CompareTo &comp) const { + while (s != e) { + int mid = (s + e) / 2; + int c = comp(key(mid), k); + if (c < 0) { + s = mid + 1; + } else if (c > 0) { + e = mid; + } else { + // Need to return the first value whose key is not less than k, which + // requires continuing the binary search. Note that we are guaranteed + // that the result is an exact match because if "key(mid-1) < k" the + // call to binary_search_compare_to() will return "mid". + s = binary_search_compare_to(k, s, mid, comp); + return s | kExactMatch; + } + } + return s; + } + + // Inserts the value x at position i, shifting all existing values and + // children at positions >= i to the right by 1. + void insert_value(int i, const value_type &x); + + // Removes the value at position i, shifting all existing values and children + // at positions > i to the left by 1. + void remove_value(int i); + + // Rebalances a node with its right sibling. + void rebalance_right_to_left(btree_node *sibling, int to_move); + void rebalance_left_to_right(btree_node *sibling, int to_move); + + // Splits a node, moving a portion of the node's values to its right sibling. + void split(btree_node *sibling, int insert_position); + + // Merges a node with its right sibling, moving all of the values and the + // delimiting key in the parent node onto itself. + void merge(btree_node *sibling); + + // Swap the contents of "this" and "src". + void swap(btree_node *src); + +#ifdef NDEBUG + static constexpr auto no_debug = true; +#else + static constexpr auto no_debug = false; +#endif + // Node allocation/deletion routines. + static btree_node* init_leaf( + leaf_fields *f, btree_node *parent, int max_count) { + btree_node *n = reinterpret_cast<btree_node*>(f); + f->leaf = 1; + f->position = 0; + f->max_count = max_count; + f->count = 0; + f->parent = parent; + if (!no_debug) { + memset(&f->values, 0, max_count * sizeof(value_type)); + } + return n; + } + static btree_node* init_internal(internal_fields *f, btree_node *parent) { + btree_node *n = init_leaf(f, parent, kNodeValues); + f->leaf = 0; + if (!no_debug) { + memset(f->children, 0, sizeof(f->children)); + } + return n; + } + static btree_node* init_root(root_fields *f, btree_node *parent) { + btree_node *n = init_internal(f, parent); + f->rightmost = parent; + f->size = parent->count(); + return n; + } + void destroy() { + for (int i = 0; i < count(); ++i) { + value_destroy(i); + } + } + + private: + void value_init(int i) { + new (&fields_.values[i]) mutable_value_type; + } + void value_init(int i, const value_type &x) { + new (&fields_.values[i]) mutable_value_type(x); + } + void value_destroy(int i) { + fields_.values[i].~mutable_value_type(); + } + + private: + root_fields fields_; + + private: + btree_node(const btree_node&); + void operator=(const btree_node&); +}; + +template <typename Node, typename Reference, typename Pointer> +struct btree_iterator { + typedef typename Node::key_type key_type; + typedef typename Node::size_type size_type; + typedef typename Node::difference_type difference_type; + typedef typename Node::params_type params_type; + + typedef Node node_type; + typedef typename std::remove_const<Node>::type normal_node; + typedef const Node const_node; + typedef typename params_type::value_type value_type; + typedef typename params_type::pointer normal_pointer; + typedef typename params_type::reference normal_reference; + typedef typename params_type::const_pointer const_pointer; + typedef typename params_type::const_reference const_reference; + + typedef Pointer pointer; + typedef Reference reference; + typedef std::bidirectional_iterator_tag iterator_category; + + typedef btree_iterator< + normal_node, normal_reference, normal_pointer> iterator; + typedef btree_iterator< + const_node, const_reference, const_pointer> const_iterator; + typedef btree_iterator<Node, Reference, Pointer> self_type; + + btree_iterator() + : node(NULL), + position(-1) { + } + btree_iterator(Node *n, int p) + : node(n), + position(p) { + } + btree_iterator(const iterator &x) + : node(x.node), + position(x.position) { + } + + // Increment/decrement the iterator. + void increment() { + if (node->leaf() && ++position < node->count()) { + return; + } + increment_slow(); + } + void increment_by(int count); + void increment_slow(); + + void decrement() { + if (node->leaf() && --position >= 0) { + return; + } + decrement_slow(); + } + void decrement_slow(); + + bool operator==(const const_iterator &x) const { + return node == x.node && position == x.position; + } + bool operator!=(const const_iterator &x) const { + return node != x.node || position != x.position; + } + + // Accessors for the key/value the iterator is pointing at. + const key_type& key() const { + return node->key(position); + } + reference operator*() const { + return node->value(position); + } + pointer operator->() const { + return &node->value(position); + } + + self_type& operator++() { + increment(); + return *this; + } + self_type& operator--() { + decrement(); + return *this; + } + self_type operator++(int) { + self_type tmp = *this; + ++*this; + return tmp; + } + self_type operator--(int) { + self_type tmp = *this; + --*this; + return tmp; + } + + // The node in the tree the iterator is pointing at. + Node *node; + // The position within the node of the tree the iterator is pointing at. + int position; +}; + +// Dispatch helper class for using btree::internal_locate with plain compare. +struct btree_internal_locate_plain_compare { + template <typename K, typename T, typename Iter> + static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) { + return t.internal_locate_plain_compare(k, iter); + } +}; + +// Dispatch helper class for using btree::internal_locate with compare-to. +struct btree_internal_locate_compare_to { + template <typename K, typename T, typename Iter> + static std::pair<Iter, int> dispatch(const K &k, const T &t, Iter iter) { + return t.internal_locate_compare_to(k, iter); + } +}; + +template <typename Params> +class btree : public Params::key_compare { + typedef btree<Params> self_type; + typedef btree_node<Params> node_type; + typedef typename node_type::base_fields base_fields; + typedef typename node_type::leaf_fields leaf_fields; + typedef typename node_type::internal_fields internal_fields; + typedef typename node_type::root_fields root_fields; + typedef typename Params::is_key_compare_to is_key_compare_to; + + friend class btree_internal_locate_plain_compare; + friend class btree_internal_locate_compare_to; + typedef typename if_< + is_key_compare_to::value, + btree_internal_locate_compare_to, + btree_internal_locate_plain_compare>::type internal_locate_type; + + enum { + kNodeValues = node_type::kNodeValues, + kMinNodeValues = kNodeValues / 2, + kValueSize = node_type::kValueSize, + kExactMatch = node_type::kExactMatch, + kMatchMask = node_type::kMatchMask, + }; + + // A helper class to get the empty base class optimization for 0-size + // allocators. Base is internal_allocator_type. + // (e.g. empty_base_handle<internal_allocator_type, node_type*>). If Base is + // 0-size, the compiler doesn't have to reserve any space for it and + // sizeof(empty_base_handle) will simply be sizeof(Data). Google [empty base + // class optimization] for more details. + template <typename Base, typename Data> + struct empty_base_handle : public Base { + empty_base_handle(const Base &b, const Data &d) + : Base(b), + data(d) { + } + Data data; + }; + + struct node_stats { + node_stats(ssize_t l, ssize_t i) + : leaf_nodes(l), + internal_nodes(i) { + } + + node_stats& operator+=(const node_stats &x) { + leaf_nodes += x.leaf_nodes; + internal_nodes += x.internal_nodes; + return *this; + } + + ssize_t leaf_nodes; + ssize_t internal_nodes; + }; + + public: + typedef Params params_type; + typedef typename Params::key_type key_type; + typedef typename Params::data_type data_type; + typedef typename Params::mapped_type mapped_type; + typedef typename Params::value_type value_type; + typedef typename Params::key_compare key_compare; + typedef typename Params::pointer pointer; + typedef typename Params::const_pointer const_pointer; + typedef typename Params::reference reference; + typedef typename Params::const_reference const_reference; + typedef typename Params::size_type size_type; + typedef typename Params::difference_type difference_type; + typedef btree_iterator<node_type, reference, pointer> iterator; + typedef typename iterator::const_iterator const_iterator; + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + typedef std::reverse_iterator<iterator> reverse_iterator; + + typedef typename Params::allocator_type allocator_type; + typedef typename allocator_type::template rebind<char>::other + internal_allocator_type; + + public: + // Default constructor. + btree(const key_compare &comp, const allocator_type &alloc); + + // Copy constructor. + btree(const self_type &x); + + // Destructor. + ~btree() { + clear(); + } + + // Iterator routines. + iterator begin() { + return iterator(leftmost(), 0); + } + const_iterator begin() const { + return const_iterator(leftmost(), 0); + } + iterator end() { + return iterator(rightmost(), rightmost() ? rightmost()->count() : 0); + } + const_iterator end() const { + return const_iterator(rightmost(), rightmost() ? rightmost()->count() : 0); + } + reverse_iterator rbegin() { + return reverse_iterator(end()); + } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + reverse_iterator rend() { + return reverse_iterator(begin()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Finds the first element whose key is not less than key. + iterator lower_bound(const key_type &key) { + return internal_end( + internal_lower_bound(key, iterator(root(), 0))); + } + const_iterator lower_bound(const key_type &key) const { + return internal_end( + internal_lower_bound(key, const_iterator(root(), 0))); + } + + // Finds the first element whose key is greater than key. + iterator upper_bound(const key_type &key) { + return internal_end( + internal_upper_bound(key, iterator(root(), 0))); + } + const_iterator upper_bound(const key_type &key) const { + return internal_end( + internal_upper_bound(key, const_iterator(root(), 0))); + } + + // Finds the range of values which compare equal to key. The first member of + // the returned pair is equal to lower_bound(key). The second member pair of + // the pair is equal to upper_bound(key). + std::pair<iterator,iterator> equal_range(const key_type &key) { + return std::make_pair(lower_bound(key), upper_bound(key)); + } + std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { + return std::make_pair(lower_bound(key), upper_bound(key)); + } + + // Inserts a value into the btree only if it does not already exist. The + // boolean return value indicates whether insertion succeeded or failed. The + // ValuePointer type is used to avoid instatiating the value unless the key + // is being inserted. Value is not dereferenced if the key already exists in + // the btree. See btree_map::operator[]. + template <typename ValuePointer> + std::pair<iterator,bool> insert_unique(const key_type &key, ValuePointer value); + + // Inserts a value into the btree only if it does not already exist. The + // boolean return value indicates whether insertion succeeded or failed. + std::pair<iterator,bool> insert_unique(const value_type &v) { + return insert_unique(params_type::key(v), &v); + } + + // Insert with hint. Check to see if the value should be placed immediately + // before position in the tree. If it does, then the insertion will take + // amortized constant time. If not, the insertion will take amortized + // logarithmic time as if a call to insert_unique(v) were made. + iterator insert_unique(iterator position, const value_type &v); + + // Insert a range of values into the btree. + template <typename InputIterator> + void insert_unique(InputIterator b, InputIterator e); + + // Inserts a value into the btree. The ValuePointer type is used to avoid + // instatiating the value unless the key is being inserted. Value is not + // dereferenced if the key already exists in the btree. See + // btree_map::operator[]. + template <typename ValuePointer> + iterator insert_multi(const key_type &key, ValuePointer value); + + // Inserts a value into the btree. + iterator insert_multi(const value_type &v) { + return insert_multi(params_type::key(v), &v); + } + + // Insert with hint. Check to see if the value should be placed immediately + // before position in the tree. If it does, then the insertion will take + // amortized constant time. If not, the insertion will take amortized + // logarithmic time as if a call to insert_multi(v) were made. + iterator insert_multi(iterator position, const value_type &v); + + // Insert a range of values into the btree. + template <typename InputIterator> + void insert_multi(InputIterator b, InputIterator e); + + void assign(const self_type &x); + + // Erase the specified iterator from the btree. The iterator must be valid + // (i.e. not equal to end()). Return an iterator pointing to the node after + // the one that was erased (or end() if none exists). + iterator erase(iterator iter); + + // Erases range. Returns the number of keys erased. + int erase(iterator begin, iterator end); + + // Erases the specified key from the btree. Returns 1 if an element was + // erased and 0 otherwise. + int erase_unique(const key_type &key); + + // Erases all of the entries matching the specified key from the + // btree. Returns the number of elements erased. + int erase_multi(const key_type &key); + + // Finds the iterator corresponding to a key or returns end() if the key is + // not present. + iterator find_unique(const key_type &key) { + return internal_end( + internal_find_unique(key, iterator(root(), 0))); + } + const_iterator find_unique(const key_type &key) const { + return internal_end( + internal_find_unique(key, const_iterator(root(), 0))); + } + iterator find_multi(const key_type &key) { + return internal_end( + internal_find_multi(key, iterator(root(), 0))); + } + const_iterator find_multi(const key_type &key) const { + return internal_end( + internal_find_multi(key, const_iterator(root(), 0))); + } + + // Returns a count of the number of times the key appears in the btree. + size_type count_unique(const key_type &key) const { + const_iterator begin = internal_find_unique( + key, const_iterator(root(), 0)); + if (!begin.node) { + // The key doesn't exist in the tree. + return 0; + } + return 1; + } + // Returns a count of the number of times the key appears in the btree. + size_type count_multi(const key_type &key) const { + return distance(lower_bound(key), upper_bound(key)); + } + + // Clear the btree, deleting all of the values it contains. + void clear(); + + // Swap the contents of *this and x. + void swap(self_type &x); + + // Assign the contents of x to *this. + self_type& operator=(const self_type &x) { + if (&x == this) { + // Don't copy onto ourselves. + return *this; + } + assign(x); + return *this; + } + + key_compare* mutable_key_comp() { + return this; + } + const key_compare& key_comp() const { + return *this; + } + bool compare_keys(const key_type &x, const key_type &y) const { + return btree_compare_keys(key_comp(), x, y); + } + + // Dump the btree to the specified ostream. Requires that operator<< is + // defined for Key and Value. + void dump(std::ostream &os) const { + if (root() != NULL) { + internal_dump(os, root(), 0); + } + } + + // Verifies the structure of the btree. + void verify() const; + + // Size routines. Note that empty() is slightly faster than doing size()==0. + size_type size() const { + if (empty()) return 0; + if (root()->leaf()) return root()->count(); + return root()->size(); + } + size_type max_size() const { return std::numeric_limits<size_type>::max(); } + bool empty() const { return root() == NULL; } + + // The height of the btree. An empty tree will have height 0. + size_type height() const { + size_type h = 0; + if (root()) { + // Count the length of the chain from the leftmost node up to the + // root. We actually count from the root back around to the level below + // the root, but the calculation is the same because of the circularity + // of that traversal. + const node_type *n = root(); + do { + ++h; + n = n->parent(); + } while (n != root()); + } + return h; + } + + // The number of internal, leaf and total nodes used by the btree. + size_type leaf_nodes() const { + return internal_stats(root()).leaf_nodes; + } + size_type internal_nodes() const { + return internal_stats(root()).internal_nodes; + } + size_type nodes() const { + node_stats stats = internal_stats(root()); + return stats.leaf_nodes + stats.internal_nodes; + } + + // The total number of bytes used by the btree. + size_type bytes_used() const { + node_stats stats = internal_stats(root()); + if (stats.leaf_nodes == 1 && stats.internal_nodes == 0) { + return sizeof(*this) + + sizeof(base_fields) + root()->max_count() * sizeof(value_type); + } else { + return sizeof(*this) + + sizeof(root_fields) - sizeof(internal_fields) + + stats.leaf_nodes * sizeof(leaf_fields) + + stats.internal_nodes * sizeof(internal_fields); + } + } + + // The average number of bytes used per value stored in the btree. + static double average_bytes_per_value() { + // Returns the number of bytes per value on a leaf node that is 75% + // full. Experimentally, this matches up nicely with the computed number of + // bytes per value in trees that had their values inserted in random order. + return sizeof(leaf_fields) / (kNodeValues * 0.75); + } + + // The fullness of the btree. Computed as the number of elements in the btree + // divided by the maximum number of elements a tree with the current number + // of nodes could hold. A value of 1 indicates perfect space + // utilization. Smaller values indicate space wastage. + double fullness() const { + return double(size()) / (nodes() * kNodeValues); + } + // The overhead of the btree structure in bytes per node. Computed as the + // total number of bytes used by the btree minus the number of bytes used for + // storing elements divided by the number of elements. + double overhead() const { + if (empty()) { + return 0.0; + } + return (bytes_used() - size() * kValueSize) / double(size()); + } + + private: + // Internal accessor routines. + node_type* root() { return root_.data; } + const node_type* root() const { return root_.data; } + node_type** mutable_root() { return &root_.data; } + + // The rightmost node is stored in the root node. + node_type* rightmost() { + return (!root() || root()->leaf()) ? root() : root()->rightmost(); + } + const node_type* rightmost() const { + return (!root() || root()->leaf()) ? root() : root()->rightmost(); + } + node_type** mutable_rightmost() { return root()->mutable_rightmost(); } + + // The leftmost node is stored as the parent of the root node. + node_type* leftmost() { return root() ? root()->parent() : NULL; } + const node_type* leftmost() const { return root() ? root()->parent() : NULL; } + + // The size of the tree is stored in the root node. + size_type* mutable_size() { return root()->mutable_size(); } + + // Allocator routines. + internal_allocator_type* mutable_internal_allocator() { + return static_cast<internal_allocator_type*>(&root_); + } + const internal_allocator_type& internal_allocator() const { + return *static_cast<const internal_allocator_type*>(&root_); + } + + // Node creation/deletion routines. + node_type* new_internal_node(node_type *parent) { + internal_fields *p = reinterpret_cast<internal_fields*>( + mutable_internal_allocator()->allocate(sizeof(internal_fields))); + return node_type::init_internal(p, parent); + } + node_type* new_internal_root_node() { + root_fields *p = reinterpret_cast<root_fields*>( + mutable_internal_allocator()->allocate(sizeof(root_fields))); + return node_type::init_root(p, root()->parent()); + } + node_type* new_leaf_node(node_type *parent) { + leaf_fields *p = reinterpret_cast<leaf_fields*>( + mutable_internal_allocator()->allocate(sizeof(leaf_fields))); + return node_type::init_leaf(p, parent, kNodeValues); + } + node_type* new_leaf_root_node(int max_count) { + leaf_fields *p = reinterpret_cast<leaf_fields*>( + mutable_internal_allocator()->allocate( + sizeof(base_fields) + max_count * sizeof(value_type))); + return node_type::init_leaf(p, reinterpret_cast<node_type*>(p), max_count); + } + void delete_internal_node(node_type *node) { + node->destroy(); + ceph_assert(node != root()); + mutable_internal_allocator()->deallocate( + reinterpret_cast<char*>(node), sizeof(internal_fields)); + } + void delete_internal_root_node() { + root()->destroy(); + mutable_internal_allocator()->deallocate( + reinterpret_cast<char*>(root()), sizeof(root_fields)); + } + void delete_leaf_node(node_type *node) { + node->destroy(); + mutable_internal_allocator()->deallocate( + reinterpret_cast<char*>(node), + sizeof(base_fields) + node->max_count() * sizeof(value_type)); + } + + // Rebalances or splits the node iter points to. + void rebalance_or_split(iterator *iter); + + // Merges the values of left, right and the delimiting key on their parent + // onto left, removing the delimiting key and deleting right. + void merge_nodes(node_type *left, node_type *right); + + // Tries to merge node with its left or right sibling, and failing that, + // rebalance with its left or right sibling. Returns true if a merge + // occurred, at which point it is no longer valid to access node. Returns + // false if no merging took place. + bool try_merge_or_rebalance(iterator *iter); + + // Tries to shrink the height of the tree by 1. + void try_shrink(); + + iterator internal_end(iterator iter) { + return iter.node ? iter : end(); + } + const_iterator internal_end(const_iterator iter) const { + return iter.node ? iter : end(); + } + + // Inserts a value into the btree immediately before iter. Requires that + // key(v) <= iter.key() and (--iter).key() <= key(v). + iterator internal_insert(iterator iter, const value_type &v); + + // Returns an iterator pointing to the first value >= the value "iter" is + // pointing at. Note that "iter" might be pointing to an invalid location as + // iter.position == iter.node->count(). This routine simply moves iter up in + // the tree to a valid location. + template <typename IterType> + static IterType internal_last(IterType iter); + + // Returns an iterator pointing to the leaf position at which key would + // reside in the tree. We provide 2 versions of internal_locate. The first + // version (internal_locate_plain_compare) always returns 0 for the second + // field of the pair. The second version (internal_locate_compare_to) is for + // the key-compare-to specialization and returns either kExactMatch (if the + // key was found in the tree) or -kExactMatch (if it wasn't) in the second + // field of the pair. The compare_to specialization allows the caller to + // avoid a subsequent comparison to determine if an exact match was made, + // speeding up string keys. + template <typename IterType> + std::pair<IterType, int> internal_locate( + const key_type &key, IterType iter) const; + template <typename IterType> + std::pair<IterType, int> internal_locate_plain_compare( + const key_type &key, IterType iter) const; + template <typename IterType> + std::pair<IterType, int> internal_locate_compare_to( + const key_type &key, IterType iter) const; + + // Internal routine which implements lower_bound(). + template <typename IterType> + IterType internal_lower_bound( + const key_type &key, IterType iter) const; + + // Internal routine which implements upper_bound(). + template <typename IterType> + IterType internal_upper_bound( + const key_type &key, IterType iter) const; + + // Internal routine which implements find_unique(). + template <typename IterType> + IterType internal_find_unique( + const key_type &key, IterType iter) const; + + // Internal routine which implements find_multi(). + template <typename IterType> + IterType internal_find_multi( + const key_type &key, IterType iter) const; + + // Deletes a node and all of its children. + void internal_clear(node_type *node); + + // Dumps a node and all of its children to the specified ostream. + void internal_dump(std::ostream &os, const node_type *node, int level) const; + + // Verifies the tree structure of node. + int internal_verify(const node_type *node, + const key_type *lo, const key_type *hi) const; + + node_stats internal_stats(const node_type *node) const { + if (!node) { + return node_stats(0, 0); + } + if (node->leaf()) { + return node_stats(1, 0); + } + node_stats res(0, 1); + for (int i = 0; i <= node->count(); ++i) { + res += internal_stats(node->child(i)); + } + return res; + } + + private: + empty_base_handle<internal_allocator_type, node_type*> root_; + + private: + // A never instantiated helper function that returns big_ if we have a + // key-compare-to functor or if R is bool and small_ otherwise. + template <typename R> + static typename if_< + if_<is_key_compare_to::value, + std::is_same<R, int>, + std::is_same<R, bool> >::type::value, + big_, small_>::type key_compare_checker(R); + + // A never instantiated helper function that returns the key comparison + // functor. + static key_compare key_compare_helper(); + + // Verify that key_compare returns a bool. This is similar to the way + // is_convertible in base/type_traits.h works. Note that key_compare_checker + // is never actually invoked. The compiler will select which + // key_compare_checker() to instantiate and then figure out the size of the + // return type of key_compare_checker() at compile time which we then check + // against the sizeof of big_. + COMPILE_ASSERT( + sizeof(key_compare_checker(key_compare_helper()(key_type(), key_type()))) == + sizeof(big_), + key_comparison_function_must_return_bool); + + // Note: We insist on kTargetValues, which is computed from + // Params::kTargetNodeSize, must fit the base_fields::field_type. + COMPILE_ASSERT(kNodeValues < + (1 << (8 * sizeof(typename base_fields::field_type))), + target_node_size_too_large); + + // Test the assumption made in setting kNodeValueSpace. + COMPILE_ASSERT(sizeof(base_fields) >= 2 * sizeof(void*), + node_space_assumption_incorrect); +}; + +//// +// btree_node methods +template <typename P> +inline void btree_node<P>::insert_value(int i, const value_type &x) { + ceph_assert(i <= count()); + value_init(count(), x); + for (int j = count(); j > i; --j) { + value_swap(j, this, j - 1); + } + set_count(count() + 1); + + if (!leaf()) { + ++i; + for (int j = count(); j > i; --j) { + *mutable_child(j) = child(j - 1); + child(j)->set_position(j); + } + *mutable_child(i) = NULL; + } +} + +template <typename P> +inline void btree_node<P>::remove_value(int i) { + if (!leaf()) { + ceph_assert(child(i + 1)->count() == 0); + for (int j = i + 1; j < count(); ++j) { + *mutable_child(j) = child(j + 1); + child(j)->set_position(j); + } + *mutable_child(count()) = NULL; + } + + set_count(count() - 1); + for (; i < count(); ++i) { + value_swap(i, this, i + 1); + } + value_destroy(i); +} + +template <typename P> +void btree_node<P>::rebalance_right_to_left(btree_node *src, int to_move) { + ceph_assert(parent() == src->parent()); + ceph_assert(position() + 1 == src->position()); + ceph_assert(src->count() >= count()); + ceph_assert(to_move >= 1); + ceph_assert(to_move <= src->count()); + + // Make room in the left node for the new values. + for (int i = 0; i < to_move; ++i) { + value_init(i + count()); + } + + // Move the delimiting value to the left node and the new delimiting value + // from the right node. + value_swap(count(), parent(), position()); + parent()->value_swap(position(), src, to_move - 1); + + // Move the values from the right to the left node. + for (int i = 1; i < to_move; ++i) { + value_swap(count() + i, src, i - 1); + } + // Shift the values in the right node to their correct position. + for (int i = to_move; i < src->count(); ++i) { + src->value_swap(i - to_move, src, i); + } + for (int i = 1; i <= to_move; ++i) { + src->value_destroy(src->count() - i); + } + + if (!leaf()) { + // Move the child pointers from the right to the left node. + for (int i = 0; i < to_move; ++i) { + set_child(1 + count() + i, src->child(i)); + } + for (int i = 0; i <= src->count() - to_move; ++i) { + ceph_assert(i + to_move <= src->max_count()); + src->set_child(i, src->child(i + to_move)); + *src->mutable_child(i + to_move) = NULL; + } + } + + // Fixup the counts on the src and dest nodes. + set_count(count() + to_move); + src->set_count(src->count() - to_move); +} + +template <typename P> +void btree_node<P>::rebalance_left_to_right(btree_node *dest, int to_move) { + ceph_assert(parent() == dest->parent()); + ceph_assert(position() + 1 == dest->position()); + ceph_assert(count() >= dest->count()); + ceph_assert(to_move >= 1); + ceph_assert(to_move <= count()); + + // Make room in the right node for the new values. + for (int i = 0; i < to_move; ++i) { + dest->value_init(i + dest->count()); + } + for (int i = dest->count() - 1; i >= 0; --i) { + dest->value_swap(i, dest, i + to_move); + } + + // Move the delimiting value to the right node and the new delimiting value + // from the left node. + dest->value_swap(to_move - 1, parent(), position()); + parent()->value_swap(position(), this, count() - to_move); + value_destroy(count() - to_move); + + // Move the values from the left to the right node. + for (int i = 1; i < to_move; ++i) { + value_swap(count() - to_move + i, dest, i - 1); + value_destroy(count() - to_move + i); + } + + if (!leaf()) { + // Move the child pointers from the left to the right node. + for (int i = dest->count(); i >= 0; --i) { + dest->set_child(i + to_move, dest->child(i)); + *dest->mutable_child(i) = NULL; + } + for (int i = 1; i <= to_move; ++i) { + dest->set_child(i - 1, child(count() - to_move + i)); + *mutable_child(count() - to_move + i) = NULL; + } + } + + // Fixup the counts on the src and dest nodes. + set_count(count() - to_move); + dest->set_count(dest->count() + to_move); +} + +template <typename P> +void btree_node<P>::split(btree_node *dest, int insert_position) { + ceph_assert(dest->count() == 0); + + // We bias the split based on the position being inserted. If we're + // inserting at the beginning of the left node then bias the split to put + // more values on the right node. If we're inserting at the end of the + // right node then bias the split to put more values on the left node. + if (insert_position == 0) { + dest->set_count(count() - 1); + } else if (insert_position == max_count()) { + dest->set_count(0); + } else { + dest->set_count(count() / 2); + } + set_count(count() - dest->count()); + ceph_assert(count() >= 1); + + // Move values from the left sibling to the right sibling. + for (int i = 0; i < dest->count(); ++i) { + dest->value_init(i); + value_swap(count() + i, dest, i); + value_destroy(count() + i); + } + + // The split key is the largest value in the left sibling. + set_count(count() - 1); + parent()->insert_value(position(), value_type()); + value_swap(count(), parent(), position()); + value_destroy(count()); + parent()->set_child(position() + 1, dest); + + if (!leaf()) { + for (int i = 0; i <= dest->count(); ++i) { + ceph_assert(child(count() + i + 1) != NULL); + dest->set_child(i, child(count() + i + 1)); + *mutable_child(count() + i + 1) = NULL; + } + } +} + +template <typename P> +void btree_node<P>::merge(btree_node *src) { + ceph_assert(parent() == src->parent()); + ceph_assert(position() + 1 == src->position()); + + // Move the delimiting value to the left node. + value_init(count()); + value_swap(count(), parent(), position()); + + // Move the values from the right to the left node. + for (int i = 0; i < src->count(); ++i) { + value_init(1 + count() + i); + value_swap(1 + count() + i, src, i); + src->value_destroy(i); + } + + if (!leaf()) { + // Move the child pointers from the right to the left node. + for (int i = 0; i <= src->count(); ++i) { + set_child(1 + count() + i, src->child(i)); + *src->mutable_child(i) = NULL; + } + } + + // Fixup the counts on the src and dest nodes. + set_count(1 + count() + src->count()); + src->set_count(0); + + // Remove the value on the parent node. + parent()->remove_value(position()); +} + +template <typename P> +void btree_node<P>::swap(btree_node *x) { + ceph_assert(leaf() == x->leaf()); + + // Swap the values. + for (int i = count(); i < x->count(); ++i) { + value_init(i); + } + for (int i = x->count(); i < count(); ++i) { + x->value_init(i); + } + int n = std::max(count(), x->count()); + for (int i = 0; i < n; ++i) { + value_swap(i, x, i); + } + for (int i = count(); i < x->count(); ++i) { + x->value_destroy(i); + } + for (int i = x->count(); i < count(); ++i) { + value_destroy(i); + } + + if (!leaf()) { + // Swap the child pointers. + for (int i = 0; i <= n; ++i) { + btree_swap_helper(*mutable_child(i), *x->mutable_child(i)); + } + for (int i = 0; i <= count(); ++i) { + x->child(i)->fields_.parent = x; + } + for (int i = 0; i <= x->count(); ++i) { + child(i)->fields_.parent = this; + } + } + + // Swap the counts. + btree_swap_helper(fields_.count, x->fields_.count); +} + +//// +// btree_iterator methods +template <typename N, typename R, typename P> +void btree_iterator<N, R, P>::increment_slow() { + if (node->leaf()) { + ceph_assert(position >= node->count()); + self_type save(*this); + while (position == node->count() && !node->is_root()) { + ceph_assert(node->parent()->child(node->position()) == node); + position = node->position(); + node = node->parent(); + } + if (position == node->count()) { + *this = save; + } + } else { + ceph_assert(position < node->count()); + node = node->child(position + 1); + while (!node->leaf()) { + node = node->child(0); + } + position = 0; + } +} + +template <typename N, typename R, typename P> +void btree_iterator<N, R, P>::increment_by(int count) { + while (count > 0) { + if (node->leaf()) { + int rest = node->count() - position; + position += std::min(rest, count); + count = count - rest; + if (position < node->count()) { + return; + } + } else { + --count; + } + increment_slow(); + } +} + +template <typename N, typename R, typename P> +void btree_iterator<N, R, P>::decrement_slow() { + if (node->leaf()) { + ceph_assert(position <= -1); + self_type save(*this); + while (position < 0 && !node->is_root()) { + ceph_assert(node->parent()->child(node->position()) == node); + position = node->position() - 1; + node = node->parent(); + } + if (position < 0) { + *this = save; + } + } else { + ceph_assert(position >= 0); + node = node->child(position); + while (!node->leaf()) { + node = node->child(node->count()); + } + position = node->count() - 1; + } +} + +//// +// btree methods +template <typename P> +btree<P>::btree(const key_compare &comp, const allocator_type &alloc) + : key_compare(comp), + root_(alloc, NULL) { +} + +template <typename P> +btree<P>::btree(const self_type &x) + : key_compare(x.key_comp()), + root_(x.internal_allocator(), NULL) { + assign(x); +} + +template <typename P> template <typename ValuePointer> +std::pair<typename btree<P>::iterator, bool> +btree<P>::insert_unique(const key_type &key, ValuePointer value) { + if (empty()) { + *mutable_root() = new_leaf_root_node(1); + } + + std::pair<iterator, int> res = internal_locate(key, iterator(root(), 0)); + iterator &iter = res.first; + if (res.second == kExactMatch) { + // The key already exists in the tree, do nothing. + return std::make_pair(internal_last(iter), false); + } else if (!res.second) { + iterator last = internal_last(iter); + if (last.node && !compare_keys(key, last.key())) { + // The key already exists in the tree, do nothing. + return std::make_pair(last, false); + } + } + + return std::make_pair(internal_insert(iter, *value), true); +} + +template <typename P> +inline typename btree<P>::iterator +btree<P>::insert_unique(iterator position, const value_type &v) { + if (!empty()) { + const key_type &key = params_type::key(v); + if (position == end() || compare_keys(key, position.key())) { + iterator prev = position; + if (position == begin() || compare_keys((--prev).key(), key)) { + // prev.key() < key < position.key() + return internal_insert(position, v); + } + } else if (compare_keys(position.key(), key)) { + iterator next = position; + ++next; + if (next == end() || compare_keys(key, next.key())) { + // position.key() < key < next.key() + return internal_insert(next, v); + } + } else { + // position.key() == key + return position; + } + } + return insert_unique(v).first; +} + +template <typename P> template <typename InputIterator> +void btree<P>::insert_unique(InputIterator b, InputIterator e) { + for (; b != e; ++b) { + insert_unique(end(), *b); + } +} + +template <typename P> template <typename ValuePointer> +typename btree<P>::iterator +btree<P>::insert_multi(const key_type &key, ValuePointer value) { + if (empty()) { + *mutable_root() = new_leaf_root_node(1); + } + + iterator iter = internal_upper_bound(key, iterator(root(), 0)); + if (!iter.node) { + iter = end(); + } + return internal_insert(iter, *value); +} + +template <typename P> +typename btree<P>::iterator +btree<P>::insert_multi(iterator position, const value_type &v) { + if (!empty()) { + const key_type &key = params_type::key(v); + if (position == end() || !compare_keys(position.key(), key)) { + iterator prev = position; + if (position == begin() || !compare_keys(key, (--prev).key())) { + // prev.key() <= key <= position.key() + return internal_insert(position, v); + } + } else { + iterator next = position; + ++next; + if (next == end() || !compare_keys(next.key(), key)) { + // position.key() < key <= next.key() + return internal_insert(next, v); + } + } + } + return insert_multi(v); +} + +template <typename P> template <typename InputIterator> +void btree<P>::insert_multi(InputIterator b, InputIterator e) { + for (; b != e; ++b) { + insert_multi(end(), *b); + } +} + +template <typename P> +void btree<P>::assign(const self_type &x) { + clear(); + + *mutable_key_comp() = x.key_comp(); + *mutable_internal_allocator() = x.internal_allocator(); + + // Assignment can avoid key comparisons because we know the order of the + // values is the same order we'll store them in. + for (const_iterator iter = x.begin(); iter != x.end(); ++iter) { + if (empty()) { + insert_multi(*iter); + } else { + // If the btree is not empty, we can just insert the new value at the end + // of the tree! + internal_insert(end(), *iter); + } + } +} + +template <typename P> +typename btree<P>::iterator btree<P>::erase(iterator iter) { + bool internal_delete = false; + if (!iter.node->leaf()) { + // Deletion of a value on an internal node. Swap the key with the largest + // value of our left child. This is easy, we just decrement iter. + iterator tmp_iter(iter--); + ceph_assert(iter.node->leaf()); + ceph_assert(!compare_keys(tmp_iter.key(), iter.key())); + iter.node->value_swap(iter.position, tmp_iter.node, tmp_iter.position); + internal_delete = true; + --*mutable_size(); + } else if (!root()->leaf()) { + --*mutable_size(); + } + + // Delete the key from the leaf. + iter.node->remove_value(iter.position); + + // We want to return the next value after the one we just erased. If we + // erased from an internal node (internal_delete == true), then the next + // value is ++(++iter). If we erased from a leaf node (internal_delete == + // false) then the next value is ++iter. Note that ++iter may point to an + // internal node and the value in the internal node may move to a leaf node + // (iter.node) when rebalancing is performed at the leaf level. + + // Merge/rebalance as we walk back up the tree. + iterator res(iter); + for (;;) { + if (iter.node == root()) { + try_shrink(); + if (empty()) { + return end(); + } + break; + } + if (iter.node->count() >= kMinNodeValues) { + break; + } + bool merged = try_merge_or_rebalance(&iter); + if (iter.node->leaf()) { + res = iter; + } + if (!merged) { + break; + } + iter.node = iter.node->parent(); + } + + // Adjust our return value. If we're pointing at the end of a node, advance + // the iterator. + if (res.position == res.node->count()) { + res.position = res.node->count() - 1; + ++res; + } + // If we erased from an internal node, advance the iterator. + if (internal_delete) { + ++res; + } + return res; +} + +template <typename P> +int btree<P>::erase(iterator begin, iterator end) { + int count = distance(begin, end); + for (int i = 0; i < count; i++) { + begin = erase(begin); + } + return count; +} + +template <typename P> +int btree<P>::erase_unique(const key_type &key) { + iterator iter = internal_find_unique(key, iterator(root(), 0)); + if (!iter.node) { + // The key doesn't exist in the tree, return nothing done. + return 0; + } + erase(iter); + return 1; +} + +template <typename P> +int btree<P>::erase_multi(const key_type &key) { + iterator begin = internal_lower_bound(key, iterator(root(), 0)); + if (!begin.node) { + // The key doesn't exist in the tree, return nothing done. + return 0; + } + // Delete all of the keys between begin and upper_bound(key). + iterator end = internal_end( + internal_upper_bound(key, iterator(root(), 0))); + return erase(begin, end); +} + +template <typename P> +void btree<P>::clear() { + if (root() != NULL) { + internal_clear(root()); + } + *mutable_root() = NULL; +} + +template <typename P> +void btree<P>::swap(self_type &x) { + std::swap(static_cast<key_compare&>(*this), static_cast<key_compare&>(x)); + std::swap(root_, x.root_); +} + +template <typename P> +void btree<P>::verify() const { + if (root() != NULL) { + ceph_assert(size() == internal_verify(root(), NULL, NULL)); + ceph_assert(leftmost() == (++const_iterator(root(), -1)).node); + ceph_assert(rightmost() == (--const_iterator(root(), root()->count())).node); + ceph_assert(leftmost()->leaf()); + ceph_assert(rightmost()->leaf()); + } else { + ceph_assert(size() == 0); + ceph_assert(leftmost() == NULL); + ceph_assert(rightmost() == NULL); + } +} + +template <typename P> +void btree<P>::rebalance_or_split(iterator *iter) { + node_type *&node = iter->node; + int &insert_position = iter->position; + ceph_assert(node->count() == node->max_count()); + + // First try to make room on the node by rebalancing. + node_type *parent = node->parent(); + if (node != root()) { + if (node->position() > 0) { + // Try rebalancing with our left sibling. + node_type *left = parent->child(node->position() - 1); + if (left->count() < left->max_count()) { + // We bias rebalancing based on the position being inserted. If we're + // inserting at the end of the right node then we bias rebalancing to + // fill up the left node. + int to_move = (left->max_count() - left->count()) / + (1 + (insert_position < left->max_count())); + to_move = std::max(1, to_move); + + if (((insert_position - to_move) >= 0) || + ((left->count() + to_move) < left->max_count())) { + left->rebalance_right_to_left(node, to_move); + + ceph_assert(node->max_count() - node->count() == to_move); + insert_position = insert_position - to_move; + if (insert_position < 0) { + insert_position = insert_position + left->count() + 1; + node = left; + } + + ceph_assert(node->count() < node->max_count()); + return; + } + } + } + + if (node->position() < parent->count()) { + // Try rebalancing with our right sibling. + node_type *right = parent->child(node->position() + 1); + if (right->count() < right->max_count()) { + // We bias rebalancing based on the position being inserted. If we're + // inserting at the beginning of the left node then we bias rebalancing + // to fill up the right node. + int to_move = (right->max_count() - right->count()) / + (1 + (insert_position > 0)); + to_move = std::max(1, to_move); + + if ((insert_position <= (node->count() - to_move)) || + ((right->count() + to_move) < right->max_count())) { + node->rebalance_left_to_right(right, to_move); + + if (insert_position > node->count()) { + insert_position = insert_position - node->count() - 1; + node = right; + } + + ceph_assert(node->count() < node->max_count()); + return; + } + } + } + + // Rebalancing failed, make sure there is room on the parent node for a new + // value. + if (parent->count() == parent->max_count()) { + iterator parent_iter(node->parent(), node->position()); + rebalance_or_split(&parent_iter); + } + } else { + // Rebalancing not possible because this is the root node. + if (root()->leaf()) { + // The root node is currently a leaf node: create a new root node and set + // the current root node as the child of the new root. + parent = new_internal_root_node(); + parent->set_child(0, root()); + *mutable_root() = parent; + ceph_assert(*mutable_rightmost() == parent->child(0)); + } else { + // The root node is an internal node. We do not want to create a new root + // node because the root node is special and holds the size of the tree + // and a pointer to the rightmost node. So we create a new internal node + // and move all of the items on the current root into the new node. + parent = new_internal_node(parent); + parent->set_child(0, parent); + parent->swap(root()); + node = parent; + } + } + + // Split the node. + node_type *split_node; + if (node->leaf()) { + split_node = new_leaf_node(parent); + node->split(split_node, insert_position); + if (rightmost() == node) { + *mutable_rightmost() = split_node; + } + } else { + split_node = new_internal_node(parent); + node->split(split_node, insert_position); + } + + if (insert_position > node->count()) { + insert_position = insert_position - node->count() - 1; + node = split_node; + } +} + +template <typename P> +void btree<P>::merge_nodes(node_type *left, node_type *right) { + left->merge(right); + if (right->leaf()) { + if (rightmost() == right) { + *mutable_rightmost() = left; + } + delete_leaf_node(right); + } else { + delete_internal_node(right); + } +} + +template <typename P> +bool btree<P>::try_merge_or_rebalance(iterator *iter) { + node_type *parent = iter->node->parent(); + if (iter->node->position() > 0) { + // Try merging with our left sibling. + node_type *left = parent->child(iter->node->position() - 1); + if ((1 + left->count() + iter->node->count()) <= left->max_count()) { + iter->position += 1 + left->count(); + merge_nodes(left, iter->node); + iter->node = left; + return true; + } + } + if (iter->node->position() < parent->count()) { + // Try merging with our right sibling. + node_type *right = parent->child(iter->node->position() + 1); + if ((1 + iter->node->count() + right->count()) <= right->max_count()) { + merge_nodes(iter->node, right); + return true; + } + // Try rebalancing with our right sibling. We don't perform rebalancing if + // we deleted the first element from iter->node and the node is not + // empty. This is a small optimization for the common pattern of deleting + // from the front of the tree. + if ((right->count() > kMinNodeValues) && + ((iter->node->count() == 0) || + (iter->position > 0))) { + int to_move = (right->count() - iter->node->count()) / 2; + to_move = std::min(to_move, right->count() - 1); + iter->node->rebalance_right_to_left(right, to_move); + return false; + } + } + if (iter->node->position() > 0) { + // Try rebalancing with our left sibling. We don't perform rebalancing if + // we deleted the last element from iter->node and the node is not + // empty. This is a small optimization for the common pattern of deleting + // from the back of the tree. + node_type *left = parent->child(iter->node->position() - 1); + if ((left->count() > kMinNodeValues) && + ((iter->node->count() == 0) || + (iter->position < iter->node->count()))) { + int to_move = (left->count() - iter->node->count()) / 2; + to_move = std::min(to_move, left->count() - 1); + left->rebalance_left_to_right(iter->node, to_move); + iter->position += to_move; + return false; + } + } + return false; +} + +template <typename P> +void btree<P>::try_shrink() { + if (root()->count() > 0) { + return; + } + // Deleted the last item on the root node, shrink the height of the tree. + if (root()->leaf()) { + ceph_assert(size() == 0); + delete_leaf_node(root()); + *mutable_root() = NULL; + } else { + node_type *child = root()->child(0); + if (child->leaf()) { + // The child is a leaf node so simply make it the root node in the tree. + child->make_root(); + delete_internal_root_node(); + *mutable_root() = child; + } else { + // The child is an internal node. We want to keep the existing root node + // so we move all of the values from the child node into the existing + // (empty) root node. + child->swap(root()); + delete_internal_node(child); + } + } +} + +template <typename P> template <typename IterType> +inline IterType btree<P>::internal_last(IterType iter) { + while (iter.node && iter.position == iter.node->count()) { + iter.position = iter.node->position(); + iter.node = iter.node->parent(); + if (iter.node->leaf()) { + iter.node = NULL; + } + } + return iter; +} + +template <typename P> +inline typename btree<P>::iterator +btree<P>::internal_insert(iterator iter, const value_type &v) { + if (!iter.node->leaf()) { + // We can't insert on an internal node. Instead, we'll insert after the + // previous value which is guaranteed to be on a leaf node. + --iter; + ++iter.position; + } + if (iter.node->count() == iter.node->max_count()) { + // Make room in the leaf for the new item. + if (iter.node->max_count() < kNodeValues) { + // Insertion into the root where the root is smaller that the full node + // size. Simply grow the size of the root node. + ceph_assert(iter.node == root()); + iter.node = new_leaf_root_node( + std::min<int>(kNodeValues, 2 * iter.node->max_count())); + iter.node->swap(root()); + delete_leaf_node(root()); + *mutable_root() = iter.node; + } else { + rebalance_or_split(&iter); + ++*mutable_size(); + } + } else if (!root()->leaf()) { + ++*mutable_size(); + } + iter.node->insert_value(iter.position, v); + return iter; +} + +template <typename P> template <typename IterType> +inline std::pair<IterType, int> btree<P>::internal_locate( + const key_type &key, IterType iter) const { + return internal_locate_type::dispatch(key, *this, iter); +} + +template <typename P> template <typename IterType> +inline std::pair<IterType, int> btree<P>::internal_locate_plain_compare( + const key_type &key, IterType iter) const { + for (;;) { + iter.position = iter.node->lower_bound(key, key_comp()); + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return std::make_pair(iter, 0); +} + +template <typename P> template <typename IterType> +inline std::pair<IterType, int> btree<P>::internal_locate_compare_to( + const key_type &key, IterType iter) const { + for (;;) { + int res = iter.node->lower_bound(key, key_comp()); + iter.position = res & kMatchMask; + if (res & kExactMatch) { + return std::make_pair(iter, static_cast<int>(kExactMatch)); + } + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + return std::make_pair(iter, -kExactMatch); +} + +template <typename P> template <typename IterType> +IterType btree<P>::internal_lower_bound( + const key_type &key, IterType iter) const { + if (iter.node) { + for (;;) { + iter.position = + iter.node->lower_bound(key, key_comp()) & kMatchMask; + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + iter = internal_last(iter); + } + return iter; +} + +template <typename P> template <typename IterType> +IterType btree<P>::internal_upper_bound( + const key_type &key, IterType iter) const { + if (iter.node) { + for (;;) { + iter.position = iter.node->upper_bound(key, key_comp()); + if (iter.node->leaf()) { + break; + } + iter.node = iter.node->child(iter.position); + } + iter = internal_last(iter); + } + return iter; +} + +template <typename P> template <typename IterType> +IterType btree<P>::internal_find_unique( + const key_type &key, IterType iter) const { + if (iter.node) { + std::pair<IterType, int> res = internal_locate(key, iter); + if (res.second == kExactMatch) { + return res.first; + } + if (!res.second) { + iter = internal_last(res.first); + if (iter.node && !compare_keys(key, iter.key())) { + return iter; + } + } + } + return IterType(NULL, 0); +} + +template <typename P> template <typename IterType> +IterType btree<P>::internal_find_multi( + const key_type &key, IterType iter) const { + if (iter.node) { + iter = internal_lower_bound(key, iter); + if (iter.node) { + iter = internal_last(iter); + if (iter.node && !compare_keys(key, iter.key())) { + return iter; + } + } + } + return IterType(NULL, 0); +} + +template <typename P> +void btree<P>::internal_clear(node_type *node) { + if (!node->leaf()) { + for (int i = 0; i <= node->count(); ++i) { + internal_clear(node->child(i)); + } + if (node == root()) { + delete_internal_root_node(); + } else { + delete_internal_node(node); + } + } else { + delete_leaf_node(node); + } +} + +template <typename P> +void btree<P>::internal_dump( + std::ostream &os, const node_type *node, int level) const { + for (int i = 0; i < node->count(); ++i) { + if (!node->leaf()) { + internal_dump(os, node->child(i), level + 1); + } + for (int j = 0; j < level; ++j) { + os << " "; + } + os << node->key(i) << " [" << level << "]\n"; + } + if (!node->leaf()) { + internal_dump(os, node->child(node->count()), level + 1); + } +} + +template <typename P> +int btree<P>::internal_verify( + const node_type *node, const key_type *lo, const key_type *hi) const { + ceph_assert(node->count() > 0); + ceph_assert(node->count() <= node->max_count()); + if (lo) { + ceph_assert(!compare_keys(node->key(0), *lo)); + } + if (hi) { + ceph_assert(!compare_keys(*hi, node->key(node->count() - 1))); + } + for (int i = 1; i < node->count(); ++i) { + ceph_assert(!compare_keys(node->key(i), node->key(i - 1))); + } + int count = node->count(); + if (!node->leaf()) { + for (int i = 0; i <= node->count(); ++i) { + ceph_assert(node->child(i) != NULL); + ceph_assert(node->child(i)->parent() == node); + ceph_assert(node->child(i)->position() == i); + count += internal_verify( + node->child(i), + (i == 0) ? lo : &node->key(i - 1), + (i == node->count()) ? hi : &node->key(i)); + } + } + return count; +} + +} // namespace btree + +#endif // UTIL_BTREE_BTREE_H__ diff --git a/src/include/cpp-btree/btree_container.h b/src/include/cpp-btree/btree_container.h new file mode 100644 index 00000000..fb617abe --- /dev/null +++ b/src/include/cpp-btree/btree_container.h @@ -0,0 +1,349 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_BTREE_BTREE_CONTAINER_H__ +#define UTIL_BTREE_BTREE_CONTAINER_H__ + +#include <iosfwd> +#include <utility> + +#include "btree.h" + +namespace btree { + +// A common base class for btree_set, btree_map, btree_multiset and +// btree_multimap. +template <typename Tree> +class btree_container { + typedef btree_container<Tree> self_type; + + public: + typedef typename Tree::params_type params_type; + typedef typename Tree::key_type key_type; + typedef typename Tree::value_type value_type; + typedef typename Tree::key_compare key_compare; + typedef typename Tree::allocator_type allocator_type; + typedef typename Tree::pointer pointer; + typedef typename Tree::const_pointer const_pointer; + typedef typename Tree::reference reference; + typedef typename Tree::const_reference const_reference; + typedef typename Tree::size_type size_type; + typedef typename Tree::difference_type difference_type; + typedef typename Tree::iterator iterator; + typedef typename Tree::const_iterator const_iterator; + typedef typename Tree::reverse_iterator reverse_iterator; + typedef typename Tree::const_reverse_iterator const_reverse_iterator; + + public: + // Default constructor. + btree_container(const key_compare &comp, const allocator_type &alloc) + : tree_(comp, alloc) { + } + + // Copy constructor. + btree_container(const self_type &x) + : tree_(x.tree_) { + } + + // Iterator routines. + iterator begin() { return tree_.begin(); } + const_iterator begin() const { return tree_.begin(); } + iterator end() { return tree_.end(); } + const_iterator end() const { return tree_.end(); } + reverse_iterator rbegin() { return tree_.rbegin(); } + const_reverse_iterator rbegin() const { return tree_.rbegin(); } + reverse_iterator rend() { return tree_.rend(); } + const_reverse_iterator rend() const { return tree_.rend(); } + + // Lookup routines. + iterator lower_bound(const key_type &key) { + return tree_.lower_bound(key); + } + const_iterator lower_bound(const key_type &key) const { + return tree_.lower_bound(key); + } + iterator upper_bound(const key_type &key) { + return tree_.upper_bound(key); + } + const_iterator upper_bound(const key_type &key) const { + return tree_.upper_bound(key); + } + std::pair<iterator,iterator> equal_range(const key_type &key) { + return tree_.equal_range(key); + } + std::pair<const_iterator,const_iterator> equal_range(const key_type &key) const { + return tree_.equal_range(key); + } + + // Utility routines. + void clear() { + tree_.clear(); + } + void swap(self_type &x) { + tree_.swap(x.tree_); + } + void dump(std::ostream &os) const { + tree_.dump(os); + } + void verify() const { + tree_.verify(); + } + + // Size routines. + size_type size() const { return tree_.size(); } + size_type max_size() const { return tree_.max_size(); } + bool empty() const { return tree_.empty(); } + size_type height() const { return tree_.height(); } + size_type internal_nodes() const { return tree_.internal_nodes(); } + size_type leaf_nodes() const { return tree_.leaf_nodes(); } + size_type nodes() const { return tree_.nodes(); } + size_type bytes_used() const { return tree_.bytes_used(); } + static double average_bytes_per_value() { + return Tree::average_bytes_per_value(); + } + double fullness() const { return tree_.fullness(); } + double overhead() const { return tree_.overhead(); } + + bool operator==(const self_type& x) const { + if (size() != x.size()) { + return false; + } + for (const_iterator i = begin(), xi = x.begin(); i != end(); ++i, ++xi) { + if (*i != *xi) { + return false; + } + } + return true; + } + + bool operator!=(const self_type& other) const { + return !operator==(other); + } + + + protected: + Tree tree_; +}; + +template <typename T> +inline std::ostream& operator<<(std::ostream &os, const btree_container<T> &b) { + b.dump(os); + return os; +} + +// A common base class for btree_set and safe_btree_set. +template <typename Tree> +class btree_unique_container : public btree_container<Tree> { + typedef btree_unique_container<Tree> self_type; + typedef btree_container<Tree> super_type; + + public: + typedef typename Tree::key_type key_type; + typedef typename Tree::value_type value_type; + typedef typename Tree::size_type size_type; + typedef typename Tree::key_compare key_compare; + typedef typename Tree::allocator_type allocator_type; + typedef typename Tree::iterator iterator; + typedef typename Tree::const_iterator const_iterator; + + public: + // Default constructor. + btree_unique_container(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_unique_container(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_unique_container(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + insert(b, e); + } + + // Lookup routines. + iterator find(const key_type &key) { + return this->tree_.find_unique(key); + } + const_iterator find(const key_type &key) const { + return this->tree_.find_unique(key); + } + size_type count(const key_type &key) const { + return this->tree_.count_unique(key); + } + + // Insertion routines. + std::pair<iterator,bool> insert(const value_type &x) { + return this->tree_.insert_unique(x); + } + iterator insert(iterator position, const value_type &x) { + return this->tree_.insert_unique(position, x); + } + template <typename InputIterator> + void insert(InputIterator b, InputIterator e) { + this->tree_.insert_unique(b, e); + } + + // Deletion routines. + int erase(const key_type &key) { + return this->tree_.erase_unique(key); + } + // Erase the specified iterator from the btree. The iterator must be valid + // (i.e. not equal to end()). Return an iterator pointing to the node after + // the one that was erased (or end() if none exists). + iterator erase(const iterator &iter) { + return this->tree_.erase(iter); + } + void erase(const iterator &first, const iterator &last) { + this->tree_.erase(first, last); + } +}; + +// A common base class for btree_map and safe_btree_map. +template <typename Tree> +class btree_map_container : public btree_unique_container<Tree> { + typedef btree_map_container<Tree> self_type; + typedef btree_unique_container<Tree> super_type; + + public: + typedef typename Tree::key_type key_type; + typedef typename Tree::data_type data_type; + typedef typename Tree::value_type value_type; + typedef typename Tree::mapped_type mapped_type; + typedef typename Tree::key_compare key_compare; + typedef typename Tree::allocator_type allocator_type; + + private: + // A pointer-like object which only generates its value when + // dereferenced. Used by operator[] to avoid constructing an empty data_type + // if the key already exists in the map. + struct generate_value { + generate_value(const key_type &k) + : key(k) { + } + value_type operator*() const { + return std::make_pair(key, data_type()); + } + const key_type &key; + }; + + public: + // Default constructor. + btree_map_container(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_map_container(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_map_container(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(b, e, comp, alloc) { + } + + // Insertion routines. + data_type& operator[](const key_type &key) { + return this->tree_.insert_unique(key, generate_value(key)).first->second; + } +}; + +// A common base class for btree_multiset and btree_multimap. +template <typename Tree> +class btree_multi_container : public btree_container<Tree> { + typedef btree_multi_container<Tree> self_type; + typedef btree_container<Tree> super_type; + + public: + typedef typename Tree::key_type key_type; + typedef typename Tree::value_type value_type; + typedef typename Tree::size_type size_type; + typedef typename Tree::key_compare key_compare; + typedef typename Tree::allocator_type allocator_type; + typedef typename Tree::iterator iterator; + typedef typename Tree::const_iterator const_iterator; + + public: + // Default constructor. + btree_multi_container(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_multi_container(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_multi_container(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + insert(b, e); + } + + // Lookup routines. + iterator find(const key_type &key) { + return this->tree_.find_multi(key); + } + const_iterator find(const key_type &key) const { + return this->tree_.find_multi(key); + } + size_type count(const key_type &key) const { + return this->tree_.count_multi(key); + } + + // Insertion routines. + iterator insert(const value_type &x) { + return this->tree_.insert_multi(x); + } + iterator insert(iterator position, const value_type &x) { + return this->tree_.insert_multi(position, x); + } + template <typename InputIterator> + void insert(InputIterator b, InputIterator e) { + this->tree_.insert_multi(b, e); + } + + // Deletion routines. + int erase(const key_type &key) { + return this->tree_.erase_multi(key); + } + // Erase the specified iterator from the btree. The iterator must be valid + // (i.e. not equal to end()). Return an iterator pointing to the node after + // the one that was erased (or end() if none exists). + iterator erase(const iterator &iter) { + return this->tree_.erase(iter); + } + void erase(const iterator &first, const iterator &last) { + this->tree_.erase(first, last); + } +}; + +} // namespace btree + +#endif // UTIL_BTREE_BTREE_CONTAINER_H__ diff --git a/src/include/cpp-btree/btree_map.h b/src/include/cpp-btree/btree_map.h new file mode 100644 index 00000000..b83489f0 --- /dev/null +++ b/src/include/cpp-btree/btree_map.h @@ -0,0 +1,130 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// A btree_map<> implements the STL unique sorted associative container +// interface and the pair associative container interface (a.k.a map<>) using a +// btree. A btree_multimap<> implements the STL multiple sorted associative +// container interface and the pair associtive container interface (a.k.a +// multimap<>) using a btree. See btree.h for details of the btree +// implementation and caveats. + +#ifndef UTIL_BTREE_BTREE_MAP_H__ +#define UTIL_BTREE_BTREE_MAP_H__ + +#include <algorithm> +#include <functional> +#include <memory> +#include <string> +#include <utility> + +#include "btree.h" +#include "btree_container.h" + +namespace btree { + +// The btree_map class is needed mainly for its constructors. +template <typename Key, typename Value, + typename Compare = std::less<Key>, + typename Alloc = std::allocator<std::pair<const Key, Value> >, + int TargetNodeSize = 256> +class btree_map : public btree_map_container< + btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > { + + typedef btree_map<Key, Value, Compare, Alloc, TargetNodeSize> self_type; + typedef btree_map_params< + Key, Value, Compare, Alloc, TargetNodeSize> params_type; + typedef btree<params_type> btree_type; + typedef btree_map_container<btree_type> super_type; + + public: + typedef typename btree_type::key_compare key_compare; + typedef typename btree_type::allocator_type allocator_type; + + public: + // Default constructor. + btree_map(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_map(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_map(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(b, e, comp, alloc) { + } +}; + +template <typename K, typename V, typename C, typename A, int N> +inline void swap(btree_map<K, V, C, A, N> &x, + btree_map<K, V, C, A, N> &y) { + x.swap(y); +} + +// The btree_multimap class is needed mainly for its constructors. +template <typename Key, typename Value, + typename Compare = std::less<Key>, + typename Alloc = std::allocator<std::pair<const Key, Value> >, + int TargetNodeSize = 256> +class btree_multimap : public btree_multi_container< + btree<btree_map_params<Key, Value, Compare, Alloc, TargetNodeSize> > > { + + typedef btree_multimap<Key, Value, Compare, Alloc, TargetNodeSize> self_type; + typedef btree_map_params< + Key, Value, Compare, Alloc, TargetNodeSize> params_type; + typedef btree<params_type> btree_type; + typedef btree_multi_container<btree_type> super_type; + + public: + typedef typename btree_type::key_compare key_compare; + typedef typename btree_type::allocator_type allocator_type; + typedef typename btree_type::data_type data_type; + typedef typename btree_type::mapped_type mapped_type; + + public: + // Default constructor. + btree_multimap(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_multimap(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_multimap(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(b, e, comp, alloc) { + } +}; + +template <typename K, typename V, typename C, typename A, int N> +inline void swap(btree_multimap<K, V, C, A, N> &x, + btree_multimap<K, V, C, A, N> &y) { + x.swap(y); +} + +} // namespace btree + +#endif // UTIL_BTREE_BTREE_MAP_H__ diff --git a/src/include/cpp-btree/btree_set.h b/src/include/cpp-btree/btree_set.h new file mode 100644 index 00000000..f9b2e75d --- /dev/null +++ b/src/include/cpp-btree/btree_set.h @@ -0,0 +1,121 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// A btree_set<> implements the STL unique sorted associative container +// interface (a.k.a set<>) using a btree. A btree_multiset<> implements the STL +// multiple sorted associative container interface (a.k.a multiset<>) using a +// btree. See btree.h for details of the btree implementation and caveats. + +#ifndef UTIL_BTREE_BTREE_SET_H__ +#define UTIL_BTREE_BTREE_SET_H__ + +#include <functional> +#include <memory> +#include <string> + +#include "btree.h" +#include "btree_container.h" + +namespace btree { + +// The btree_set class is needed mainly for its constructors. +template <typename Key, + typename Compare = std::less<Key>, + typename Alloc = std::allocator<Key>, + int TargetNodeSize = 256> +class btree_set : public btree_unique_container< + btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > { + + typedef btree_set<Key, Compare, Alloc, TargetNodeSize> self_type; + typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type; + typedef btree<params_type> btree_type; + typedef btree_unique_container<btree_type> super_type; + + public: + typedef typename btree_type::key_compare key_compare; + typedef typename btree_type::allocator_type allocator_type; + + public: + // Default constructor. + btree_set(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_set(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_set(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(b, e, comp, alloc) { + } +}; + +template <typename K, typename C, typename A, int N> +inline void swap(btree_set<K, C, A, N> &x, btree_set<K, C, A, N> &y) { + x.swap(y); +} + +// The btree_multiset class is needed mainly for its constructors. +template <typename Key, + typename Compare = std::less<Key>, + typename Alloc = std::allocator<Key>, + int TargetNodeSize = 256> +class btree_multiset : public btree_multi_container< + btree<btree_set_params<Key, Compare, Alloc, TargetNodeSize> > > { + + typedef btree_multiset<Key, Compare, Alloc, TargetNodeSize> self_type; + typedef btree_set_params<Key, Compare, Alloc, TargetNodeSize> params_type; + typedef btree<params_type> btree_type; + typedef btree_multi_container<btree_type> super_type; + + public: + typedef typename btree_type::key_compare key_compare; + typedef typename btree_type::allocator_type allocator_type; + + public: + // Default constructor. + btree_multiset(const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(comp, alloc) { + } + + // Copy constructor. + btree_multiset(const self_type &x) + : super_type(x) { + } + + // Range constructor. + template <class InputIterator> + btree_multiset(InputIterator b, InputIterator e, + const key_compare &comp = key_compare(), + const allocator_type &alloc = allocator_type()) + : super_type(b, e, comp, alloc) { + } +}; + +template <typename K, typename C, typename A, int N> +inline void swap(btree_multiset<K, C, A, N> &x, + btree_multiset<K, C, A, N> &y) { + x.swap(y); +} + +} // namespace btree + +#endif // UTIL_BTREE_BTREE_SET_H__ diff --git a/src/include/crc32c.h b/src/include/crc32c.h new file mode 100644 index 00000000..dd4ede66 --- /dev/null +++ b/src/include/crc32c.h @@ -0,0 +1,57 @@ +#ifndef CEPH_CRC32C_H +#define CEPH_CRC32C_H + +#include <stdint.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length); + +/* + * this is a static global with the chosen crc32c implementation for + * the given architecture. + */ +extern ceph_crc32c_func_t ceph_crc32c_func; + +extern ceph_crc32c_func_t ceph_choose_crc32(void); + +/** + * calculate crc32c for data that is entirely 0 (ZERO) + * + * Note: works the same as ceph_crc32c_func for data == nullptr, + * but faster than the optimized assembly on certain architectures. + * This is faster than intel optimized assembly, but not as fast as + * ppc64le optimized assembly. + * + * @param crc initial value + * @param length length of buffer + */ +uint32_t ceph_crc32c_zeros(uint32_t crc, unsigned length); + +/** + * calculate crc32c + * + * Note: if the data pointer is NULL, we calculate a crc value as if + * it were zero-filled. + * + * @param crc initial value + * @param data pointer to data buffer + * @param length length of buffer + */ +static inline uint32_t ceph_crc32c(uint32_t crc, unsigned char const *data, unsigned length) +{ +#ifndef HAVE_POWER8 + if (!data && length > 16) + return ceph_crc32c_zeros(crc, length); +#endif /* HAVE_POWER8 */ + + return ceph_crc32c_func(crc, data, length); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/demangle.h b/src/include/demangle.h new file mode 100644 index 00000000..9e46d952 --- /dev/null +++ b/src/include/demangle.h @@ -0,0 +1,48 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INCLUDE_DEMANGLE +#define CEPH_INCLUDE_DEMANGLE + +//// Stole this code from http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname +#ifdef __GNUG__ +#include <cstdlib> +#include <memory> +#include <cxxabi.h> + +static std::string ceph_demangle(const char* name) +{ + int status = -4; // some arbitrary value to eliminate the compiler warning + + // enable c++11 by passing the flag -std=c++11 to g++ + std::unique_ptr<char, void(*)(void*)> res { + abi::__cxa_demangle(name, NULL, NULL, &status), + std::free + }; + + return (status == 0) ? res.get() : name ; +} + +#else + +// does nothing if not g++ +static std::string demangle(const char* name) +{ + return name; +} + +#endif + + +#endif diff --git a/src/include/denc.h b/src/include/denc.h new file mode 100644 index 00000000..a6a0fcaa --- /dev/null +++ b/src/include/denc.h @@ -0,0 +1,1724 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +// If you #include "include/encoding.h" you get the old-style *and* +// the new-style definitions. (The old-style needs denc_traits<> in +// order to disable the container helpers when new-style traits are +// present.) + +// You can also just #include "include/denc.h" and get only the +// new-style helpers. The eventual goal is to drop the legacy +// definitions. + +#ifndef _ENC_DEC_H +#define _ENC_DEC_H + +#include <array> +#include <cstring> +#include <map> +#include <optional> +#include <set> +#include <string> +#include <type_traits> +#include <vector> + +#include <boost/container/flat_map.hpp> +#include <boost/container/flat_set.hpp> +#include <boost/intrusive/set.hpp> +#include <boost/optional.hpp> + +#include "include/ceph_assert.h" // boost clobbers this +#include "include/intarith.h" +#include "include/int_types.h" + +#include "buffer.h" +#include "byteorder.h" + +#include "common/convenience.h" + +template<typename T, typename=void> +struct denc_traits { + static constexpr bool supported = false; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = true; +}; + +template<typename T> +inline constexpr bool denc_supported = denc_traits<T>::supported; + + +// hack for debug only; FIXME +//#include <iostream> +//using std::cout; + +// Define this to compile in a dump of all encoded objects to disk to +// populate ceph-object-corpus. Note that there is an almost +// identical implementation in encoding.h, but you only need to define +// ENCODE_DUMP_PATH here. +// +// See src/test/encoding/generate-corpus-objects.sh. +// +//#define ENCODE_DUMP_PATH /tmp/something + +#ifdef ENCODE_DUMP_PATH +# include <cstdio> +# include <sys/types.h> +# include <sys/stat.h> +# include <fcntl.h> +# define ENCODE_STR(x) #x +# define ENCODE_STRINGIFY(x) ENCODE_STR(x) +# define DENC_DUMP_PRE(Type) \ + char *__denc_dump_pre = p.get_pos(); + // this hackery with bits below is just to get a semi-reasonable + // distribution across time. it is somewhat exponential but not + // quite. +# define DENC_DUMP_POST(Type) \ + do { \ + static int i = 0; \ + i++; \ + int bits = 0; \ + for (unsigned t = i; t; bits++) \ + t &= t - 1; \ + if (bits > 2) \ + break; \ + char fn[PATH_MAX]; \ + snprintf(fn, sizeof(fn), \ + ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #Type, \ + getpid(), i++); \ + int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644); \ + if (fd >= 0) { \ + size_t len = p.get_pos() - __denc_dump_pre; \ + int r = ::write(fd, __denc_dump_pre, len); \ + (void)r; \ + ::close(fd); \ + } \ + } while (0) +#else +# define DENC_DUMP_PRE(Type) +# define DENC_DUMP_POST(Type) +#endif + + +/* + + top level level functions look like so + ====================================== + + inline void denc(const T& o, size_t& p, uint64_t features=0); + inline void denc(const T& o, buffer::list::contiguous_appender& p, + uint64_t features=0); + inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features=0); + + or (for featured objects) + + inline void denc(const T& o, size_t& p, uint64_t features); + inline void denc(const T& o, buffer::list::contiguous_appender& p, + uint64_t features); + inline void denc(T& o, buffer::ptr::const_iterator& p, uint64_t features); + + - These are symmetrical, so that they can be used from the magic DENC + method of writing the bound_encode/encode/decode methods all in one go; + they differ only in the type of p. + + - These are automatically fabricated via a template that calls into + the denc_traits<> methods (see below), provided denc_traits<T>::supported + is defined and true. They never need to be written explicitly. + + + static denc_traits<> definitions look like so + ============================================= + + template<> + struct denc_traits<T> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = true; + static void bound_encode(const T &o, size_t& p, uint64_t f=0); + static void encode(const T &o, buffer::list::contiguous_appender& p, + uint64_t f=0); + static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0); + }; + + or (for featured objects) + + template<> + struct denc_traits<T> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const T &o, size_t& p, uint64_t f); + static void encode(const T &o, buffer::list::contiguous_appender& p, + uint64_t f); + static void decode(T& o, buffer::ptr::const_iterator &p, uint64_t f=0); + }; + + - denc_traits<T> is normally declared via the WRITE_CLASS_DENC(type) macro, + which is used in place of the old-style WRITE_CLASS_ENCODER(type) macro. + There are _FEATURED and _BOUNDED variants. The class traits simply call + into class methods of the same name (see below). + + - denc_traits<T> can also be written explicitly for some type to indicate + how it should be encoded. This is the "source of truth" for how a type + is encoded. + + - denc_traits<T> are declared for the base integer types, string, bufferptr, + and bufferlist base types. + + - denc_traits<std::foo<T>>-like traits are declared for standard container + types. + + + class methods look like so + ========================== + + void bound_encode(size_t& p) const; + void encode(buffer::list::contiguous_appender& p) const; + void decode(buffer::ptr::const_iterator &p); + + or (for featured objects) + + void bound_encode(size_t& p, uint64_t f) const; + void encode(buffer::list::contiguous_appender& p, uint64_t f) const; + void decode(buffer::ptr::const_iterator &p); + + - These are normally invoked by the denc_traits<> methods that are + declared via WRITE_CLASS_DENC, although you can also invoke them explicitly + in your code. + + - These methods are optimised for contiguous buffer, but denc() will try + rebuild a contigous one if the decoded bufferlist is segmented. If you are + concerned about the cost, you might want to define yet another method: + + void decode(buffer::list::iterator &p); + + - These can be defined either explicitly (as above), or can be "magically" + defined all in one go using the DENC macro and DENC_{START,FINISH} helpers + (which work like the legacy {ENCODE,DECODE}_{START,FINISH} macros): + + class foo_t { + ... + DENC(foo_t, v, p) { + DENC_START(1, 1, p); + denc(v.foo, p); + denc(v.bar, p); + denc(v.baz, p); + DENC_FINISH(p); + } + ... + }; + WRITE_CLASS_DENC(foo_t) + + */ + +// --------------------------------------------------------------------- +// raw types +namespace _denc { +template<typename T, typename... Us> +inline constexpr bool is_any_of = (... || std::is_same_v<T, Us>); + +template<typename T, typename=void> struct underlying_type { + using type = T; +}; +template<typename T> +struct underlying_type<T, std::enable_if_t<std::is_enum_v<T>>> { + using type = std::underlying_type_t<T>; +}; +template<typename T> +using underlying_type_t = typename underlying_type<T>::type; +} + +template<class It> +struct is_const_iterator + : std::conditional_t<std::is_const_v<std::remove_pointer_t<typename It::pointer>>, + std::true_type, + std::false_type> +{}; +template<> +struct is_const_iterator<size_t> : std::false_type {}; +template<> +struct is_const_iterator<buffer::list::contiguous_appender> : std::false_type { + // appender is used for *changing* the buffer +}; +template<class It> +inline constexpr bool is_const_iterator_v = is_const_iterator<It>::value; + +template<typename T, class It> +std::enable_if_t<is_const_iterator_v<It>, const T&> +get_pos_add(It& i) { + return *reinterpret_cast<const T*>(i.get_pos_add(sizeof(T))); +} + +template<typename T, class It> +std::enable_if_t<!is_const_iterator_v<It>, T&> +get_pos_add(It& i) { + return *reinterpret_cast<T*>(i.get_pos_add(sizeof(T))); +} + +template<typename T> +struct denc_traits< + T, + std::enable_if_t< + _denc::is_any_of<_denc::underlying_type_t<T>, + ceph_le64, ceph_le32, ceph_le16, uint8_t +#ifndef _CHAR_IS_SIGNED + , int8_t +#endif + >>> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + static void bound_encode(const T &o, size_t& p, uint64_t f=0) { + p += sizeof(T); + } + template<class It> + static std::enable_if_t<!is_const_iterator_v<It>> + encode(const T &o, It& p, uint64_t f=0) { + get_pos_add<T>(p) = o; + } + template<class It> + static std::enable_if_t<is_const_iterator_v<It>> + decode(T& o, It& p, uint64_t f=0) { + o = get_pos_add<T>(p); + } + static void decode(T& o, buffer::list::const_iterator &p) { + p.copy(sizeof(T), reinterpret_cast<char*>(&o)); + } +}; + + +// ----------------------------------------------------------------------- +// integer types + +// itype == internal type +// otype == external type, i.e., the type on the wire + +// NOTE: the overload resolution ensures that the legacy encode/decode methods +// defined for int types is preferred to the ones defined using the specialized +// template, and hence get selected. This machinery prevents these these from +// getting glued into the legacy encode/decode methods; the overhead of setting +// up a contiguous_appender etc is likely to be slower. +namespace _denc { + +template<typename T, typename=void> struct ExtType { + using type = void; +}; + +template<typename T> +struct ExtType<T, std::enable_if_t<std::is_same_v<T, int16_t> || + std::is_same_v<T, uint16_t>>> { + using type = ceph_le16; +}; + +template<typename T> +struct ExtType<T, std::enable_if_t<std::is_same_v<T, int32_t> || + std::is_same_v<T, uint32_t>>> { + using type = ceph_le32; +}; + +template<typename T> +struct ExtType<T, std::enable_if_t<std::is_same_v<T, int64_t> || + std::is_same_v<T, uint64_t>>> { + using type = ceph_le64; +}; + +template<> +struct ExtType<bool> { + using type = uint8_t; +}; +template<typename T> +using ExtType_t = typename ExtType<T>::type; +} // namespace _denc + +template<typename T> +struct denc_traits<T, std::enable_if_t<!std::is_void_v<_denc::ExtType_t<T>>>> +{ + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + using etype = _denc::ExtType_t<T>; + static void bound_encode(const T &o, size_t& p, uint64_t f=0) { + p += sizeof(etype); + } + template<class It> + static std::enable_if_t<!is_const_iterator_v<It>> + encode(const T &o, It& p, uint64_t f=0) { + get_pos_add<etype>(p) = o; + } + template<class It> + static std::enable_if_t<is_const_iterator_v<It>> + decode(T& o, It &p, uint64_t f=0) { + o = get_pos_add<etype>(p); + } + static void decode(T& o, buffer::list::const_iterator &p) { + etype e; + p.copy(sizeof(etype), reinterpret_cast<char*>(&e)); + o = e; + } +}; + +// varint +// +// high bit of each byte indicates another byte follows. +template<typename T> +inline void denc_varint(T v, size_t& p) { + p += sizeof(T) + 1; +} + +template<typename T> +inline void denc_varint(T v, bufferlist::contiguous_appender& p) { + uint8_t byte = v & 0x7f; + v >>= 7; + while (v) { + byte |= 0x80; + get_pos_add<__u8>(p) = byte; + byte = (v & 0x7f); + v >>= 7; + } + get_pos_add<__u8>(p) = byte; +} + +template<typename T> +inline void denc_varint(T& v, bufferptr::const_iterator& p) { + uint8_t byte = *(__u8*)p.get_pos_add(1); + v = byte & 0x7f; + int shift = 7; + while (byte & 0x80) { + byte = get_pos_add<__u8>(p); + v |= (T)(byte & 0x7f) << shift; + shift += 7; + } +} + + +// signed varint encoding +// +// low bit = 1 = negative, 0 = positive +// high bit of every byte indicates whether another byte follows. +inline void denc_signed_varint(int64_t v, size_t& p) { + p += sizeof(v) + 2; +} +template<class It> +inline std::enable_if_t<!is_const_iterator_v<It>> +denc_signed_varint(int64_t v, It& p) { + if (v < 0) { + v = (-v << 1) | 1; + } else { + v <<= 1; + } + denc_varint(v, p); +} + +template<typename T, class It> +inline std::enable_if_t<is_const_iterator_v<It>> +denc_signed_varint(T& v, It& p) +{ + int64_t i = 0; + denc_varint(i, p); + if (i & 1) { + v = -(i >> 1); + } else { + v = i >> 1; + } +} + +// varint + lowz encoding +// +// first(low) 2 bits = how many low zero bits (nibbles) +// high bit of each byte = another byte follows +// (so, 5 bits data in first byte, 7 bits data thereafter) +inline void denc_varint_lowz(uint64_t v, size_t& p) { + p += sizeof(v) + 2; +} +inline void denc_varint_lowz(uint64_t v, bufferlist::contiguous_appender& p) { + int lowznib = v ? (ctz(v) / 4) : 0; + if (lowznib > 3) + lowznib = 3; + v >>= lowznib * 4; + v <<= 2; + v |= lowznib; + denc_varint(v, p); +} + +template<typename T> +inline void denc_varint_lowz(T& v, bufferptr::const_iterator& p) +{ + uint64_t i = 0; + denc_varint(i, p); + int lowznib = (i & 3); + i >>= 2; + i <<= lowznib * 4; + v = i; +} + +// signed varint + lowz encoding +// +// first low bit = 1 for negative, 0 for positive +// next 2 bits = how many low zero bits (nibbles) +// high bit of each byte = another byte follows +// (so, 4 bits data in first byte, 7 bits data thereafter) +inline void denc_signed_varint_lowz(int64_t v, size_t& p) { + p += sizeof(v) + 2; +} +template<class It> +inline std::enable_if_t<!is_const_iterator_v<It>> +denc_signed_varint_lowz(int64_t v, It& p) { + bool negative = false; + if (v < 0) { + v = -v; + negative = true; + } + unsigned lowznib = v ? (ctz(v) / 4) : 0u; + if (lowznib > 3) + lowznib = 3; + v >>= lowznib * 4; + v <<= 3; + v |= lowznib << 1; + v |= (int)negative; + denc_varint(v, p); +} + +template<typename T, class It> +inline std::enable_if_t<is_const_iterator_v<It>> +denc_signed_varint_lowz(T& v, It& p) +{ + int64_t i = 0; + denc_varint(i, p); + int lowznib = (i & 6) >> 1; + if (i & 1) { + i >>= 3; + i <<= lowznib * 4; + v = -i; + } else { + i >>= 3; + i <<= lowznib * 4; + v = i; + } +} + + +// LBA +// +// first 1-3 bits = how many low zero bits +// *0 = 12 (common 4 K alignment case) +// *01 = 16 +// *011 = 20 +// *111 = byte +// then 28-30 bits of data +// then last bit = another byte follows +// high bit of each subsequent byte = another byte follows +inline void denc_lba(uint64_t v, size_t& p) { + p += sizeof(v) + 2; +} + +template<class It> +inline std::enable_if_t<!is_const_iterator_v<It>> +denc_lba(uint64_t v, It& p) { + int low_zero_nibbles = v ? (int)(ctz(v) / 4) : 0; + int pos; + uint32_t word; + int t = low_zero_nibbles - 3; + if (t < 0) { + pos = 3; + word = 0x7; + } else if (t < 3) { + v >>= (low_zero_nibbles * 4); + pos = t + 1; + word = (1 << t) - 1; + } else { + v >>= 20; + pos = 3; + word = 0x3; + } + word |= (v << pos) & 0x7fffffff; + v >>= 31 - pos; + if (!v) { + *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word; + return; + } + word |= 0x80000000; + *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)) = word; + uint8_t byte = v & 0x7f; + v >>= 7; + while (v) { + byte |= 0x80; + *(__u8*)p.get_pos_add(1) = byte; + byte = (v & 0x7f); + v >>= 7; + } + *(__u8*)p.get_pos_add(1) = byte; +} + +template<class It> +inline std::enable_if_t<is_const_iterator_v<It>> +denc_lba(uint64_t& v, It& p) { + uint32_t word = *(ceph_le32*)p.get_pos_add(sizeof(uint32_t)); + int shift; + switch (word & 7) { + case 0: + case 2: + case 4: + case 6: + v = (uint64_t)(word & 0x7ffffffe) << (12 - 1); + shift = 12 + 30; + break; + case 1: + case 5: + v = (uint64_t)(word & 0x7ffffffc) << (16 - 2); + shift = 16 + 29; + break; + case 3: + v = (uint64_t)(word & 0x7ffffff8) << (20 - 3); + shift = 20 + 28; + break; + case 7: + v = (uint64_t)(word & 0x7ffffff8) >> 3; + shift = 28; + } + uint8_t byte = word >> 24; + while (byte & 0x80) { + byte = *(__u8*)p.get_pos_add(1); + v |= (uint64_t)(byte & 0x7f) << shift; + shift += 7; + } +} + + +// --------------------------------------------------------------------- +// denc top-level methods that call into denc_traits<T> methods + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported> denc( + const T& o, + size_t& p, + uint64_t f=0) +{ + if constexpr (traits::featured) { + traits::bound_encode(o, p, f); + } else { + traits::bound_encode(o, p); + } +} + +template<typename T, class It, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !is_const_iterator_v<It>> +denc(const T& o, + It& p, + uint64_t features=0) +{ + if constexpr (traits::featured) { + traits::encode(o, p, features); + } else { + traits::encode(o, p); + } +} + +template<typename T, class It, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && is_const_iterator_v<It>> +denc(T& o, + It& p, + uint64_t features=0) +{ + if constexpr (traits::featured) { + traits::decode(o, p, features); + } else { + traits::decode(o, p); + } +} + +namespace _denc { +template<typename T, typename = void> +struct has_legacy_denc : std::false_type {}; +template<typename T> +struct has_legacy_denc<T, decltype(std::declval<T&>() + .decode(std::declval< + bufferlist::const_iterator&>()))> + : std::true_type { + static void decode(T& v, bufferlist::const_iterator& p) { + v.decode(p); + } +}; +template<typename T> +struct has_legacy_denc<T, + std::enable_if_t< + !denc_traits<T>::need_contiguous>> : std::true_type { + static void decode(T& v, bufferlist::const_iterator& p) { + denc_traits<T>::decode(v, p); + } +}; +} + +template<typename T, + typename traits=denc_traits<T>, + typename has_legacy_denc=_denc::has_legacy_denc<T>> +inline std::enable_if_t<traits::supported && + has_legacy_denc::value> denc( + T& o, + buffer::list::const_iterator& p) +{ + has_legacy_denc::decode(o, p); +} + +// --------------------------------------------------------------------- +// base types and containers + +// +// std::string +// +template<typename A> +struct denc_traits<std::basic_string<char,std::char_traits<char>,A>> { +private: + using value_type = std::basic_string<char,std::char_traits<char>,A>; + +public: + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + + static void bound_encode(const value_type& s, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + s.size(); + } + template<class It> + static void encode(const value_type& s, + It& p, + uint64_t f=0) { + denc((uint32_t)s.size(), p); + memcpy(p.get_pos_add(s.size()), s.data(), s.size()); + } + template<class It> + static void decode(value_type& s, + It& p, + uint64_t f=0) { + uint32_t len; + denc(len, p); + decode_nohead(len, s, p); + } + static void decode(value_type& s, buffer::list::const_iterator& p) + { + uint32_t len; + denc(len, p); + decode_nohead(len, s, p); + } + template<class It> + static void decode_nohead(size_t len, value_type& s, It& p) { + s.clear(); + if (len) { + s.append(p.get_pos_add(len), len); + } + } + static void decode_nohead(size_t len, value_type& s, + buffer::list::const_iterator& p) { + if (len) { + if constexpr (std::is_same_v<value_type, std::string>) { + s.clear(); + p.copy(len, s); + } else { + s.resize(len); + p.copy(len, s.data()); + } + } else { + s.clear(); + } + } + template<class It> + static std::enable_if_t<!is_const_iterator_v<It>> + encode_nohead(const value_type& s, It& p) { + auto len = s.length(); + maybe_inline_memcpy(p.get_pos_add(len), s.data(), len, 16); + } +}; + +// +// bufferptr +// +template<> +struct denc_traits<bufferptr> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + v.length(); + } + template <class It> + static std::enable_if_t<!is_const_iterator_v<It>> + encode(const bufferptr& v, It& p, uint64_t f=0) { + denc((uint32_t)v.length(), p); + p.append(v); + } + template <class It> + static std::enable_if_t<is_const_iterator_v<It>> + decode(bufferptr& v, It& p, uint64_t f=0) { + uint32_t len; + denc(len, p); + v = p.get_ptr(len); + } + static void decode(bufferptr& v, buffer::list::const_iterator& p) { + uint32_t len; + denc(len, p); + bufferlist s; + p.copy(len, s); + if (len) { + if (s.get_num_buffers() == 1) + v = s.front(); + else + v = buffer::copy(s.c_str(), s.length()); + } + } +}; + +// +// bufferlist +// +template<> +struct denc_traits<bufferlist> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = false; + static void bound_encode(const bufferlist& v, size_t& p, uint64_t f=0) { + p += sizeof(uint32_t) + v.length(); + } + static void encode(const bufferlist& v, buffer::list::contiguous_appender& p, + uint64_t f=0) { + denc((uint32_t)v.length(), p); + p.append(v); + } + static void decode(bufferlist& v, buffer::ptr::const_iterator& p, uint64_t f=0) { + uint32_t len; + denc(len, p); + v.clear(); + v.push_back(p.get_ptr(len)); + } + static void decode(bufferlist& v, buffer::list::const_iterator& p) { + uint32_t len; + denc(len, p); + v.clear(); + p.copy(len, v); + } + static void encode_nohead(const bufferlist& v, + buffer::list::contiguous_appender& p) { + p.append(v); + } + static void decode_nohead(size_t len, bufferlist& v, + buffer::ptr::const_iterator& p) { + v.clear(); + if (len) { + v.append(p.get_ptr(len)); + } + } + static void decode_nohead(size_t len, bufferlist& v, + buffer::list::const_iterator& p) { + v.clear(); + p.copy(len, v); + } +}; + +// +// std::pair<A, B> +// +template<typename A, typename B> +struct denc_traits< + std::pair<A, B>, + std::enable_if_t<denc_supported<A> && denc_supported<B>>> { + typedef denc_traits<A> a_traits; + typedef denc_traits<B> b_traits; + + static constexpr bool supported = true; + static constexpr bool featured = a_traits::featured || b_traits::featured ; + static constexpr bool bounded = a_traits::bounded && b_traits::bounded; + static constexpr bool need_contiguous = (a_traits::need_contiguous || + b_traits::need_contiguous); + + static void bound_encode(const std::pair<A,B>& v, size_t& p, uint64_t f = 0) { + if constexpr (featured) { + denc(v.first, p, f); + denc(v.second, p, f); + } else { + denc(v.first, p); + denc(v.second, p); + } + } + + static void encode(const std::pair<A,B>& v, bufferlist::contiguous_appender& p, + uint64_t f = 0) { + if constexpr (featured) { + denc(v.first, p, f); + denc(v.second, p, f); + } else { + denc(v.first, p); + denc(v.second, p); + } + } + + static void decode(std::pair<A,B>& v, buffer::ptr::const_iterator& p, uint64_t f=0) { + denc(v.first, p, f); + denc(v.second, p, f); + } + template<typename AA=A> + static std::enable_if_t<!!sizeof(AA) && !need_contiguous> + decode(std::pair<A,B>& v, buffer::list::const_iterator& p, + uint64_t f = 0) { + denc(v.first, p); + denc(v.second, p); + } +}; + +namespace _denc { + template<template<class...> class C, typename Details, typename ...Ts> + struct container_base { + private: + using container = C<Ts...>; + using T = typename Details::T; + + public: + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + template<typename U=T> + static void bound_encode(const container& s, size_t& p, uint64_t f = 0) { + p += sizeof(uint32_t); + if constexpr (traits::bounded) { + if (!s.empty()) { + // STL containers use weird element types like std::pair<const K, V>; + // cast to something we have denc_traits for. + size_t elem_size = 0; + if constexpr (traits::featured) { + denc(static_cast<const T&>(*s.begin()), elem_size, f); + } else { + denc(static_cast<const T&>(*s.begin()), elem_size); + } + p += sizeof(uint32_t) + elem_size * s.size(); + } + } else { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + } + + template<typename U=T> + static void encode(const container& s, buffer::list::contiguous_appender& p, + uint64_t f = 0) { + denc((uint32_t)s.size(), p); + if constexpr (traits::featured) { + encode_nohead(s, p, f); + } else { + encode_nohead(s, p); + } + } + static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p, f); + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(container& s, buffer::list::const_iterator& p) { + uint32_t num; + denc(num, p); + decode_nohead(num, s, p); + } + + // nohead + static void encode_nohead(const container& s, buffer::list::contiguous_appender& p, + uint64_t f = 0) { + for (const T& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + static void decode_nohead(size_t num, container& s, + buffer::ptr::const_iterator& p, uint64_t f=0) { + s.clear(); + Details::reserve(s, num); + while (num--) { + T t; + denc(t, p, f); + Details::insert(s, std::move(t)); + } + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode_nohead(size_t num, container& s, + buffer::list::const_iterator& p) { + s.clear(); + Details::reserve(s, num); + while (num--) { + T t; + denc(t, p); + Details::insert(s, std::move(t)); + } + } + }; + + template<typename T> + class container_has_reserve { + template<typename U, U> struct SFINAE_match; + template<typename U> + static std::true_type test(SFINAE_match<T(*)(typename T::size_type), + &U::reserve>*); + + template<typename U> + static std::false_type test(...); + + public: + static constexpr bool value = decltype( + test<denc_traits<T>>(0))::value; + }; + template<typename T> + inline constexpr bool container_has_reserve_v = + container_has_reserve<T>::value; + + + template<typename Container> + struct container_details_base { + using T = typename Container::value_type; + static void reserve(Container& c, size_t s) { + if constexpr (container_has_reserve_v<Container>) { + c.reserve(s); + } + } + }; + + template<typename Container> + struct pushback_details : public container_details_base<Container> { + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_back(std::forward<Args>(args)...); + } + }; +} + +template<typename T, typename ...Ts> +struct denc_traits< + std::list<T, Ts...>, + typename std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::list, + _denc::pushback_details<std::list<T, Ts...>>, + T, Ts...> {}; + +template<typename T, typename ...Ts> +struct denc_traits< + std::vector<T, Ts...>, + typename std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::vector, + _denc::pushback_details<std::vector<T, Ts...>>, + T, Ts...> {}; + +namespace _denc { + template<typename Container> + struct setlike_details : public container_details_base<Container> { + using T = typename Container::value_type; + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_hint(c.cend(), std::forward<Args>(args)...); + } + }; +} + +template<typename T, typename ...Ts> +struct denc_traits< + std::set<T, Ts...>, + std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base<std::set, + _denc::setlike_details<std::set<T, Ts...>>, + T, Ts...> {}; + +template<typename T, typename ...Ts> +struct denc_traits< + boost::container::flat_set<T, Ts...>, + std::enable_if_t<denc_traits<T>::supported>> + : public _denc::container_base< + boost::container::flat_set, + _denc::setlike_details<boost::container::flat_set<T, Ts...>>, + T, Ts...> {}; + +namespace _denc { + template<typename Container> + struct maplike_details : public container_details_base<Container> { + using T = std::pair<typename Container::key_type, + typename Container::mapped_type>; + template<typename ...Args> + static void insert(Container& c, Args&& ...args) { + c.emplace_hint(c.cend(), std::forward<Args>(args)...); + } + }; +} + +template<typename A, typename B, typename ...Ts> +struct denc_traits< + std::map<A, B, Ts...>, + std::enable_if_t<denc_traits<A>::supported && + denc_traits<B>::supported>> + : public _denc::container_base<std::map, + _denc::maplike_details<std::map<A, B, Ts...>>, + A, B, Ts...> {}; + +template<typename A, typename B, typename ...Ts> +struct denc_traits< + boost::container::flat_map<A, B, Ts...>, + std::enable_if_t<denc_traits<A>::supported && + denc_traits<B>::supported>> + : public _denc::container_base< + boost::container::flat_map, + _denc::maplike_details<boost::container::flat_map< + A, B, Ts...>>, + A, B, Ts...> {}; + +template<typename T, size_t N> +struct denc_traits< + std::array<T, N>, + std::enable_if_t<denc_traits<T>::supported>> { +private: + using container = std::array<T, N>; +public: + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = traits::bounded; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const container& s, size_t& p, uint64_t f = 0) { + if constexpr (traits::bounded) { + if constexpr (traits::featured) { + if (!s.empty()) { + size_t elem_size = 0; + denc(*s.begin(), elem_size, f); + p += elem_size * s.size(); + } + } else { + size_t elem_size = 0; + denc(*s.begin(), elem_size); + p += elem_size * N; + } + } else { + for (const auto& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + } + + static void encode(const container& s, buffer::list::contiguous_appender& p, + uint64_t f = 0) { + for (const auto& e : s) { + if constexpr (traits::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + } + } + static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) { + for (auto& e : s) + denc(e, p, f); + } + template<typename U=T> + static std::enable_if_t<!!sizeof(U) && + !need_contiguous> + decode(container& s, buffer::list::const_iterator& p) { + for (auto& e : s) { + denc(e, p); + } + } +}; + +template<typename... Ts> +struct denc_traits< + std::tuple<Ts...>, + std::enable_if_t<(denc_traits<Ts>::supported && ...)>> { + +private: + static_assert(sizeof...(Ts) > 0, + "Zero-length tuples are not supported."); + using container = std::tuple<Ts...>; + +public: + + static constexpr bool supported = true; + static constexpr bool featured = (denc_traits<Ts>::featured || ...); + static constexpr bool bounded = (denc_traits<Ts>::bounded && ...); + static constexpr bool need_contiguous = + (denc_traits<Ts>::need_contiguous || ...); + + template<typename U = container> + static std::enable_if_t<denc_traits<U>::featured> + bound_encode(const container& s, size_t& p, uint64_t f) { + ceph::for_each(s, [&p, f] (const auto& e) { + if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + }); + } + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::featured> + bound_encode(const container& s, size_t& p) { + ceph::for_each(s, [&p] (const auto& e) { + denc(e, p); + }); + } + + template<typename U = container> + static std::enable_if_t<denc_traits<U>::featured> + encode(const container& s, buffer::list::contiguous_appender& p, uint64_t f) { + ceph::for_each(s, [&p, f] (const auto& e) { + if constexpr (denc_traits<std::decay_t<decltype(e)>>::featured) { + denc(e, p, f); + } else { + denc(e, p); + } + }); + } + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::featured> + encode(const container& s, buffer::list::contiguous_appender& p) { + ceph::for_each(s, [&p] (const auto& e) { + denc(e, p); + }); + } + + static void decode(container& s, buffer::ptr::const_iterator& p, uint64_t f = 0) { + ceph::for_each(s, [&p] (auto& e) { + denc(e, p); + }); + } + + template<typename U = container> + static std::enable_if_t<!denc_traits<U>::need_contiguous> + decode(container& s, buffer::list::const_iterator& p, uint64_t f = 0) { + ceph::for_each(s, [&p] (auto& e) { + denc(e, p); + }); + } +}; + +// +// boost::optional<T> +// +template<typename T> +struct denc_traits< + boost::optional<T>, + std::enable_if_t<denc_traits<T>::supported>> { + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const boost::optional<T>& v, size_t& p, + uint64_t f = 0) { + p += sizeof(bool); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void encode(const boost::optional<T>& v, + bufferlist::contiguous_appender& p, + uint64_t f = 0) { + denc((bool)v, p); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode(boost::optional<T>& v, buffer::ptr::const_iterator& p, + uint64_t f = 0) { + bool x; + denc(x, p, f); + if (x) { + v = T{}; + denc(*v, p, f); + } else { + v = boost::none; + } + } + + template<typename U = T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(boost::optional<T>& v, buffer::list::const_iterator& p) { + bool x; + denc(x, p); + if (x) { + v = T{}; + denc(*v, p); + } else { + v = boost::none; + } + } + + template<typename U = T> + static void encode_nohead(const boost::optional<T>& v, + bufferlist::contiguous_appender& p, + uint64_t f = 0) { + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode_nohead(bool num, boost::optional<T>& v, + buffer::ptr::const_iterator& p, uint64_t f = 0) { + if (num) { + v = T(); + denc(*v, p, f); + } else { + v = boost::none; + } + } +}; + +template<> +struct denc_traits<boost::none_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + + static void bound_encode(const boost::none_t& v, size_t& p) { + p += sizeof(bool); + } + + static void encode(const boost::none_t& v, + bufferlist::contiguous_appender& p) { + denc(false, p); + } +}; + +// +// std::optional<T> +// +template<typename T> +struct denc_traits< + std::optional<T>, + std::enable_if_t<denc_traits<T>::supported>> { + using traits = denc_traits<T>; + + static constexpr bool supported = true; + static constexpr bool featured = traits::featured; + static constexpr bool bounded = false; + static constexpr bool need_contiguous = traits::need_contiguous; + + static void bound_encode(const std::optional<T>& v, size_t& p, + uint64_t f = 0) { + p += sizeof(bool); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void encode(const std::optional<T>& v, + bufferlist::contiguous_appender& p, + uint64_t f = 0) { + denc((bool)v, p); + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode(std::optional<T>& v, buffer::ptr::const_iterator& p, + uint64_t f = 0) { + bool x; + denc(x, p, f); + if (x) { + v = T{}; + denc(*v, p, f); + } else { + v = std::nullopt; + } + } + + template<typename U = T> + static std::enable_if_t<!!sizeof(U) && !need_contiguous> + decode(std::optional<T>& v, buffer::list::const_iterator& p) { + bool x; + denc(x, p); + if (x) { + v = T{}; + denc(*v, p); + } else { + v = std::nullopt; + } + } + + static void encode_nohead(const std::optional<T>& v, + bufferlist::contiguous_appender& p, + uint64_t f = 0) { + if (v) { + if constexpr (featured) { + denc(*v, p, f); + } else { + denc(*v, p); + } + } + } + + static void decode_nohead(bool num, std::optional<T>& v, + buffer::ptr::const_iterator& p, uint64_t f = 0) { + if (num) { + v = T(); + denc(*v, p, f); + } else { + v = std::nullopt; + } + } +}; + +template<> +struct denc_traits<std::nullopt_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + + static void bound_encode(const std::nullopt_t& v, size_t& p) { + p += sizeof(bool); + } + + static void encode(const std::nullopt_t& v, + bufferlist::contiguous_appender& p) { + denc(false, p); + } +}; + +// ---------------------------------------------------------------------- +// class helpers + +// Write denc_traits<> for a class that defines bound_encode/encode/decode +// methods. + +#define WRITE_CLASS_DENC(T) _DECLARE_CLASS_DENC(T, false) +#define WRITE_CLASS_DENC_BOUNDED(T) _DECLARE_CLASS_DENC(T, true) +#define _DECLARE_CLASS_DENC(T, b) \ + template<> struct denc_traits<T> { \ + static constexpr bool supported = true; \ + static constexpr bool featured = false; \ + static constexpr bool bounded = b; \ + static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\ + static void bound_encode(const T& v, size_t& p, uint64_t f=0) { \ + v.bound_encode(p); \ + } \ + static void encode(const T& v, buffer::list::contiguous_appender& p, \ + uint64_t f=0) { \ + v.encode(p); \ + } \ + static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) { \ + v.decode(p); \ + } \ + }; + +#define WRITE_CLASS_DENC_FEATURED(T) _DECLARE_CLASS_DENC_FEATURED(T, false) +#define WRITE_CLASS_DENC_FEATURED_BOUNDED(T) _DECLARE_CLASS_DENC_FEATURED(T, true) +#define _DECLARE_CLASS_DENC_FEATURED(T, b) \ + template<> struct denc_traits<T> { \ + static constexpr bool supported = true; \ + static constexpr bool featured = true; \ + static constexpr bool bounded = b; \ + static constexpr bool need_contiguous = !_denc::has_legacy_denc<T>::value;\ + static void bound_encode(const T& v, size_t& p, uint64_t f) { \ + v.bound_encode(p, f); \ + } \ + static void encode(const T& v, buffer::list::contiguous_appender& p, \ + uint64_t f) { \ + v.encode(p, f); \ + } \ + static void decode(T& v, buffer::ptr::const_iterator& p, uint64_t f=0) { \ + v.decode(p, f); \ + } \ + }; + + +// ---------------------------------------------------------------------- +// encode/decode wrappers + +// These glue the new-style denc world into old-style calls to encode +// and decode by calling into denc_traits<> methods (when present). + +namespace ceph { +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::featured> encode( + const T& o, + bufferlist& bl, + uint64_t features_unused=0) +{ + size_t len = 0; + traits::bound_encode(o, len); + auto a = bl.get_contiguous_appender(len); + traits::encode(o, a); +} + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && traits::featured> encode( + const T& o, bufferlist& bl, + uint64_t features) +{ + size_t len = 0; + traits::bound_encode(o, len, features); + auto a = bl.get_contiguous_appender(len); + traits::encode(o, a, features); +} + +template<typename T, + typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::need_contiguous> decode( + T& o, + bufferlist::const_iterator& p) +{ + if (p.end()) + throw buffer::end_of_buffer(); + const auto& bl = p.get_bl(); + const auto remaining = bl.length() - p.get_off(); + // it is expensive to rebuild a contigous buffer and drop it, so avoid this. + if (!p.is_pointing_same_raw(bl.back()) && remaining > CEPH_PAGE_SIZE) { + traits::decode(o, p); + } else { + // ensure we get a contigous buffer... until the end of the + // bufferlist. we don't really know how much we'll need here, + // unfortunately. hopefully it is already contiguous and we're just + // bumping the raw ref and initializing the ptr tmp fields. + bufferptr tmp; + auto t = p; + t.copy_shallow(remaining, tmp); + auto cp = std::cbegin(tmp); + traits::decode(o, cp); + p.advance(cp.get_offset()); + } +} + +template<typename T, + typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && traits::need_contiguous> decode( + T& o, + bufferlist::const_iterator& p) +{ + if (p.end()) + throw buffer::end_of_buffer(); + // ensure we get a contigous buffer... until the end of the + // bufferlist. we don't really know how much we'll need here, + // unfortunately. hopefully it is already contiguous and we're just + // bumping the raw ref and initializing the ptr tmp fields. + bufferptr tmp; + auto t = p; + t.copy_shallow(p.get_bl().length() - p.get_off(), tmp); + auto cp = std::cbegin(tmp); + traits::decode(o, cp); + p.advance(cp.get_offset()); +} + +// nohead variants +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && + !traits::featured> encode_nohead( + const T& o, + bufferlist& bl) +{ + size_t len = 0; + traits::bound_encode(o, len); + auto a = bl.get_contiguous_appender(len); + traits::encode_nohead(o, a); +} + +template<typename T, typename traits=denc_traits<T>> +inline std::enable_if_t<traits::supported && !traits::featured> decode_nohead( + size_t num, + T& o, + bufferlist::const_iterator& p) +{ + if (!num) + return; + if (p.end()) + throw buffer::end_of_buffer(); + if constexpr (traits::need_contiguous) { + bufferptr tmp; + auto t = p; + if constexpr (denc_traits<typename T::value_type>::bounded) { + size_t element_size = 0; + typename T::value_type v; + denc_traits<typename T::value_type>::bound_encode(v, element_size); + t.copy_shallow(num * element_size, tmp); + } else { + t.copy_shallow(p.get_bl().length() - p.get_off(), tmp); + } + auto cp = std::cbegin(tmp); + traits::decode_nohead(num, o, cp); + p.advance(cp.get_offset()); + } else { + traits::decode_nohead(num, o, p); + } +} +} + + +// ---------------------------------------------------------------- +// DENC + +// These are some class methods we need to do the version and length +// wrappers for DENC_{START,FINISH} for inter-version +// interoperability. + +#define DENC_HELPERS \ + /* bound_encode */ \ + static void _denc_start(size_t& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **, uint32_t *) { \ + p += 2 + 4; \ + } \ + static void _denc_finish(size_t& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **, uint32_t *) { } \ + /* encode */ \ + static void _denc_start(bufferlist::contiguous_appender& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **len_pos, \ + uint32_t *start_oob_off) { \ + denc(*struct_v, p); \ + denc(*struct_compat, p); \ + *len_pos = p.get_pos_add(4); \ + *start_oob_off = p.get_out_of_band_offset(); \ + } \ + static void _denc_finish(bufferlist::contiguous_appender& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **len_pos, \ + uint32_t *start_oob_off) { \ + *(ceph_le32*)*len_pos = p.get_pos() - *len_pos - sizeof(uint32_t) + \ + p.get_out_of_band_offset() - *start_oob_off; \ + } \ + /* decode */ \ + static void _denc_start(buffer::ptr::const_iterator& p, \ + __u8 *struct_v, \ + __u8 *struct_compat, \ + char **start_pos, \ + uint32_t *struct_len) { \ + denc(*struct_v, p); \ + denc(*struct_compat, p); \ + denc(*struct_len, p); \ + *start_pos = const_cast<char*>(p.get_pos()); \ + } \ + static void _denc_finish(buffer::ptr::const_iterator& p, \ + __u8 *struct_v, __u8 *struct_compat, \ + char **start_pos, \ + uint32_t *struct_len) { \ + const char *pos = p.get_pos(); \ + char *end = *start_pos + *struct_len; \ + ceph_assert(pos <= end); \ + if (pos < end) { \ + p.advance(end - pos); \ + } \ + } + +// Helpers for versioning the encoding. These correspond to the +// {ENCODE,DECODE}_{START,FINISH} macros. + +#define DENC_START(v, compat, p) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + char *_denc_pchar; \ + uint32_t _denc_u32; \ + _denc_start(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); \ + do { + +#define DENC_FINISH(p) \ + } while (false); \ + _denc_finish(p, &struct_v, &struct_compat, &_denc_pchar, &_denc_u32); + + +// ---------------------------------------------------------------------- + +// Helpers for writing a unified bound_encode/encode/decode +// implementation that won't screw up buffer size estimations. + +#define DENC(Type, v, p) \ + DENC_HELPERS \ + void bound_encode(size_t& p) const { \ + _denc_friend(*this, p); \ + } \ + void encode(bufferlist::contiguous_appender& p) const { \ + DENC_DUMP_PRE(Type); \ + _denc_friend(*this, p); \ + DENC_DUMP_POST(Type); \ + } \ + void decode(buffer::ptr::const_iterator& p) { \ + _denc_friend(*this, p); \ + } \ + template<typename T, typename P> \ + friend std::enable_if_t<std::is_same_v<T, Type> || \ + std::is_same_v<T, const Type>> \ + _denc_friend(T& v, P& p) + +#define DENC_FEATURED(Type, v, p, f) \ + DENC_HELPERS \ + void bound_encode(size_t& p, uint64_t f) const { \ + _denc_friend(*this, p, f); \ + } \ + void encode(bufferlist::contiguous_appender& p, uint64_t f) const { \ + DENC_DUMP_PRE(Type); \ + _denc_friend(*this, p, f); \ + DENC_DUMP_POST(Type); \ + } \ + void decode(buffer::ptr::const_iterator& p, uint64_t f=0) { \ + _denc_friend(*this, p, f); \ + } \ + template<typename T, typename P> \ + friend std::enable_if_t<std::is_same_v<T, Type> || \ + std::is_same_v<T, const Type>> \ + _denc_friend(T& v, P& p, uint64_t f) + +#endif diff --git a/src/include/elist.h b/src/include/elist.h new file mode 100644 index 00000000..38be35db --- /dev/null +++ b/src/include/elist.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ELIST_H +#define CEPH_ELIST_H + +/* + * elist: embedded list. + * + * requirements: + * - elist<T>::item be embedded in the parent class + * - items are _always_ added to the list via the same elist<T>::item at the same + * fixed offset in the class. + * - begin(), front(), back() methods take the member offset as an argument for traversal. + * + */ + +#define member_offset(cls, member) ((size_t)(&((cls*)1)->member) - 1) + +template<typename T> +class elist { +public: + struct item { + item *_prev, *_next; + + item(T i=0) : _prev(this), _next(this) {} + ~item() { + ceph_assert(!is_on_list()); + } + + item(const item& other) = delete; + const item& operator= (const item& right) = delete; + + + bool empty() const { return _prev == this; } + bool is_on_list() const { return !empty(); } + + bool remove_myself() { + if (_next == this) { + ceph_assert(_prev == this); + return false; + } + _next->_prev = _prev; + _prev->_next = _next; + _prev = _next = this; + return true; + } + + void insert_after(item *other) { + ceph_assert(other->empty()); + other->_prev = this; + other->_next = _next; + _next->_prev = other; + _next = other; + } + void insert_before(item *other) { + ceph_assert(other->empty()); + other->_next = this; + other->_prev = _prev; + _prev->_next = other; + _prev = other; + } + + T get_item(size_t offset) { + ceph_assert(offset); + return (T)(((char *)this) - offset); + } + }; + +private: + item _head; + size_t item_offset; + +public: + elist(const elist& other); + const elist& operator=(const elist& other); + + elist(size_t o) : _head(NULL), item_offset(o) {} + ~elist() { + ceph_assert(_head.empty()); + } + + bool empty() const { + return _head.empty(); + } + + void clear() { + while (!_head.empty()) + pop_front(); + } + + void push_front(item *i) { + if (!i->empty()) + i->remove_myself(); + _head.insert_after(i); + } + void push_back(item *i) { + if (!i->empty()) + i->remove_myself(); + _head.insert_before(i); + } + + T front(size_t o=0) { + ceph_assert(!_head.empty()); + return _head._next->get_item(o ? o : item_offset); + } + T back(size_t o=0) { + ceph_assert(!_head.empty()); + return _head._prev->get_item(o ? o : item_offset); + } + + void pop_front() { + ceph_assert(!empty()); + _head._next->remove_myself(); + } + void pop_back() { + ceph_assert(!empty()); + _head._prev->remove_myself(); + } + + void clear_list() { + while (!empty()) + pop_front(); + } + + enum mode_t { + MAGIC, CURRENT, CACHE_NEXT + }; + + class iterator { + private: + item *head; + item *cur, *next; + size_t item_offset; + mode_t mode; + public: + iterator(item *h, size_t o, mode_t m) : + head(h), cur(h->_next), next(cur->_next), item_offset(o), + mode(m) { + ceph_assert(item_offset > 0); + } + T operator*() { + return cur->get_item(item_offset); + } + iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur != head); + if (mode == MAGIC) { + // if 'cur' appears to be valid, use that. otherwise, + // use cached 'next'. + // this is a bit magic, and probably a bad idea... :/ + if (cur->empty()) + cur = next; + else + cur = cur->_next; + } else if (mode == CURRENT) + cur = cur->_next; + else if (mode == CACHE_NEXT) + cur = next; + else + ceph_abort(); + next = cur->_next; + return *this; + } + bool end() const { + return cur == head; + } + }; + + iterator begin(size_t o=0) { + return iterator(&_head, o ? o : item_offset, MAGIC); + } + iterator begin_use_current(size_t o=0) { + return iterator(&_head, o ? o : item_offset, CURRENT); + } + iterator begin_cache_next(size_t o=0) { + return iterator(&_head, o ? o : item_offset, CACHE_NEXT); + } +}; + + +#endif diff --git a/src/include/encoding.h b/src/include/encoding.h new file mode 100644 index 00000000..61219024 --- /dev/null +++ b/src/include/encoding.h @@ -0,0 +1,1505 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_ENCODING_H +#define CEPH_ENCODING_H + +#include <set> +#include <map> +#include <deque> +#include <vector> +#include <string> +#include <string_view> +#include <tuple> +#include <boost/container/small_vector.hpp> +#include <boost/optional/optional_io.hpp> +#include <boost/tuple/tuple.hpp> + +#include "include/unordered_map.h" +#include "include/unordered_set.h" +#include "common/ceph_time.h" + +#include "include/int_types.h" + +#include "common/convenience.h" + +#include "byteorder.h" +#include "buffer.h" + +// pull in the new-style encoding so that we get the denc_traits<> definition. +#include "denc.h" + +#include "assert.h" + +using namespace ceph; + +namespace ceph { + +/* + * Notes on feature encoding: + * + * - The default encode() methods have a features argument with a default parameter + * (which goes to zero). + * - Normal classes will use WRITE_CLASS_ENCODER, with that features=0 default. + * - Classes that _require_ features will use WRITE_CLASS_ENCODER_FEATURES, which + * does not define the default. Any caller must explicitly pass it in. + * - STL container macros have two encode variants: one with a features arg, and one + * without. + * + * The result: + * - A feature encode() method will fail to compile if a value is not + * passed in. + * - The feature varianet of the STL templates will be used when the feature arg is + * provided. It will be passed through to any template arg types, but it will be + * ignored when not needed. + */ + +// -------------------------------------- +// base types + +template<class T> +inline void encode_raw(const T& t, bufferlist& bl) +{ + bl.append((char*)&t, sizeof(t)); +} +template<class T> +inline void decode_raw(T& t, bufferlist::const_iterator &p) +{ + p.copy(sizeof(t), (char*)&t); +} + +#define WRITE_RAW_ENCODER(type) \ + inline void encode(const type &v, ::ceph::bufferlist& bl, uint64_t features=0) { ::ceph::encode_raw(v, bl); } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { ::ceph::decode_raw(v, p); } + +WRITE_RAW_ENCODER(__u8) +#ifndef _CHAR_IS_SIGNED +WRITE_RAW_ENCODER(__s8) +#endif +WRITE_RAW_ENCODER(char) +WRITE_RAW_ENCODER(ceph_le64) +WRITE_RAW_ENCODER(ceph_le32) +WRITE_RAW_ENCODER(ceph_le16) + +inline void encode(const bool &v, bufferlist& bl) { + __u8 vv = v; + encode_raw(vv, bl); +} +inline void decode(bool &v, bufferlist::const_iterator& p) { + __u8 vv; + decode_raw(vv, p); + v = vv; +} + + +// ----------------------------------- +// int types + +#define WRITE_INTTYPE_ENCODER(type, etype) \ + inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \ + ceph_##etype e; \ + e = v; \ + ::ceph::encode_raw(e, bl); \ + } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \ + ceph_##etype e; \ + ::ceph::decode_raw(e, p); \ + v = e; \ + } + +WRITE_INTTYPE_ENCODER(uint64_t, le64) +WRITE_INTTYPE_ENCODER(int64_t, le64) +WRITE_INTTYPE_ENCODER(uint32_t, le32) +WRITE_INTTYPE_ENCODER(int32_t, le32) +WRITE_INTTYPE_ENCODER(uint16_t, le16) +WRITE_INTTYPE_ENCODER(int16_t, le16) + +// ----------------------------------- +// float types +// +// NOTE: The following code assumes all supported platforms use IEEE binary32 +// as float and IEEE binary64 as double floating-point format. The assumption +// is verified by the assertions below. +// +// Under this assumption, we can use raw encoding of floating-point types +// on little-endian machines, but we still need to perform a byte swap +// on big-endian machines to ensure cross-architecture compatibility. +// To achive that, we reinterpret the values as integers first, which are +// byte-swapped via the ceph_le types as above. The extra conversions +// are optimized away on little-endian machines by the compiler. +#define WRITE_FLTTYPE_ENCODER(type, itype, etype) \ + static_assert(sizeof(type) == sizeof(itype)); \ + static_assert(std::numeric_limits<type>::is_iec559, \ + "floating-point type not using IEEE754 format"); \ + inline void encode(type v, ::ceph::bufferlist& bl, uint64_t features=0) { \ + ceph_##etype e; \ + e = *reinterpret_cast<itype *>(&v); \ + ::ceph::encode_raw(e, bl); \ + } \ + inline void decode(type &v, ::ceph::bufferlist::const_iterator& p) { \ + ceph_##etype e; \ + ::ceph::decode_raw(e, p); \ + *reinterpret_cast<itype *>(&v) = e; \ + } + +WRITE_FLTTYPE_ENCODER(float, uint32_t, le32) +WRITE_FLTTYPE_ENCODER(double, uint64_t, le64) + +// see denc.h for ENCODE_DUMP_PATH discussion and definition. +#ifdef ENCODE_DUMP_PATH +# define ENCODE_DUMP_PRE() \ + unsigned pre_off = bl.length() +# define ENCODE_DUMP_POST(cl) \ + do { \ + static int i = 0; \ + i++; \ + int bits = 0; \ + for (unsigned t = i; t; bits++) \ + t &= t - 1; \ + if (bits > 2) \ + break; \ + char fn[PATH_MAX]; \ + snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP_PATH) "/%s__%d.%x", #cl, getpid(), i++); \ + int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644); \ + if (fd >= 0) { \ + ::ceph::bufferlist sub; \ + sub.substr_of(bl, pre_off, bl.length() - pre_off); \ + sub.write_fd(fd); \ + ::close(fd); \ + } \ + } while (0) +#else +# define ENCODE_DUMP_PRE() +# define ENCODE_DUMP_POST(cl) +#endif + + +#define WRITE_CLASS_ENCODER(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features=0) { \ + ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_MEMBER_ENCODER(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl) const { \ + ENCODE_DUMP_PRE(); c.encode(bl); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_ENCODER_FEATURES(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features) { \ + ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + +#define WRITE_CLASS_ENCODER_OPTIONAL_FEATURES(cl) \ + inline void encode(const cl &c, ::ceph::bufferlist &bl, uint64_t features = 0) { \ + ENCODE_DUMP_PRE(); c.encode(bl, features); ENCODE_DUMP_POST(cl); } \ + inline void decode(cl &c, ::ceph::bufferlist::const_iterator &p) { c.decode(p); } + + +// string +inline void encode(std::string_view s, bufferlist& bl, uint64_t features=0) +{ + __u32 len = s.length(); + encode(len, bl); + if (len) + bl.append(s.data(), len); +} +inline void encode(const std::string& s, bufferlist& bl, uint64_t features=0) +{ + return encode(std::string_view(s), bl, features); +} +inline void decode(std::string& s, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + s.clear(); + p.copy(len, s); +} + +inline void encode_nohead(std::string_view s, bufferlist& bl) +{ + bl.append(s.data(), s.length()); +} +inline void encode_nohead(const std::string& s, bufferlist& bl) +{ + encode_nohead(std::string_view(s), bl); +} +inline void decode_nohead(int len, std::string& s, bufferlist::const_iterator& p) +{ + s.clear(); + p.copy(len, s); +} + +// const char* (encode only, string compatible) +inline void encode(const char *s, bufferlist& bl) +{ + encode(std::string_view(s, strlen(s)), bl); +} + + +// ----------------------------- +// buffers + +// bufferptr (encapsulated) +inline void encode(const buffer::ptr& bp, bufferlist& bl) +{ + __u32 len = bp.length(); + encode(len, bl); + if (len) + bl.append(bp); +} +inline void decode(buffer::ptr& bp, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + + bufferlist s; + p.copy(len, s); + + if (len) { + if (s.get_num_buffers() == 1) + bp = s.front(); + else + bp = buffer::copy(s.c_str(), s.length()); + } +} + +// bufferlist (encapsulated) +inline void encode(const bufferlist& s, bufferlist& bl) +{ + __u32 len = s.length(); + encode(len, bl); + bl.append(s); +} +inline void encode_destructively(bufferlist& s, bufferlist& bl) +{ + __u32 len = s.length(); + encode(len, bl); + bl.claim_append(s); +} +inline void decode(bufferlist& s, bufferlist::const_iterator& p) +{ + __u32 len; + decode(len, p); + s.clear(); + p.copy(len, s); +} + +inline void encode_nohead(const bufferlist& s, bufferlist& bl) +{ + bl.append(s); +} +inline void decode_nohead(int len, bufferlist& s, bufferlist::const_iterator& p) +{ + s.clear(); + p.copy(len, s); +} + +// Time, since the templates are defined in std::chrono + +template<typename Clock, typename Duration, + typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr> +void encode(const std::chrono::time_point<Clock, Duration>& t, + ceph::bufferlist &bl) { + auto ts = Clock::to_timespec(t); + // A 32 bit count of seconds causes me vast unhappiness. + uint32_t s = ts.tv_sec; + uint32_t ns = ts.tv_nsec; + encode(s, bl); + encode(ns, bl); +} + +template<typename Clock, typename Duration, + typename std::enable_if_t<converts_to_timespec_v<Clock>>* = nullptr> +void decode(std::chrono::time_point<Clock, Duration>& t, + bufferlist::const_iterator& p) { + uint32_t s; + uint32_t ns; + decode(s, p); + decode(ns, p); + struct timespec ts = { + static_cast<time_t>(s), + static_cast<long int>(ns)}; + + t = Clock::from_timespec(ts); +} + +template<typename Rep, typename Period, + typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr> +void encode(const std::chrono::duration<Rep, Period>& d, + ceph::bufferlist &bl) { + using namespace std::chrono; + uint32_t s = duration_cast<seconds>(d).count(); + uint32_t ns = (duration_cast<nanoseconds>(d) % seconds(1)).count(); + encode(s, bl); + encode(ns, bl); +} + +template<typename Rep, typename Period, + typename std::enable_if_t<std::is_integral_v<Rep>>* = nullptr> +void decode(std::chrono::duration<Rep, Period>& d, + bufferlist::const_iterator& p) { + uint32_t s; + uint32_t ns; + decode(s, p); + decode(ns, p); + d = std::chrono::seconds(s) + std::chrono::nanoseconds(ns); +} + +// ----------------------------- +// STL container types + +template<typename T> +inline void encode(const boost::optional<T> &p, bufferlist &bl); +template<typename T> +inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp); +template<class A, class B, class C> +inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl); +template<class A, class B, class C> +inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || !b_traits::supported> +encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> +encode(const std::pair<A,B> &p, bufferlist &bl); +template<class A, class B, + typename a_traits=denc_traits<A>, typename b_traits=denc_traits<B>> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> +decode(std::pair<A,B> &pa, bufferlist::const_iterator &p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::list<T, Alloc>& ls, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p); +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl); +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl, uint64_t features); +template<class T, class Alloc> +inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist& bl); +template<class T, class Comp, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist::iterator& p); +template<class T, class Comp, class Alloc> +inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl); +template<class T, class Comp, class Alloc> +inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::vector<T,Alloc>& v, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl); +template<class T, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p); +template<class T,class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl, + uint64_t features); +template<class T, class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl); +template<class T, class Alloc> +inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist::const_iterator& p); +// small_vector +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl); +template<class T, std::size_t N, class Alloc, typename traits=denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p); +// std::map +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || + !u_traits::supported> +encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl, + uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl, uint64_t features); +template<class T, class U, class Comp, class Alloc, + typename t_traits=denc_traits<T>, typename u_traits=denc_traits<U>> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> +decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p); +template<class T, class U, class Comp, class Alloc> +inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl); +template<class T, class U, class Comp, class Alloc> +inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl, + uint64_t features); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl); +template<class T, class U, class Hash, class Pred, class Alloc> +inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class Hash, class Pred, class Alloc> +inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl); +template<class T, class Hash, class Pred, class Alloc> +inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p); +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features); +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl); +template<class T, class Alloc> +inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl); +template<class T, size_t N, typename traits = denc_traits<T>> +inline std::enable_if_t<!traits::supported> +decode(std::array<T, N>& v, bufferlist::const_iterator& p); + +// full bl decoder +template<class T> +inline void decode(T &o, const bufferlist& bl) +{ + auto p = bl.begin(); + decode(o, p); + ceph_assert(p.end()); +} + +// boost optional +template<typename T> +inline void encode(const boost::optional<T> &p, bufferlist &bl) +{ + __u8 present = static_cast<bool>(p); + encode(present, bl); + if (p) + encode(p.get(), bl); +} + +#pragma GCC diagnostic ignored "-Wpragmas" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +template<typename T> +inline void decode(boost::optional<T> &p, bufferlist::const_iterator &bp) +{ + __u8 present; + decode(present, bp); + if (present) { + p = T{}; + decode(p.get(), bp); + } else { + p = boost::none; + } +} +#pragma GCC diagnostic pop +#pragma GCC diagnostic warning "-Wpragmas" + +// std::tuple +template<typename... Ts> +inline void encode(const std::tuple<Ts...> &t, bufferlist& bl) +{ + ceph::for_each(t, [&bl](const auto& e) { + encode(e, bl); + }); +} +template<typename... Ts> +inline void decode(std::tuple<Ts...> &t, bufferlist::const_iterator &bp) +{ + ceph::for_each(t, [&bp](auto& e) { + decode(e, bp); + }); +} + +//triple boost::tuple +template<class A, class B, class C> +inline void encode(const boost::tuple<A, B, C> &t, bufferlist& bl) +{ + encode(boost::get<0>(t), bl); + encode(boost::get<1>(t), bl); + encode(boost::get<2>(t), bl); +} +template<class A, class B, class C> +inline void decode(boost::tuple<A, B, C> &t, bufferlist::const_iterator &bp) +{ + decode(boost::get<0>(t), bp); + decode(boost::get<1>(t), bp); + decode(boost::get<2>(t), bp); +} + +// std::pair<A,B> +template<class A, class B, + typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || !b_traits::supported> + encode(const std::pair<A,B> &p, bufferlist &bl, uint64_t features) +{ + encode(p.first, bl, features); + encode(p.second, bl, features); +} +template<class A, class B, + typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> + encode(const std::pair<A,B> &p, bufferlist &bl) +{ + encode(p.first, bl); + encode(p.second, bl); +} +template<class A, class B, typename a_traits, typename b_traits> +inline std::enable_if_t<!a_traits::supported || + !b_traits::supported> + decode(std::pair<A,B> &pa, bufferlist::const_iterator &p) +{ + decode(pa.first, p); + decode(pa.second, p); +} + +// std::list<T> +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::list<T, Alloc>& ls, bufferlist& bl) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::list<T,Alloc>& ls, bufferlist& bl, uint64_t features) +{ + // should i pre- or post- count? + if (!ls.empty()) { + unsigned pos = bl.length(); + unsigned n = 0; + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) { + n++; + encode(*p, bl, features); + } + ceph_le32 en; + en = n; + bl.copy_in(pos, sizeof(en), (char*)&en); + } else { + __u32 n = (__u32)(ls.size()); // FIXME: this is slow on a list. + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl, features); + } +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::list<T,Alloc>& ls, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + ls.emplace_back(); + decode(ls.back(), p); + } +} + +// std::list<std::shared_ptr<T>> +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (const auto& ref : ls) { + encode(*ref, bl); + } +} +template<class T, class Alloc> +inline void encode(const std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(ls.size()); // c++11 std::list::size() is O(1) + encode(n, bl); + for (const auto& ref : ls) { + encode(*ref, bl, features); + } +} +template<class T, class Alloc> +inline void decode(std::list<std::shared_ptr<T>, Alloc>& ls, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + auto ref = std::make_shared<T>(); + decode(*ref, p); + ls.emplace_back(std::move(ref)); + } +} + +// std::set<T> +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::set<T,Comp,Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Comp, class Alloc, typename traits> +inline typename std::enable_if<!traits::supported>::type + encode_nohead(const std::set<T,Comp,Alloc>& s, bufferlist& bl) +{ + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, std::set<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + for (int i=0; i<len; i++) { + T v; + decode(v, p); + s.insert(v); + } +} + +// boost::container::flat_set<T> +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const boost::container::flat_set<T, Comp, Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (const auto& e : s) + encode(e, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +decode(boost::container::flat_set<T, Comp, Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + s.reserve(n); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +encode_nohead(const boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist& bl) +{ + for (const auto& e : s) + encode(e, bl); +} +template<class T, class Comp, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> +decode_nohead(int len, boost::container::flat_set<T, Comp, Alloc>& s, + bufferlist::iterator& p) +{ + s.reserve(len); + for (int i=0; i<len; i++) { + T v; + decode(v, p); + s.insert(v); + } +} + +// multiset +template<class T, class Comp, class Alloc> +inline void encode(const std::multiset<T,Comp,Alloc>& s, bufferlist& bl) +{ + __u32 n = (__u32)(s.size()); + encode(n, bl); + for (auto p = s.begin(); p != s.end(); ++p) + encode(*p, bl); +} +template<class T, class Comp, class Alloc> +inline void decode(std::multiset<T,Comp,Alloc>& s, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + s.clear(); + while (n--) { + T v; + decode(v, p); + s.insert(v); + } +} + +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::vector<T,Alloc>& v, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl, features); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const std::vector<T,Alloc>& v, bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(std::vector<T,Alloc>& v, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.resize(n); + for (__u32 i=0; i<n; i++) + decode(v[i], p); +} + +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode_nohead(const std::vector<T,Alloc>& v, bufferlist& bl) +{ + for (auto p = v.begin(); p != v.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, std::vector<T,Alloc>& v, bufferlist::const_iterator& p) +{ + v.resize(len); + for (__u32 i=0; i<v.size(); i++) + decode(v[i], p); +} + +// small vector +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& i : v) + encode(i, bl, features); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& i : v) + encode(i, bl); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode(boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.resize(n); + for (auto& i : v) + decode(i, p); +} + +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + encode_nohead(const boost::container::small_vector<T,N,Alloc>& v, bufferlist& bl) +{ + for (const auto& i : v) + encode(i, bl); +} +template<class T, std::size_t N, class Alloc, typename traits> +inline std::enable_if_t<!traits::supported> + decode_nohead(int len, boost::container::small_vector<T,N,Alloc>& v, bufferlist::const_iterator& p) +{ + v.resize(len); + for (auto& i : v) + decode(i, p); +} + + +// vector (shared_ptr) +template<class T,class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& ref : v) { + if (ref) + encode(*ref, bl, features); + else + encode(T(), bl, features); + } +} +template<class T, class Alloc> +inline void encode(const std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist& bl) +{ + __u32 n = (__u32)(v.size()); + encode(n, bl); + for (const auto& ref : v) { + if (ref) + encode(*ref, bl); + else + encode(T(), bl); + } +} +template<class T, class Alloc> +inline void decode(std::vector<std::shared_ptr<T>,Alloc>& v, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + v.clear(); + v.reserve(n); + while (n--) { + auto ref = std::make_shared<T>(); + decode(*ref, p); + v.emplace_back(std::move(ref)); + } +} + +// map +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || + !u_traits::supported> + encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const std::map<T,U,Comp,Alloc>& m, bufferlist& bl, uint64_t features) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode_nohead(int n, std::map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// boost::container::flat-map +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (typename boost::container::flat_map<T,U,Comp>::const_iterator p + = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode(const boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode(boost::container::flat_map<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + m.reserve(n); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode_noclear(boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.reserve(m.size() + n); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> + inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + encode_nohead(const boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist& bl, uint64_t features) +{ + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Comp, class Alloc, + typename t_traits, typename u_traits> +inline std::enable_if_t<!t_traits::supported || !u_traits::supported> + decode_nohead(int n, boost::container::flat_map<T,U,Comp,Alloc>& m, + bufferlist::const_iterator& p) +{ + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// multimap +template<class T, class U, class Comp, class Alloc> +inline void encode(const std::multimap<T,U,Comp,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Comp, class Alloc> +inline void decode(std::multimap<T,U,Comp,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + typename std::pair<T,U> tu = std::pair<T,U>(); + decode(tu.first, p); + typename std::multimap<T,U,Comp,Alloc>::iterator it = m.insert(tu); + decode(it->second, p); + } +} + +// ceph::unordered_map +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl, + uint64_t features) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl, features); + encode(p->second, bl, features); + } +} +template<class T, class U, class Hash, class Pred, class Alloc> +inline void encode(const unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) { + encode(p->first, bl); + encode(p->second, bl); + } +} +template<class T, class U, class Hash, class Pred, class Alloc> +inline void decode(unordered_map<T,U,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + decode(m[k], p); + } +} + +// ceph::unordered_set +template<class T, class Hash, class Pred, class Alloc> +inline void encode(const ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist& bl) +{ + __u32 n = (__u32)(m.size()); + encode(n, bl); + for (auto p = m.begin(); p != m.end(); ++p) + encode(*p, bl); +} +template<class T, class Hash, class Pred, class Alloc> +inline void decode(ceph::unordered_set<T,Hash,Pred,Alloc>& m, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + m.clear(); + while (n--) { + T k; + decode(k, p); + m.insert(k); + } +} + +// deque +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl, uint64_t features) +{ + __u32 n = ls.size(); + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl, features); +} +template<class T, class Alloc> +inline void encode(const std::deque<T,Alloc>& ls, bufferlist& bl) +{ + __u32 n = ls.size(); + encode(n, bl); + for (auto p = ls.begin(); p != ls.end(); ++p) + encode(*p, bl); +} +template<class T, class Alloc> +inline void decode(std::deque<T,Alloc>& ls, bufferlist::const_iterator& p) +{ + __u32 n; + decode(n, p); + ls.clear(); + while (n--) { + ls.emplace_back(); + decode(ls.back(), p); + } +} + +// std::array<T, N> +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl, uint64_t features) +{ + for (const auto& e : v) + encode(e, bl, features); +} +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +encode(const std::array<T, N>& v, bufferlist& bl) +{ + for (const auto& e : v) + encode(e, bl); +} +template<class T, size_t N, typename traits> +inline std::enable_if_t<!traits::supported> +decode(std::array<T, N>& v, bufferlist::const_iterator& p) +{ + for (auto& e : v) + decode(e, p); +} +} + +/* + * guards + */ + +/** + * start encoding block + * + * @param v current (code) version of the encoding + * @param compat oldest code version that can decode it + * @param bl bufferlist to encode to + * + */ +#define ENCODE_START(v, compat, bl) \ + __u8 struct_v = v; \ + __u8 struct_compat = compat; \ + ceph_le32 struct_len; \ + auto filler = (bl).append_hole(sizeof(struct_v) + \ + sizeof(struct_compat) + sizeof(struct_len)); \ + const auto starting_bl_len = (bl).length(); \ + using ::ceph::encode; \ + do { + +/** + * finish encoding block + * + * @param bl bufferlist we were encoding to + * @param new_struct_compat struct-compat value to use + */ +#define ENCODE_FINISH_NEW_COMPAT(bl, new_struct_compat) \ + } while (false); \ + if (new_struct_compat) { \ + struct_compat = new_struct_compat; \ + } \ + struct_len = (bl).length() - starting_bl_len; \ + filler.copy_in(sizeof(struct_v), (char *)&struct_v); \ + filler.copy_in(sizeof(struct_compat), \ + (char *)&struct_compat); \ + filler.copy_in(sizeof(struct_len), (char *)&struct_len); + +#define ENCODE_FINISH(bl) ENCODE_FINISH_NEW_COMPAT(bl, 0) + +#define DECODE_ERR_OLDVERSION(func, v, compatv) \ + (std::string(func) + " no longer understand old encoding version " #v " < " + std::to_string(compatv)) + +#define DECODE_ERR_PAST(func) \ + (std::string(func) + " decode past end of struct encoding") + +/** + * check for very old encoding + * + * If the encoded data is older than oldestv, raise an exception. + * + * @param oldestv oldest version of the code we can successfully decode. + */ +#define DECODE_OLDEST(oldestv) \ + if (struct_v < oldestv) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, oldestv)); + +/** + * start a decoding block + * + * @param v current version of the encoding that the code supports/encodes + * @param bl bufferlist::iterator for the encoded data + */ +#define DECODE_START(v, bl) \ + __u8 struct_v, struct_compat; \ + using ::ceph::decode; \ + decode(struct_v, bl); \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw ::ceph::buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + unsigned struct_end = bl.get_off() + struct_len; \ + do { + +/* BEWARE: any change to this macro MUST be also reflected in the duplicative + * DECODE_START_LEGACY_COMPAT_LEN! */ +#define __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, skip_v, bl) \ + using ::ceph::decode; \ + __u8 struct_v; \ + decode(struct_v, bl); \ + if (struct_v >= compatv) { \ + __u8 struct_compat; \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw buffer::malformed_input(DECODE_ERR_OLDVERSION(__PRETTY_FUNCTION__, v, struct_compat)); \ + } else if (skip_v) { \ + if (bl.get_remaining() < skip_v) \ + throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + bl.advance(skip_v); \ + } \ + unsigned struct_end = 0; \ + if (struct_v >= lenv) { \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + struct_end = bl.get_off() + struct_len; \ + } \ + do { + +/** + * start a decoding block with legacy support for older encoding schemes + * + * The old encoding schemes has a __u8 struct_v only, or lacked either + * the compat version or length. Skip those fields conditionally. + * + * Most of the time, v, compatv, and lenv will all match the version + * where the structure was switched over to the new macros. + * + * @param v current version of the encoding that the code supports/encodes + * @param compatv oldest version that includes a __u8 compat version field + * @param lenv oldest version that includes a __u32 length wrapper + * @param bl bufferlist::iterator containing the encoded data + */ + +/* BEWARE: this is duplication of __DECODE_START_LEGACY_COMPAT_LEN which + * MUST be changed altogether. For the rationale behind code duplication, + * please `git blame` and refer to the commit message. */ +#define DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, bl) \ + using ::ceph::decode; \ + __u8 struct_v; \ + decode(struct_v, bl); \ + if (struct_v >= compatv) { \ + __u8 struct_compat; \ + decode(struct_compat, bl); \ + if (v < struct_compat) \ + throw buffer::malformed_input(DECODE_ERR_OLDVERSION( \ + __PRETTY_FUNCTION__, v, struct_compat)); \ + } \ + unsigned struct_end = 0; \ + if (struct_v >= lenv) { \ + __u32 struct_len; \ + decode(struct_len, bl); \ + if (struct_len > bl.get_remaining()) \ + throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + struct_end = bl.get_off() + struct_len; \ + } \ + do { + +/** + * start a decoding block with legacy support for older encoding schemes + * + * This version of the macro assumes the legacy encoding had a 32 bit + * version + * + * The old encoding schemes has a __u8 struct_v only, or lacked either + * the compat version or length. Skip those fields conditionally. + * + * Most of the time, v, compatv, and lenv will all match the version + * where the structure was switched over to the new macros. + * + * @param v current version of the encoding that the code supports/encodes + * @param compatv oldest version that includes a __u8 compat version field + * @param lenv oldest version that includes a __u32 length wrapper + * @param bl bufferlist::iterator containing the encoded data + */ +#define DECODE_START_LEGACY_COMPAT_LEN_32(v, compatv, lenv, bl) \ + __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 3u, bl) + +#define DECODE_START_LEGACY_COMPAT_LEN_16(v, compatv, lenv, bl) \ + __DECODE_START_LEGACY_COMPAT_LEN(v, compatv, lenv, 1u, bl) + +/** + * finish decode block + * + * @param bl bufferlist::iterator we were decoding from + */ +#define DECODE_FINISH(bl) \ + } while (false); \ + if (struct_end) { \ + if (bl.get_off() > struct_end) \ + throw buffer::malformed_input(DECODE_ERR_PAST(__PRETTY_FUNCTION__)); \ + if (bl.get_off() < struct_end) \ + bl.advance(struct_end - bl.get_off()); \ + } + +namespace ceph { + +/* + * Encoders/decoders to read from current offset in a file handle and + * encode/decode the data according to argument types. + */ +inline ssize_t decode_file(int fd, std::string &str) +{ + bufferlist bl; + __u32 len = 0; + bl.read_fd(fd, sizeof(len)); + decode(len, bl); + bl.read_fd(fd, len); + decode(str, bl); + return bl.length(); +} + +inline ssize_t decode_file(int fd, bufferptr &bp) +{ + bufferlist bl; + __u32 len = 0; + bl.read_fd(fd, sizeof(len)); + decode(len, bl); + bl.read_fd(fd, len); + auto bli = std::cbegin(bl); + + decode(bp, bli); + return bl.length(); +} +} + +#endif diff --git a/src/include/err.h b/src/include/err.h new file mode 100644 index 00000000..ba4b32ae --- /dev/null +++ b/src/include/err.h @@ -0,0 +1,29 @@ +#ifndef CEPH_ERR_H +#define CEPH_ERR_H + +/* + * adapted from linux 2.6.24 include/linux/err.h + */ +#define MAX_ERRNO 4095 +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +#include <errno.h> + +/* this generates a warning in c++; caller can do the cast manually +static inline void *ERR_PTR(long error) +{ + return (void *) error; +} +*/ + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline long IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +#endif diff --git a/src/include/error.h b/src/include/error.h new file mode 100644 index 00000000..a548d975 --- /dev/null +++ b/src/include/error.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include <stdarg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) + +#define ASSERT(c) \ + ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) + +/* print usage error message and exit */ +extern void userror(const char *use, const char *fmt, ...); + +/* print system error message and exit */ +extern void syserror(const char *fmt, ...); + +/* print error message and exit */ +extern void exiterror(const char *fmt, ...); + +/* print error message */ +extern void error(const char *fmt, ...); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/src/include/event_type.h b/src/include/event_type.h new file mode 100644 index 00000000..aa6ddedb --- /dev/null +++ b/src/include/event_type.h @@ -0,0 +1,24 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_EVENT_TYPE_H +#define CEPH_COMMON_EVENT_TYPE_H + +#define EVENT_SOCKET_TYPE_NONE 0 +#define EVENT_SOCKET_TYPE_PIPE 1 +#define EVENT_SOCKET_TYPE_EVENTFD 2 + +#endif diff --git a/src/include/filepath.h b/src/include/filepath.h new file mode 100644 index 00000000..832016ac --- /dev/null +++ b/src/include/filepath.h @@ -0,0 +1,247 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_FILEPATH_H +#define CEPH_FILEPATH_H + +/* + * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. + * -> should it be different? how? should this[0] be "", with depth 4? + * + */ + + +#include <iosfwd> +#include <string> +#include <string_view> +#include <vector> + +#include "buffer.h" +#include "encoding.h" +#include "include/types.h" +#include "include/fs_types.h" + +#include "common/Formatter.h" + + +class filepath { + inodeno_t ino; // base inode. ino=0 implies pure relative path. + string path; // relative path. + + /** bits - path segments + * this is ['a', 'b', 'c'] for both the aboslute and relative case. + * + * NOTE: this value is LAZILY maintained... i.e. it's a cache + */ + mutable vector<string> bits; + bool encoded; + + void rebuild_path() { + path.clear(); + for (unsigned i=0; i<bits.size(); i++) { + if (i) path += "/"; + path += bits[i]; + } + } + void parse_bits() const { + bits.clear(); + int off = 0; + while (off < (int)path.length()) { + int nextslash = path.find('/', off); + if (nextslash < 0) + nextslash = path.length(); // no more slashes + if (((nextslash - off) > 0) || encoded) { + // skip empty components unless they were introduced deliberately + // see commit message for more detail + bits.push_back( path.substr(off,nextslash-off) ); + } + off = nextslash+1; + } + } + + public: + filepath() : ino(0), encoded(false) { } + filepath(std::string_view s, inodeno_t i) : ino(i), path(s), encoded(false) { } + filepath(const string& s, inodeno_t i) : ino(i), path(s), encoded(false) { } + filepath(const char* s, inodeno_t i) : ino(i), path(s), encoded(false) { } + filepath(const filepath& o) { + ino = o.ino; + path = o.path; + bits = o.bits; + encoded = o.encoded; + } + filepath(inodeno_t i) : ino(i), encoded(false) { } + + /* + * if we are fed a relative path as a string, either set ino=0 (strictly + * relative) or 1 (absolute). throw out any leading '/'. + */ + filepath(std::string_view s) : encoded(false) { + set_path(s); + } + filepath(const char *s) : encoded(false) { + set_path(std::string_view(s)); + } + + void set_path(std::string_view s, inodeno_t b) { + path = s; + ino = b; + } + void set_path(std::string_view s) { + if (s[0] == '/') { + path = s.substr(1); + ino = 1; + } else { + ino = 0; + path = s; + } + bits.clear(); + } + + + // accessors + inodeno_t get_ino() const { return ino; } + const string& get_path() const { return path; } + const char *c_str() const { return path.c_str(); } + + int length() const { return path.length(); } + unsigned depth() const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits.size(); + } + bool empty() const { return path.length() == 0 && ino == 0; } + + bool absolute() const { return ino == 1; } + bool pure_relative() const { return ino == 0; } + bool ino_relative() const { return ino > 0; } + + const string& operator[](int i) const { + if (bits.empty() && path.length() > 0) parse_bits(); + return bits[i]; + } + + const string& last_dentry() const { + if (bits.empty() && path.length() > 0) parse_bits(); + ceph_assert(!bits.empty()); + return bits[ bits.size()-1 ]; + } + + filepath prefixpath(int s) const { + filepath t(ino); + for (int i=0; i<s; i++) + t.push_dentry(bits[i]); + return t; + } + filepath postfixpath(int s) const { + filepath t; + for (unsigned i=s; i<bits.size(); i++) + t.push_dentry(bits[i]); + return t; + } + + + // modifiers + // string can be relative "a/b/c" (ino=0) or absolute "/a/b/c" (ino=1) + void _set_ino(inodeno_t i) { ino = i; } + void clear() { + ino = 0; + path = ""; + bits.clear(); + } + + void pop_dentry() { + if (bits.empty() && path.length() > 0) + parse_bits(); + bits.pop_back(); + rebuild_path(); + } + void push_dentry(std::string_view s) { + if (bits.empty() && path.length() > 0) + parse_bits(); + if (!bits.empty()) + path += "/"; + path += s; + bits.emplace_back(s); + } + void push_dentry(const string& s) { + push_dentry(std::string_view(s)); + } + void push_dentry(const char *cs) { + push_dentry(std::string_view(cs, strlen(cs))); + } + void push_front_dentry(const string& s) { + bits.insert(bits.begin(), s); + rebuild_path(); + } + void append(const filepath& a) { + ceph_assert(a.pure_relative()); + for (unsigned i=0; i<a.depth(); i++) + push_dentry(a[i]); + } + + // encoding + void encode(bufferlist& bl) const { + using ceph::encode; + __u8 struct_v = 1; + encode(struct_v, bl); + encode(ino, bl); + encode(path, bl); + } + void decode(bufferlist::const_iterator& blp) { + using ceph::decode; + bits.clear(); + __u8 struct_v; + decode(struct_v, blp); + decode(ino, blp); + decode(path, blp); + encoded = true; + } + void dump(Formatter *f) const { + f->dump_unsigned("base_ino", ino); + f->dump_string("relative_path", path); + } + static void generate_test_instances(list<filepath*>& o) { + o.push_back(new filepath); + o.push_back(new filepath("/usr/bin", 0)); + o.push_back(new filepath("/usr/sbin", 1)); + o.push_back(new filepath("var/log", 1)); + o.push_back(new filepath("foo/bar", 101)); + } + + bool is_last_dot_or_dotdot() const { + if (depth() > 0) { + std::string dname = last_dentry(); + if (dname == "." || dname == "..") { + return true; + } + } + + return false; + } +}; + +WRITE_CLASS_ENCODER(filepath) + +inline ostream& operator<<(ostream& out, const filepath& path) +{ + if (path.get_ino()) { + out << '#' << path.get_ino(); + if (path.length()) + out << '/'; + } + return out << path.get_path(); +} + +#endif diff --git a/src/include/frag.h b/src/include/frag.h new file mode 100644 index 00000000..5e8b154f --- /dev/null +++ b/src/include/frag.h @@ -0,0 +1,602 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_FRAG_H +#define CEPH_FRAG_H + +#include <boost/container/small_vector.hpp> + +#include <iostream> + +#include <stdint.h> +#include <stdio.h> + +#include "buffer.h" +#include "compact_map.h" + +#include "ceph_frag.h" +#include "include/encoding.h" +#include "include/ceph_assert.h" + +#include "common/dout.h" + +/* + * + * the goal here is to use a binary split strategy to partition a namespace. + * frag_t represents a particular fragment. bits() tells you the size of the + * fragment, and value() it's name. this is roughly analogous to an ip address + * and netmask. + * + * fragtree_t represents an entire namespace and it's partition. it essentially + * tells you where fragments are split into other fragments, and by how much + * (i.e. by how many bits, resulting in a power of 2 number of child fragments). + * + * this vaguely resembles a btree, in that when a fragment becomes large or small + * we can split or merge, except that there is no guarantee of being balanced. + * + * presumably we are partitioning the output of a (perhaps specialized) hash + * function. + */ + +/** + * frag_t + * + * description of an individual fragment. that is, a particular piece + * of the overall namespace. + * + * this is conceptually analogous to an ip address and netmask. + * + * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). + * + * we write it as v/b, where v is a value and b is the number of bits. + * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, + * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. + * + * this makes the right most bit of v the "most significant", which is the + * opposite of what we usually see. + */ + +/* + * TODO: + * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) + * iteration efficient (see, e.g., try_assimilate_children() + * - rework frag_t so that we mask the left-most (most significant) bits instead of + * the right-most (least significant) bits. just because it's more intuitive, and + * matches the network/netmask concept. + */ + +class frag_t { + /* + * encoding is dictated by frag_* functions in ceph_fs.h. use those + * helpers _exclusively_. + */ +public: + using _frag_t = uint32_t; + + frag_t() = default; + frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } + frag_t(_frag_t e) : _enc(e) { } + + // constructors + void from_unsigned(unsigned e) { _enc = e; } + + // accessors + unsigned value() const { return ceph_frag_value(_enc); } + unsigned bits() const { return ceph_frag_bits(_enc); } + unsigned mask() const { return ceph_frag_mask(_enc); } + unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } + + operator _frag_t() const { return _enc; } + + // tests + bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } + bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } + bool is_root() const { return bits() == 0; } + frag_t parent() const { + ceph_assert(bits() > 0); + return frag_t(ceph_frag_parent(_enc)); + } + + // splitting + frag_t make_child(int i, int nb) const { + ceph_assert(i < (1<<nb)); + return frag_t(ceph_frag_make_child(_enc, nb, i)); + } + template<typename T> + void split(int nb, T& fragments) const { + ceph_assert(nb > 0); + unsigned nway = 1 << nb; + for (unsigned i=0; i<nway; i++) + fragments.push_back(make_child(i, nb)); + } + + // binary splitting + frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); } + frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); } + + bool is_left() const { return ceph_frag_is_left_child(_enc); } + bool is_right() const { return ceph_frag_is_right_child(_enc); } + frag_t get_sibling() const { + ceph_assert(!is_root()); + return frag_t(ceph_frag_sibling(_enc)); + } + + // sequencing + bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); } + bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); } + frag_t next() const { + ceph_assert(!is_rightmost()); + return frag_t(ceph_frag_next(_enc)); + } + + // parse + bool parse(const char *s) { + int pvalue, pbits; + int r = sscanf(s, "%x/%d", &pvalue, &pbits); + if (r == 2) { + *this = frag_t(pvalue, pbits); + return true; + } + return false; + } + + void encode(bufferlist& bl) const { + encode_raw(_enc, bl); + } + void decode(bufferlist::const_iterator& p) { + __u32 v; + decode_raw(v, p); + _enc = v; + } + +private: + _frag_t _enc = 0; +}; + +inline std::ostream& operator<<(std::ostream& out, const frag_t& hb) +{ + //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '='; + unsigned num = hb.bits(); + if (num) { + unsigned val = hb.value(); + for (unsigned bit = 23; num; num--, bit--) + out << ((val & (1<<bit)) ? '1':'0'); + } + return out << '*'; +} + +inline void encode(const frag_t &f, bufferlist& bl) { f.encode(bl); } +inline void decode(frag_t &f, bufferlist::const_iterator& p) { f.decode(p); } + +using frag_vec_t = boost::container::small_vector<frag_t, 4>; + +/** + * fragtree_t -- partition an entire namespace into one or more frag_t's. + */ +class fragtree_t { + // pairs <f, b>: + // frag_t f is split by b bits. + // if child frag_t does not appear, it is not split. +public: + compact_map<frag_t,int32_t> _splits; + +public: + // ------------- + // basics + void swap(fragtree_t& other) { + _splits.swap(other._splits); + } + void clear() { + _splits.clear(); + } + + // ------------- + // accessors + bool empty() const { + return _splits.empty(); + } + int get_split(const frag_t hb) const { + compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb); + if (p == _splits.end()) + return 0; + else + return p->second; + } + + + bool is_leaf(frag_t x) const { + frag_vec_t s; + get_leaves_under(x, s); + //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; + return s.size() == 1 && s.front() == x; + } + + /** + * get_leaves -- list all leaves + */ + template<typename T> + void get_leaves(T& c) const { + return get_leaves_under_split(frag_t(), c); + } + + /** + * get_leaves_under_split -- list all leaves under a known split point (or root) + */ + template<typename T> + void get_leaves_under_split(frag_t under, T& c) const { + frag_vec_t s; + s.push_back(under); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * get_branch -- get branch point at OR above frag @a x + * - may be @a x itself, if @a x is a split + * - may be root (frag_t()) + */ + frag_t get_branch(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + if (get_split(x)) return x; // found it! + x = x.parent(); + } + } + + /** + * get_branch_above -- get a branch point above frag @a x + * - may be root (frag_t()) + * - may NOT be @a x, even if @a x is a split. + */ + frag_t get_branch_above(frag_t x) const { + while (1) { + if (x == frag_t()) return x; // root + x = x.parent(); + if (get_split(x)) return x; // found it! + } + } + + + /** + * get_branch_or_leaf -- get branch or leaf point parent for frag @a x + * - may be @a x itself, if @a x is a split or leaf + * - may be root (frag_t()) + */ + frag_t get_branch_or_leaf(frag_t x) const { + frag_t branch = get_branch(x); + int nb = get_split(branch); + if (nb > 0 && // if branch is a split, and + branch.bits() + nb <= x.bits()) // one of the children is or contains x + return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) + else + return branch; + } + + /** + * get_leaves_under(x, ls) -- search for any leaves fully contained by x + */ + template<typename T> + void get_leaves_under(frag_t x, T& c) const { + frag_vec_t s; + s.push_back(get_branch_or_leaf(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) + t.split(nb, s); // queue up children + else if (x.contains(t)) + c.push_back(t); // not spit, it's a leaf. + } + } + + /** + * contains(fg) -- does fragtree contain the specific frag @a x + */ + bool contains(frag_t x) const { + frag_vec_t s; + s.push_back(get_branch(x)); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + if (t.bits() >= x.bits() && // if t is more specific than x, and + !x.contains(t)) // x does not contain t, + continue; // then skip + int nb = get_split(t); + if (nb) { + if (t == x) return false; // it's split. + t.split(nb, s); // queue up children + } else { + if (t == x) return true; // it's there. + } + } + return false; + } + + /** + * operator[] -- map a (hash?) value to a frag + */ + frag_t operator[](unsigned v) const { + frag_t t; + while (1) { + ceph_assert(t.contains(v)); + int nb = get_split(t); + + // is this a leaf? + if (nb == 0) return t; // done. + + // pick appropriate child fragment. + unsigned nway = 1 << nb; + unsigned i; + for (i=0; i<nway; i++) { + frag_t n = t.make_child(i, nb); + if (n.contains(v)) { + t = n; + break; + } + } + ceph_assert(i < nway); + } + } + + + // --------------- + // modifiers + void split(frag_t x, int b, bool simplify=true) { + ceph_assert(is_leaf(x)); + _splits[x] = b; + + if (simplify) + try_assimilate_children(get_branch_above(x)); + } + void merge(frag_t x, int b, bool simplify=true) { + ceph_assert(!is_leaf(x)); + ceph_assert(_splits[x] == b); + _splits.erase(x); + + if (simplify) + try_assimilate_children(get_branch_above(x)); + } + + /* + * if all of a given split's children are identically split, + * then the children can be assimilated. + */ + void try_assimilate_children(frag_t x) { + int nb = get_split(x); + if (!nb) return; + frag_vec_t children; + x.split(nb, children); + int childbits = 0; + for (auto& frag : children) { + int cb = get_split(frag); + if (!cb) return; // nope. + if (childbits && cb != childbits) return; // not the same + childbits = cb; + } + // all children are split with childbits! + for (auto& frag : children) + _splits.erase(frag); + _splits[x] += childbits; + } + + bool force_to_leaf(CephContext *cct, frag_t x) { + if (is_leaf(x)) + return false; + + lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl; + + frag_t parent = get_branch_or_leaf(x); + ceph_assert(parent.bits() <= x.bits()); + lgeneric_dout(cct, 10) << "parent is " << parent << dendl; + + // do we need to split from parent to x? + if (parent.bits() < x.bits()) { + int spread = x.bits() - parent.bits(); + int nb = get_split(parent); + lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl; + if (nb == 0) { + // easy: split parent (a leaf) by the difference + lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl; + split(parent, spread); + ceph_assert(is_leaf(x)); + return true; + } + ceph_assert(nb > spread); + + // add an intermediary split + merge(parent, nb, false); + split(parent, spread, false); + + frag_vec_t subs; + parent.split(spread, subs); + for (auto& frag : subs) { + lgeneric_dout(cct, 10) << "splitting intermediate " << frag << " by " << (nb-spread) << dendl; + split(frag, nb - spread, false); + } + } + + // x is now a leaf or split. + // hoover up any children. + frag_vec_t s; + s.push_back(x); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + int nb = get_split(t); + if (nb) { + lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; + merge(t, nb, false); // merge this point, and + t.split(nb, s); // queue up children + } + } + + lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; + ceph_assert(is_leaf(x)); + return true; + } + + // encoding + void encode(bufferlist& bl) const { + using ceph::encode; + encode(_splits, bl); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + decode(_splits, p); + } + void encode_nohead(bufferlist& bl) const { + using ceph::encode; + for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin(); + p != _splits.end(); + ++p) { + encode(p->first, bl); + encode(p->second, bl); + } + } + void decode_nohead(int n, bufferlist::const_iterator& p) { + using ceph::decode; + _splits.clear(); + while (n-- > 0) { + frag_t f; + decode(f, p); + decode(_splits[f], p); + } + } + + void print(std::ostream& out) { + out << "fragtree_t("; + frag_vec_t s; + s.push_back(frag_t()); + while (!s.empty()) { + frag_t t = s.back(); + s.pop_back(); + // newline + indent? + if (t.bits()) { + out << std::endl; + for (unsigned i=0; i<t.bits(); i++) out << ' '; + } + int nb = get_split(t); + if (nb) { + out << t << " %" << nb; + t.split(nb, s); // queue up children + } else { + out << t; + } + } + out << ")"; + } + + void dump(Formatter *f) const { + f->open_array_section("splits"); + for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin(); + p != _splits.end(); + ++p) { + f->open_object_section("split"); + std::ostringstream frag_str; + frag_str << p->first; + f->dump_string("frag", frag_str.str()); + f->dump_int("children", p->second); + f->close_section(); // split + } + f->close_section(); // splits + } +}; +WRITE_CLASS_ENCODER(fragtree_t) + +inline bool operator==(const fragtree_t& l, const fragtree_t& r) { + return l._splits == r._splits; +} +inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { + return l._splits != r._splits; +} + +inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) +{ + out << "fragtree_t("; + + for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin(); + p != ft._splits.end(); + ++p) { + if (p != ft._splits.begin()) + out << " "; + out << p->first << "^" << p->second; + } + return out << ")"; +} + + +/** + * fragset_t -- a set of fragments + */ +class fragset_t { + std::set<frag_t> _set; + +public: + const std::set<frag_t> &get() const { return _set; } + std::set<frag_t>::iterator begin() { return _set.begin(); } + std::set<frag_t>::iterator end() { return _set.end(); } + + bool empty() const { return _set.empty(); } + + bool contains(frag_t f) const { + while (1) { + if (_set.count(f)) return true; + if (f.bits() == 0) return false; + f = f.parent(); + } + } + + void insert(frag_t f) { + _set.insert(f); + simplify(); + } + + void simplify() { + while (1) { + bool clean = true; + std::set<frag_t>::iterator p = _set.begin(); + while (p != _set.end()) { + if (!p->is_root() && + _set.count(p->get_sibling())) { + _set.erase(p->get_sibling()); + _set.insert(p->parent()); + _set.erase(p++); + clean = false; + } else { + p++; + } + } + if (clean) + break; + } + } +}; + +inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) +{ + return out << "fragset_t(" << fs.get() << ")"; +} + +#endif diff --git a/src/include/fs_types.h b/src/include/fs_types.h new file mode 100644 index 00000000..2132db9a --- /dev/null +++ b/src/include/fs_types.h @@ -0,0 +1,126 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_INCLUDE_FS_TYPES_H +#define CEPH_INCLUDE_FS_TYPES_H + +#include "types.h" + +// -------------------------------------- +// ino + +typedef uint64_t _inodeno_t; + +struct inodeno_t { + _inodeno_t val; + inodeno_t() : val(0) {} + // cppcheck-suppress noExplicitConstructor + inodeno_t(_inodeno_t v) : val(v) {} + inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } + operator _inodeno_t() const { return val; } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(val, bl); + } + void decode(bufferlist::const_iterator& p) { + using ceph::decode; + decode(val, p); + } +} __attribute__ ((__may_alias__)); +WRITE_CLASS_ENCODER(inodeno_t) + +template<> +struct denc_traits<inodeno_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const inodeno_t &o, size_t& p) { + denc(o.val, p); + } + static void encode(const inodeno_t &o, buffer::list::contiguous_appender& p) { + denc(o.val, p); + } + static void decode(inodeno_t& o, buffer::ptr::const_iterator &p) { + denc(o.val, p); + } +}; + +inline ostream& operator<<(ostream& out, const inodeno_t& ino) { + return out << hex << "0x" << ino.val << dec; +} + +namespace std { + template<> struct hash< inodeno_t > + { + size_t operator()( const inodeno_t& x ) const + { + static rjhash<uint64_t> H; + return H(x.val); + } + }; +} // namespace std + + +// file modes + +inline bool file_mode_is_readonly(int mode) { + return (mode & CEPH_FILE_MODE_WR) == 0; +} + + +// dentries +#define MAX_DENTRY_LEN 255 + +// -- +namespace ceph { + class Formatter; +} +void dump(const ceph_file_layout& l, ceph::Formatter *f); +void dump(const ceph_dir_layout& l, ceph::Formatter *f); + + + +// file_layout_t + +struct file_layout_t { + // file -> object mapping + uint32_t stripe_unit; ///< stripe unit, in bytes, + uint32_t stripe_count; ///< over this many objects + uint32_t object_size; ///< until objects are this big + + int64_t pool_id; ///< rados pool id + string pool_ns; ///< rados pool namespace + + file_layout_t(uint32_t su=0, uint32_t sc=0, uint32_t os=0) + : stripe_unit(su), + stripe_count(sc), + object_size(os), + pool_id(-1) { + } + + static file_layout_t get_default() { + return file_layout_t(1<<22, 1, 1<<22); + } + + uint64_t get_period() const { + return static_cast<uint64_t>(stripe_count) * object_size; + } + + void from_legacy(const ceph_file_layout& fl); + void to_legacy(ceph_file_layout *fl) const; + + bool is_valid() const; + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list<file_layout_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(file_layout_t) + +WRITE_EQ_OPERATORS_5(file_layout_t, stripe_unit, stripe_count, object_size, pool_id, pool_ns); + +ostream& operator<<(ostream& out, const file_layout_t &layout); + +#endif diff --git a/src/include/hash.h b/src/include/hash.h new file mode 100644 index 00000000..2ab95448 --- /dev/null +++ b/src/include/hash.h @@ -0,0 +1,64 @@ +#ifndef CEPH_HASH_H +#define CEPH_HASH_H + +#include "acconfig.h" + +// Robert Jenkins' function for mixing 32-bit values +// http://burtleburtle.net/bob/hash/evahash.html +// a, b = random bits, c = input and output + +#define hashmix(a,b,c) \ + a=a-b; a=a-c; a=a^(c>>13); \ + b=b-c; b=b-a; b=b^(a<<8); \ + c=c-a; c=c-b; c=c^(b>>13); \ + a=a-b; a=a-c; a=a^(c>>12); \ + b=b-c; b=b-a; b=b^(a<<16); \ + c=c-a; c=c-b; c=c^(b>>5); \ + a=a-b; a=a-c; a=a^(c>>3); \ + b=b-c; b=b-a; b=b^(a<<10); \ + c=c-a; c=c-b; c=c^(b>>15); + + +//namespace ceph { + +template <class _Key> struct rjhash { }; + +inline uint64_t rjhash64(uint64_t key) { + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; +} + +inline uint32_t rjhash32(uint32_t a) { + a = (a+0x7ed55d16) + (a<<12); + a = (a^0xc761c23c) ^ (a>>19); + a = (a+0x165667b1) + (a<<5); + a = (a+0xd3a2646c) ^ (a<<9); + a = (a+0xfd7046c5) + (a<<3); + a = (a^0xb55a4f09) ^ (a>>16); + return a; +} + + +template<> struct rjhash<uint32_t> { + inline size_t operator()(const uint32_t x) const { + return rjhash32(x); + } +}; + +template<> struct rjhash<uint64_t> { + inline size_t operator()(const uint64_t x) const { + return rjhash64(x); + } +}; + +//} + + + +#endif diff --git a/src/include/health.h b/src/include/health.h new file mode 100644 index 00000000..5c00225e --- /dev/null +++ b/src/include/health.h @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include <ostream> +#include <string> + +#include "include/encoding.h" + +// health_status_t +enum health_status_t { + HEALTH_ERR = 0, + HEALTH_WARN = 1, + HEALTH_OK = 2, +}; + +inline void encode(health_status_t hs, bufferlist& bl) { + using ceph::encode; + uint8_t v = hs; + encode(v, bl); +} +inline void decode(health_status_t& hs, bufferlist::const_iterator& p) { + using ceph::decode; + uint8_t v; + decode(v, p); + hs = health_status_t(v); +} +template<> +struct denc_traits<health_status_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = false; + static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) { + p++; + } + static void encode(const health_status_t& v, + buffer::list::contiguous_appender& p, + uint64_t f=0) { + ::denc((uint8_t)v, p); + } + static void decode(health_status_t& v, buffer::ptr::const_iterator& p, + uint64_t f=0) { + uint8_t tmp; + ::denc(tmp, p); + v = health_status_t(tmp); + } + static void decode(health_status_t& v, buffer::list::const_iterator& p, + uint64_t f=0) { + uint8_t tmp; + ::denc(tmp, p); + v = health_status_t(tmp); + } +}; + +inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) { + switch (status) { + case HEALTH_ERR: + oss << "HEALTH_ERR"; + break; + case HEALTH_WARN: + oss << "HEALTH_WARN"; + break; + case HEALTH_OK: + oss << "HEALTH_OK"; + break; + } + return oss; +} diff --git a/src/include/inline_memory.h b/src/include/inline_memory.h new file mode 100644 index 00000000..48d88976 --- /dev/null +++ b/src/include/inline_memory.h @@ -0,0 +1,150 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_INLINE_MEMORY_H +#define CEPH_INLINE_MEMORY_H + +#if defined(__GNUC__) + +// optimize for the common case, which is very small copies +static inline void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) + __attribute__((always_inline)); + +void *maybe_inline_memcpy(void *dest, const void *src, size_t l, + size_t inline_len) +{ + if (l > inline_len) { + return memcpy(dest, src, l); + } + switch (l) { + case 8: + return __builtin_memcpy(dest, src, 8); + case 4: + return __builtin_memcpy(dest, src, 4); + case 3: + return __builtin_memcpy(dest, src, 3); + case 2: + return __builtin_memcpy(dest, src, 2); + case 1: + return __builtin_memcpy(dest, src, 1); + default: + int cursor = 0; + while (l >= sizeof(uint64_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint64_t)); + cursor += sizeof(uint64_t); + l -= sizeof(uint64_t); + } + while (l >= sizeof(uint32_t)) { + __builtin_memcpy((char*)dest + cursor, (char*)src + cursor, + sizeof(uint32_t)); + cursor += sizeof(uint32_t); + l -= sizeof(uint32_t); + } + while (l > 0) { + *((char*)dest + cursor) = *((char*)src + cursor); + cursor++; + l--; + } + } + return dest; +} + +#else + +#define maybe_inline_memcpy(d, s, l, x) memcpy(d, s, l) + +#endif + + +#if defined(__GNUC__) && defined(__x86_64__) + +namespace ceph { +typedef unsigned uint128_t __attribute__ ((mode (TI))); +} +using ceph::uint128_t; + +static inline bool mem_is_zero(const char *data, size_t len) + __attribute__((always_inline)); + +bool mem_is_zero(const char *data, size_t len) +{ + // we do have XMM registers in x86-64, so if we need to check at least + // 16 bytes, make use of them + if (len / sizeof(uint128_t) > 0) { + // align data pointer to 16 bytes, otherwise it'll segfault due to bug + // in (at least some) GCC versions (using MOVAPS instead of MOVUPS). + // check up to 15 first bytes while at it. + while (((unsigned long long)data) & 15) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + --len; + } + + const char* data_start = data; + const char* max128 = data + (len / sizeof(uint128_t))*sizeof(uint128_t); + + while (data < max128) { + if (*(uint128_t*)data != 0) { + return false; + } + data += sizeof(uint128_t); + } + len -= (data - data_start); + } + + const char* max = data + len; + const char* max32 = data + (len / sizeof(uint32_t))*sizeof(uint32_t); + while (data < max32) { + if (*(uint32_t*)data != 0) { + return false; + } + data += sizeof(uint32_t); + } + while (data < max) { + if (*(uint8_t*)data != 0) { + return false; + } + data += sizeof(uint8_t); + } + return true; +} + +#else // gcc and x86_64 + +static inline bool mem_is_zero(const char *data, size_t len) { + const char *end = data + len; + const char* end64 = data + (len / sizeof(uint64_t))*sizeof(uint64_t); + + while (data < end64) { + if (*(uint64_t*)data != 0) { + return false; + } + data += sizeof(uint64_t); + } + + while (data < end) { + if (*data != 0) { + return false; + } + ++data; + } + return true; +} + +#endif // !x86_64 + +#endif diff --git a/src/include/int_types.h b/src/include/int_types.h new file mode 100644 index 00000000..56b2723f --- /dev/null +++ b/src/include/int_types.h @@ -0,0 +1,65 @@ +#ifndef CEPH_INTTYPES_H +#define CEPH_INTTYPES_H + +#include "acconfig.h" + +#include <inttypes.h> + +#ifdef HAVE_LINUX_TYPES_H +#include <linux/types.h> +#else +#ifndef HAVE___U8 +typedef uint8_t __u8; +#endif + +#ifndef HAVE___S8 +typedef int8_t __s8; +#endif + +#ifndef HAVE___U16 +typedef uint16_t __u16; +#endif + +#ifndef HAVE___S16 +typedef int16_t __s16; +#endif + +#ifndef HAVE___U32 +typedef uint32_t __u32; +#endif + +#ifndef HAVE___S32 +typedef int32_t __s32; +#endif + +#ifndef HAVE___U64 +typedef uint64_t __u64; +#endif + +#ifndef HAVE___S64 +typedef int64_t __s64; +#endif +#endif /* LINUX_TYPES_H */ + +#define __bitwise__ + +typedef __u16 __bitwise__ __le16; +typedef __u16 __bitwise__ __be16; +typedef __u32 __bitwise__ __le32; +typedef __u32 __bitwise__ __be32; +typedef __u64 __bitwise__ __le64; +typedef __u64 __bitwise__ __be64; + +#ifndef BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS +#define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS +#endif + +#ifndef BOOST_MPL_LIMIT_VECTOR_SIZE +#define BOOST_MPL_LIMIT_VECTOR_SIZE 30 // or whatever you need +#endif + +#ifndef BOOST_MPL_LIMIT_MAP_SIZE +#define BOOST_MPL_LIMIT_MAP_SIZE 30 // or whatever you need +#endif + +#endif diff --git a/src/include/intarith.h b/src/include/intarith.h new file mode 100644 index 00000000..e912cbe7 --- /dev/null +++ b/src/include/intarith.h @@ -0,0 +1,193 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INTARITH_H +#define CEPH_INTARITH_H + +#include <type_traits> + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> div_round_up(T n, U d) { + return (n + d - 1) / d; +} + + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> round_up_to(T n, U d) { + return (n % d ? (n + d - n % d) : n); +} + +template<typename T, typename U> +constexpr inline std::make_unsigned_t<std::common_type_t<T, U>> shift_round_up(T x, U y) { + return (x + (1 << y) - 1) >> y; +} + +/* + * Wrapper to determine if value is a power of 2 + */ +template<typename T> +constexpr inline bool isp2(T x) { + return (x & (x - 1)) == 0; +} + +/* + * Wrappers for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, p2align(1200, 1024) == 1024 (1*align) + * eg, p2align(1024, 1024) == 1024 (1*align) + * eg, p2align(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, p2align(0x5600, 0x100) == 0x5600 (0x56*align) + */ +template<typename T> +constexpr inline T p2align(T x, T align) { + return x & -align; +} + +/* + * return x % (mod) align + * eg, p2phase(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, p2phase(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +template<typename T> +constexpr inline T p2phase(T x, T align) { + return x & (align - 1); +} + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, p2nphase(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, p2nphase(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +template<typename T> +constexpr inline T p2nphase(T x, T align) { + return -x & (align - 1); +} + +/* + * return x rounded up to an align boundary + * eg, p2roundup(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, p2roundup(0x5600, 0x100) == 0x5600 (0x56*align) + */ +template<typename T> +constexpr inline T p2roundup(T x, T align) { + return -(-x & -align); +} + +// count trailing zeros. +// NOTE: the builtin is nondeterministic on 0 input +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) <= sizeof(unsigned)), + unsigned>::type ctz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_ctz(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned int) && + sizeof(T) <= sizeof(unsigned long)), + unsigned>::type ctz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_ctzl(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned long) && + sizeof(T) <= sizeof(unsigned long long)), + unsigned>::type ctz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_ctzll(v); +} + +// count leading zeros +// NOTE: the builtin is nondeterministic on 0 input +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) <= sizeof(unsigned)), + unsigned>::type clz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_clz(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned int) && + sizeof(T) <= sizeof(unsigned long)), + unsigned>::type clz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_clzl(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned long) && + sizeof(T) <= sizeof(unsigned long long)), + unsigned>::type clz(T v) { + if (v == 0) + return sizeof(v) * 8; + return __builtin_clzll(v); +} + +// count bits (set + any 0's that follow) +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) <= sizeof(unsigned)), + unsigned>::type cbits(T v) { + if (v == 0) + return 0; + return (sizeof(v) * 8) - __builtin_clz(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned int) && + sizeof(T) <= sizeof(unsigned long)), + unsigned>::type cbits(T v) { + if (v == 0) + return 0; + return (sizeof(v) * 8) - __builtin_clzl(v); +} + +template<class T> + inline typename std::enable_if< + (std::is_integral<T>::value && + sizeof(T) > sizeof(unsigned long) && + sizeof(T) <= sizeof(unsigned long long)), + unsigned>::type cbits(T v) { + if (v == 0) + return 0; + return (sizeof(v) * 8) - __builtin_clzll(v); +} + +#endif diff --git a/src/include/interval_set.h b/src/include/interval_set.h new file mode 100644 index 00000000..4fb6be45 --- /dev/null +++ b/src/include/interval_set.h @@ -0,0 +1,783 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_INTERVAL_SET_H +#define CEPH_INTERVAL_SET_H + +#include <iterator> +#include <map> +#include <ostream> + +#include "encoding.h" + +/* + * *** NOTE *** + * + * This class is written to work with a variety of map-like containers, + * *include* ones that invalidate iterators when they are modified (e.g., + * flat_map and btree_map). + */ + +template<typename T, typename Map = std::map<T,T>> +class interval_set { + public: + using value_type = T; + + class const_iterator; + + class iterator : public std::iterator <std::forward_iterator_tag, T> + { + public: + explicit iterator(typename Map::iterator iter) + : _iter(iter) + { } + + // For the copy constructor and assignment operator, the compiler-generated functions, which + // perform simple bitwise copying, should be fine. + + bool operator==(const iterator& rhs) const { + return (_iter == rhs._iter); + } + + bool operator!=(const iterator& rhs) const { + return (_iter != rhs._iter); + } + + // Dereference this iterator to get a pair. + std::pair < T, T > &operator*() { + return *_iter; + } + + // Return the interval start. + T get_start() const { + return _iter->first; + } + + // Return the interval length. + T get_len() const { + return _iter->second; + } + T get_end() const { + return _iter->first + _iter->second; + } + + // Set the interval length. + void set_len(T len) { + _iter->second = len; + } + + // Preincrement + iterator &operator++() + { + ++_iter; + return *this; + } + + // Postincrement + iterator operator++(int) + { + iterator prev(_iter); + ++_iter; + return prev; + } + + friend class interval_set<T,Map>::const_iterator; + + protected: + typename Map::iterator _iter; + friend class interval_set<T,Map>; + }; + + class const_iterator : public std::iterator <std::forward_iterator_tag, T> + { + public: + explicit const_iterator(typename Map::const_iterator iter) + : _iter(iter) + { } + + const_iterator(const iterator &i) + : _iter(i._iter) + { } + + // For the copy constructor and assignment operator, the compiler-generated functions, which + // perform simple bitwise copying, should be fine. + + bool operator==(const const_iterator& rhs) const { + return (_iter == rhs._iter); + } + + bool operator!=(const const_iterator& rhs) const { + return (_iter != rhs._iter); + } + + // Dereference this iterator to get a pair. + std::pair < T, T > operator*() const { + return *_iter; + } + + // Return the interval start. + T get_start() const { + return _iter->first; + } + T get_end() const { + return _iter->first + _iter->second; + } + + // Return the interval length. + T get_len() const { + return _iter->second; + } + + // Preincrement + const_iterator &operator++() + { + ++_iter; + return *this; + } + + // Postincrement + const_iterator operator++(int) + { + const_iterator prev(_iter); + ++_iter; + return prev; + } + + protected: + typename Map::const_iterator _iter; + }; + + interval_set() : _size(0) {} + interval_set(Map& other) { + m.swap(other); + _size = 0; + for (auto& i : m) { + _size += i.second; + } + } + + int num_intervals() const + { + return m.size(); + } + + typename interval_set<T,Map>::iterator begin() { + return typename interval_set<T,Map>::iterator(m.begin()); + } + + typename interval_set<T,Map>::iterator lower_bound(T start) { + return typename interval_set<T,Map>::iterator(find_inc_m(start)); + } + + typename interval_set<T,Map>::iterator end() { + return typename interval_set<T,Map>::iterator(m.end()); + } + + typename interval_set<T,Map>::const_iterator begin() const { + return typename interval_set<T,Map>::const_iterator(m.begin()); + } + + typename interval_set<T,Map>::const_iterator lower_bound(T start) const { + return typename interval_set<T,Map>::const_iterator(find_inc(start)); + } + + typename interval_set<T,Map>::const_iterator end() const { + return typename interval_set<T,Map>::const_iterator(m.end()); + } + + // helpers + private: + typename Map::const_iterator find_inc(T start) const { + typename Map::const_iterator p = m.lower_bound(start); // p->first >= start + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might overlap? + if (p->first + p->second <= start) + p++; // it doesn't. + } + return p; + } + + typename Map::iterator find_inc_m(T start) { + typename Map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might overlap? + if (p->first + p->second <= start) + p++; // it doesn't. + } + return p; + } + + typename Map::const_iterator find_adj(T start) const { + typename Map::const_iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might touch? + if (p->first + p->second < start) + p++; // it doesn't. + } + return p; + } + + typename Map::iterator find_adj_m(T start) { + typename Map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might touch? + if (p->first + p->second < start) + p++; // it doesn't. + } + return p; + } + + void intersection_size_asym(const interval_set &s, const interval_set &l) { + typename decltype(m)::const_iterator ps = s.m.begin(), pl; + ceph_assert(ps != s.m.end()); + T offset = ps->first; + bool first = true; + typename decltype(m)::iterator mi = m.begin(); + + while (1) { + if (first) + first = false; + pl = l.find_inc(offset); + if (pl == l.m.end()) + break; + while (ps != s.m.end() && ps->first + ps->second <= pl->first) + ++ps; + if (ps == s.m.end()) + break; + offset = pl->first + pl->second; + if (offset <= ps->first) { + offset = ps->first; + continue; + } + + if (*ps == *pl) { + do { + mi = m.insert(mi, *ps); + _size += ps->second; + ++ps; + ++pl; + } while (ps != s.m.end() && pl != l.m.end() && *ps == *pl); + if (ps == s.m.end()) + break; + offset = ps->first; + continue; + } + + T start = std::max<T>(ps->first, pl->first); + T en = std::min<T>(ps->first + ps->second, offset); + ceph_assert(en > start); + typename decltype(m)::value_type i{start, en - start}; + mi = m.insert(mi, i); + _size += i.second; + if (ps->first + ps->second <= offset) { + ++ps; + if (ps == s.m.end()) + break; + offset = ps->first; + } + } + } + + bool subset_size_sym(const interval_set &b) const { + auto pa = m.begin(), pb = b.m.begin(); + const auto a_end = m.end(), b_end = b.m.end(); + + while (pa != a_end && pb != b_end) { + while (pb->first + pb->second <= pa->first) { + ++pb; + if (pb == b_end) + return false; + } + + if (*pa == *pb) { + do { + ++pa; + ++pb; + } while (pa != a_end && pb != b_end && *pa == *pb); + continue; + } + + // interval begins before other + if (pa->first < pb->first) + return false; + // interval is longer than other + if (pa->first + pa->second > pb->first + pb->second) + return false; + + ++pa; + } + + return pa == a_end; + } + + public: + bool operator==(const interval_set& other) const { + return _size == other._size && m == other.m; + } + + int64_t size() const { + return _size; + } + + void bound_encode(size_t& p) const { + denc_traits<Map>::bound_encode(m, p); + } + void encode(bufferlist::contiguous_appender& p) const { + denc(m, p); + } + void decode(bufferptr::const_iterator& p) { + denc(m, p); + _size = 0; + for (const auto& i : m) { + _size += i.second; + } + } + void decode(bufferlist::iterator& p) { + denc(m, p); + _size = 0; + for (const auto& i : m) { + _size += i.second; + } + } + + void encode_nohead(bufferlist::contiguous_appender& p) const { + denc_traits<Map>::encode_nohead(m, p); + } + void decode_nohead(int n, bufferptr::const_iterator& p) { + denc_traits<Map>::decode_nohead(n, m, p); + _size = 0; + for (const auto& i : m) { + _size += i.second; + } + } + + void clear() { + m.clear(); + _size = 0; + } + + bool contains(T i, T *pstart=0, T *plen=0) const { + typename Map::const_iterator p = find_inc(i); + if (p == m.end()) return false; + if (p->first > i) return false; + if (p->first+p->second <= i) return false; + ceph_assert(p->first <= i && p->first+p->second > i); + if (pstart) + *pstart = p->first; + if (plen) + *plen = p->second; + return true; + } + bool contains(T start, T len) const { + typename Map::const_iterator p = find_inc(start); + if (p == m.end()) return false; + if (p->first > start) return false; + if (p->first+p->second <= start) return false; + ceph_assert(p->first <= start && p->first+p->second > start); + if (p->first+p->second < start+len) return false; + return true; + } + bool intersects(T start, T len) const { + interval_set a; + a.insert(start, len); + interval_set i; + i.intersection_of( *this, a ); + if (i.empty()) return false; + return true; + } + + // outer range of set + bool empty() const { + return m.empty(); + } + T range_start() const { + ceph_assert(!empty()); + typename Map::const_iterator p = m.begin(); + return p->first; + } + T range_end() const { + ceph_assert(!empty()); + typename Map::const_iterator p = m.end(); + p--; + return p->first+p->second; + } + + // interval start after p (where p not in set) + bool starts_after(T i) const { + ceph_assert(!contains(i)); + typename Map::const_iterator p = find_inc(i); + if (p == m.end()) return false; + return true; + } + T start_after(T i) const { + ceph_assert(!contains(i)); + typename Map::const_iterator p = find_inc(i); + return p->first; + } + + // interval end that contains start + T end_after(T start) const { + ceph_assert(contains(start)); + typename Map::const_iterator p = find_inc(start); + return p->first+p->second; + } + + void insert(T val) { + insert(val, 1); + } + + void insert(T start, T len, T *pstart=0, T *plen=0) { + //cout << "insert " << start << "~" << len << endl; + ceph_assert(len > 0); + _size += len; + typename Map::iterator p = find_adj_m(start); + if (p == m.end()) { + m[start] = len; // new interval + if (pstart) + *pstart = start; + if (plen) + *plen = len; + } else { + if (p->first < start) { + + if (p->first + p->second != start) { + //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; + ceph_abort(); + } + + p->second += len; // append to end + + typename Map::iterator n = p; + n++; + if (pstart) + *pstart = p->first; + if (n != m.end() && + start+len == n->first) { // combine with next, too! + p->second += n->second; + if (plen) + *plen = p->second; + m.erase(n); + } else { + if (plen) + *plen = p->second; + } + } else { + if (start+len == p->first) { + if (pstart) + *pstart = start; + if (plen) + *plen = len + p->second; + T psecond = p->second; + m.erase(p); + m[start] = len + psecond; // append to front + } else { + ceph_assert(p->first > start+len); + if (pstart) + *pstart = start; + if (plen) + *plen = len; + m[start] = len; // new interval + } + } + } + } + + void swap(interval_set<T,Map>& other) { + m.swap(other.m); + std::swap(_size, other._size); + } + + void erase(iterator &i) { + _size -= i.get_len(); + ceph_assert(_size >= 0); + m.erase(i._iter); + } + + void erase(T val) { + erase(val, 1); + } + + void erase(T start, T len, + std::function<bool(T, T)> claim = {}) { + typename Map::iterator p = find_inc_m(start); + + _size -= len; + ceph_assert(_size >= 0); + + ceph_assert(p != m.end()); + ceph_assert(p->first <= start); + + T before = start - p->first; + ceph_assert(p->second >= before+len); + T after = p->second - before - len; + if (before) { + if (claim && claim(p->first, before)) { + _size -= before; + m.erase(p); + } else { + p->second = before; // shorten bit before + } + } else { + m.erase(p); + } + if (after) { + if (claim && claim(start + len, after)) { + _size -= after; + } else { + m[start + len] = after; + } + } + } + + void subtract(const interval_set &a) { + for (typename Map::const_iterator p = a.m.begin(); + p != a.m.end(); + p++) + erase(p->first, p->second); + } + + void insert(const interval_set &a) { + for (typename Map::const_iterator p = a.m.begin(); + p != a.m.end(); + p++) + insert(p->first, p->second); + } + + + void intersection_of(const interval_set &a, const interval_set &b) { + ceph_assert(&a != this); + ceph_assert(&b != this); + clear(); + + const interval_set *s, *l; + + if (a.size() < b.size()) { + s = &a; + l = &b; + } else { + s = &b; + l = &a; + } + + if (!s->size()) + return; + + /* + * Use the lower_bound algorithm for larger size ratios + * where it performs better, but not for smaller size + * ratios where sequential search performs better. + */ + if (l->size() / s->size() >= 10) { + intersection_size_asym(*s, *l); + return; + } + + typename Map::const_iterator pa = a.m.begin(); + typename Map::const_iterator pb = b.m.begin(); + typename decltype(m)::iterator mi = m.begin(); + + while (pa != a.m.end() && pb != b.m.end()) { + // passing? + if (pa->first + pa->second <= pb->first) + { pa++; continue; } + if (pb->first + pb->second <= pa->first) + { pb++; continue; } + + if (*pa == *pb) { + do { + mi = m.insert(mi, *pa); + _size += pa->second; + ++pa; + ++pb; + } while (pa != a.m.end() && pb != b.m.end() && *pa == *pb); + continue; + } + + T start = std::max(pa->first, pb->first); + T en = std::min(pa->first+pa->second, pb->first+pb->second); + ceph_assert(en > start); + typename decltype(m)::value_type i{start, en - start}; + mi = m.insert(mi, i); + _size += i.second; + if (pa->first+pa->second > pb->first+pb->second) + pb++; + else + pa++; + } + } + void intersection_of(const interval_set& b) { + interval_set a; + swap(a); + intersection_of(a, b); + } + + void union_of(const interval_set &a, const interval_set &b) { + ceph_assert(&a != this); + ceph_assert(&b != this); + clear(); + + //cout << "union_of" << endl; + + // a + m = a.m; + _size = a._size; + + // - (a*b) + interval_set ab; + ab.intersection_of(a, b); + subtract(ab); + + // + b + insert(b); + return; + } + void union_of(const interval_set &b) { + interval_set a; + swap(a); + union_of(a, b); + } + void union_insert(T off, T len) { + interval_set a; + a.insert(off, len); + union_of(a); + } + + bool subset_of(const interval_set &big) const { + if (!size()) + return true; + if (size() > big.size()) + return false; + if (range_end() > big.range_end()) + return false; + + /* + * Use the lower_bound algorithm for larger size ratios + * where it performs better, but not for smaller size + * ratios where sequential search performs better. + */ + if (big.size() / size() < 10) + return subset_size_sym(big); + + for (typename Map::const_iterator i = m.begin(); + i != m.end(); + i++) + if (!big.contains(i->first, i->second)) return false; + return true; + } + + /* + * build a subset of @other, starting at or after @start, and including + * @len worth of values, skipping holes. e.g., + * span_of([5~10,20~5], 8, 5) -> [8~2,20~3] + */ + void span_of(const interval_set &other, T start, T len) { + clear(); + typename Map::const_iterator p = other.find_inc(start); + if (p == other.m.end()) + return; + if (p->first < start) { + if (p->first + p->second < start) + return; + if (p->first + p->second < start + len) { + T howmuch = p->second - (start - p->first); + insert(start, howmuch); + len -= howmuch; + p++; + } else { + insert(start, len); + return; + } + } + while (p != other.m.end() && len > 0) { + if (p->second < len) { + insert(p->first, p->second); + len -= p->second; + p++; + } else { + insert(p->first, len); + return; + } + } + } + + /* + * Move contents of m into another Map. Use that instead of + * encoding interval_set into bufferlist then decoding it back into Map. + */ + void move_into(Map& other) { + other = std::move(m); + } + +private: + // data + int64_t _size; + Map m; // map start -> len +}; + +// declare traits explicitly because (1) it's templatized, and (2) we +// want to include _nohead variants. +template<typename T, typename Map> +struct denc_traits<interval_set<T,Map>> { + static constexpr bool supported = true; + static constexpr bool bounded = false; + static constexpr bool featured = false; + static constexpr bool need_contiguous = denc_traits<T,Map>::need_contiguous; + static void bound_encode(const interval_set<T,Map>& v, size_t& p) { + v.bound_encode(p); + } + static void encode(const interval_set<T,Map>& v, + bufferlist::contiguous_appender& p) { + v.encode(p); + } + static void decode(interval_set<T,Map>& v, bufferptr::const_iterator& p) { + v.decode(p); + } + template<typename U=T> + static typename std::enable_if<sizeof(U) && !need_contiguous>::type + decode(interval_set<T,Map>& v, bufferlist::iterator& p) { + v.decode(p); + } + static void encode_nohead(const interval_set<T,Map>& v, + bufferlist::contiguous_appender& p) { + v.encode_nohead(p); + } + static void decode_nohead(size_t n, interval_set<T,Map>& v, + bufferptr::const_iterator& p) { + v.decode_nohead(n, p); + } +}; + + +template<class T, typename Map> +inline std::ostream& operator<<(std::ostream& out, const interval_set<T,Map> &s) { + out << "["; + const char *prequel = ""; + for (typename interval_set<T,Map>::const_iterator i = s.begin(); + i != s.end(); + ++i) + { + out << prequel << i.get_start() << "~" << i.get_len(); + prequel = ","; + } + out << "]"; + return out; +} + + +#endif diff --git a/src/include/ipaddr.h b/src/include/ipaddr.h new file mode 100644 index 00000000..e8bed829 --- /dev/null +++ b/src/include/ipaddr.h @@ -0,0 +1,48 @@ +#ifndef CEPH_IPADDR_H +#define CEPH_IPADDR_H + +class entity_addr_t; + +/* + * Find an IP address that is in the wanted subnet. + * + * If there are multiple matches, the first one is returned; this order + * is system-dependent and should not be relied on. + */ +const struct ifaddrs *find_ip_in_subnet(const struct ifaddrs *addrs, + const struct sockaddr *net, + unsigned int prefix_len, + int numa_node = -1); + +/* + * Validate and parse IPv4 or IPv6 network + * + * Given a network (e.g. "192.168.0.0/24") and pointers to a sockaddr_storage + * struct and an unsigned int: + * + * if the network string is valid, return true and populate sockaddr_storage + * and prefix_len; + * + * if the network string is invalid, return false. + */ +bool parse_network(const char *s, + struct sockaddr_storage *network, + unsigned int *prefix_len); +bool parse_network(const char *s, + entity_addr_t *network, + unsigned int *prefix_len); + +void netmask_ipv6(const struct in6_addr *addr, + unsigned int prefix_len, + struct in6_addr *out); + +void netmask_ipv4(const struct in_addr *addr, + unsigned int prefix_len, + struct in_addr *out); + +bool network_contains( + const struct entity_addr_t& network, + unsigned int prefix_len, + const struct entity_addr_t& addr); + +#endif diff --git a/src/include/krbd.h b/src/include/krbd.h new file mode 100644 index 00000000..977d45fe --- /dev/null +++ b/src/include/krbd.h @@ -0,0 +1,97 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Inktank Storage, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_KRBD_H +#define CEPH_KRBD_H + +#include "rados/librados.h" + +/* + * Don't wait for udev add uevents in krbd_map() and udev remove + * uevents in krbd_unmap*(). Instead, make do with the respective + * kernel uevents and return as soon as they are received. + * + * systemd-udevd sends out udev uevents after it finishes processing + * the respective kernel uevents, which mostly boils down to executing + * all matching udev rules. With this flag set, on return from + * krbd_map() systemd-udevd may still be poking at the device: it + * may still be open with tools such as blkid and various ioctls to + * be run against it, none of the persistent symlinks to the device + * node may be there, etc. udev used to be responsible for creating + * the device node as well, but that has been handled by devtmpfs in + * the kernel for many years now, so the device node (as returned + * through @pdevnode) is guaranteed to be there. + * + * If set, krbd_map() and krbd_unmap*() can be invoked from any + * network namespace that is owned by the initial user namespace + * (which is a formality because things like loading kernel modules + * and creating block devices are not namespaced and require global + * privileges, i.e. capabilities in the initial user namespace). + * Otherwise, krbd_map() and krbd_unmap*() must be invoked from + * the initial network namespace. + * + * If set, krbd_unmap*() doesn't attempt to settle the udev queue + * before retrying unmap for the last time. Some EBUSY errors due + * to systemd-udevd poking at the device at the time krbd_unmap*() + * is invoked that are otherwise covered by the retry logic may be + * returned. + */ +#define KRBD_CTX_F_NOUDEV (1U << 0) + +#ifdef __cplusplus +extern "C" { +#endif + +struct krbd_ctx; + +int krbd_create_from_context(rados_config_t cct, uint32_t flags, + struct krbd_ctx **pctx); +void krbd_destroy(struct krbd_ctx *ctx); + +int krbd_map(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + const char *options, + char **pdevnode); +int krbd_is_mapped(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + char **pdevnode); + +int krbd_unmap(struct krbd_ctx *ctx, const char *devnode, + const char *options); +int krbd_unmap_by_spec(struct krbd_ctx *ctx, + const char *pool_name, + const char *nspace_name, + const char *image_name, + const char *snap_name, + const char *options); + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus + +namespace ceph { + class Formatter; +} + +int krbd_showmapped(struct krbd_ctx *ctx, ceph::Formatter *f); + +#endif /* __cplusplus */ + +#endif /* CEPH_KRBD_H */ diff --git a/src/include/linux_fiemap.h b/src/include/linux_fiemap.h new file mode 100644 index 00000000..36046b5c --- /dev/null +++ b/src/include/linux_fiemap.h @@ -0,0 +1,73 @@ +/* + * FS_IOC_FIEMAP ioctl infrastructure. + * + * Some portions copyright (C) 2007 Cluster File Systems, Inc + * + * Authors: Mark Fasheh <mfasheh@suse.com> + * Kalpak Shah <kalpak.shah@sun.com> + * Andreas Dilger <adilger@sun.com> + */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H + +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD_) +#include <sys/types.h> +#endif + +#include "include/int_types.h" + +struct fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_reserved[3]; +}; + +struct fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ +}; + +#define FIEMAP_MAX_OFFSET (~0ULL) + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ + +#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_BYPASS. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ +#define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other + * files. */ + +#endif /* _LINUX_FIEMAP_H */ diff --git a/src/include/lru.h b/src/include/lru.h new file mode 100644 index 00000000..1e30cdfe --- /dev/null +++ b/src/include/lru.h @@ -0,0 +1,243 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_LRU_H +#define CEPH_LRU_H + +#include <math.h> +#include <stdint.h> + +#include "common/config.h" +#include "xlist.h" + +class LRUObject { +public: + LRUObject() : lru(), lru_link(this), lru_pinned(false) { } + ~LRUObject(); + + // pin/unpin item in cache + void lru_pin(); + void lru_unpin(); + bool lru_is_expireable() const { return !lru_pinned; } + + friend class LRU; +private: + class LRU *lru; + xlist<LRUObject *>::item lru_link; + bool lru_pinned; +}; + +class LRU { +public: + LRU() : num_pinned(0), midpoint(0.6) {} + + uint64_t lru_get_size() const { return lru_get_top()+lru_get_bot()+lru_get_pintail(); } + uint64_t lru_get_top() const { return top.size(); } + uint64_t lru_get_bot() const{ return bottom.size(); } + uint64_t lru_get_pintail() const { return pintail.size(); } + uint64_t lru_get_num_pinned() const { return num_pinned; } + + void lru_set_midpoint(double f) { midpoint = fmin(1.0, fmax(0.0, f)); } + + void lru_clear() { + while (!top.empty()) { + lru_remove(top.front()); + } + while (!bottom.empty()) { + lru_remove(bottom.front()); + } + while (!pintail.empty()) { + lru_remove(pintail.front()); + } + ceph_assert(num_pinned == 0); + } + + // insert at top of lru + void lru_insert_top(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + top.push_front(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // insert at mid point in lru + void lru_insert_mid(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + bottom.push_front(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // insert at bottom of lru + void lru_insert_bot(LRUObject *o) { + ceph_assert(!o->lru); + o->lru = this; + bottom.push_back(&o->lru_link); + if (o->lru_pinned) num_pinned++; + adjust(); + } + + // remove an item + LRUObject *lru_remove(LRUObject *o) { + if (!o->lru) return o; + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + o->lru_link.remove_myself(); + if (o->lru_pinned) num_pinned--; + o->lru = nullptr; + adjust(); + return o; + } + + // touch item -- move to head of lru + bool lru_touch(LRUObject *o) { + if (!o->lru) { + lru_insert_top(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + top.push_front(&o->lru_link); + adjust(); + } + return true; + } + + // touch item -- move to midpoint (unless already higher) + bool lru_midtouch(LRUObject *o) { + if (!o->lru) { + lru_insert_mid(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + if (list == &top) return false; + bottom.push_front(&o->lru_link); + adjust(); + } + return true; + } + + // touch item -- move to bottom + bool lru_bottouch(LRUObject *o) { + if (!o->lru) { + lru_insert_bot(o); + } else { + ceph_assert(o->lru == this); + auto list = o->lru_link.get_list(); + ceph_assert(list == &top || list == &bottom || list == &pintail); + bottom.push_back(&o->lru_link); + adjust(); + } + return true; + } + + void lru_touch_entire_pintail() { + // promote entire pintail to the top lru + while (pintail.size() > 0) { + top.push_back(&pintail.front()->lru_link); + adjust(); + } + } + + // expire -- expire a single item + LRUObject *lru_get_next_expire() { + adjust(); + // look through tail of bot + while (bottom.size()) { + LRUObject *p = bottom.back(); + if (!p->lru_pinned) return p; + + // move to pintail + pintail.push_front(&p->lru_link); + } + + // ok, try head then + while (top.size()) { + LRUObject *p = top.back(); + if (!p->lru_pinned) return p; + + // move to pintail + pintail.push_front(&p->lru_link); + } + + // no luck! + return NULL; + } + + LRUObject *lru_expire() { + LRUObject *p = lru_get_next_expire(); + if (p) + return lru_remove(p); + return NULL; + } + + void lru_status() { + //generic_dout(10) << "lru: " << lru_get_size() << " items, " << top.size() << " top, " << bottom.size() << " bot, " << pintail.size() << " pintail" << dendl; + } + +protected: + // adjust top/bot balance, as necessary + void adjust() { + uint64_t toplen = top.size(); + uint64_t topwant = (midpoint * (double)(lru_get_size() - num_pinned)); + /* move items from below midpoint (bottom) to top: move midpoint forward */ + for (uint64_t i = toplen; i < topwant; i++) { + top.push_back(&bottom.front()->lru_link); + } + /* or: move items from above midpoint (top) to bottom: move midpoint backwards */ + for (uint64_t i = toplen; i > topwant; i--) { + bottom.push_front(&top.back()->lru_link); + } + } + + uint64_t num_pinned; + double midpoint; + + friend class LRUObject; +private: + typedef xlist<LRUObject *> LRUList; + LRUList top, bottom, pintail; +}; + +inline LRUObject::~LRUObject() { + if (lru) { + lru->lru_remove(this); + } +} + +inline void LRUObject::lru_pin() { + if (lru && !lru_pinned) { + lru->num_pinned++; + } + lru_pinned = true; +} + +inline void LRUObject::lru_unpin() { + if (lru && lru_pinned) { + lru->num_pinned--; + + // move from pintail -> bot + if (lru_link.get_list() == &lru->pintail) { + lru->lru_bottouch(this); + } + } + lru_pinned = false; +} + +#endif diff --git a/src/include/mempool.h b/src/include/mempool.h new file mode 100644 index 00000000..9cee3825 --- /dev/null +++ b/src/include/mempool.h @@ -0,0 +1,547 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Allen Samuels <allen.samuels@sandisk.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef _CEPH_INCLUDE_MEMPOOL_H +#define _CEPH_INCLUDE_MEMPOOL_H + +#include <cstddef> +#include <map> +#include <unordered_map> +#include <set> +#include <vector> +#include <list> +#include <mutex> +#include <atomic> +#include <typeinfo> +#include <boost/container/flat_set.hpp> +#include <boost/container/flat_map.hpp> + +#include <common/Formatter.h> +#include "include/ceph_assert.h" +#include "include/compact_map.h" +#include "include/compact_set.h" + + +/* + +Memory Pools +============ + +A memory pool is a method for accounting the consumption of memory of +a set of containers. + +Memory pools are statically declared (see pool_index_t). + +Each memory pool tracks the number of bytes and items it contains. + +Allocators can be declared and associated with a type so that they are +tracked independently of the pool total. This additional accounting +is optional and only incurs an overhead if the debugging is enabled at +runtime. This allows developers to see what types are consuming the +pool resources. + + +Declaring +--------- + +Using memory pools is very easy. + +To create a new memory pool, simply add a new name into the list of +memory pools that's defined in "DEFINE_MEMORY_POOLS_HELPER". That's +it. :) + +For each memory pool that's created a C++ namespace is also +automatically created (name is same as in DEFINE_MEMORY_POOLS_HELPER). +That namespace contains a set of common STL containers that are predefined +with the appropriate allocators. + +Thus for mempool "osd" we have automatically available to us: + + mempool::osd::map + mempool::osd::multimap + mempool::osd::set + mempool::osd::multiset + mempool::osd::list + mempool::osd::vector + mempool::osd::unordered_map + + +Putting objects in a mempool +---------------------------- + +In order to use a memory pool with a particular type, a few additional +declarations are needed. + +For a class: + + struct Foo { + MEMPOOL_CLASS_HELPERS(); + ... + }; + +Then, in an appropriate .cc file, + + MEMPOOL_DEFINE_OBJECT_FACTORY(Foo, foo, osd); + +The second argument can generally be identical to the first, except +when the type contains a nested scope. For example, for +BlueStore::Onode, we need to do + + MEMPOOL_DEFINE_OBJECT_FACTORY(BlueStore::Onode, bluestore_onode, + bluestore_meta); + +(This is just because we need to name some static variables and we +can't use :: in a variable name.) + +XXX Note: the new operator hard-codes the allocation size to the size of the +object given in MEMPOOL_DEFINE_OBJECT_FACTORY. For this reason, you cannot +incorporate mempools into a base class without also defining a helper/factory +for the child class as well (as the base class is usually smaller than the +child class). + +In order to use the STL containers, simply use the namespaced variant +of the container type. For example, + + mempool::osd::map<int> myvec; + +Introspection +------------- + +The simplest way to interrogate the process is with + + Formater *f = ... + mempool::dump(f); + +This will dump information about *all* memory pools. When debug mode +is enabled, the runtime complexity of dump is O(num_shards * +num_types). When debug name is disabled it is O(num_shards). + +You can also interrogate a specific pool programmatically with + + size_t bytes = mempool::unittest_2::allocated_bytes(); + size_t items = mempool::unittest_2::allocated_items(); + +The runtime complexity is O(num_shards). + +Note that you cannot easily query per-type, primarily because debug +mode is optional and you should not rely on that information being +available. + +*/ + +namespace mempool { + +// -------------------------------------------------------------- +// define memory pools + +#define DEFINE_MEMORY_POOLS_HELPER(f) \ + f(bloom_filter) \ + f(bluestore_alloc) \ + f(bluestore_cache_data) \ + f(bluestore_cache_onode) \ + f(bluestore_cache_meta) \ + f(bluestore_cache_other) \ + f(bluestore_Buffer) \ + f(bluestore_Extent) \ + f(bluestore_Blob) \ + f(bluestore_SharedBlob) \ + f(bluestore_inline_bl) \ + f(bluestore_fsck) \ + f(bluestore_txc) \ + f(bluestore_writing_deferred) \ + f(bluestore_writing) \ + f(bluefs) \ + f(bluefs_file_reader) \ + f(bluefs_file_writer) \ + f(buffer_anon) \ + f(buffer_meta) \ + f(osd) \ + f(osd_mapbl) \ + f(osd_pglog) \ + f(osdmap) \ + f(osdmap_mapping) \ + f(pgmap) \ + f(mds_co) \ + f(unittest_1) \ + f(unittest_2) + + +// give them integer ids +#define P(x) mempool_##x, +enum pool_index_t { + DEFINE_MEMORY_POOLS_HELPER(P) + num_pools // Must be last. +}; +#undef P + +extern bool debug_mode; +extern void set_debug_mode(bool d); + +// -------------------------------------------------------------- +class pool_t; + +// we shard pool stats across many shard_t's to reduce the amount +// of cacheline ping pong. +enum { + num_shard_bits = 5 +}; +enum { + num_shards = 1 << num_shard_bits +}; + +// align shard to a cacheline +struct shard_t { + std::atomic<size_t> bytes = {0}; + std::atomic<size_t> items = {0}; + char __padding[128 - sizeof(std::atomic<size_t>)*2]; +} __attribute__ ((aligned (128))); + +static_assert(sizeof(shard_t) == 128, "shard_t should be cacheline-sized"); + +struct stats_t { + ssize_t items = 0; + ssize_t bytes = 0; + void dump(ceph::Formatter *f) const { + f->dump_int("items", items); + f->dump_int("bytes", bytes); + } + + stats_t& operator+=(const stats_t& o) { + items += o.items; + bytes += o.bytes; + return *this; + } +}; + +pool_t& get_pool(pool_index_t ix); +const char *get_pool_name(pool_index_t ix); + +struct type_t { + const char *type_name; + size_t item_size; + std::atomic<ssize_t> items = {0}; // signed +}; + +struct type_info_hash { + std::size_t operator()(const std::type_info& k) const { + return k.hash_code(); + } +}; + +class pool_t { + shard_t shard[num_shards]; + + mutable std::mutex lock; // only used for types list + std::unordered_map<const char *, type_t> type_map; + +public: + // + // How much this pool consumes. O(<num_shards>) + // + size_t allocated_bytes() const; + size_t allocated_items() const; + + void adjust_count(ssize_t items, ssize_t bytes); + + static size_t pick_a_shard_int() { + // Dirt cheap, see: + // https://fossies.org/dox/glibc-2.32/pthread__self_8c_source.html + size_t me = (size_t)pthread_self(); + size_t i = (me >> 12) & ((1 << num_shard_bits) - 1); + return i; + } + + shard_t* pick_a_shard() { + size_t i = pick_a_shard_int(); + return &shard[i]; + } + + type_t *get_type(const std::type_info& ti, size_t size) { + std::lock_guard<std::mutex> l(lock); + auto p = type_map.find(ti.name()); + if (p != type_map.end()) { + return &p->second; + } + type_t &t = type_map[ti.name()]; + t.type_name = ti.name(); + t.item_size = size; + return &t; + } + + // get pool stats. by_type is not populated if !debug + void get_stats(stats_t *total, + std::map<std::string, stats_t> *by_type) const; + + void dump(ceph::Formatter *f, stats_t *ptotal=0) const; +}; + +void dump(ceph::Formatter *f); + + +// STL allocator for use with containers. All actual state +// is stored in the static pool_allocator_base_t, which saves us from +// passing the allocator to container constructors. + +template<pool_index_t pool_ix, typename T> +class pool_allocator { + pool_t *pool; + type_t *type = nullptr; + +public: + typedef pool_allocator<pool_ix, T> allocator_type; + typedef T value_type; + typedef value_type *pointer; + typedef const value_type * const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template<typename U> struct rebind { + typedef pool_allocator<pool_ix,U> other; + }; + + void init(bool force_register) { + pool = &get_pool(pool_ix); + if (debug_mode || force_register) { + type = pool->get_type(typeid(T), sizeof(T)); + } + } + + pool_allocator(bool force_register=false) { + init(force_register); + } + template<typename U> + pool_allocator(const pool_allocator<pool_ix,U>&) { + init(false); + } + + T* allocate(size_t n, void *p = nullptr) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes += total; + shard->items += n; + if (type) { + type->items += n; + } + T* r = reinterpret_cast<T*>(new char[total]); + return r; + } + + void deallocate(T* p, size_t n) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes -= total; + shard->items -= n; + if (type) { + type->items -= n; + } + delete[] reinterpret_cast<char*>(p); + } + + T* allocate_aligned(size_t n, size_t align, void *p = nullptr) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes += total; + shard->items += n; + if (type) { + type->items += n; + } + char *ptr; + int rc = ::posix_memalign((void**)(void*)&ptr, align, total); + if (rc) + throw std::bad_alloc(); + T* r = reinterpret_cast<T*>(ptr); + return r; + } + + void deallocate_aligned(T* p, size_t n) { + size_t total = sizeof(T) * n; + shard_t *shard = pool->pick_a_shard(); + shard->bytes -= total; + shard->items -= n; + if (type) { + type->items -= n; + } + ::free(p); + } + + void destroy(T* p) { + p->~T(); + } + + template<class U> + void destroy(U *p) { + p->~U(); + } + + void construct(T* p, const T& val) { + ::new ((void *)p) T(val); + } + + template<class U, class... Args> void construct(U* p,Args&&... args) { + ::new((void *)p) U(std::forward<Args>(args)...); + } + + bool operator==(const pool_allocator&) const { return true; } + bool operator!=(const pool_allocator&) const { return false; } +}; + + +// Namespace mempool + +#define P(x) \ + namespace x { \ + static const mempool::pool_index_t id = mempool::mempool_##x; \ + template<typename v> \ + using pool_allocator = mempool::pool_allocator<id,v>; \ + \ + using string = std::basic_string<char,std::char_traits<char>, \ + pool_allocator<char>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using map = std::map<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using compact_map = compact_map<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using compact_multimap = compact_multimap<k, v, cmp, \ + pool_allocator<std::pair<const k,v>>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using compact_set = compact_set<k, cmp, pool_allocator<k>>; \ + \ + template<typename k,typename v, typename cmp = std::less<k> > \ + using multimap = std::multimap<k,v,cmp, \ + pool_allocator<std::pair<const k, \ + v>>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using set = std::set<k,cmp,pool_allocator<k>>; \ + \ + template<typename k, typename cmp = std::less<k> > \ + using flat_set = boost::container::flat_set<k,cmp,pool_allocator<k>>; \ + \ + template<typename k, typename v, typename cmp = std::less<k> > \ + using flat_map = boost::container::flat_map<k,v,cmp, \ + pool_allocator<std::pair<k,v>>>; \ + \ + template<typename v> \ + using list = std::list<v,pool_allocator<v>>; \ + \ + template<typename v> \ + using vector = std::vector<v,pool_allocator<v>>; \ + \ + template<typename k, typename v, \ + typename h=std::hash<k>, \ + typename eq = std::equal_to<k>> \ + using unordered_map = \ + std::unordered_map<k,v,h,eq,pool_allocator<std::pair<const k,v>>>;\ + \ + inline size_t allocated_bytes() { \ + return mempool::get_pool(id).allocated_bytes(); \ + } \ + inline size_t allocated_items() { \ + return mempool::get_pool(id).allocated_items(); \ + } \ + }; + +DEFINE_MEMORY_POOLS_HELPER(P) + +#undef P + +}; + +// the elements allocated by mempool is in the same memory space as the ones +// allocated by the default allocator. so compare them in an efficient way: +// libstdc++'s std::equal is specialized to use memcmp if T is integer or +// pointer. this is good enough for our usecase. use +// std::is_trivially_copyable<T> to expand the support to more types if +// nececssary. +template<typename T, mempool::pool_index_t pool_index> +bool operator==(const std::vector<T, std::allocator<T>>& lhs, + const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs) +{ + return (lhs.size() == rhs.size() && + std::equal(lhs.begin(), lhs.end(), rhs.begin())); +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator!=(const std::vector<T, std::allocator<T>>& lhs, + const std::vector<T, mempool::pool_allocator<pool_index, T>>& rhs) +{ + return !(lhs == rhs); +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator==(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs, + const std::vector<T, std::allocator<T>>& rhs) +{ + return rhs == lhs; +} + +template<typename T, mempool::pool_index_t pool_index> +bool operator!=(const std::vector<T, mempool::pool_allocator<pool_index, T>>& lhs, + const std::vector<T, std::allocator<T>>& rhs) +{ + return !(lhs == rhs); +} + +// Use this for any type that is contained by a container (unless it +// is a class you defined; see below). +#define MEMPOOL_DECLARE_FACTORY(obj, factoryname, pool) \ + namespace mempool { \ + namespace pool { \ + extern pool_allocator<obj> alloc_##factoryname; \ + } \ + } + +#define MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \ + namespace mempool { \ + namespace pool { \ + pool_allocator<obj> alloc_##factoryname = {true}; \ + } \ + } + +// Use this for each class that belongs to a mempool. For example, +// +// class T { +// MEMPOOL_CLASS_HELPERS(); +// ... +// }; +// +#define MEMPOOL_CLASS_HELPERS() \ + void *operator new(size_t size); \ + void *operator new[](size_t size) noexcept { \ + ceph_abort_msg("no array new"); \ + return nullptr; } \ + void operator delete(void *); \ + void operator delete[](void *) { ceph_abort_msg("no array delete"); } + + +// Use this in some particular .cc file to match each class with a +// MEMPOOL_CLASS_HELPERS(). +#define MEMPOOL_DEFINE_OBJECT_FACTORY(obj,factoryname,pool) \ + MEMPOOL_DEFINE_FACTORY(obj, factoryname, pool) \ + void *obj::operator new(size_t size) { \ + return mempool::pool::alloc_##factoryname.allocate(1); \ + } \ + void obj::operator delete(void *p) { \ + return mempool::pool::alloc_##factoryname.deallocate((obj*)p, 1); \ + } + +#endif diff --git a/src/include/msgr.h b/src/include/msgr.h new file mode 100644 index 00000000..f7b2a078 --- /dev/null +++ b/src/include/msgr.h @@ -0,0 +1,254 @@ +#ifndef CEPH_MSGR_H +#define CEPH_MSGR_H + +#ifndef __KERNEL__ +#include <sys/socket.h> // for struct sockaddr_storage +#endif + +#include "include/int_types.h" + +/* See comment in ceph_fs.h. */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * Data types for message passing layer used by Ceph. + */ + +#define CEPH_MON_PORT_LEGACY 6789 /* legacy default monitor port */ +#define CEPH_MON_PORT_IANA 3300 /* IANA monitor port */ + +/* + * client-side processes will try to bind to ports in this + * range, simply for the benefit of tools like nmap or wireshark + * that would like to identify the protocol. + */ +#define CEPH_PORT_FIRST 6789 + +/* + * tcp connection banner. include a protocol version. and adjust + * whenever the wire protocol changes. try to keep this string length + * constant. + */ +#define CEPH_BANNER "ceph v027" + + +/* + * messenger V2 connection banner prefix. + * The full banner string should have the form: "ceph v2\n<le16>" + * the 2 bytes are the length of the remaining banner. + */ +#define CEPH_BANNER_V2_PREFIX "ceph v2\n" + +/* + * messenger V2 features + */ +#define CEPH_MSGR2_INCARNATION_1 (0ull) + +#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + const static uint64_t CEPH_MSGR2_FEATUREMASK_##name = \ + (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); + +#define HAVE_MSGR2_FEATURE(x, name) \ + (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name)) + +DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1 + +#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + +#define CEPH_MSGR2_REQUIRED_FEATURES (0ull) + + +/* + * Rollover-safe type and comparator for 32-bit sequence numbers. + * Comparator returns -1, 0, or 1. + */ +typedef __u32 ceph_seq_t; + +static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) +{ + return (__s32)a - (__s32)b; +} + + +/* + * entity_name -- logical name for a process participating in the + * network, e.g. 'mds0' or 'osd3'. + */ +struct ceph_entity_name { + __u8 type; /* CEPH_ENTITY_TYPE_* */ + __le64 num; +} __attribute__ ((packed)); + +#define CEPH_ENTITY_TYPE_MON 0x01 +#define CEPH_ENTITY_TYPE_MDS 0x02 +#define CEPH_ENTITY_TYPE_OSD 0x04 +#define CEPH_ENTITY_TYPE_CLIENT 0x08 +#define CEPH_ENTITY_TYPE_MGR 0x10 +#define CEPH_ENTITY_TYPE_AUTH 0x20 + +#define CEPH_ENTITY_TYPE_ANY 0xFF + +extern const char *ceph_entity_type_name(int type); + +/* + * entity_addr -- network address + */ +struct ceph_entity_addr { + __le32 type; + __le32 nonce; /* unique id for process (e.g. pid) */ + struct sockaddr_storage in_addr; +} __attribute__ ((packed)); + +struct ceph_entity_inst { + struct ceph_entity_name name; + struct ceph_entity_addr addr; +} __attribute__ ((packed)); + + +/* used by message exchange protocol */ +#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ +#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ +#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing + incoming connection */ +#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again + with higher cseq */ +#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again + with higher gseq */ +#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ +#define CEPH_MSGR_TAG_MSG 7 /* message */ +#define CEPH_MSGR_TAG_ACK 8 /* message ack */ +#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ +#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ +#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ +#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ +#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ +#define CEPH_MSGR_TAG_KEEPALIVE2 14 +#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive reply */ +#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* ceph v2 doing server challenge */ + +/* + * connection negotiation + */ +struct ceph_msg_connect { + __le64 features; /* supported feature bits */ + __le32 host_type; /* CEPH_ENTITY_TYPE_* */ + __le32 global_seq; /* count connections initiated by this host */ + __le32 connect_seq; /* count connections initiated in this session */ + __le32 protocol_version; + __le32 authorizer_protocol; + __le32 authorizer_len; + __u8 flags; /* CEPH_MSG_CONNECT_* */ +} __attribute__ ((packed)); + +struct ceph_msg_connect_reply { + __u8 tag; + __le64 features; /* feature bits for this session */ + __le32 global_seq; + __le32 connect_seq; + __le32 protocol_version; + __le32 authorizer_len; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ + + +/* + * message header + */ +struct ceph_msg_header_old { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_inst src, orig_src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + +struct ceph_msg_header2 { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 data_pre_padding_len; + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __le64 ack_seq; + __u8 flags; + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; +} __attribute__ ((packed)); + +#define CEPH_MSG_PRIO_LOW 64 +#define CEPH_MSG_PRIO_DEFAULT 127 +#define CEPH_MSG_PRIO_HIGH 196 +#define CEPH_MSG_PRIO_HIGHEST 255 + +/* + * follows data payload + * ceph_msg_footer_old does not support digital signatures on messages PLR + */ + +struct ceph_msg_footer_old { + __le32 front_crc, middle_crc, data_crc; + __u8 flags; +} __attribute__ ((packed)); + +struct ceph_msg_footer { + __le32 front_crc, middle_crc, data_crc; + // sig holds the 64 bits of the digital signature for the message PLR + __le64 sig; + __u8 flags; +} __attribute__ ((packed)); + +#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ +#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ +#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */ + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/object.h b/src/include/object.h new file mode 100644 index 00000000..99ca58f9 --- /dev/null +++ b/src/include/object.h @@ -0,0 +1,214 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_OBJECT_H +#define CEPH_OBJECT_H + +#include <stdint.h> +#include <stdio.h> + +#include <iosfwd> +#include <iomanip> + +#include "include/rados.h" +#include "include/unordered_map.h" + +#include "hash.h" +#include "encoding.h" +#include "ceph_hash.h" +#include "cmp.h" + +using namespace std; + +struct object_t { + string name; + + object_t() {} + // cppcheck-suppress noExplicitConstructor + object_t(const char *s) : name(s) {} + // cppcheck-suppress noExplicitConstructor + object_t(const string& s) : name(s) {} + + void swap(object_t& o) { + name.swap(o.name); + } + void clear() { + name.clear(); + } + + void encode(bufferlist &bl) const { + using ceph::encode; + encode(name, bl); + } + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + decode(name, bl); + } +}; +WRITE_CLASS_ENCODER(object_t) + +inline bool operator==(const object_t& l, const object_t& r) { + return l.name == r.name; +} +inline bool operator!=(const object_t& l, const object_t& r) { + return l.name != r.name; +} +inline bool operator>(const object_t& l, const object_t& r) { + return l.name > r.name; +} +inline bool operator<(const object_t& l, const object_t& r) { + return l.name < r.name; +} +inline bool operator>=(const object_t& l, const object_t& r) { + return l.name >= r.name; +} +inline bool operator<=(const object_t& l, const object_t& r) { + return l.name <= r.name; +} +inline ostream& operator<<(ostream& out, const object_t& o) { + return out << o.name; +} + +namespace std { + template<> struct hash<object_t> { + size_t operator()(const object_t& r) const { + //static hash<string> H; + //return H(r.name); + return ceph_str_hash_linux(r.name.c_str(), r.name.length()); + } + }; +} // namespace std + + +struct file_object_t { + uint64_t ino, bno; + mutable char buf[34]; + + file_object_t(uint64_t i=0, uint64_t b=0) : ino(i), bno(b) { + buf[0] = 0; + } + + const char *c_str() const { + if (!buf[0]) + snprintf(buf, sizeof(buf), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)bno); + return buf; + } + + operator object_t() { + return object_t(c_str()); + } +}; + + +// --------------------------- +// snaps + +struct snapid_t { + uint64_t val; + // cppcheck-suppress noExplicitConstructor + snapid_t(uint64_t v=0) : val(v) {} + snapid_t operator+=(snapid_t o) { val += o.val; return *this; } + snapid_t operator++() { ++val; return *this; } + operator uint64_t() const { return val; } +}; + +inline void encode(snapid_t i, bufferlist &bl) { encode(i.val, bl); } +inline void decode(snapid_t &i, bufferlist::const_iterator &p) { decode(i.val, p); } + +template<> +struct denc_traits<snapid_t> { + static constexpr bool supported = true; + static constexpr bool featured = false; + static constexpr bool bounded = true; + static constexpr bool need_contiguous = true; + static void bound_encode(const snapid_t& o, size_t& p) { + denc(o.val, p); + } + static void encode(const snapid_t &o, buffer::list::contiguous_appender& p) { + denc(o.val, p); + } + static void decode(snapid_t& o, buffer::ptr::const_iterator &p) { + denc(o.val, p); + } +}; + +inline ostream& operator<<(ostream& out, const snapid_t& s) { + if (s == CEPH_NOSNAP) + return out << "head"; + else if (s == CEPH_SNAPDIR) + return out << "snapdir"; + else + return out << hex << s.val << dec; +} + + +struct sobject_t { + object_t oid; + snapid_t snap; + + sobject_t() : snap(0) {} + sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {} + + void swap(sobject_t& o) { + oid.swap(o.oid); + snapid_t t = snap; + snap = o.snap; + o.snap = t; + } + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(oid, bl); + encode(snap, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(oid, bl); + decode(snap, bl); + } +}; +WRITE_CLASS_ENCODER(sobject_t) + +inline bool operator==(const sobject_t &l, const sobject_t &r) { + return l.oid == r.oid && l.snap == r.snap; +} +inline bool operator!=(const sobject_t &l, const sobject_t &r) { + return l.oid != r.oid || l.snap != r.snap; +} +inline bool operator>(const sobject_t &l, const sobject_t &r) { + return l.oid > r.oid || (l.oid == r.oid && l.snap > r.snap); +} +inline bool operator<(const sobject_t &l, const sobject_t &r) { + return l.oid < r.oid || (l.oid == r.oid && l.snap < r.snap); +} +inline bool operator>=(const sobject_t &l, const sobject_t &r) { + return l.oid > r.oid || (l.oid == r.oid && l.snap >= r.snap); +} +inline bool operator<=(const sobject_t &l, const sobject_t &r) { + return l.oid < r.oid || (l.oid == r.oid && l.snap <= r.snap); +} +inline ostream& operator<<(ostream& out, const sobject_t &o) { + return out << o.oid << "/" << o.snap; +} +namespace std { + template<> struct hash<sobject_t> { + size_t operator()(const sobject_t &r) const { + static hash<object_t> H; + static rjhash<uint64_t> I; + return H(r.oid) ^ I(r.snap); + } + }; +} // namespace std + +#endif diff --git a/src/include/on_exit.h b/src/include/on_exit.h new file mode 100644 index 00000000..c412ab33 --- /dev/null +++ b/src/include/on_exit.h @@ -0,0 +1,49 @@ +#ifndef CEPH_ON_EXIT_H +#define CEPH_ON_EXIT_H + +#include <pthread.h> +#include <vector> + +#include "include/ceph_assert.h" +/* + * Create a static instance at the file level to get callbacks called when the + * process exits via main() or exit(). + */ + +class OnExitManager { + public: + typedef void (*callback_t)(void *arg); + + OnExitManager() { + int ret = pthread_mutex_init(&lock_, NULL); + ceph_assert(ret == 0); + } + + ~OnExitManager() { + pthread_mutex_lock(&lock_); + std::vector<struct cb>::iterator it; + for (it = funcs_.begin(); it != funcs_.end(); it++) { + it->func(it->arg); + } + funcs_.clear(); + pthread_mutex_unlock(&lock_); + } + + void add_callback(callback_t func, void *arg) { + pthread_mutex_lock(&lock_); + struct cb callback = { func, arg }; + funcs_.push_back(callback); + pthread_mutex_unlock(&lock_); + } + + private: + struct cb { + callback_t func; + void *arg; + }; + + std::vector<struct cb> funcs_; + pthread_mutex_t lock_; +}; + +#endif diff --git a/src/include/page.h b/src/include/page.h new file mode 100644 index 00000000..db6e2058 --- /dev/null +++ b/src/include/page.h @@ -0,0 +1,18 @@ +#ifndef CEPH_PAGE_H +#define CEPH_PAGE_H + +namespace ceph { + // these are in common/page.cc + extern unsigned _page_size; + extern unsigned long _page_mask; + extern unsigned _page_shift; +} + +#endif + + +#define CEPH_PAGE_SIZE ceph::_page_size +#define CEPH_PAGE_MASK ceph::_page_mask +#define CEPH_PAGE_SHIFT ceph::_page_shift + + diff --git a/src/include/rados.h b/src/include/rados.h new file mode 100644 index 00000000..bbcf0867 --- /dev/null +++ b/src/include/rados.h @@ -0,0 +1,681 @@ +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H + +/* + * Data types for the Ceph distributed object storage layer RADOS + * (Reliable Autonomic Distributed Object Store). + */ + +#include <string.h> +#include <stdbool.h> +#include "msgr.h" + +/* See comment in ceph_fs.h. */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * fs id + */ +struct ceph_fsid { + unsigned char fsid[16]; +}; + +static inline int ceph_fsid_compare(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ +#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ +#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ + +/* + * placement group. + * we encode this into one __le64. + */ +struct ceph_pg { + __le16 preferred; /* preferred primary osd */ + __le16 ps; /* placement seed */ + __le32 pool; /* object pool */ +} __attribute__ ((packed)); + +/* + * pg pool types + * + * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are + * duplicated here only for CrushCompiler's benefit. + */ +#define CEPH_PG_TYPE_REPLICATED 1 +/* #define CEPH_PG_TYPE_RAID4 2 never implemented */ +#define CEPH_PG_TYPE_ERASURE 3 + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; /* for per-object parity, if any */ +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS (1<<0) +#define CEPH_OSD_UP (1<<1) +#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ +#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ +#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ +#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ +#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */ +#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */ +#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */ +#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */ +#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */ +#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */ + +extern const char *ceph_osd_state_name(int s); + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + +#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 +#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ +#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ +#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ +#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ +#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ +#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ +#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ +#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */ +#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */ +#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */ + +/* these are hidden in 'ceph status' view */ +#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS | \ + CEPH_OSDMAP_RECOVERY_DELETES | \ + CEPH_OSDMAP_SORTBITWISE | \ + CEPH_OSDMAP_PURGED_SNAPDIRS | \ + CEPH_OSDMAP_PGLOG_HARDLIMIT) +#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS) + +/* + * major ceph release numbers + */ +#define CEPH_RELEASE_ARGONAUT 1 +#define CEPH_RELEASE_BOBTAIL 2 +#define CEPH_RELEASE_CUTTLEFISH 3 +#define CEPH_RELEASE_DUMPLING 4 +#define CEPH_RELEASE_EMPEROR 5 +#define CEPH_RELEASE_FIREFLY 6 +#define CEPH_RELEASE_GIANT 7 +#define CEPH_RELEASE_HAMMER 8 +#define CEPH_RELEASE_INFERNALIS 9 +#define CEPH_RELEASE_JEWEL 10 +#define CEPH_RELEASE_KRAKEN 11 +#define CEPH_RELEASE_LUMINOUS 12 +#define CEPH_RELEASE_MIMIC 13 +#define CEPH_RELEASE_NAUTILUS 14 +#define CEPH_RELEASE_MAX 15 /* highest + 1 */ + +extern const char *ceph_release_name(int r); +extern int ceph_release_from_name(const char *s); +extern uint64_t ceph_release_features(int r); +extern int ceph_release_from_features(uint64_t features); + +/* + * The error code to return when an OSD can't handle a write + * because it is too large. + */ +#define OSD_WRITETOOBIG EMSGSIZE + +/* + * osd ops + * + * WARNING: do not use these op codes directly. Use the helpers + * defined below instead. In certain cases, op code behavior was + * redefined, resulting in special-cases in the helpers. + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_RMW 0x3000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 +#define CEPH_OSD_OP_MODE_CACHE 0x8000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 +#define CEPH_OSD_OP_TYPE_EXEC 0x0400 +#define CEPH_OSD_OP_TYPE_PG 0x0500 +// LEAVE UNUSED 0x0600 used to be multiobject ops + +#define __CEPH_OSD_OP1(mode, nr) \ + (CEPH_OSD_OP_MODE_##mode | (nr)) + +#define __CEPH_OSD_OP(mode, type, nr) \ + (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr)) + +#define __CEPH_FORALL_OSD_OPS(f) \ + /** data **/ \ + /* read */ \ + f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \ + f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \ + f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \ + f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \ + \ + /* fancy read */ \ + f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \ + f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \ + \ + f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \ + f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \ + \ + /* versioning */ \ + f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \ + \ + f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ + \ + f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ + \ + /* sync */ \ + f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ + \ + /* write */ \ + f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \ + f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \ + f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \ + f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \ + f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \ + \ + /* fancy write */ \ + f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ + f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \ + f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ + f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ + \ + f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \ + f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \ + f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \ + \ + f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \ + f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \ + \ + f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \ + \ + /* omap */ \ + f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \ + f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \ + f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \ + f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \ + f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \ + f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \ + f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \ + f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \ + f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \ + \ + /* tiering */ \ + f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + /* was copy-get-classic */ \ + f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ + f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ + f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \ + f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \ + f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \ + f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \ + \ + /* convert tmap to omap */ \ + f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \ + \ + /* hints */ \ + f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \ + \ + /* cache pin/unpin */ \ + f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \ + f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \ + \ + /* ESX/SCSI */ \ + f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \ + f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \ + \ + /* Extensible */ \ + f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \ + f(SET_CHUNK, __CEPH_OSD_OP(WR, DATA, 40), "set-chunk") \ + f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \ + f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \ + \ + /** attrs **/ \ + /* read */ \ + f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \ + f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \ + f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \ + \ + /* write */ \ + f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \ + f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \ + f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \ + f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \ + \ + /** subop **/ \ + f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \ + f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \ + f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \ + f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \ + f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \ + f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \ + f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \ + /* 8 used to be scrub-stop */ \ + f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \ + \ + /** exec **/ \ + /* note: the RD bit here is wrong; see special-case below in helper */ \ + f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \ + \ + /** pg **/ \ + f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \ + f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \ + f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \ + f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \ + f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \ + f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \ + f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls") + +enum { +#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode), +__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY) +#undef GENERATE_ENUM_ENTRY +}; + +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} +static inline int ceph_osd_op_type_exec(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; +} +static inline int ceph_osd_op_type_pg(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE_RD) && + op != CEPH_OSD_OP_CALL; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return op & CEPH_OSD_OP_MODE_WR; +} +static inline int ceph_osd_op_mode_cache(int op) +{ + return op & CEPH_OSD_OP_MODE_CACHE; +} +static inline bool ceph_osd_op_uses_extent(int op) +{ + switch(op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_TRIMTRUNC: + case CEPH_OSD_OP_CMPEXT: + return true; + default: + return false; + } +} + +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * and objclass.h. Any modification here needs to be updated there + */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' /* create key */ +#define CEPH_OSD_TMAP_RM 'r' +#define CEPH_OSD_TMAP_RMSLOPPY 'R' + +extern const char *ceph_osd_op_name(int op); + +/* + * osd op flags + * + * An op may be READ, WRITE, or READ|WRITE. + */ +enum { + CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */ + CEPH_OSD_FLAG_READ = 0x0010, /* op may read */ + CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 0x0100, + CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ + CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ + CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id + */ + CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ + CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */ +}; + +enum { + CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */ + CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */ + CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */ + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */ +}; + +#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/ +#define EBLACKLISTED 108 /* blacklisted */ + +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + +enum { + CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to + * cloneid */ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ +}; + +enum { + CEPH_OSD_TMAP2OMAP_NULLOK = 1, +}; + +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +enum { + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0, + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1, + CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2 +}; + +const char *ceph_osd_watch_op_name(int o); + +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + +const char *ceph_osd_alloc_hint_flag_name(int f); + +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; + +const char *ceph_osd_backoff_op_name(int op); + +/* + * an individual object operation. each may be accompanied by some data + * payload + */ +struct ceph_osd_op { + __le16 op; /* CEPH_OSD_OP_* */ + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ + union { + struct { + __le64 offset, length; + __le64 truncate_size; + __le32 truncate_seq; + } __attribute__ ((packed)) extent; + struct { + __le32 name_len; + __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } __attribute__ ((packed)) xattr; + struct { + __u8 class_len; + __u8 method_len; + __u8 argc; + __le32 indata_len; + } __attribute__ ((packed)) cls; + struct { + __le64 count; + __le32 start_epoch; /* for the pgls sequence */ + } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; + struct { + __le64 cookie; + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __u32 gen; /* registration generation */ + __u32 timeout; /* connection timeout */ + } __attribute__ ((packed)) watch; + struct { + __le64 cookie; + } __attribute__ ((packed)) notify; + struct { + __le64 unused; + __le64 ver; + } __attribute__ ((packed)) assert_ver; + struct { + __le64 offset, length; + __le64 src_offset; + } __attribute__ ((packed)) clonerange; + struct { + __le64 max; /* max data in reply */ + } __attribute__ ((packed)) copy_get; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ + /* + * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags + * for src object, flags for dest object are in + * ceph_osd_op::flags. + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; + struct { + struct ceph_timespec stamp; + } __attribute__ ((packed)) hit_set_get; + struct { + __u8 flags; + } __attribute__ ((packed)) tmap2omap; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ + } __attribute__ ((packed)) alloc_hint; + struct { + __le64 offset; + __le64 length; + __le64 data_length; + } __attribute__ ((packed)) writesame; + struct { + __le64 offset; + __le64 length; + __le32 chunk_size; + __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */ + } __attribute__ ((packed)) checksum; + }; + __le32 payload_len; +} __attribute__ ((packed)); + +/* + * Check the compatibility of struct ceph_osd_op + * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) + + * sizeof(ceph_osd_op::flags) + + * sizeof(ceph_osd_op::extent) + + * sizeof(ceph_osd_op::payload_len)) + */ +#ifdef __cplusplus +static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4), + "sizeof(ceph_osd_op) breaks the compatibility"); +#endif + +struct ceph_osd_reply_head { + __le32 client_inc; /* client incarnation */ + __le32 flags; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; /* for replaying uncommitted */ + + __le32 result; /* result code */ + + __le32 object_len; /* length of object name */ + __le32 num_ops; + struct ceph_osd_op ops[0]; /* ops[], object */ +} __attribute__ ((packed)); + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h new file mode 120000 index 00000000..51fc03be --- /dev/null +++ b/src/include/rados/buffer.h @@ -0,0 +1 @@ +../buffer.h
\ No newline at end of file diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h new file mode 120000 index 00000000..bd1f6f1b --- /dev/null +++ b/src/include/rados/buffer_fwd.h @@ -0,0 +1 @@ +../buffer_fwd.h
\ No newline at end of file diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h new file mode 120000 index 00000000..19ef4317 --- /dev/null +++ b/src/include/rados/crc32c.h @@ -0,0 +1 @@ +../crc32c.h
\ No newline at end of file diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h new file mode 120000 index 00000000..48f0d443 --- /dev/null +++ b/src/include/rados/inline_memory.h @@ -0,0 +1 @@ +../inline_memory.h
\ No newline at end of file diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h new file mode 100644 index 00000000..58a65afa --- /dev/null +++ b/src/include/rados/librados.h @@ -0,0 +1,4015 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRADOS_H +#define CEPH_LIBRADOS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif +#include <unistd.h> +#include <string.h> +#include "rados_types.h" + +#include <sys/time.h> + +#ifndef CEPH_OSD_TMAP_SET +/* These are also defined in rados.h and objclass.h. Keep them in sync! */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' +#define CEPH_OSD_TMAP_RM 'r' +#endif + +#define LIBRADOS_VER_MAJOR 3 +#define LIBRADOS_VER_MINOR 0 +#define LIBRADOS_VER_EXTRA 0 + +#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA) + +#define LIBRADOS_SUPPORTS_WATCH 1 +#define LIBRADOS_SUPPORTS_SERVICES 1 +#define LIBRADOS_SUPPORTS_GETADDRS 1 +#define LIBRADOS_SUPPORTS_APP_METADATA 1 + +/* RADOS lock flags + * They are also defined in cls_lock_types.h. Keep them in sync! + */ +#define LIBRADOS_LOCK_FLAG_RENEW 0x1 + +/* + * Constants for rados_write_op_create(). + */ +#define LIBRADOS_CREATE_EXCLUSIVE 1 +#define LIBRADOS_CREATE_IDEMPOTENT 0 + +/* + * Flags that can be set on a per-op basis via + * rados_read_op_set_flags() and rados_write_op_set_flags(). + */ +enum { + // fail a create operation if the object already exists + LIBRADOS_OP_FLAG_EXCL = 0x1, + // allow the transaction to succeed even if the flagged op fails + LIBRADOS_OP_FLAG_FAILOK = 0x2, + // indicate read/write op random + LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4, + // indicate read/write op sequential + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, + // indicate read/write data will be accessed in the near future (by someone) + LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10, + // indicate read/write data will not accessed in the near future (by anyone) + LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20, + // indicate read/write data will not accessed again (by *this* client) + LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40, + // optionally support FUA (force unit access) on write requests + LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80, +}; + +#define CEPH_RADOS_API + +/** + * @name xattr comparison operations + * Operators for comparing xattrs on objects, and aborting the + * rados_read_op or rados_write_op transaction if the comparison + * fails. + * + * @{ + */ +enum { + LIBRADOS_CMPXATTR_OP_EQ = 1, + LIBRADOS_CMPXATTR_OP_NE = 2, + LIBRADOS_CMPXATTR_OP_GT = 3, + LIBRADOS_CMPXATTR_OP_GTE = 4, + LIBRADOS_CMPXATTR_OP_LT = 5, + LIBRADOS_CMPXATTR_OP_LTE = 6 +}; +/** @} */ + +/** + * @name Operation Flags + * Flags for rados_read_op_operate(), rados_write_op_operate(), + * rados_aio_read_op_operate(), and rados_aio_write_op_operate(). + * See librados.hpp for details. + * @{ + */ +enum { + LIBRADOS_OPERATION_NOFLAG = 0, + LIBRADOS_OPERATION_BALANCE_READS = 1, + LIBRADOS_OPERATION_LOCALIZE_READS = 2, + LIBRADOS_OPERATION_ORDER_READS_WRITES = 4, + LIBRADOS_OPERATION_IGNORE_CACHE = 8, + LIBRADOS_OPERATION_SKIPRWLOCKS = 16, + LIBRADOS_OPERATION_IGNORE_OVERLAY = 32, + /* send requests to cluster despite the cluster or pool being marked + full; ops will either succeed (e.g., delete) or return EDQUOT or + ENOSPC. */ + LIBRADOS_OPERATION_FULL_TRY = 64, + /* + * Mainly for delete op + */ + LIBRADOS_OPERATION_FULL_FORCE = 128, + LIBRADOS_OPERATION_IGNORE_REDIRECT = 256, + LIBRADOS_OPERATION_ORDERSNAP = 512, +}; +/** @} */ + +/** + * @name Alloc hint flags + * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2() + * indicating future IO patterns. + * @{ + */ +enum { + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8, + LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32, + LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64, + LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128, + LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; +/** @} */ + +typedef enum { + LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0, + LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1, + LIBRADOS_CHECKSUM_TYPE_CRC32C = 2 +} rados_checksum_type_t; + +/* + * snap id contants + */ +#define LIBRADOS_SNAP_HEAD ((uint64_t)(-2)) +#define LIBRADOS_SNAP_DIR ((uint64_t)(-1)) + +/** + * @typedef rados_t + * + * A handle for interacting with a RADOS cluster. It encapsulates all + * RADOS client configuration, including username, key for + * authentication, logging, and debugging. Talking different clusters + * -- or to the same cluster with different users -- requires + * different cluster handles. + */ +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif //VOIDPTR_RADOS_T + +/** + * @typedef rados_config_t + * + * A handle for the ceph configuration context for the rados_t cluster + * instance. This can be used to share configuration context/state + * (e.g., logging configuration) between librados instance. + * + * @warning The config context does not have independent reference + * counting. As such, a rados_config_t handle retrieved from a given + * rados_t is only valid as long as that rados_t. + */ +typedef void *rados_config_t; + +/** + * @typedef rados_ioctx_t + * + * An io context encapsulates a few settings for all I/O operations + * done on it: + * - pool - set when the io context is created (see rados_ioctx_create()) + * - snapshot context for writes (see + * rados_ioctx_selfmanaged_snap_set_write_ctx()) + * - snapshot id to read from (see rados_ioctx_snap_set_read()) + * - object locator for all single-object operations (see + * rados_ioctx_locator_set_key()) + * - namespace for all single-object operations (see + * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES + * before rados_nobjects_list_open() will list all objects in all + * namespaces. + * + * @warning Changing any of these settings is not thread-safe - + * librados users must synchronize any of these changes on their own, + * or use separate io contexts for each thread + */ +typedef void *rados_ioctx_t; + +/** + * @typedef rados_list_ctx_t + * + * An iterator for listing the objects in a pool. + * Used with rados_nobjects_list_open(), + * rados_nobjects_list_next(), rados_nobjects_list_next2(), and + * rados_nobjects_list_close(). + */ +typedef void *rados_list_ctx_t; + +/** + * @typedef rados_object_list_cursor + * + * The cursor used with rados_enumerate_objects + * and accompanying methods. + */ +typedef void * rados_object_list_cursor; + +/** + * @struct rados_object_list_item + * + * The item populated by rados_object_list in + * the results array. + */ +typedef struct rados_object_list_item { + + /// oid length + size_t oid_length; + /// name of the object + char *oid; + /// namespace length + size_t nspace_length; + /// the object namespace + char *nspace; + /// locator length + size_t locator_length; + /// object locator + char *locator; +} rados_object_list_item; + +/** + * @typedef rados_snap_t + * The id of a snapshot. + */ +typedef uint64_t rados_snap_t; + +/** + * @typedef rados_xattrs_iter_t + * An iterator for listing extended attrbutes on an object. + * Used with rados_getxattrs(), rados_getxattrs_next(), and + * rados_getxattrs_end(). + */ +typedef void *rados_xattrs_iter_t; + +/** + * @typedef rados_omap_iter_t + * An iterator for listing omap key/value pairs on an object. + * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and + * rados_omap_get_end(). + */ +typedef void *rados_omap_iter_t; + +/** + * @struct rados_pool_stat_t + * Usage information for a pool. + */ +struct rados_pool_stat_t { + /// space used in bytes + uint64_t num_bytes; + /// space used in KB + uint64_t num_kb; + /// number of objects in the pool + uint64_t num_objects; + /// number of clones of objects + uint64_t num_object_clones; + /// num_objects * num_replicas + uint64_t num_object_copies; + /// number of objects missing on primary + uint64_t num_objects_missing_on_primary; + /// number of objects found on no OSDs + uint64_t num_objects_unfound; + /// number of objects replicated fewer times than they should be + /// (but found on at least one OSD) + uint64_t num_objects_degraded; + /// number of objects read + uint64_t num_rd; + /// objects read in KB + uint64_t num_rd_kb; + /// number of objects written + uint64_t num_wr; + /// objects written in KB + uint64_t num_wr_kb; + /// bytes originally provided by user + uint64_t num_user_bytes; + /// bytes passed compression + uint64_t compressed_bytes_orig; + /// bytes resulted after compression + uint64_t compressed_bytes; + /// bytes allocated at storage + uint64_t compressed_bytes_alloc; +}; + +/** + * @struct rados_cluster_stat_t + * Cluster-wide usage information + */ +struct rados_cluster_stat_t { + /// total device size + uint64_t kb; + /// total used + uint64_t kb_used; + /// total available/free + uint64_t kb_avail; + /// number of objects + uint64_t num_objects; +}; + +/** + * @typedef rados_write_op_t + * + * An object write operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_write_op() rados_release_write_op() + * - Extended attribute manipulation: rados_write_op_cmpxattr() + * rados_write_op_cmpxattr(), rados_write_op_setxattr(), + * rados_write_op_rmxattr() + * - Object map key/value pairs: rados_write_op_omap_set(), + * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(), + * rados_write_op_omap_cmp() + * - Object properties: rados_write_op_assert_exists(), + * rados_write_op_assert_version() + * - Creating objects: rados_write_op_create() + * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero + * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove, + * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext() + * - Hints: rados_write_op_set_alloc_hint() + * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate() + */ +typedef void *rados_write_op_t; + +/** + * @typedef rados_read_op_t + * + * An object read operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_read_op() rados_release_read_op() + * - Extended attribute manipulation: rados_read_op_cmpxattr(), + * rados_read_op_getxattr(), rados_read_op_getxattrs() + * - Object map key/value pairs: rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(), + * rados_read_op_omap_cmp() + * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(), + * rados_read_op_assert_version() + * - IO on objects: rados_read_op_read(), rados_read_op_checksum(), + * rados_read_op_cmpext() + * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf() + * - Request properties: rados_read_op_set_flags() + * - Performing the operation: rados_read_op_operate(), + * rados_aio_read_op_operate() + */ +typedef void *rados_read_op_t; + +/** + * @typedef rados_completion_t + * Represents the state of an asynchronous operation - it contains the + * return value once the operation completes, and can be used to block + * until the operation is complete or safe. + */ +typedef void *rados_completion_t; + +/** + * @struct blkin_trace_info + * blkin trace information for Zipkin tracing + */ +struct blkin_trace_info; + +/** + * Get the version of librados. + * + * The version number is major.minor.extra. Note that this is + * unrelated to the Ceph version number. + * + * TODO: define version semantics, i.e.: + * - incrementing major is for backwards-incompatible changes + * - incrementing minor is for backwards-compatible changes + * - incrementing extra is for bug fixes + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param extra where to store the extra version number + */ +CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra); + +/** + * @name Setup and Teardown + * These are the first and last functions to that should be called + * when using librados. + * + * @{ + */ + +/** + * Create a handle for communicating with a RADOS cluster. + * + * Ceph environment variables are read when this is called, so if + * $CEPH_ARGS specifies everything you need to connect, no further + * configuration is necessary. + * + * @param cluster where to store the handle + * @param id the user to connect as (i.e. admin, not client.admin) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id); + +/** + * Extended version of rados_create. + * + * Like rados_create, but + * 1) don't assume 'client\.'+id; allow full specification of name + * 2) allow specification of cluster name + * 3) flags for future expansion + */ +CEPH_RADOS_API int rados_create2(rados_t *pcluster, + const char *const clustername, + const char * const name, uint64_t flags); + +/** + * Initialize a cluster handle from an existing configuration. + * + * Share configuration state with another rados_t instance. + * + * @param cluster where to store the handle + * @param cct the existing configuration to use + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create_with_context(rados_t *cluster, + rados_config_t cct); + +/** + * Ping the monitor with ID mon_id, storing the resulting reply in + * buf (if specified) with a maximum size of len. + * + * The result buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param[in] mon_id ID of the monitor to ping + * @param[out] outstr double pointer with the resulting reply + * @param[out] outstrlen pointer with the size of the reply in outstr + */ +CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id, + char **outstr, size_t *outstrlen); + +/** + * Connect to the cluster. + * + * @note BUG: Before calling this, calling a function that communicates with the + * cluster will crash. + * + * @pre The cluster handle is configured with at least a monitor + * address. If cephx is enabled, a client name and secret must also be + * set. + * + * @post If this succeeds, any function in librados may be used + * + * @param cluster The cluster to connect to. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_connect(rados_t cluster); + +/** + * Disconnects from the cluster. + * + * For clean up, this is only necessary after rados_connect() has + * succeeded. + * + * @warning This does not guarantee any asynchronous writes have + * completed. To do that, you must call rados_aio_flush() on all open + * io contexts. + * + * @warning We implicitly call rados_watch_flush() on shutdown. If + * there are watches being used, this should be done explicitly before + * destroying the relevant IoCtx. We do it here as a safety measure. + * + * @post the cluster handle cannot be used again + * + * @param cluster the cluster to shutdown + */ +CEPH_RADOS_API void rados_shutdown(rados_t cluster); + +/** @} init */ + +/** + * @name Configuration + * These functions read and update Ceph configuration for a cluster + * handle. Any configuration changes must be done before connecting to + * the cluster. + * + * Options that librados users might want to set include: + * - mon_host + * - auth_supported + * - key, keyfile, or keyring when using cephx + * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog + * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms + * + * See docs.ceph.com for information about available configuration options` + * + * @{ + */ + +/** + * Configure the cluster handle using a Ceph config file + * + * If path is NULL, the default locations are searched, and the first + * found is used. The locations are: + * - $CEPH_CONF (environment variable) + * - /etc/ceph/ceph.conf + * - ~/.ceph/config + * - ceph.conf (in the current working directory) + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param path path to a Ceph configuration file + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path); + +/** + * Configure the cluster handle with command line arguments + * + * argv can contain any common Ceph command line option, including any + * configuration parameter prefixed by '--' and replacing spaces with + * dashes or underscores. For example, the following options are equivalent: + * - --mon-host 10.0.0.1:6789 + * - --mon_host 10.0.0.1:6789 + * - -m 10.0.0.1:6789 + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc, + const char **argv); + + +/** + * Configure the cluster handle with command line arguments, returning + * any remainders. Same rados_conf_parse_argv, except for extra + * remargv argument to hold returns unrecognized arguments. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @param remargv char* array for returned unrecognized arguments + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc, + const char **argv, + const char **remargv); +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cluster cluster handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var); + +/** + * Set a configuration option + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param option option to set + * @param value value of the option + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when the option is not a Ceph configuration option + */ +CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option, + const char *value); + +/** + * Get the value of a configuration option + * + * @param cluster configuration to read + * @param option which option to read + * @param buf where to write the configuration value + * @param len the size of buf in bytes + * @returns 0 on success, negative error code on failure + * @returns -ENAMETOOLONG if the buffer is too short to contain the + * requested value + */ +CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option, + char *buf, size_t len); + +/** @} config */ + +/** + * Read usage info about the cluster + * + * This tells you total space, space used, space available, and number + * of objects. These are not updated immediately when data is written, + * they are eventually consistent. + * + * @param cluster cluster to query + * @param result where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cluster_stat(rados_t cluster, + struct rados_cluster_stat_t *result); + +/** + * Get the fsid of the cluster as a hexadecimal string. + * + * The fsid is a unique id of an entire Ceph cluster. + * + * @param cluster where to get the fsid + * @param buf where to write the fsid + * @param len the size of buf in bytes (should be 37) + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the buffer is too short to contain the + * fsid + */ +CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len); + +/** + * Get/wait for the most recent osdmap + * + * @param cluster the cluster to shutdown + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster); + +/** + * @name Pools + * + * RADOS pools are separate namespaces for objects. Pools may have + * different crush rules associated with them, so they could have + * differing replication levels or placement strategies. RADOS + * permissions are also tied to pools - users can have different read, + * write, and execute permissions on a per-pool basis. + * + * @{ + */ + +/** + * List pools + * + * Gets a list of pool names as NULL-terminated strings. The pool + * names will be placed in the supplied buffer one after another. + * After the last pool name, there will be two 0 bytes in a row. + * + * If len is too short to fit all the pool name entries we need, we will fill + * as much as we can. + * + * Buf may be null to determine the buffer size needed to list all pools. + * + * @param cluster cluster handle + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len); + +/** + * List inconsistent placement groups of the given pool + * + * Gets a list of inconsistent placement groups as NULL-terminated strings. + * The placement group names will be placed in the supplied buffer one after + * another. After the last name, there will be two 0 types in a row. + * + * If len is too short to fit all the placement group entries we need, we will + * fill as much as we can. + * + * @param cluster cluster handle + * @param pool pool ID + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool, + char *buf, size_t len); + +/** + * Get a configuration handle for a rados cluster handle + * + * This handle is valid only as long as the cluster handle is valid. + * + * @param cluster cluster handle + * @returns config handle for this cluster + */ +CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster); + +/** + * Get a global id for current instance + * + * This id is a unique representation of current connection to the cluster + * + * @param cluster cluster handle + * @returns instance global id + */ +CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster); + +/** + * Gets the minimum compatible OSD version + * + * @param cluster cluster handle + * @param[out] require_osd_release minimum compatible OSD version + * based upon the current features + * @returns 0 on sucess, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster, + int8_t* require_osd_release); + +/** + * Gets the minimum compatible client version + * + * @param cluster cluster handle + * @param[out] min_compat_client minimum compatible client version + * based upon the current features + * @param[out] require_min_compat_client required minimum client version + * based upon explicit setting + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster, + int8_t* min_compat_client, + int8_t* require_min_compat_client); + +/** + * Create an io context + * + * The io context allows you to perform operations within a particular + * pool. For more details see rados_ioctx_t. + * + * @param cluster which cluster the pool is in + * @param pool_name name of the pool + * @param ioctx where to store the io context + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name, + rados_ioctx_t *ioctx); +CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id, + rados_ioctx_t *ioctx); + +/** + * The opposite of rados_ioctx_create + * + * This just tells librados that you no longer need to use the io context. + * It may not be freed immediately if there are pending asynchronous + * requests on it, but you should not use an io context again after + * calling this function on it. + * + * @warning This does not guarantee any asynchronous + * writes have completed. You must call rados_aio_flush() + * on the io context before destroying it to do that. + * + * @warning If this ioctx is used by rados_watch, the caller needs to + * be sure that all registered watches are disconnected via + * rados_unwatch() and that rados_watch_flush() is called. This + * ensures that a racing watch callback does not make use of a + * destroyed ioctx. + * + * @param io the io context to dispose of + */ +CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io); + +/** + * Get configuration handle for a pool handle + * + * @param io pool handle + * @returns rados_config_t for this cluster + */ +CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io); + +/** + * Get the cluster handle used by this rados_ioctx_t + * Note that this is a weak reference, and should not + * be destroyed via rados_shutdown(). + * + * @param io the io context + * @returns the cluster handle for this io context + */ +CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io); + +/** + * Get pool usage statistics + * + * Fills in a rados_pool_stat_t after querying the cluster. + * + * @param io determines which pool to query + * @param stats where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io, + struct rados_pool_stat_t *stats); + +/** + * Get the id of a pool + * + * @param cluster which cluster the pool is in + * @param pool_name which pool to look up + * @returns id of the pool + * @returns -ENOENT if the pool is not found + */ +CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster, + const char *pool_name); + +/** + * Get the name of a pool + * + * @param cluster which cluster the pool is in + * @param id the id of the pool + * @param buf where to store the pool name + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id, + char *buf, size_t maxlen); + +/** + * Create a pool with default settings + * + * The default crush rule is rule 0. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name); + +/** + * Create a pool owned by a specific auid. + * + * DEPRECATED: auid support has been removed, and this call will be removed in a future + * release. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster, + const char *pool_name, + uint64_t auid) + __attribute__((deprecated)); + +/** + * Create a pool with a specific CRUSH rule + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool1 + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster, + const char *pool_name, + uint8_t crush_rule_num); + +/** + * Create a pool with a specific CRUSH rule and auid + * + * DEPRECATED: auid support has been removed and this call will be removed + * in a future release. + * + * This is a combination of rados_pool_create_with_crush_rule() and + * rados_pool_create_with_auid(). + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool2 + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster, + const char *pool_name, + uint64_t auid, + uint8_t crush_rule_num) + __attribute__((deprecated)); + +/** + * Returns the pool that is the base tier for this pool. + * + * The return value is the ID of the pool that should be used to read from/write to. + * If tiering is not set up for the pool, returns \c pool. + * + * @param cluster the cluster the pool is in + * @param pool ID of the pool to query + * @param[out] base_tier base tier, or \c pool if tiering is not configured + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool, + int64_t* base_tier); + +/** + * Delete a pool and all data inside it + * + * The pool is removed from the cluster immediately, + * but the actual data is deleted in the background. + * + * @param cluster the cluster the pool is in + * @param pool_name which pool to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name); + +/** + * Attempt to change an io context's associated auid "owner" + * + * DEPRECATED: auid support has been removed and this call has no effect. + * + * Requires that you have write permission on both the current and new + * auid. + * + * @param io reference to the pool to change. + * @param auid the auid you wish the io to have. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid) + __attribute__((deprecated)); + + +/** + * Get the auid of a pool + * + * DEPRECATED: auid support has been removed and this call always reports + * CEPH_AUTH_UID_DEFAULT (-1). + + * @param io pool to query + * @param auid where to store the auid + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid) + __attribute__((deprecated)); + +/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Test whether the specified pool requires alignment or not. + * + * @param io pool to query + * @param req 1 if alignment is supported, 0 if not. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io, + int *req); + +/* deprecated, use rados_ioctx_pool_required_alignment2 instead */ +CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Get the alignment flavor of a pool + * + * @param io pool to query + * @param alignment where to store the alignment flavor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io, + uint64_t *alignment); + +/** + * Get the pool id of the io context + * + * @param io the io context to query + * @returns the id of the pool the io context uses + */ +CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io); + +/** + * Get the pool name of the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} pools */ + +/** + * @name Object Locators + * + * @{ + */ + +/** + * Set the key for mapping objects to pgs within an io context. + * + * The key is used instead of the object name to determine which + * placement groups an object is put in. This affects all subsequent + * operations of the io context - until a different locator key is + * set, all objects in this io context will be placed in the same pg. + * + * @param io the io context to change + * @param key the key to use as the object locator, or NULL to discard + * any previously set key + */ +CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io, + const char *key); + +/** + * Set the namespace for objects within an io context + * + * The namespace specification further refines a pool into different + * domains. The mapping of objects to pgs is also based on this + * value. + * + * @param io the io context to change + * @param nspace the name to use as the namespace, or NULL use the + * default namespace + */ +CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io, + const char *nspace); + +/** + * Get the namespace for objects within the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} obj_loc */ + +/** + * @name Listing Objects + * @{ + */ +/** + * Start listing objects in a pool + * + * @param io the pool to list from + * @param ctx the handle to store list context in + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io, + rados_list_ctx_t *ctx); + +/** + * Return hash position of iterator, rounded to the current PG + * + * @param ctx iterator marking where you are in the listing + * @returns current hash position, rounded to the current pg + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx); + +/** + * Reposition object iterator to a different hash position + * + * @param ctx iterator marking where you are in the listing + * @param pos hash position to move to + * @returns actual (rounded) position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx, + uint32_t pos); + +/** + * Reposition object iterator to a different position + * + * @param ctx iterator marking where you are in the listing + * @param cursor position to move to + * @returns rounded position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor cursor); + +/** + * Reposition object iterator to a different position + * + * The returned handle must be released with rados_object_list_cursor_free(). + * + * @param ctx iterator marking where you are in the listing + * @param cursor where to store cursor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor *cursor); + +/** + * Get the next object name and locator in the pool + * + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace); + +/** + * Get the next object name, locator and their sizes in the pool + * + * The sizes allow to list objects with \0 (the NUL character) + * in .e.g *entry. Is is unusual see such object names but a bug + * in a client has risen the need to handle them as well. + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @param entry_size where to store the size of name of the entry + * @param key_size where to store the size of object locator (set to NULL to ignore) + * @param nspace_size where to store the size of object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace, + size_t *entry_size, + size_t *key_size, + size_t *nspace_size); + +/** + * Close the object listing handle. + * + * This should be called when the handle is no longer needed. + * The handle should not be used after it has been closed. + * + * @param ctx the handle to close + */ +CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx); + +/** + * Get cursor handle pointing to the *beginning* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin( + rados_ioctx_t io); + +/** + * Get cursor handle pointing to the *end* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io); + +/** + * Check if a cursor has reached the end of a pool + * + * @param io ioctx + * @param cur cursor + * @returns 1 if the cursor has reached the end of the pool, 0 otherwise + */ +CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Release a cursor + * + * Release a cursor. The handle may not be used after this point. + * + * @param io ioctx + * @param cur cursor + */ +CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Compare two cursor positions + * + * Compare two cursors, and indicate whether the first cursor precedes, + * matches, or follows the second. + * + * @param io ioctx + * @param lhs first cursor + * @param rhs second cursor + * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs + */ +CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io, + rados_object_list_cursor lhs, rados_object_list_cursor rhs); + +/** + * @return the number of items set in the results array + */ +CEPH_RADOS_API int rados_object_list(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t result_size, + const char *filter_buf, + const size_t filter_buf_len, + rados_object_list_item *results, + rados_object_list_cursor *next); + +CEPH_RADOS_API void rados_object_list_free( + const size_t result_size, + rados_object_list_item *results); + +/** + * Obtain cursors delineating a subset of a range. Use this + * when you want to split up the work of iterating over the + * global namespace. Expected use case is when you are iterating + * in parallel, with `m` workers, and each worker taking an id `n`. + * + * @param io ioctx + * @param start start of the range to be sliced up (inclusive) + * @param finish end of the range to be sliced up (exclusive) + * @param n which of the m chunks you would like to get cursors for + * @param m how many chunks to divide start-finish into + * @param split_start cursor populated with start of the subrange (inclusive) + * @param split_finish cursor populated with end of the subrange (exclusive) + */ +CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t n, + const size_t m, + rados_object_list_cursor *split_start, + rados_object_list_cursor *split_finish); + + +/** @} Listing Objects */ + +/** + * @name Snapshots + * + * RADOS snapshots are based upon sequence numbers that form a + * snapshot context. They are pool-specific. The snapshot context + * consists of the current snapshot sequence number for a pool, and an + * array of sequence numbers at which snapshots were taken, in + * descending order. Whenever a snapshot is created or deleted, the + * snapshot sequence number for the pool is increased. To add a new + * snapshot, the new snapshot sequence number must be increased and + * added to the snapshot context. + * + * There are two ways to manage these snapshot contexts: + * -# within the RADOS cluster + * These are called pool snapshots, and store the snapshot context + * in the OSDMap. These represent a snapshot of all the objects in + * a pool. + * -# within the RADOS clients + * These are called self-managed snapshots, and push the + * responsibility for keeping track of the snapshot context to the + * clients. For every write, the client must send the snapshot + * context. In librados, this is accomplished with + * rados_selfmanaged_snap_set_write_ctx(). These are more + * difficult to manage, but are restricted to specific objects + * instead of applying to an entire pool. + * + * @{ + */ + +/** + * Create a pool-wide snapshot + * + * @param io the pool to snapshot + * @param snapname the name of the snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io, + const char *snapname); + +/** + * Delete a pool snapshot + * + * @param io the pool to delete the snapshot from + * @param snapname which snapshot to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io, + const char *snapname); + +/** + * Rollback an object to a pool snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapname which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid, + const char *snapname); + +/** + * @warning Deprecated: Use rados_ioctx_snap_rollback() instead + */ +CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid, + const char *snapname) + __attribute__((deprecated)); + +/** + * Set the snapshot from which reads are performed. + * + * Subsequent reads will return data as it was at the time of that + * snapshot. + * + * @param io the io context to change + * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no + * snapshot (i.e. normal operation) + */ +CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io, + rados_snap_t snap); + +/** + * Allocate an ID for a self-managed snapshot + * + * Get a unique ID to put in the snaphot context to create a + * snapshot. A clone of an object is not created until a write with + * the new snapshot context is completed. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid, + rados_completion_t completion); + +/** + * Remove a self-managed snapshot + * + * This increases the snapshot sequence number, which will cause + * snapshots to be removed lazily. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid, + rados_completion_t completion); + +/** + * Rollback an object to a self-managed snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapid which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, + const char *oid, + rados_snap_t snapid); + +/** + * Set the snapshot context for use when writing to objects + * + * This is stored in the io context, and applies to all future writes. + * + * @param io the io context to change + * @param seq the newest snapshot sequence number for the pool + * @param snaps array of snapshots in sorted by descending id + * @param num_snaps how many snaphosts are in the snaps array + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snaps are not in descending order + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io, + rados_snap_t seq, + rados_snap_t *snaps, + int num_snaps); + +/** + * List all the ids of pool snapshots + * + * If the output array does not have enough space to fit all the + * snapshots, -ERANGE is returned and the caller should retry with a + * larger array. + * + * @param io the pool to read from + * @param snaps where to store the results + * @param maxlen the number of rados_snap_t that fit in the snaps array + * @returns number of snapshots on success, negative error code on failure + * @returns -ERANGE is returned if the snaps array is too short + */ +CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps, + int maxlen); + +/** + * Get the id of a pool snapshot + * + * @param io the pool to read from + * @param name the snapshot to find + * @param id where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name, + rados_snap_t *id); + +/** + * Get the name of a pool snapshot + * + * @param io the pool to read from + * @param id the snapshot to find + * @param name where to store the result + * @param maxlen the size of the name array + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the name array is too small + */ +CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id, + char *name, int maxlen); + +/** + * Find when a pool snapshot occurred + * + * @param io the pool the snapshot was taken in + * @param id the snapshot to lookup + * @param t where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, + time_t *t); + +/** @} Snapshots */ + +/** + * @name Synchronous I/O + * Writes are replicated to a number of OSDs based on the + * configuration of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_ioctx_wait_for_complete(). For greater data safety, use the + * asynchronous functions and rados_aio_wait_for_safe(). + * + * @{ + */ + +/** + * Return the version of the last object read or written to. + * + * This exposes the internal version number of the last object read or + * written via this io context + * + * @param io the io context to check + * @returns last read or written object version + */ +CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io); + +/** + * Write *len* bytes from *buf* into the *oid* object, starting at + * offset *off*. The value of *len* must be <= UINT_MAX/2. + * + * @note This will never return a positive value not equal to len. + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid, + const char *buf, size_t len, uint64_t off); + +/** + * Write *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Write the same *data_len* bytes from *buf* multiple times into the + * *oid* object. *write_len* bytes are written in total, which must be + * a multiple of *data_len*. The value of *write_len* and *data_len* + * must be <= UINT_MAX/2. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Append *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * @param io the context to operate in + * @param oid the name of the object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf, + size_t len, uint64_t off); + +/** + * Compute checksum from object data + * + * The io context determines the snapshot to checksum, if any was set + * by rados_ioctx_snap_set_read(). The length of the init_value and + * resulting checksum are dependent upon the checksum type: + * + * XXHASH64: le64 + * XXHASH32: le32 + * CRC32C: le32 + * + * The checksum result is encoded the following manner: + * + * le32 num_checksum_chunks + * { + * leXX checksum for chunk (where XX = appropriate size for the checksum type) + * } * num_checksum_chunks + * + * @param io the context in which to perform the checksum + * @param oid the name of the object to checksum + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param len the number of bytes to checksum + * @param off the offset to start checksumming in the object + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result + * @param checksum_len the number of bytes available for the result + * @return negative error code on failure + */ +CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid, + rados_checksum_type_t type, + const char *init_value, size_t init_value_len, + size_t len, uint64_t off, size_t chunk_size, + char *pchecksum, size_t checksum_len); + +/** + * Delete an object + * + * @note This does not delete any snapshots of the object. + * + * @param io the pool to delete the object from + * @param oid the name of the object to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @param io the context in which to truncate + * @param oid the name of the object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid, + uint64_t size); + +/** + * Compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o name of the object + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o, + const char *cmp_buf, size_t cmp_len, + uint64_t off); + +/** + * @name Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o, + const char *name, char *buf, size_t len); + +/** + * Set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o, + const char *name, const char *buf, + size_t len); + +/** + * Delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o, + const char *name); + +/** + * Start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Get the next omap key/value pair on the object + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key is + * null-terminated, and val has length len. If the end of the list has + * been reached, key and val are NULL, and len is 0. key and val will + * not be accessible after rados_omap_get_end() is called on iter, so + * if they are needed after that they should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter, + char **key, + char **val, + size_t *len); + +/** + * Get the next omap key/value pair on the object. Note that it's + * perfectly safe to mix calls to rados_omap_get_next and + * rados_omap_get_next2. + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key has length + * keylen and val has length vallen. If the end of the list has + * been reached, key and val are NULL, and keylen and vallen is 0. + * key and val will not be accessible after rados_omap_get_end() + * is called on iter, so if they are needed after that they + * should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param key_len where to store the number of bytes in key + * @param val_len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter, + char **key, + char **val, + size_t *key_len, + size_t *val_len); + +/** + * Return number of elements in the iterator + * + * @param iter the iterator of which to return the size + */ +CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter); + +/** + * Close the omap iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter); + +/** + * Get object stats (size/mtime) + * + * TODO: when are these set, and by whom? can they be out of date? + * + * @param io ioctx + * @param o object name + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize, + time_t *pmtime); +/** + * Execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param oid the object to call the method on + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns the length of the output, or + * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For + * methods that don't return data, the return value is + * method-specific. + */ +CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid, + const char *cls, const char *method, + const char *in_buf, size_t in_len, char *buf, + size_t out_len); + + +/** @} Synchronous I/O */ + +/** + * @name Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_callback_t + * Callbacks for asynchrous operations take two parameters: + * - cb the completion that has finished + * - arg application defined data made available to the callback function + */ +typedef void (*rados_callback_t)(rados_completion_t cb, void *arg); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * TODO: more complete documentation of this elsewhere (in the RADOS docs?) + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all relpicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_completion_t *pc); + +/** + * Block until an operation completes + * + * This means it is in memory on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c); + +/** + * Block until an operation is safe + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c); + +/** + * Has an asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c); + +/** + * Is an asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c); + +/** + * Block until an operation completes and callback completes + * + * This means it is in memory on all replicas and can be read. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c); + +/** + * Block until an operation is safe and callback has completed + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c); + +/** + * Has an asynchronous operation and callback completed + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c); + +/** + * Is an asynchronous operation safe and has the callback completed + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c); + +/** + * Get the return value of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns return value of the operation + */ +CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c); + +/** + * Get the internal object version of the target of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns version number of the asychronous operation's target + */ +CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c); + +/** + * Release a completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c completion to release + */ +CEPH_RADOS_API void rados_aio_release(rados_completion_t c); + +/** + * Write data to an object asynchronously + * + * Queues the write and returns. The return value of the completion + * will be 0 on success, negative error code on failure. + * + * @param io the context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len, uint64_t off); + +/** + * Asynchronously append data to an object + * + * Queues the append and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the append is safe and complete + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write an entire object + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * Queues the write_full and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write_full is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write the same buffer multiple times + * + * Queues the writesame and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the writesame is safe and complete + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid, + rados_completion_t completion); + +/** + * Asynchronously read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @note only the 'complete' callback of the completion will be called. + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param completion what to do when the read is complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + char *buf, size_t len, uint64_t off); + +/** + * Block until all pending writes in an io context are safe + * + * This is not equivalent to calling rados_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @note BUG: always returns 0, should be void or accept a timeout + * + * @param io the context to flush + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io); + + +/** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * rados_aio_flush(). + * + * @param io the context to flush + * @param completion what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io, + rados_completion_t completion); + + +/** + * Asynchronously get object stats (size/mtime) + * + * @param io ioctx + * @param o object name + * @param completion what to do when the stat is complete + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, time_t *pmtime); + +/** + * Asynchronously compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o the name of the object to compare with + * @param completion what to do when the comparison is complete + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cmp_buf, + size_t cmp_len, + uint64_t off); + +/** + * Cancel async operation + * + * @param io ioctx + * @param completion completion handle + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io, + rados_completion_t completion); + +/** + * Asynchronously execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param o name of the object + * @param completion what to do when the exec completes + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cls, const char *method, + const char *in_buf, size_t in_len, + char *buf, size_t out_len); + +/** @} Asynchronous I/O */ + +/** + * @name Asynchronous Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Asynchronously get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param completion what to do when the getxattr completes + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, char *buf, size_t len); + +/** + * Asynchronously set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param completion what to do when the setxattr completes + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, const char *buf, + size_t len); + +/** + * Asynchronously delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param completion what to do when the rmxattr completes + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name); + +/** + * Asynchronously start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param completion what to do when the getxattrs completes + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + rados_xattrs_iter_t *iter); + +/** @} Asynchronous Xattrs */ + +/** + * @name Watch/Notify + * + * Watch/notify is a protocol to help communicate among clients. It + * can be used to sychronize client state. All that's needed is a + * well-known object name (for example, rbd uses the header object of + * an image). + * + * Watchers register an interest in an object, and receive all + * notifies on that object. A notify attempts to communicate with all + * clients watching an object, and blocks on the notifier until each + * client responds or a timeout is reached. + * + * See rados_watch() and rados_notify() for more details. + * + * @{ + */ + +/** + * @typedef rados_watchcb_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param opcode undefined + * @param ver version of the watched object + * @param arg application-specific data + * + * @note BUG: opcode is an internal detail that shouldn't be exposed + * @note BUG: ver is unused + */ +typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg); + +/** + * @typedef rados_watchcb2_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param arg opaque user-defined value provided to rados_watch2() + * @param notify_id an id for this notify event + * @param handle the watcher handle we are notifying + * @param notifier_id the unique client id for the notifier + * @param data payload from the notifier + * @param datalen length of payload buffer + */ +typedef void (*rados_watchcb2_t)(void *arg, + uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + void *data, + size_t data_len); + +/** + * @typedef rados_watcherrcb_t + * + * Callback activated when we encounter an error with the watch session. + * This can happen when the location of the objects moves within the + * cluster and we fail to register our watch with the new object location, + * or when our connection with the object OSD is otherwise interrupted and + * we may have missed notify events. + * + * @param pre opaque user-defined value provided to rados_watch2() + * @param err error code + */ + typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @note BUG: librados should provide a way for watchers to notice connection resets + * @note BUG: the ver parameter does not work, and -ERANGE will never be returned + * (See URL tracker.ceph.com/issues/2592) + * + * @param io the pool the object is in + * @param o the object to watch + * @param ver expected version of the object + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param arg application defined data to pass when watchcb is called + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the version of the object is greater than ver + */ +CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver, + uint64_t *cookie, + rados_watchcb_t watchcb, void *arg) + __attribute__((deprecated)); + + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to the + * primary OSD for a watched object, the watch will be removed after + * a timeout configured with osd_client_watch_timeout. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after the number of seconds that configured in timeout parameter. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Check on the status of a watch + * + * Return the number of milliseconds since the watch was last confirmed. + * Or, if there has been an error, return that. + * + * If there is an error, the watch is no longer valid, and should be + * destroyed with rados_unwatch2(). The the user is still interested + * in the object, a new watch should be created with rados_watch2(). + * + * @param io the pool the object is in + * @param cookie the watch handle + * @returns ms since last confirmed on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param o the name of the watched object (ignored) + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie) + __attribute__((deprecated)); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie); + +/** + * Asynchronous unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie, + rados_completion_t completion); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * @note BUG: the timeout is not changeable via the C API + * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t + * + * @param io the pool the object is in + * @param o the name of the object + * @param ver obsolete - just pass zero + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver, + const char *buf, int buf_len) + __attribute__((deprecated)); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * The reply buffer is optional. If specified, the client will get + * back an encoded buffer that includes the ids of the clients that + * acknowledged the notify as well as their notify ack payloads (if + * any). Clients that timed out are not included. Even clients that + * do not include a notify ack payload are included in the list but + * have a 0-length payload associated with them. The format: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * Note: There may be multiple instances of the same gid if there are + * multiple watchers registered via the same client. + * + * Note: The buffer must be released with rados_buffer_free() when the + * user is done with it. + * + * Note: Since the result buffer includes clients that time out, it + * will be set even when rados_notify() returns an error code (like + * -ETIMEDOUT). + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param o the name of the object + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @param timeout_ms notify timeout (in ms) + * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free) + * @param reply_buffer_len pointer to size of reply buffer + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *buf, int buf_len, + uint64_t timeout_ms, char **reply_buffer, + size_t *reply_buffer_len); +CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o, + const char *buf, int buf_len, + uint64_t timeout_ms, + char **reply_buffer, size_t *reply_buffer_len); + +/** + * Acknolwedge receipt of a notify + * + * @param io the pool the object is in + * @param o the name of the object + * @param notify_id the notify_id we got on the watchcb2_t callback + * @param cookie the watcher handle + * @param buf payload to return to notifier (optional) + * @param buf_len payload length + * @returns 0 on success + */ +CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o, + uint64_t notify_id, uint64_t cookie, + const char *buf, int buf_len); + +/** + * Flush watch/notify callbacks + * + * This call will block until all pending watch/notify callbacks have + * been executed and the queue is empty. It should usually be called + * after shutting down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + */ +CEPH_RADOS_API int rados_watch_flush(rados_t cluster); +/** + * Flush watch/notify callbacks + * + * This call will be nonblock, and the completion will be called + * until all pending watch/notify callbacks have been executed and + * the queue is empty. It should usually be called after shutting + * down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + * @param completion what to do when operation has been attempted + */ +CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion); + +/** @} Watch/Notify */ + +/** + * Pin an object in the cache tier + * + * When an object is pinned in the cache tier, it stays in the cache + * tier, and won't be flushed out. + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o); + +/** + * Unpin an object in the cache tier + * + * After an object is unpinned in the cache tier, it can be flushed out + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o); + +/** + * @name Hints + * + * @{ + */ + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** @} Hints */ + +/** + * @name Object Operations + * + * A single rados operation can do multiple operations on one object + * atomically. The whole operation will succeed or fail, and no partial + * results will be visible. + * + * Operations may be either reads, which can return data, or writes, + * which cannot. The effects of writes are applied and visible all at + * once, so an operation that sets an xattr and then checks its value + * will not see the updated value. + * + * @{ + */ + +/** + * Create a new rados_write_op_t write operation. This will store all actions + * to be performed atomically. You must call rados_release_write_op when you are + * finished with it. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_write_op_t rados_create_write_op(void); + +/** + * Free a rados_write_op_t, must be called when you're done with it. + * @param write_op operation to deallocate, created with rados_create_write_op + */ +CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op); + +/** + * Set flags for the last operation added to this write_op. + * At least one op must have been added to the write_op. + * @param write_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op, + int flags); + +/** + * Ensure that the object exists before writing + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before writing. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_write_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_write_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param write_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param write_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that given xattr satisfies comparison. + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param write_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Set an xattr + * @param write_op operation to add this action to + * @param name name of the xattr + * @param value buffer to set xattr to + * @param value_len length of buffer to set xattr to + */ +CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op, + const char *name, + const char *value, + size_t value_len); + +/** + * Remove an xattr + * @param write_op operation to add this action to + * @param name name of the xattr to remove + */ +CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op, + const char *name); + +/** + * Create the object + * @param write_op operation to add this action to + * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or + LIBRADOS_CREATE_IDEMPOTENT + * will error if the object already exists. + * @param category category string (DEPRECATED, HAS NO EFFECT) + */ +CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op, + int exclusive, + const char* category); + +/** + * Write to offset + * @param write_op operation to add this action to + * @param offset offset to write to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op, + const char *buffer, + size_t len, + uint64_t offset); + +/** + * Write whole object, atomically replacing it. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op, + const char *buffer, + size_t len); + +/** + * Write the same buffer multiple times + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param data_len length of buffer + * @param write_len total number of bytes to write, as a multiple of @c data_len + * @param offset offset to write to + */ +CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op, + const char *buffer, + size_t data_len, + size_t write_len, + uint64_t offset); + +/** + * Append to end of object. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op, + const char *buffer, + size_t len); +/** + * Remove object + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op); + +/** + * Truncate an object + * @param write_op operation to add this action to + * @param offset Offset to truncate to + */ +CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op, + uint64_t offset); + +/** + * Zero part of an object + * @param write_op operation to add this action to + * @param offset Offset to zero + * @param len length to zero + */ +CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op, + uint64_t offset, + uint64_t len); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * @param write_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + int *prval); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *lens, + size_t num); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param key_lens array of lengths corresponding to each key + * @param val_lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *key_lens, + const size_t *val_lens, + size_t num); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to remove + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op, + char const* const* keys, + size_t keys_len); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of char arrays representing keys to remove + * @param key_lens array of size_t values representing length of each key + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op, + char const* const* keys, + const size_t* key_lens, + size_t keys_len); + +/** + * Remove all key/value pairs from an object + * + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + time_t *mtime, + int flags); +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ + +CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + time_t *mtime, + int flags); + +/** + * Create a new rados_read_op_t write operation. This will store all + * actions to be performed atomically. You must call + * rados_release_read_op when you are finished with it (after it + * completes, or you decide not to send it in the first place). + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_read_op_t rados_create_read_op(void); + +/** + * Free a rados_read_op_t, must be called when you're done with it. + * @param read_op operation to deallocate, created with rados_create_read_op + */ +CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op); + +/** + * Set flags for the last operation added to this read_op. + * At least one op must have been added to the read_op. + * @param read_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags); + +/** + * Ensure that the object exists before reading + * @param read_op operation to add this action to + */ +CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before reading. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_read_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_read_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param read_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param read_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that the an xattr satisfies a comparison + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param read_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Start iterating over xattrs on an object. + * + * @param read_op operation to add this action to + * @param iter where to store the iterator + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op, + rados_xattrs_iter_t *iter, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Get object size and mtime + * @param read_op operation to add this action to + * @param psize where to store object size + * @param pmtime where to store modification time + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op, + uint64_t *psize, + time_t *pmtime, + int *prval); + +/** + * Read bytes from offset into buffer. + * + * prlen will be filled with the number of bytes read if successful. + * A short read can only occur if the read reaches the end of the + * object. + * + * @param read_op operation to add this action to + * @param offset offset to read from + * @param len length of buffer + * @param buffer where to put the data + * @param bytes_read where to store the number of bytes read by this action + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op, + uint64_t offset, + size_t len, + char *buffer, + size_t *bytes_read, + int *prval); + +/** + * Compute checksum from object data + * + * @param read_op operation to add this action to + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param offset the offset to start checksumming in the object + * @param len the number of bytes to checksum + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result for this action + * @param checksum_len the number of bytes available for the result + * @param prval where to store the return value for this action + */ +CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op, + rados_checksum_type_t type, + const char *init_value, + size_t init_value_len, + uint64_t offset, size_t len, + size_t chunk_size, char *pchecksum, + size_t checksum_len, int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * The output buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf where to put librados-allocated output buffer + * @param out_len length of out_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char **out_buf, + size_t *out_len, + int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * If the output buffer is too small, prval will + * be set to -ERANGE and used_len will be 0. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf user-provided buffer to read into + * @param out_len length of out_buf in bytes + * @param used_len where to store the number of bytes read into out_buf + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char *out_buf, + size_t out_len, + size_t *used_len, + int *prval); + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to null-terminated keys to get + * @param keys_len the number of strings in keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op, + char const* const* keys, + size_t keys_len, + rados_omap_iter_t *iter, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to keys to get + * @param num_keys the number of strings in keys + * @param key_lens array of size_t's describing each key len (in bytes) + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op, + char const* const* keys, + size_t num_keys, + const size_t* key_lens, + rados_omap_iter_t *iter, + int *prval); + +/** + * Perform a read operation synchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + const char *oid, + int flags); + +/** + * Perform a read operation asynchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + int flags); + +/** @} Object Operations */ + +/** + * Take an exclusive lock on an object. + * + * @param io the context to operate in + * @param oid the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid, + const char * name, const char * cookie, + const char * desc, + struct timeval * duration, + uint8_t flags); + +/** + * Take a shared lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag The tag of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o, + const char * name, const char * cookie, + const char * tag, const char * desc, + struct timeval * duration, uint8_t flags); + +/** + * Release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie); + +/** + * Asynchronous release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @param completion what to do when operation has been attempted + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie, + rados_completion_t completion); + +/** + * List clients that have locked the named object lock and information about + * the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the object lock + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o, + const char *name, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Releases a shared or exclusive lock on an object, which was taken by the + * specified client. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param client the client currently holding the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + * @returns -EINVAL if the client cannot be parsed + */ +CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o, + const char *name, const char *client, + const char *cookie); + +/** + * Blacklists the specified client from the OSDs + * + * @param cluster cluster handle + * @param client_address client address + * @param expire_seconds number of seconds to blacklist (0 for default) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_blacklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds); + +/** + * Gets addresses of the RADOS session, suitable for blacklisting. + * + * @param cluster cluster handle + * @param addrs the output string. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs); + +CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io); + +CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io); + +/** + * Enable an application on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param force 0 if only single application per pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io, + const char *app_name, int force); + +/** + * List all enabled applications + * + * If the provided buffer is too short, the required length is filled in and + * -ERANGE is returned. Otherwise, the buffers are filled with the application + * names, with a '\0' after each. + * + * @param io pool ioctx + * @param values buffer in which to store application names + * @param values_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values, + size_t *values_len); + +/** + * Get application metadata value from pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value result buffer + * @param value_len maximum len of value + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io, + const char *app_name, + const char *key, char *value, + size_t *value_len); + +/** + * Set application metadata on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io, + const char *app_name, + const char *key, + const char *value); + +/** + * Remove application metadata from a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io, + const char *app_name, + const char *key); + +/** + * List all metadata key/value pairs associated with an application. + * + * This iterates over all metadata, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are filled + * in and -ERANGE is returned. Otherwise, the buffers are filled with + * the keys and values of the metadata, with a '\0' after each. + * + * @param io pool ioctx + * @param app_name application name + * @param keys buffer in which to store key names + * @param key_len number of bytes in keys buffer + * @param values buffer in which to store values + * @param vals_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io, + const char *app_name, + char *keys, size_t *key_len, + char *values, + size_t *vals_len); + +/** + * @name Mon/OSD/PG Commands + * + * These interfaces send commands relating to the monitor, OSD, or PGs. + * + * @{ + */ + +/** + * Send monitor command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send monitor command to a specific monitor. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name target monitor's name + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * free a rados-allocated buffer + * + * Release memory allocated by librados calls like rados_mon_command(). + * + * @param buf buffer pointer + */ +CEPH_RADOS_API void rados_buffer_free(char *buf); + +CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +CEPH_RADOS_API int rados_mgr_command(rados_t cluster, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback_t)(void *arg, + const char *line, + const char *who, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback2_t)(void *arg, + const char *line, + const char *channel, + const char *who, + const char *name, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level, + rados_log_callback_t cb, void *arg); +CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level, + rados_log_callback2_t cb, void *arg); + + +/** + * register daemon instance for a service + * + * Register us as a daemon providing a particular service. We identify + * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname'). + * The metadata is a map of keys and values with arbitrary static metdata + * for this instance. The encoding is a series of NULL-terminated strings, + * alternating key names and values, terminating with an empty key name. + * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}. + * + * For the lifetime of the librados instance, regular beacons will be sent + * to the cluster to maintain our registration in the service map. + * + * @param cluster handle + * @param service service name + * @param daemon daemon instance name + * @param metadata_dict static daemon metadata dict + */ +CEPH_RADOS_API int rados_service_register( + rados_t cluster, + const char *service, + const char *daemon, + const char *metadata_dict); + +/** + * update daemon status + * + * Update our mutable status information in the service map. + * + * The status dict is encoded the same way the daemon metadata is encoded + * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is + * {foo=bar,this=that}. + * + * @param cluster rados cluster handle + * @param status_dict status dict + */ +CEPH_RADOS_API int rados_service_update_status( + rados_t cluster, + const char *status_dict); + +/** @} Mon/OSD/PG commands */ + +/* + * These methods are no longer supported and return -ENOTSUP where possible. + */ +CEPH_RADOS_API int rados_objects_list_open( + rados_ioctx_t io, + rados_list_ctx_t *ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position( + rados_list_ctx_t ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_seek( + rados_list_ctx_t ctx, + uint32_t pos) __attribute__((deprecated)); +CEPH_RADOS_API int rados_objects_list_next( + rados_list_ctx_t ctx, + const char **entry, + const char **key) __attribute__((deprecated)); +CEPH_RADOS_API void rados_objects_list_close( + rados_list_ctx_t ctx) __attribute__((deprecated)); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp new file mode 100644 index 00000000..0c047c43 --- /dev/null +++ b/src/include/rados/librados.hpp @@ -0,0 +1,1468 @@ +#ifndef __LIBRADOS_HPP +#define __LIBRADOS_HPP + +#include <string> +#include <list> +#include <map> +#include <memory> +#include <set> +#include <vector> +#include <utility> +#include "buffer.h" + +#include "librados.h" +#include "librados_fwd.hpp" +#include "rados_types.hpp" + +namespace libradosstriper +{ + class RadosStriper; +} + +namespace librados { + +using ceph::bufferlist; + +struct AioCompletionImpl; +struct IoCtxImpl; +struct ListObjectImpl; +class NObjectIteratorImpl; +struct ObjListCtx; +class ObjectOperationImpl; +struct PlacementGroupImpl; +struct PoolAsyncCompletionImpl; + +typedef struct rados_cluster_stat_t cluster_stat_t; +typedef struct rados_pool_stat_t pool_stat_t; + +typedef void *list_ctx_t; +typedef uint64_t auid_t; +typedef void *config_t; + +typedef struct { + std::string client; + std::string cookie; + std::string address; +} locker_t; + +typedef std::map<std::string, pool_stat_t> stats_map; + +typedef void *completion_t; +typedef void (*callback_t)(completion_t cb, void *arg); + +inline namespace v14_2_0 { + + class IoCtx; + class RadosClient; + + class CEPH_RADOS_API ListObject + { + public: + const std::string& get_nspace() const; + const std::string& get_oid() const; + const std::string& get_locator() const; + + ListObject(); + ~ListObject(); + ListObject( const ListObject&); + ListObject& operator=(const ListObject& rhs); + private: + ListObject(ListObjectImpl *impl); + + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& out, const ListObject& lop); + + ListObjectImpl *impl; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop); + + class CEPH_RADOS_API NObjectIterator; + + class CEPH_RADOS_API ObjectCursor + { + public: + ObjectCursor(); + ObjectCursor(const ObjectCursor &rhs); + explicit ObjectCursor(rados_object_list_cursor c); + ~ObjectCursor(); + ObjectCursor& operator=(const ObjectCursor& rhs); + bool operator<(const ObjectCursor &rhs) const; + bool operator==(const ObjectCursor &rhs) const; + void set(rados_object_list_cursor c); + + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + std::string to_str() const; + bool from_str(const std::string& s); + + protected: + rados_object_list_cursor c_cursor; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + class CEPH_RADOS_API NObjectIterator : public std::iterator <std::forward_iterator_tag, ListObject> { + public: + static const NObjectIterator __EndObjectIterator; + NObjectIterator(): impl(NULL) {} + ~NObjectIterator(); + NObjectIterator(const NObjectIterator &rhs); + NObjectIterator& operator=(const NObjectIterator& rhs); + + bool operator==(const NObjectIterator& rhs) const; + bool operator!=(const NObjectIterator& rhs) const; + const ListObject& operator*() const; + const ListObject* operator->() const; + NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions + NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + + /// get current hash position of the iterator, rounded to the current pg + uint32_t get_pg_hash_position() const; + + /// move the iterator to a given hash position. this may (will!) be rounded + /// to the nearest pg. errors are thrown as exceptions + uint32_t seek(uint32_t pos); + + /// move the iterator to a given cursor position. errors are thrown as exceptions + uint32_t seek(const ObjectCursor& cursor); + + /// get current cursor position + ObjectCursor get_cursor(); + + /** + * Configure PGLS filter to be applied OSD-side (requires caller + * to know/understand the format expected by the OSD) + */ + void set_filter(const bufferlist &bl); + + private: + NObjectIterator(ObjListCtx *ctx_); + void get_next(); + NObjectIteratorImpl *impl; + }; + + class CEPH_RADOS_API ObjectItem + { + public: + std::string oid; + std::string nspace; + std::string locator; + }; + + /// DEPRECATED; do not use + class CEPH_RADOS_API WatchCtx { + public: + virtual ~WatchCtx(); + virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0; + }; + + class CEPH_RADOS_API WatchCtx2 { + public: + virtual ~WatchCtx2(); + /** + * Callback activated when we receive a notify event. + * + * @param notify_id unique id for this notify event + * @param cookie the watcher we are notifying + * @param notifier_id the unique client id of the notifier + * @param bl opaque notify payload (from the notifier) + */ + virtual void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + + /** + * Callback activated when we encounter an error with the watch. + * + * Errors we may see: + * -ENOTCONN : our watch was disconnected + * -ETIMEDOUT : our watch is still valid, but we may have missed + * a notify event. + * + * @param cookie the watcher with the problem + * @param err error + */ + virtual void handle_error(uint64_t cookie, int err) = 0; + }; + + struct CEPH_RADOS_API AioCompletion { + AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {} + int set_complete_callback(void *cb_arg, callback_t cb); + int set_safe_callback(void *cb_arg, callback_t cb); + int wait_for_complete(); + int wait_for_safe(); + int wait_for_complete_and_cb(); + int wait_for_safe_and_cb(); + bool is_complete(); + bool is_safe(); + bool is_complete_and_cb(); + bool is_safe_and_cb(); + int get_return_value(); + int get_version() __attribute__ ((deprecated)); + uint64_t get_version64(); + void release(); + AioCompletionImpl *pc; + }; + + struct CEPH_RADOS_API PoolAsyncCompletion { + PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {} + int set_callback(void *cb_arg, callback_t cb); + int wait(); + bool is_complete(); + int get_return_value(); + void release(); + PoolAsyncCompletionImpl *pc; + }; + + /** + * These are per-op flags which may be different among + * ops added to an ObjectOperation. + */ + enum ObjectOperationFlags { + OP_EXCL = LIBRADOS_OP_FLAG_EXCL, + OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK, + OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM, + OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL, + OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED, + OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE, + }; + + class CEPH_RADOS_API ObjectOperationCompletion { + public: + virtual ~ObjectOperationCompletion() {} + virtual void handle_completion(int r, bufferlist& outbl) = 0; + }; + + /** + * These flags apply to the ObjectOperation as a whole. + * + * BALANCE_READS and LOCALIZE_READS should only be used + * when reading from data you're certain won't change, + * like a snapshot, or where eventual consistency is ok. + * + * ORDER_READS_WRITES will order reads the same way writes are + * ordered (e.g., waiting for degraded objects). In particular, it + * will make a write followed by a read sequence be preserved. + * + * IGNORE_CACHE will skip the caching logic on the OSD that normally + * handles promotion of objects between tiers. This allows an operation + * to operate (or read) the cached (or uncached) object, even if it is + * not coherent. + * + * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and + * process the op directly on the destination pool. This is useful + * for CACHE_FLUSH and CACHE_EVICT operations. + */ + enum ObjectOperationGlobalFlags { + OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG, + OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS, + OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS, + OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES, + OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE, + OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS, + OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY, + // send requests to cluster despite the cluster or pool being + // marked full; ops will either succeed (e.g., delete) or return + // EDQUOT or ENOSPC + OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY, + //mainly for delete + OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE, + OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT, + OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP, + }; + + /* + * Alloc hint flags for the alloc_hint operation. + */ + enum AllocHintFlags { + ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + ALLOC_HINT_FLAG_RANDOM_READ = 8, + ALLOC_HINT_FLAG_APPEND_ONLY = 16, + ALLOC_HINT_FLAG_IMMUTABLE = 32, + ALLOC_HINT_FLAG_SHORTLIVED = 64, + ALLOC_HINT_FLAG_LONGLIVED = 128, + ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, + }; + + /* + * ObjectOperation : compound object operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectOperation + { + public: + ObjectOperation(); + virtual ~ObjectOperation(); + + size_t size(); + void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated)); + //flag mean ObjectOperationFlags + void set_op_flags2(int flags); + + void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval); + void cmpxattr(const char *name, uint8_t op, const bufferlist& val); + void cmpxattr(const char *name, uint8_t op, uint64_t v); + void exec(const char *cls, const char *method, bufferlist& inbl); + void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval); + void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion); + /** + * Guard operation with a check that object version == ver + * + * @param ver [in] version to check + */ + void assert_version(uint64_t ver); + + /** + * Guard operation with a check that the object already exists + */ + void assert_exists(); + + /** + * get key/value pairs for specified keys + * + * @param assertions [in] comparison assertions + * @param prval [out] place error code in prval upon completion + * + * assertions has the form of mappings from keys to (comparison rval, assertion) + * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ]. + * + * That is, to assert that the value at key 'foo' is greater than 'bar': + * + * ObjectReadOperation op; + * int r; + * map<string, pair<bufferlist, int> > assertions; + * bufferlist bar(string('bar')); + * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT); + * op.omap_cmp(assertions, &r); + */ + void omap_cmp( + const std::map<std::string, std::pair<bufferlist, int> > &assertions, + int *prval); + + protected: + ObjectOperationImpl *impl; + ObjectOperation(const ObjectOperation& rhs); + ObjectOperation& operator=(const ObjectOperation& rhs); + friend class IoCtx; + friend class Rados; + }; + + /* + * ObjectWriteOperation : compound object write operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation + { + protected: + time_t *unused; + public: + ObjectWriteOperation() : unused(NULL) {} + ~ObjectWriteOperation() override {} + + void mtime(time_t *pt); + void mtime2(struct timespec *pts); + + void create(bool exclusive); + void create(bool exclusive, + const std::string& category); ///< NOTE: category is unused + + void write(uint64_t off, const bufferlist& bl); + void write_full(const bufferlist& bl); + void writesame(uint64_t off, uint64_t write_len, + const bufferlist& bl); + void append(const bufferlist& bl); + void remove(); + void truncate(uint64_t off); + void zero(uint64_t off, uint64_t len); + void rmxattr(const char *name); + void setxattr(const char *name, const bufferlist& bl); + void setxattr(const char *name, const bufferlist&& bl); + void tmap_update(const bufferlist& cmdbl); + void tmap_put(const bufferlist& bl); + void selfmanaged_snap_rollback(uint64_t snapid); + + /** + * Rollback an object to the specified snapshot id + * + * Used with pool snapshots + * + * @param snapid [in] snopshot id specified + */ + void snap_rollback(uint64_t snapid); + + /** + * set keys and values according to map + * + * @param map [in] keys and values to set + */ + void omap_set(const std::map<std::string, bufferlist> &map); + + /** + * set header + * + * @param bl [in] header to set + */ + void omap_set_header(const bufferlist &bl); + + /** + * Clears omap contents + */ + void omap_clear(); + + /** + * Clears keys in to_rm + * + * @param to_rm [in] keys to remove + */ + void omap_rm_keys(const std::set<std::string> &to_rm); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t src_fadvise_flags); + + /** + * undirty an object + * + * Clear an objects dirty flag + */ + void undirty(); + + /** + * Set allocation hint for an object + * + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags flags () + */ + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size); + void set_alloc_hint2(uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + /** + * Pin/unpin an object in cache tier + * + * @returns 0 on success, negative error code on failure + */ + void cache_pin(); + void cache_unpin(); + + /** + * Extensible tier + * + * Set redirect target + */ + void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx, + uint64_t tgt_version, int flag = 0); + void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx, + std::string tgt_oid, uint64_t tgt_offset, int flag = 0); + void tier_promote(); + void unset_manifest(); + + + friend class IoCtx; + }; + + /* + * ObjectReadOperation : compound object operation that return value + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation + { + public: + ObjectReadOperation() {} + ~ObjectReadOperation() override {} + + void stat(uint64_t *psize, time_t *pmtime, int *prval); + void stat2(uint64_t *psize, struct timespec *pts, int *prval); + void getxattr(const char *name, bufferlist *pbl, int *prval); + void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval); + void read(size_t off, uint64_t len, bufferlist *pbl, int *prval); + void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl, + uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl, + int *prval); + + /** + * see aio_sparse_read() + */ + void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m, + bufferlist *data_bl, int *prval); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals2: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + + /** + * omap_get_keys: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_keys2: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys2(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore, + int *prval); + + /** + * omap_get_header: get header from object omap + * + * @param header [out] place header here upon completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_header(bufferlist *header, int *prval); + + /** + * get key/value pairs for specified keys + * + * @param keys [in] keys to get + * @param map [out] place key/value pairs found here on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals_by_keys(const std::set<std::string> &keys, + std::map<std::string, bufferlist> *map, + int *prval); + + /** + * list_watchers: Get list watchers of object + * + * @param out_watchers [out] place returned values in out_watchers on completion + * @param prval [out] place error code in prval upon completion + */ + void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval); + + /** + * list snapshot clones associated with a logical object + * + * This will include a record for each version of the object, + * include the "HEAD" (which will have a cloneid of SNAP_HEAD). + * Each clone includes a vector of snap ids for which it is + * defined to exist. + * + * NOTE: this operation must be submitted from an IoCtx with a + * read snapid of SNAP_DIR for reliable results. + * + * @param out_snaps [out] pointer to resulting snap_set_t + * @param prval [out] place error code in prval upon completion + */ + void list_snaps(snap_set_t *out_snaps, int *prval); + + /** + * query dirty state of an object + * + * @param isdirty [out] pointer to resulting bool + * @param prval [out] place error code in prval upon completion + */ + void is_dirty(bool *isdirty, int *prval); + + /** + * flush a cache tier object to backing tier; will block racing + * updates. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_flush(); + + /** + * Flush a cache tier object to backing tier; will EAGAIN if we race + * with an update. Must be used with the SKIPRWLOCKS flag. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_try_flush(); + + /** + * evict a clean cache tier object + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promote on the OSD (that is then evicted). + */ + void cache_evict(); + }; + + /* IoCtx : This is a context in which we can perform I/O. + * It includes a Pool, + * + * Typical use (error checking omitted): + * + * IoCtx p; + * rados.ioctx_create("my_pool", p); + * p->stat(&stats); + * ... etc ... + * + * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * that is used for watch events to ensure that racing callbacks + * have completed. + */ + class CEPH_RADOS_API IoCtx + { + public: + IoCtx(); + static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + IoCtx(const IoCtx& rhs); + IoCtx& operator=(const IoCtx& rhs); + IoCtx(IoCtx&& rhs) noexcept; + IoCtx& operator=(IoCtx&& rhs) noexcept; + + ~IoCtx(); + + bool is_valid() const; + + // Close our pool handle + void close(); + + // deep copy + void dup(const IoCtx& rhs); + + // set pool auid + int set_auid(uint64_t auid_) + __attribute__ ((deprecated)); + + // set pool auid + int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + + // get pool auid + int get_auid(uint64_t *auid_) + __attribute__ ((deprecated)); + + uint64_t get_instance_id() const; + + std::string get_pool_name(); + + bool pool_requires_alignment(); + int pool_requires_alignment2(bool * req); + uint64_t pool_required_alignment(); + int pool_required_alignment2(uint64_t * alignment); + + // create an object + int create(const std::string& oid, bool exclusive); + int create(const std::string& oid, bool exclusive, + const std::string& category); ///< category is unused + + /** + * write bytes to an object at a specified offset + * + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + /** + * append bytes to an object + * + * NOTE: this call steals the contents of @param bl. + */ + int append(const std::string& oid, bufferlist& bl, size_t len); + /** + * replace object contents with provided data + * + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& oid, bufferlist& bl); + int writesame(const std::string& oid, bufferlist& bl, + size_t write_len, uint64_t off); + int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + int checksum(const std::string& o, rados_checksum_type_t type, + const bufferlist &init_value_bl, size_t len, uint64_t off, + size_t chunk_size, bufferlist *pbl); + int remove(const std::string& oid); + int remove(const std::string& oid, int flags); + int trunc(const std::string& oid, uint64_t size); + int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m); + int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl); + int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off); + int getxattr(const std::string& oid, const char *name, bufferlist& bl); + int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset); + int setxattr(const std::string& oid, const char *name, bufferlist& bl); + int rmxattr(const std::string& oid, const char *name); + int stat(const std::string& oid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts); + int exec(const std::string& oid, const char *cls, const char *method, + bufferlist& inbl, bufferlist& outbl); + /** + * modify object tmap based on encoded update sequence + * + * NOTE: this call steals the contents of @param bl + */ + int tmap_update(const std::string& oid, bufferlist& cmdbl); + + int omap_get_vals(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_vals(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_keys(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys); + int omap_get_keys2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore); + int omap_get_header(const std::string& oid, + bufferlist *bl); + int omap_get_vals_by_keys(const std::string& oid, + const std::set<std::string>& keys, + std::map<std::string, bufferlist> *vals); + int omap_set(const std::string& oid, + const std::map<std::string, bufferlist>& map); + int omap_set_header(const std::string& oid, + const bufferlist& bl); + int omap_clear(const std::string& oid); + int omap_rm_keys(const std::string& oid, + const std::set<std::string>& keys); + + void snap_set_read(snap_t seq); + int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps); + + // Create a snapshot with a given name + int snap_create(const char *snapname); + + // Look up a snapshot by name. + // Returns 0 on success; error code otherwise + int snap_lookup(const char *snapname, snap_t *snap); + + // Gets a timestamp for a snap + int snap_get_stamp(snap_t snapid, time_t *t); + + // Gets the name of a snap + int snap_get_name(snap_t snapid, std::string *s); + + // Remove a snapshot from this pool + int snap_remove(const char *snapname); + + int snap_list(std::vector<snap_t> *snaps); + + int snap_rollback(const std::string& oid, const char *snapname); + + // Deprecated name kept for backward compatibility - same as snap_rollback() + int rollback(const std::string& oid, const char *snapname) + __attribute__ ((deprecated)); + + int selfmanaged_snap_create(uint64_t *snapid); + void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c); + + int selfmanaged_snap_remove(uint64_t snapid); + void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c); + + int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid); + + // Advisory locking on rados objects. + int lock_exclusive(const std::string &oid, const std::string &name, + const std::string &cookie, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int lock_shared(const std::string &oid, const std::string &name, + const std::string &cookie, const std::string &tag, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int unlock(const std::string &oid, const std::string &name, + const std::string &cookie); + + int break_lock(const std::string &oid, const std::string &name, + const std::string &client, const std::string &cookie); + + int list_lockers(const std::string &oid, const std::string &name, + int *exclusive, + std::string *tag, + std::list<librados::locker_t> *lockers); + + + /// Start enumerating objects for a pool. Errors are thrown as exceptions. + NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from a hash position. + /// Errors are thrown as exceptions. + NObjectIterator nobjects_begin(uint32_t start_hash_position, + const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from cursor. Errors are + /// thrown as exceptions. + NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor, + const bufferlist &filter=bufferlist()); + /// Iterator indicating the end of a pool + const NObjectIterator& nobjects_end() const; + + /// Get cursor for pool beginning + ObjectCursor object_list_begin(); + + /// Get cursor for pool end + ObjectCursor object_list_end(); + + /// Check whether a cursor is at the end of a pool + bool object_list_is_end(const ObjectCursor &oc); + + /// List some objects between two cursors + int object_list(const ObjectCursor &start, const ObjectCursor &finish, + const size_t result_count, + const bufferlist &filter, + std::vector<ObjectItem> *result, + ObjectCursor *next); + + /// Generate cursors that include the N out of Mth slice of the pool + void object_list_slice( + const ObjectCursor start, + const ObjectCursor finish, + const size_t n, + const size_t m, + ObjectCursor *split_start, + ObjectCursor *split_finish); + + /** + * List available hit set objects + * + * @param uint32_t [in] hash position to query + * @param c [in] completion + * @param pls [out] list of available intervals + */ + int hit_set_list(uint32_t hash, AioCompletion *c, + std::list< std::pair<time_t, time_t> > *pls); + + /** + * Retrieve hit set for a given hash, and time + * + * @param hash [in] hash position + * @param c [in] completion + * @param stamp [in] time interval that falls within the hit set's interval + * @param pbl [out] buffer to store the result in + */ + int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp, + bufferlist *pbl); + + uint64_t get_last_version(); + + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off); + /** + * Asynchronously read from an object at a particular snapshot + * + * This is the same as normal aio_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param pbl where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid); + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off); + /** + * Asynchronously read existing extents from an object at a + * particular snapshot + * + * This is the same as normal aio_sparse_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * m will be filled in with a map of extents in the object, + * mapping offsets to lengths (in bytes) within the range + * requested. The data for all of the extents are stored + * back-to-back in offset order in data_bl. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param m where to store the map of extents + * @param data_bl where to store the data + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off, uint64_t snapid); + /** + * Asynchronously compare an on-disk object range with a buffer + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param off object byte offset at which to start the comparison + * @param cmp_bl buffer containing bytes to be compared with object contents + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ + int aio_cmpext(const std::string& oid, + librados::AioCompletion *c, + uint64_t off, + bufferlist& cmp_bl); + int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len, uint64_t off); + int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len); + int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl); + int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t write_len, uint64_t off); + + /** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param oid the name of the object + * @param c what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than SNAP_HEAD + */ + int aio_remove(const std::string& oid, AioCompletion *c); + int aio_remove(const std::string& oid, AioCompletion *c, int flags); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * aio_flush(). + * + * @param c what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ + int aio_flush_async(AioCompletion *c); + int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset); + int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name); + int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); + + /** + * Cancel aio operation + * + * @param c completion handle + * @returns 0 on success, negative error code on failure + */ + int aio_cancel(AioCompletion *c); + + int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method, + bufferlist& inbl, bufferlist *outbl); + + /* + * asynchronous version of unlock + */ + int aio_unlock(const std::string &oid, const std::string &name, + const std::string &cookie, AioCompletion *c); + + // compound object operations + int operate(const std::string& oid, ObjectWriteOperation *op); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags); + /** + * Schedule an async write operation with explicit snapshot parameters + * + * This is the same as the first aio_operate(), except that it + * gets the snapshot context from its arguments instead of the + * IoCtx internal state. + * + * @param oid the object to operate on + * @param c what to do when the operation is complete and safe + * @param op which operations to perform + * @param seq latest selfmanaged snapshot sequence number for this object + * @param snaps currently existing selfmanaged snapshot ids for this object + * @returns 0 on success, negative error code on failure + */ + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, int flags, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, bufferlist *pbl); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, snap_t snapid, int flags, + bufferlist *pbl) + __attribute__ ((deprecated)); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl, const blkin_trace_info *trace_info); + + // watch/notify + int watch2(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx); + int watch3(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx); + int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int unwatch2(uint64_t handle); + int aio_unwatch(uint64_t handle, AioCompletion *c); + /** + * Send a notify event to watchers + * + * Upon completion the pbl bufferlist reply payload will be + * encoded like so: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * + */ + int notify2(const std::string& o, ///< object + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + int aio_notify(const std::string& o, ///< object + AioCompletion *c, ///< completion when notify completes + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + + int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers); + int list_snaps(const std::string& o, snap_set_t *out_snaps); + void set_notify_timeout(uint32_t timeout); + + /// acknowledge a notify we received. + void notify_ack(const std::string& o, ///< watched object + uint64_t notify_id, ///< notify id + uint64_t cookie, ///< our watch handle + bufferlist& bl); ///< optional reply payload + + /*** + * check on watch validity + * + * Check if a watch is valid. If so, return the number of + * milliseconds since we last confirmed its liveness. If there is + * a known error, return it. + * + * If there is an error, the watch is no longer valid, and should + * be destroyed with unwatch(). The user is still interested in + * the object, a new watch should be created with watch(). + * + * @param cookie watch handle + * @returns ms since last confirmed valid, or error + */ + int watch_check(uint64_t cookie); + + // old, deprecated versions + int watch(const std::string& o, uint64_t ver, uint64_t *cookie, + librados::WatchCtx *ctx) __attribute__ ((deprecated)); + int notify(const std::string& o, uint64_t ver, bufferlist& bl) + __attribute__ ((deprecated)); + int unwatch(const std::string& o, uint64_t cookie) + __attribute__ ((deprecated)); + + /** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it + * was submitted with a OP_FAILOK flag set) and is not guaranteed + * to do anything on the backend. + * + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ + int set_alloc_hint(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size); + int set_alloc_hint2(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + // assert version for next sync operations + void set_assert_version(uint64_t ver); + + /** + * Pin/unpin an object in cache tier + * + * @param o the name of the object + * @returns 0 on success, negative error code on failure + */ + int cache_pin(const std::string& o); + int cache_unpin(const std::string& o); + + std::string get_pool_name() const; + + void locator_set_key(const std::string& key); + void set_namespace(const std::string& nspace); + std::string get_namespace() const; + + int64_t get_id(); + + // deprecated versions + uint32_t get_object_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + uint32_t get_object_pg_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + + int get_object_hash_position2(const std::string& oid, uint32_t *hash_position); + int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position); + + config_t cct(); + + void set_osdmap_full_try(); + void unset_osdmap_full_try(); + + int application_enable(const std::string& app_name, bool force); + int application_enable_async(const std::string& app_name, + bool force, PoolAsyncCompletion *c); + int application_list(std::set<std::string> *app_names); + int application_metadata_get(const std::string& app_name, + const std::string &key, + std::string *value); + int application_metadata_set(const std::string& app_name, + const std::string &key, + const std::string& value); + int application_metadata_remove(const std::string& app_name, + const std::string &key); + int application_metadata_list(const std::string& app_name, + std::map<std::string, std::string> *values); + + private: + /* You can only get IoCtx instances from Rados */ + IoCtx(IoCtxImpl *io_ctx_impl_); + + friend class Rados; // Only Rados can use our private constructor to create IoCtxes. + friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl + friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl + + IoCtxImpl *io_ctx_impl; + }; + + struct CEPH_RADOS_API PlacementGroup { + PlacementGroup(); + PlacementGroup(const PlacementGroup&); + ~PlacementGroup(); + bool parse(const char*); + std::unique_ptr<PlacementGroupImpl> impl; + }; + + CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&); + + class CEPH_RADOS_API Rados + { + public: + static void version(int *major, int *minor, int *extra); + + Rados(); + explicit Rados(IoCtx& ioctx); + ~Rados(); + static void from_rados_t(rados_t cluster, Rados &rados); + + int init(const char * const id); + int init2(const char * const name, const char * const clustername, + uint64_t flags); + int init_with_context(config_t cct_); + config_t cct(); + int connect(); + void shutdown(); + int watch_flush(); + int aio_watch_flush(AioCompletion*); + int conf_read_file(const char * const path) const; + int conf_parse_argv(int argc, const char ** argv) const; + int conf_parse_argv_remainder(int argc, const char ** argv, + const char ** remargv) const; + int conf_parse_env(const char *env) const; + int conf_set(const char *option, const char *value); + int conf_get(const char *option, std::string &val); + + int service_daemon_register( + const std::string& service, ///< service name (e.g., 'rgw') + const std::string& name, ///< daemon name (e.g., 'gwfoo') + const std::map<std::string,std::string>& metadata); ///< static metadata about daemon + int service_daemon_update_status( + std::map<std::string,std::string>&& status); + + int pool_create(const char *name); + int pool_create(const char *name, uint64_t auid) + __attribute__ ((deprecated)); + int pool_create(const char *name, uint64_t auid, uint8_t crush_rule) + __attribute__ ((deprecated)); + int pool_create_with_rule(const char *name, uint8_t crush_rule); + int pool_create_async(const char *name, PoolAsyncCompletion *c); + int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c); + int pool_get_base_tier(int64_t pool, int64_t* base_tier); + int pool_delete(const char *name); + int pool_delete_async(const char *name, PoolAsyncCompletion *c); + int64_t pool_lookup(const char *name); + int pool_reverse_lookup(int64_t id, std::string *name); + + uint64_t get_instance_id(); + + int get_min_compatible_osd(int8_t* require_osd_release); + int get_min_compatible_client(int8_t* min_compat_client, + int8_t* require_min_compat_client); + + int mon_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int mgr_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int osd_command(int osdid, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + + int ioctx_create(const char *name, IoCtx &pioctx); + int ioctx_create2(int64_t pool_id, IoCtx &pioctx); + + // Features useful for test cases + void test_blacklist_self(bool set); + + /* pool info */ + int pool_list(std::list<std::string>& v); + int pool_list2(std::list<std::pair<int64_t, std::string> >& v); + int get_pool_stats(std::list<std::string>& v, + stats_map& result); + /// deprecated; use simpler form. categories no longer supported. + int get_pool_stats(std::list<std::string>& v, + std::map<std::string, stats_map>& stats); + /// deprecated; categories no longer supported + int get_pool_stats(std::list<std::string>& v, + std::string& category, + std::map<std::string, stats_map>& stats); + /// check if pool has selfmanaged snaps + bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname); + + int cluster_stat(cluster_stat_t& result); + int cluster_fsid(std::string *fsid); + + /** + * List inconsistent placement groups in the given pool + * + * @param pool_id the pool id + * @param pgs [out] the inconsistent PGs + */ + int get_inconsistent_pgs(int64_t pool_id, + std::vector<PlacementGroup>* pgs); + /** + * List the inconsistent objects found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param objects [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_objects(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_obj_t>* objects, + uint32_t* interval); + /** + * List the inconsistent snapsets found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param snapsets [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_snapsets(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_snapset_t>* snapset, + uint32_t* interval); + + /// get/wait for the most recent osdmap + int wait_for_latest_osdmap(); + + int blacklist_add(const std::string& client_address, + uint32_t expire_seconds); + + /* + * pool aio + * + * It is up to the caller to release the completion handler, even if the pool_create_async() + * and/or pool_delete_async() fails and does not send the async request + */ + static PoolAsyncCompletion *pool_async_create_completion(); + + // -- aio -- + static AioCompletion *aio_create_completion(); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete, + callback_t cb_safe); + + friend std::ostream& operator<<(std::ostream &oss, const Rados& r); + private: + // We don't allow assignment or copying + Rados(const Rados& rhs); + const Rados& operator=(const Rados& rhs); + RadosClient *client; + }; + +} // namespace v14_2_0 +} // namespace librados + +#endif + diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp new file mode 100644 index 00000000..8926d097 --- /dev/null +++ b/src/include/rados/librados_fwd.hpp @@ -0,0 +1,32 @@ +#ifndef __LIBRADOS_FWD_HPP +#define __LIBRADOS_FWD_HPP + +namespace libradosstriper { + +class RadosStriper; + +} // namespace libradosstriper + +namespace librados { +inline namespace v14_2_0 { + +class AioCompletion; +class IoCtx; +class ListObject; +class NObjectIterator; +class ObjectCursor; +class ObjectItem; +class ObjectOperation; +class ObjectOperationCompletion; +class ObjectReadOperation; +class ObjectWriteOperation; +class PlacementGroup; +class PoolAsyncCompletion; +class Rados; +class WatchCtx; +class WatchCtx2; + +} // inline namespace v14_2_0 +} // namespace librados + +#endif // __LIBRADOS_FWD_HPP diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h new file mode 100644 index 00000000..c20e96be --- /dev/null +++ b/src/include/rados/librgw.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_LIBRGW_H +#define CEPH_LIBRGW_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_VER_MAJOR 1 +#define LIBRGW_VER_MINOR 1 +#define LIBRGW_VER_EXTRA 0 + +#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA) + +typedef void* librgw_t; +int librgw_create(librgw_t *rgw, int argc, char **argv); +void librgw_shutdown(librgw_t rgw); + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_LIBRGW_H */ diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h new file mode 100644 index 00000000..80ae69d2 --- /dev/null +++ b/src/include/rados/objclass.h @@ -0,0 +1,177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H +#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H + +#ifdef __cplusplus + +#include "buffer.h" + +extern "C" { +#endif + +#define CEPH_CLS_API [[gnu::visibility("default")]] + +#define CLS_VER(maj,min) \ +int __cls_ver__## maj ## _ ##min = 0; \ +int __cls_ver_maj = maj; \ +int __cls_ver_min = min; + +#define CLS_NAME(name) \ +int __cls_name__## name = 0; \ +const char *__cls_name = #name; + +#define CLS_INIT(name) \ +CEPH_CLS_API void __cls_init() + +#define CLS_METHOD_RD 0x1 /// method executes read operations +#define CLS_METHOD_WR 0x2 /// method executes write operations +#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier + +#define CLS_LOG(level, fmt, ...) \ + cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__) + +/** + * Initialize a class. + */ +void __cls_init(); + +/** + * @typdef cls_handle_t + * + * A handle for interacting with the object class. + */ +typedef void *cls_handle_t; + +/** + * @typedef cls_method_handle_t + * + * A handle for interacting with the method of the object class. + */ +typedef void *cls_method_handle_t; + +/** + * @typedef cls_method_context_t + * + * A context for the method of the object class. + */ +typedef void* cls_method_context_t; + +/*class utils*/ +extern int cls_log(int level, const char *format, ...) + __attribute__((__format__(printf, 2, 3))); + +/* class registration api */ +extern int cls_register(const char *name, cls_handle_t *handle); + +#ifdef __cplusplus +} + +/** + * @typedef cls_method_cxx_call_t + * + */ +typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx, + class ceph::buffer::list *inbl, class ceph::buffer::list *outbl); + +/** + * Register a method. + * + * @param hclass + * @param method + * @param flags + * @param class_call + * @param handle + */ +extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags, + cls_method_cxx_call_t class_call, cls_method_handle_t *handle); + +/** + * Create an object. + * + * @param hctx + * @param exclusive + */ +extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive); + +/** + * Remove an object. + * + * @param hctx + */ +extern int cls_cxx_remove(cls_method_context_t hctx); + +/** + * Check on the status of an object. + * + * @param hctx + * @param size + * @param mtime + */ +extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime); + +/** + * Read contents of an object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Write to the object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Get xattr of the object. + * + * @param hctx + * @param name + * @param outbl + */ +extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *outbl); + +/** + * Set xattr of the object. + * + * @param hctx + * @param name + * @param inbl + */ +extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *inbl); + +/** + * Get value corresponding to a key from the map. + * + * @param hctx + * @param key + * @param outbl + */ +extern int cls_cxx_map_get_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *outbl); + +/** + * Set value corresponding to a key in the map. + * + * @param hctx + * @param key + * @param inbl + */ +extern int cls_cxx_map_set_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *inbl); + +#endif + +#endif diff --git a/src/include/rados/page.h b/src/include/rados/page.h new file mode 120000 index 00000000..cf983e83 --- /dev/null +++ b/src/include/rados/page.h @@ -0,0 +1 @@ +../page.h
\ No newline at end of file diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h new file mode 100644 index 00000000..0712f489 --- /dev/null +++ b/src/include/rados/rados_types.h @@ -0,0 +1,29 @@ +#ifndef CEPH_RADOS_TYPES_H +#define CEPH_RADOS_TYPES_H + +#include <stdint.h> + +/** + * @struct obj_watch_t + * One item from list_watchers + */ +struct obj_watch_t { + /// Address of the Watcher + char addr[256]; + /// Watcher ID + int64_t watcher_id; + /// Cookie + uint64_t cookie; + /// Timeout in Seconds + uint32_t timeout_seconds; +}; + +/** + * + * Pass as nspace argument to rados_ioctx_set_namespace() + * before calling rados_nobjects_list_open() to return + * all objects in all namespaces. + */ +#define LIBRADOS_ALL_NSPACES "\001" + +#endif diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp new file mode 100644 index 00000000..8c02dd83 --- /dev/null +++ b/src/include/rados/rados_types.hpp @@ -0,0 +1,331 @@ +#ifndef CEPH_RADOS_TYPES_HPP +#define CEPH_RADOS_TYPES_HPP + +#include <map> +#include <utility> +#include <vector> +#include <stdint.h> +#include <string> + +#include "buffer.h" +#include "rados_types.h" + +namespace librados { + +typedef uint64_t snap_t; + +enum { + SNAP_HEAD = (uint64_t)(-2), + SNAP_DIR = (uint64_t)(-1) +}; + +struct clone_info_t { + snap_t cloneid; + std::vector<snap_t> snaps; // ascending + std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest + uint64_t size; + clone_info_t() : cloneid(0), size(0) {} +}; + +struct snap_set_t { + std::vector<clone_info_t> clones; // ascending + snap_t seq; // newest snapid seen by the object + snap_set_t() : seq(0) {} +}; + +struct object_id_t { + std::string name; + std::string nspace; + std::string locator; + snap_t snap = 0; + object_id_t() = default; + object_id_t(const std::string& name, + const std::string& nspace, + const std::string& locator, + snap_t snap) + : name(name), + nspace(nspace), + locator(locator), + snap(snap) + {} +}; + +struct err_t { + enum : uint64_t { + SHARD_MISSING = 1 << 1, + SHARD_STAT_ERR = 1 << 2, + SHARD_READ_ERR = 1 << 3, + DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old + DATA_DIGEST_MISMATCH_INFO = 1 << 9, + OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old + OMAP_DIGEST_MISMATCH_INFO = 1 << 10, + SIZE_MISMATCH_OI = 1 << 11, // Old + SIZE_MISMATCH_INFO = 1 << 11, + SHARD_EC_HASH_MISMATCH = 1 << 12, + SHARD_EC_SIZE_MISMATCH = 1 << 13, + OI_ATTR_MISSING = 1 << 14, // Old + INFO_MISSING = 1 << 14, + OI_ATTR_CORRUPTED = 1 << 15, // Old + INFO_CORRUPTED = 1 << 15, + SS_ATTR_MISSING = 1 << 16, // Old + SNAPSET_MISSING = 1 << 16, + SS_ATTR_CORRUPTED = 1 << 17, // Old + SNAPSET_CORRUPTED = 1 << 17, + OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old + OBJ_SIZE_INFO_MISMATCH = 1 << 18, + HINFO_MISSING = 1 << 19, + HINFO_CORRUPTED = 1 << 20 + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED; + static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH; + bool has_shard_missing() const { + return errors & SHARD_MISSING; + } + bool has_stat_error() const { + return errors & SHARD_STAT_ERR; + } + bool has_read_error() const { + return errors & SHARD_READ_ERR; + } + bool has_data_digest_mismatch_oi() const { // Compatibility + return errors & DATA_DIGEST_MISMATCH_OI; + } + bool has_data_digest_mismatch_info() const { + return errors & DATA_DIGEST_MISMATCH_INFO; + } + bool has_omap_digest_mismatch_oi() const { // Compatibility + return errors & OMAP_DIGEST_MISMATCH_OI; + } + bool has_omap_digest_mismatch_info() const { + return errors & OMAP_DIGEST_MISMATCH_INFO; + } + bool has_size_mismatch_oi() const { // Compatibility + return errors & SIZE_MISMATCH_OI; + } + bool has_size_mismatch_info() const { + return errors & SIZE_MISMATCH_INFO; + } + bool has_ec_hash_error() const { + return errors & SHARD_EC_HASH_MISMATCH; + } + bool has_ec_size_error() const { + return errors & SHARD_EC_SIZE_MISMATCH; + } + bool has_oi_attr_missing() const { // Compatibility + return errors & OI_ATTR_MISSING; + } + bool has_info_missing() const { + return errors & INFO_MISSING; + } + bool has_oi_attr_corrupted() const { // Compatibility + return errors & OI_ATTR_CORRUPTED; + } + bool has_info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool has_ss_attr_missing() const { // Compatibility + return errors & SS_ATTR_MISSING; + } + bool has_snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool has_ss_attr_corrupted() const { // Compatibility + return errors & SS_ATTR_CORRUPTED; + } + bool has_snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_obj_size_oi_mismatch() const { // Compatibility + return errors & OBJ_SIZE_OI_MISMATCH; + } + bool has_obj_size_info_mismatch() const { + return errors & OBJ_SIZE_INFO_MISMATCH; + } + bool has_hinfo_missing() const { + return errors & HINFO_MISSING; + } + bool has_hinfo_corrupted() const { + return errors & HINFO_CORRUPTED; + } +}; + +struct shard_info_t : err_t { + std::map<std::string, ceph::bufferlist> attrs; + uint64_t size = -1; + bool omap_digest_present = false; + uint32_t omap_digest = 0; + bool data_digest_present = false; + uint32_t data_digest = 0; + bool selected_oi = false; + bool primary = false; +}; + +struct osd_shard_t { + int32_t osd; + int8_t shard; +}; + +inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) { + if (lhs.osd < rhs.osd) + return true; + else if (lhs.osd > rhs.osd) + return false; + else + return lhs.shard < rhs.shard; +} + +struct obj_err_t { + enum : uint64_t { + OBJECT_INFO_INCONSISTENCY = 1 << 1, + // XXX: Can an older rados binary work if these bits stay the same? + DATA_DIGEST_MISMATCH = 1 << 4, + OMAP_DIGEST_MISMATCH = 1 << 5, + SIZE_MISMATCH = 1 << 6, + ATTR_VALUE_MISMATCH = 1 << 7, + ATTR_NAME_MISMATCH = 1 << 8, + SNAPSET_INCONSISTENCY = 1 << 9, + HINFO_INCONSISTENCY = 1 << 10, + SIZE_TOO_LARGE = 1 << 11, + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH + |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE; + static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH; + bool has_object_info_inconsistency() const { + return errors & OBJECT_INFO_INCONSISTENCY; + } + bool has_data_digest_mismatch() const { + return errors & DATA_DIGEST_MISMATCH; + } + bool has_omap_digest_mismatch() const { + return errors & OMAP_DIGEST_MISMATCH; + } + bool has_size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool has_attr_value_mismatch() const { + return errors & ATTR_VALUE_MISMATCH; + } + bool has_attr_name_mismatch() const { + return errors & ATTR_NAME_MISMATCH; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_snapset_inconsistency() const { + return errors & SNAPSET_INCONSISTENCY; + } + bool has_hinfo_inconsistency() const { + return errors & HINFO_INCONSISTENCY; + } + bool has_size_too_large() const { + return errors & SIZE_TOO_LARGE; + } +}; + +struct inconsistent_obj_t : obj_err_t { + inconsistent_obj_t() = default; + inconsistent_obj_t(const object_id_t& object) + : object{object}, version(0) + {} + object_id_t object; + uint64_t version; // XXX: Redundant with object info attr + std::map<osd_shard_t, shard_info_t> shards; + err_t union_shards; +}; + +struct inconsistent_snapset_t { + inconsistent_snapset_t() = default; + inconsistent_snapset_t(const object_id_t& head) + : object{head} + {} + enum { + SNAPSET_MISSING = 1 << 0, + SNAPSET_CORRUPTED = 1 << 1, + CLONE_MISSING = 1 << 2, + SNAP_ERROR = 1 << 3, + HEAD_MISMATCH = 1 << 4, // Unused + HEADLESS_CLONE = 1 << 5, + SIZE_MISMATCH = 1 << 6, + OI_MISSING = 1 << 7, // Old + INFO_MISSING = 1 << 7, + OI_CORRUPTED = 1 << 8, // Old + INFO_CORRUPTED = 1 << 8, + EXTRA_CLONES = 1 << 9, + }; + uint64_t errors = 0; + object_id_t object; + // Extra clones + std::vector<snap_t> clones; + std::vector<snap_t> missing; + ceph::bufferlist ss_bl; + + bool ss_attr_missing() const { // Compatibility + return errors & SNAPSET_MISSING; + } + bool snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool ss_attr_corrupted() const { // Compatibility + return errors & SNAPSET_CORRUPTED; + } + bool snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool clone_missing() const { + return errors & CLONE_MISSING; + } + bool snapset_mismatch() const { // Compatibility + return errors & SNAP_ERROR; + } + bool snapset_error() const { + return errors & SNAP_ERROR; + } + bool head_mismatch() const { // Compatibility + return false; + } + bool headless() const { + return errors & HEADLESS_CLONE; + } + bool size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool oi_attr_missing() const { // Compatibility + return errors & OI_MISSING; + } + bool info_missing() const { + return errors & INFO_MISSING; + } + bool oi_attr_corrupted() const { // Compatibility + return errors & OI_CORRUPTED; + } + bool info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool extra_clones() const { + return errors & EXTRA_CLONES; + } +}; + +/** + * @var all_nspaces + * Pass as nspace argument to IoCtx::set_namespace() + * before calling nobjects_begin() to iterate + * through all objects in all namespaces. + */ +const std::string all_nspaces(LIBRADOS_ALL_NSPACES); + +} +#endif diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h new file mode 100644 index 00000000..66cf627a --- /dev/null +++ b/src/include/rados/rgw_file.h @@ -0,0 +1,384 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * convert RGW commands to file commands + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef RADOS_RGW_FILE_H +#define RADOS_RGW_FILE_H + +#include <sys/stat.h> +#include <sys/types.h> +#include <stdint.h> +#include <stdbool.h> + +#include "librgw.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_FILE_VER_MAJOR 1 +#define LIBRGW_FILE_VER_MINOR 1 +#define LIBRGW_FILE_VER_EXTRA 7 + +#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA) + +/* + * object types + */ +enum rgw_fh_type { + RGW_FS_TYPE_NIL = 0, + RGW_FS_TYPE_FILE, + RGW_FS_TYPE_DIRECTORY, + RGW_FS_TYPE_SYMBOLIC_LINK, +}; + +/* + * dynamic allocated handle to support nfs handle + */ + +/* content-addressable hash */ +struct rgw_fh_hk { + uint64_t bucket; + uint64_t object; +}; + +struct rgw_file_handle +{ + /* content-addressable hash */ + struct rgw_fh_hk fh_hk; + void *fh_private; /* librgw private data */ + /* object type */ + enum rgw_fh_type fh_type; +}; + +struct rgw_fs +{ + librgw_t rgw; + void *fs_private; + struct rgw_file_handle* root_fh; +}; + + +/* XXX mount info hypothetical--emulate Unix, support at least + * UUID-length fsid */ +struct rgw_statvfs { + uint64_t f_bsize; /* file system block size */ + uint64_t f_frsize; /* fragment size */ + uint64_t f_blocks; /* size of fs in f_frsize units */ + uint64_t f_bfree; /* # free blocks */ + uint64_t f_bavail; /* # free blocks for unprivileged users */ + uint64_t f_files; /* # inodes */ + uint64_t f_ffree; /* # free inodes */ + uint64_t f_favail; /* # free inodes for unprivileged users */ + uint64_t f_fsid[2]; /* file system ID */ + uint64_t f_flag; /* mount flags */ + uint64_t f_namemax; /* maximum filename length */ +}; + + +void rgwfile_version(int *major, int *minor, int *extra); + +/* + lookup object by name (POSIX style) +*/ +#define RGW_LOOKUP_FLAG_NONE 0x0000 +#define RGW_LOOKUP_FLAG_CREATE 0x0001 +#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */ +#define RGW_LOOKUP_FLAG_DIR 0x0004 +#define RGW_LOOKUP_FLAG_FILE 0x0008 + +#define RGW_LOOKUP_TYPE_FLAGS \ + (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE) + +int rgw_lookup(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *path, + struct rgw_file_handle **fh, + struct stat *st, uint32_t mask, uint32_t flags); + +/* + lookup object by handle (NFS style) +*/ +int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk, + struct rgw_file_handle **fh, uint32_t flags); + +/* + * release file handle + */ +#define RGW_FH_RELE_FLAG_NONE 0x0000 + +int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + attach rgw namespace +*/ +#define RGW_MOUNT_FLAG_NONE 0x0000 + +int rgw_mount(librgw_t rgw, const char *uid, const char *key, + const char *secret, struct rgw_fs **rgw_fs, + uint32_t flags); + +int rgw_mount2(librgw_t rgw, const char *uid, const char *key, + const char *secret, const char *root, struct rgw_fs **rgw_fs, + uint32_t flags); + +/* + register invalidate callbacks +*/ +#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000 + +typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk); + +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags); + +/* + detach rgw namespace +*/ +#define RGW_UMOUNT_FLAG_NONE 0x0000 + +int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags); + + +/* + get filesystem attributes +*/ +#define RGW_STATFS_FLAG_NONE 0x0000 + +int rgw_statfs(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + struct rgw_statvfs *vfs_st, + uint32_t flags); + + +/* XXX (get|set)attr mask bits */ +#define RGW_SETATTR_MODE 1 +#define RGW_SETATTR_UID 2 +#define RGW_SETATTR_GID 4 +#define RGW_SETATTR_MTIME 8 +#define RGW_SETATTR_ATIME 16 +#define RGW_SETATTR_SIZE 32 +#define RGW_SETATTR_CTIME 64 + +/* + create file +*/ +#define RGW_CREATE_FLAG_NONE 0x0000 + +int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a symbolic link + */ +#define RGW_CREATELINK_FLAG_NONE 0x0000 +int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, const char *link_path, struct stat *st, + uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a new directory +*/ +#define RGW_MKDIR_FLAG_NONE 0x0000 + +int rgw_mkdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t flags); + +/* + rename object +*/ +#define RGW_RENAME_FLAG_NONE 0x0000 + +int rgw_rename(struct rgw_fs *rgw_fs, + struct rgw_file_handle *olddir, const char* old_name, + struct rgw_file_handle *newdir, const char* new_name, + uint32_t flags); + +/* + remove file or directory +*/ +#define RGW_UNLINK_FLAG_NONE 0x0000 + +int rgw_unlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char* path, + uint32_t flags); + +/* + read directory content +*/ +typedef bool (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset, + struct stat *st, uint32_t mask, + uint32_t flags); + +#define RGW_READDIR_FLAG_NONE 0x0000 +#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */ + +int rgw_readdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, uint64_t *offset, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* enumeration continuing from name */ +int rgw_readdir2(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *name, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* project offset of dirent name */ +#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000 + +int rgw_dirent_offset(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, int64_t *offset, + uint32_t flags); + +/* + get unix attributes for object +*/ +#define RGW_GETATTR_FLAG_NONE 0x0000 + +int rgw_getattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t flags); + +/* + set unix attributes for object +*/ +#define RGW_SETATTR_FLAG_NONE 0x0000 + +int rgw_setattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t mask, uint32_t flags); + +/* + truncate file +*/ +#define RGW_TRUNCATE_FLAG_NONE 0x0000 + +int rgw_truncate(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t size, + uint32_t flags); + +/* + open file +*/ +#define RGW_OPEN_FLAG_NONE 0x0000 +#define RGW_OPEN_FLAG_CREATE 0x0001 +#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */ +#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */ + +int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + uint32_t posix_flags, uint32_t flags); + +/* + close file +*/ + +#define RGW_CLOSE_FLAG_NONE 0x0000 +#define RGW_CLOSE_FLAG_RELE 0x0001 + +int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + read data from file +*/ +#define RGW_READ_FLAG_NONE 0x0000 + +int rgw_read(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + read symbolic link +*/ +#define RGW_READLINK_FLAG_NONE 0x0000 + +int rgw_readlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + write data to file +*/ +#define RGW_WRITE_FLAG_NONE 0x0000 + +int rgw_write(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_written, void *buffer, + uint32_t flags); + +#define RGW_UIO_NONE 0x0000 +#define RGW_UIO_GIFT 0x0001 +#define RGW_UIO_FREE 0x0002 +#define RGW_UIO_BUFQ 0x0004 + +struct rgw_uio; +typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t); + +/* buffer vector descriptors */ +struct rgw_vio { + void *vio_p1; + void *vio_u1; + void *vio_base; + int32_t vio_len; +}; + +struct rgw_uio { + rgw_uio_release uio_rele; + void *uio_p1; + void *uio_u1; + uint64_t uio_offset; + uint64_t uio_resid; + uint32_t uio_cnt; + uint32_t uio_flags; + struct rgw_vio *uio_vio; /* appended vectors */ +}; + +typedef struct rgw_uio rgw_uio; + +int rgw_readv(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +int rgw_writev(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +/* + sync written data +*/ +#define RGW_FSYNC_FLAG_NONE 0x0000 + +int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + NFS commit operation +*/ + +#define RGW_COMMIT_FLAG_NONE 0x0000 + +int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint64_t offset, uint64_t length, uint32_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* RADOS_RGW_FILE_H */ diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h new file mode 100644 index 00000000..7eb33596 --- /dev/null +++ b/src/include/radosstriper/libradosstriper.h @@ -0,0 +1,610 @@ +#ifndef CEPH_LIBRADOSSTRIPER_H +#define CEPH_LIBRADOSSTRIPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <string.h> + +#include "../rados/librados.h" + +#define LIBRADOSSTRIPER_VER_MAJOR 0 +#define LIBRADOSSTRIPER_VER_MINOR 0 +#define LIBRADOSSTRIPER_VER_EXTRA 0 + +#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA) + +/** + * @typedef rados_striper_t + * + * A handle for interacting with striped objects in a RADOS cluster. + */ +typedef void *rados_striper_t; + +/** + * @defgroup libradosstriper_h_init Setup and Teardown + * These are the first and last functions to that should be called + * when using libradosstriper. + * + * @{ + */ + +/** + * Creates a rados striper using the given io context + * Striper has initially default object layout. + * See rados_striper_set_object_layout_*() to change this + * + * @param ioctx the rados context to use + * @param striper where to store the rados striper + * @returns 0 on success, negative error code on failure + */ + int rados_striper_create(rados_ioctx_t ioctx, + rados_striper_t *striper); + +/** + * Destroys a rados striper + * + * @param striper the striper to destroy + */ +void rados_striper_destroy(rados_striper_t striper); + +/** + * Sets the object layout's stripe unit of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_unit the stripe_unit value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper, + unsigned int stripe_unit); + +/** + * Sets the object layout's stripe count of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_count the stripe_count value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_count(rados_striper_t striper, + unsigned int stripe_count); + +/** + * Sets the object layout's object_size of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param object_size the object_size value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_object_size(rados_striper_t striper, + unsigned int object_size); + +/** @} init */ + +/** + * @defgroup libradosstriper_h_synch_io Synchronous I/O + * Writes are striped to several rados objects which are then + * replicated to a number of OSDs based on the configuration + * of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_striper_ioctx_wait_for_complete(). + * + * @{ + */ + +/** + * Synchronously write data to a striped object at the specified offset + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_write(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously write an entire striped object + * + * The striped object is filled with the provided data. If the striped object exists, + * it is truncated and then written. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_write_full(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Append data to an object + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_append(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Synchronously read data from a striped object at the specified offset + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +int rados_striper_read(rados_striper_t striper, + const char *soid, + char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @returns 0 on success, negative error code on failure + */ +int rados_striper_remove(rados_striper_t striper, + const char* soid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @note the truncation is not fully atomic. The metadata part is, + * so the behavior will be atomic from user point of view when + * the object size is reduced. However, in case of failure, old data + * may stay around, hidden. They may reappear if the object size is + * later grown, instead of the expected 0s. When growing the + * object and in case of failure, the new 0 data may not be + * fully created. This can lead to ENOENT errors when + * writing/reading the missing parts. + * @note the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + * @param io the rados context to use + * @param soid the name of the striped object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size); + +/** @} Synchronous I/O */ + +/** + * @defgroup libradosstriper_h_xattrs Xattrs + * Extended attributes are stored as extended attributes on the + * first rados regular object of the striped object. + * Thus, they have the same limitations as the underlying + * rados extended attributes. + * + * @{ + */ + +/** + * Get the value of an extended attribute on a striped object. + * + * @param striper the striper in which the getxattr will occur + * @param oid name of the striped object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +int rados_striper_getxattr(rados_striper_t striper, + const char *oid, + const char *name, + char *buf, + size_t len); + +/** + * Set an extended attribute on a striped object. + * + * @param striper the striper in which the setxattr will occur + * @param oid name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +int rados_striper_setxattr(rados_striper_t striper, + const char *oid, + const char *name, + const char *buf, + size_t len); + +/** + * Delete an extended attribute from a striped object. + * + * @param striper the striper in which the rmxattr will occur + * @param oid name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_rmxattr(rados_striper_t striper, + const char *oid, + const char *name); + +/** + * Start iterating over xattrs on a striped object. + * + * @post iter is a valid iterator + * + * @param striper the striper in which the getxattrs will occur + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs(rados_striper_t striper, + const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the striped object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, + const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +void rados_striper_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Synchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +int rados_striper_stat(rados_striper_t striper, + const char* soid, + uint64_t *psize, + time_t *pmtime); + +/** + * @defgroup libradosstriper_h_asynch_io Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_striper_multi_completion_t + * Represents the state of a set of asynchronous operations + * it contains the aggregated return value once the operations complete + * and can be used to block until all operations are complete and/or safe. + */ +typedef void *rados_striper_multi_completion_t; + +/** + * Constructs a multi completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all relpicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +int rados_striper_multi_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_striper_multi_completion_t *pc); + +/** + * Block until all operation complete + * + * This means data is in memory on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c); + +/** + * Block until all operation are safe + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c); + +/** + * Block until all operations complete and callback completes + * + * This means data is in memory on all replicas and can be read. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Block until all operations are safe and callback has completed + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation and callback completed + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe and has the callback completed + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Get the return value of a multi asychronous operation + * + * The return value is set when all operations are complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operations to inspect + * @returns aggregated return value of the operations + */ +int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c); + +/** + * Release a multi asynchrnous IO completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c multi completion to release + */ +void rados_striper_multi_aio_release(rados_striper_multi_completion_t c); + +/** + * Asynchronously write data to a striped object at the specified offset + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len, + uint64_t off); + +/** + * Asynchronously appends data to a striped object + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_append(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously fills and object with the provided data. + * If the object exists, it is truncated and then written. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write_full(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously read data from a striped object at the specified offset + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param completion what to do when the read is safe and complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_read(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + char *buf, + const size_t len, + uint64_t off); + +/** + * Asynchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, negative error code on failure + */ + +int rados_striper_aio_remove(rados_striper_t striper, + const char* soid, + rados_completion_t completion); + +/** + * Block until all pending writes in a striper are safe + * + * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @param striper the striper in which the flush will occur + * @returns 0 on success, negative error code on failure +*/ +void rados_striper_aio_flush(rados_striper_t striper); + +/** + * Asynchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @param completion what to do when the stats is complete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_aio_stat(rados_striper_t striper, + const char* soid, + rados_completion_t completion, + uint64_t *psize, + time_t *pmtime); + +/** @} Asynchronous I/O */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp new file mode 100644 index 00000000..674a56b7 --- /dev/null +++ b/src/include/radosstriper/libradosstriper.hpp @@ -0,0 +1,241 @@ +#ifndef __LIBRADOSSTRIPER_HPP +#define __LIBRADOSSTRIPER_HPP + +#include <string.h> +#include <string> +#include <map> +#include "../rados/buffer.h" +#include "../rados/librados.hpp" + +#include "libradosstriper.h" + +namespace libradosstriper +{ + struct RadosStriperImpl; + struct MultiAioCompletionImpl; + + /* + * Completion object for multiple asynchronous IO + * It allows to internally handle several "requests" + */ + struct MultiAioCompletion { + MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {} + ~MultiAioCompletion(); + int set_complete_callback(void *cb_arg, librados::callback_t cb); + int set_safe_callback(void *cb_arg, librados::callback_t cb); + void wait_for_complete(); + void wait_for_safe(); + void wait_for_complete_and_cb(); + void wait_for_safe_and_cb(); + bool is_complete(); + bool is_safe(); + bool is_complete_and_cb(); + bool is_safe_and_cb(); + int get_return_value(); + void release(); + MultiAioCompletionImpl *pc; + }; + + /* RadosStriper : This class allows to perform read/writes on striped objects + * + * Typical use (error checking omitted): + * + * RadosStriper rs; + * RadosStriper.striper_create("my_cluster", rs); + * bufferlist bl; + * ... put data in bl ... + * rs.write(object_name, bl, len, offset); + * bufferlist bl2; + * rs.read(object_name, &bl2, len, offset); + * ... + */ + class RadosStriper + { + public: + + /* + * constructor + */ + RadosStriper(); + + /* + * builds the C counter part of a RadosStriper + */ + static void to_rados_striper_t(RadosStriper &striper, + rados_striper_t *s); + + /* + * copy constructor + */ + RadosStriper(const RadosStriper& rs); + + /* + * operator= + */ + RadosStriper& operator=(const RadosStriper& rs); + + /* + * destructor + * Internally calling close() if an object is currently opened + */ + ~RadosStriper(); + + /* + * create method + */ + static int striper_create(librados::IoCtx& ioctx, + RadosStriper *striper); + + /* + * set object layout's stripe unit + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_unit(unsigned int stripe_unit); + + /* + * set object layout's stripe count + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_count(unsigned int stripe_count); + + /* + * set object layout's object size + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_object_size(unsigned int object_size); + + /** + * Get the value of an extended attribute on a striped object + */ + int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Set the value of an extended attribute on a striped object + */ + int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Delete an extended attribute from a striped object + */ + int rmxattr(const std::string& oid, const char *name); + + /** + * Start iterating over xattrs on a striped object. + */ + int getxattrs(const std::string& oid, + std::map<std::string, ceph::bufferlist>& attrset); + + /** + * synchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * synchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& soid, const ceph::bufferlist& bl); + + /** + * synchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int append(const std::string& soid, const ceph::bufferlist& bl, size_t len); + + /** + * asynchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @p bl. + */ + int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * asynchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @p bl. + */ + int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl); + + /** + * asynchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len); + + /** + * synchronously read from the striped object at the specified offset. + */ + int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off); + + /** + * asynchronously read from the striped object at the specified offset. + */ + int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off); + + /** + * synchronously get striped object stats (size/mtime) + */ + int stat(const std::string& soid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts); + + /** + * asynchronously get striped object stats (size/mtime) + */ + int aio_stat(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, struct timespec *pts); + + /** + * deletes a striped object. + * There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + */ + int remove(const std::string& soid); + int remove(const std::string& soid, int flags); + + /** + * asynchronous remove of striped objects + * See synchronous version for comments on (lack of) atomicity + */ + int aio_remove(const std::string& soid, librados::AioCompletion *c); + int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags); + + /** + * Resizes a striped object + * the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + */ + int trunc(const std::string& oid, uint64_t size); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * creation of multi aio completion objects + */ + static MultiAioCompletion *multi_aio_create_completion(); + static MultiAioCompletion *multi_aio_create_completion(void *cb_arg, + librados::callback_t cb_complete, + librados::callback_t cb_safe); + + private: + RadosStriperImpl *rados_striper_impl; + + }; + +} + +#endif diff --git a/src/include/random.h b/src/include/random.h new file mode 100644 index 00000000..b3cb80c3 --- /dev/null +++ b/src/include/random.h @@ -0,0 +1,289 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * +*/ + +#ifndef CEPH_RANDOM_H +#define CEPH_RANDOM_H 1 + +#include <mutex> +#include <random> +#include <type_traits> +#include <boost/optional.hpp> + +// Basic random number facility, adapted from N3551: +namespace ceph::util { + +inline namespace version_1_0_2 { + +namespace detail { + +template <typename T0, typename T1> +using larger_of = typename std::conditional< + sizeof(T0) >= sizeof(T1), + T0, T1> + ::type; + +// avoid mixing floating point and integers: +template <typename NumberT0, typename NumberT1> +using has_compatible_numeric_types = + std::disjunction< + std::conjunction< + std::is_floating_point<NumberT0>, std::is_floating_point<NumberT1> + >, + std::conjunction< + std::is_integral<NumberT0>, std::is_integral<NumberT1> + > + >; + + +// Select the larger of type compatible numeric types: +template <typename NumberT0, typename NumberT1> +using select_number_t = std::enable_if_t<detail::has_compatible_numeric_types<NumberT0, NumberT1>::value, + detail::larger_of<NumberT0, NumberT1>>; + +} // namespace detail + +namespace detail { + +// Choose default distribution for appropriate types: +template <typename NumberT, + bool IsIntegral> +struct select_distribution +{ + using type = std::uniform_int_distribution<NumberT>; +}; + +template <typename NumberT> +struct select_distribution<NumberT, false> +{ + using type = std::uniform_real_distribution<NumberT>; +}; + +template <typename NumberT> +using default_distribution = typename + select_distribution<NumberT, std::is_integral<NumberT>::value>::type; + +} // namespace detail + +namespace detail { + +template <typename EngineT> +EngineT& engine(); + +template <typename MutexT, typename EngineT, + typename SeedT = typename EngineT::result_type> +void randomize_rng(const SeedT seed, MutexT& m, EngineT& e) +{ + std::lock_guard<MutexT> lg(m); + e.seed(seed); +} + +template <typename MutexT, typename EngineT> +void randomize_rng(MutexT& m, EngineT& e) +{ + std::random_device rd; + + std::lock_guard<MutexT> lg(m); + e.seed(rd()); +} + +template <typename EngineT = std::default_random_engine, + typename SeedT = typename EngineT::result_type> +void randomize_rng(const SeedT n) +{ + detail::engine<EngineT>().seed(n); +} + +template <typename EngineT = std::default_random_engine> +void randomize_rng() +{ + std::random_device rd; + detail::engine<EngineT>().seed(rd()); +} + +template <typename EngineT> +EngineT& engine() +{ + thread_local boost::optional<EngineT> rng_engine; + + if (!rng_engine) { + rng_engine.emplace(EngineT()); + randomize_rng<EngineT>(); + } + + return *rng_engine; +} + +} // namespace detail + +namespace detail { + +template <typename NumberT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max, + EngineT& e) +{ + DistributionT d { min, max }; + + using param_type = typename DistributionT::param_type; + return d(e, param_type { min, max }); +} + +template <typename NumberT, + typename MutexT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max, + MutexT& m, EngineT& e) +{ + DistributionT d { min, max }; + + using param_type = typename DistributionT::param_type; + + std::lock_guard<MutexT> lg(m); + return d(e, param_type { min, max }); +} + +template <typename NumberT, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT> +NumberT generate_random_number(const NumberT min, const NumberT max) +{ + return detail::generate_random_number<NumberT, DistributionT, EngineT> + (min, max, detail::engine<EngineT>()); +} + +template <typename MutexT, + typename EngineT, + typename NumberT = int, + typename DistributionT = detail::default_distribution<NumberT>> +NumberT generate_random_number(MutexT& m, EngineT& e) +{ + return detail::generate_random_number<NumberT, MutexT, DistributionT, EngineT> + (0, std::numeric_limits<NumberT>::max(), m, e); +} + +template <typename NumberT, typename MutexT, typename EngineT> +NumberT generate_random_number(const NumberT max, MutexT& m, EngineT& e) +{ + return generate_random_number<NumberT>(0, max, m, e); +} + +} // namespace detail + +template <typename EngineT = std::default_random_engine> +void randomize_rng() +{ + detail::randomize_rng<EngineT>(); +} + +template <typename NumberT = int, + typename DistributionT = detail::default_distribution<NumberT>, + typename EngineT = std::default_random_engine> +NumberT generate_random_number() +{ + return detail::generate_random_number<NumberT, DistributionT, EngineT> + (0, std::numeric_limits<NumberT>::max()); +} + +template <typename NumberT0, typename NumberT1, + typename NumberT = detail::select_number_t<NumberT0, NumberT1> + > +NumberT generate_random_number(const NumberT0 min, const NumberT1 max) +{ + return detail::generate_random_number<NumberT, + detail::default_distribution<NumberT>, + std::default_random_engine> + (static_cast<NumberT>(min), static_cast<NumberT>(max)); +} + +template <typename NumberT0, typename NumberT1, + typename DistributionT, + typename EngineT, + typename NumberT = detail::select_number_t<NumberT0, NumberT1> + > +NumberT generate_random_number(const NumberT min, const NumberT max, + EngineT& e) +{ + return detail::generate_random_number<NumberT, + DistributionT, + EngineT>(static_cast<NumberT>(min), static_cast<NumberT>(max), e); +} + +template <typename NumberT> +NumberT generate_random_number(const NumberT max) +{ + return generate_random_number<NumberT>(0, max); +} + +// Function object: +template <typename NumberT> +class random_number_generator final +{ + std::mutex l; + std::random_device rd; + std::default_random_engine e; + + using seed_type = typename decltype(e)::result_type; + + public: + using number_type = NumberT; + using random_engine_type = decltype(e); + using random_device_type = decltype(rd); + + public: + random_device_type& random_device() noexcept { return rd; } + random_engine_type& random_engine() noexcept { return e; } + + public: + random_number_generator() { + detail::randomize_rng(l, e); + } + + explicit random_number_generator(const seed_type seed) { + detail::randomize_rng(seed, l, e); + } + + random_number_generator(random_number_generator&& rhs) + : e(std::move(rhs.e)) + {} + + public: + random_number_generator(const random_number_generator&) = delete; + random_number_generator& operator=(const random_number_generator&) = delete; + + public: + NumberT operator()() { + return detail::generate_random_number(l, e); + } + + NumberT operator()(const NumberT max) { + return detail::generate_random_number<NumberT>(max, l, e); + } + + NumberT operator()(const NumberT min, const NumberT max) { + return detail::generate_random_number<NumberT>(min, max, l, e); + } + + public: + void seed(const seed_type n) { + detail::randomize_rng(n, l, e); + } +}; + +} // inline namespace version_* + +} // namespace ceph::util + +#endif diff --git a/src/include/rangeset.h b/src/include/rangeset.h new file mode 100644 index 00000000..e7e3d047 --- /dev/null +++ b/src/include/rangeset.h @@ -0,0 +1,250 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_RANGESET_H +#define CEPH_RANGESET_H + +/* + * + * my first container with iterator! it's pretty ugly. + * + */ + +#include <map> + +//typedef int T; + +template <class T> +struct _rangeset_base { + map<T,T> ranges; // pair(first,last) (inclusive, e.g. [first,last]) + + typedef typename map<T,T>::iterator mapit; + + // get iterator for range including val. or ranges.end(). + mapit get_range_for(T val) { + mapit it = ranges.lower_bound(val); + if (it == ranges.end()) { + // search backwards + typename map<T,T>::reverse_iterator it = ranges.rbegin(); + if (it == ranges.rend()) return ranges.end(); + if (it->first <= val && it->second >= val) + return ranges.find(it->first); + return ranges.end(); + } else { + if (it->first == val) return + it--; + if (it->first <= val && it->second >= val) + return it; + return ranges.end(); + } + } + +}; + + +template <class T> +class rangeset_iterator : + public std::iterator<std::input_iterator_tag, T> +{ + //typedef typename map<T,T>::iterator mapit; + + map<T,T> ranges; + typename map<T,T>::iterator it; + T current; + +public: + // cons + rangeset_iterator() {} + + rangeset_iterator(typename map<T,T>::iterator& it, map<T,T>& ranges) { + this->ranges = ranges; + this->it = it; + if (this->it != ranges.end()) + current = it->first; + } + + bool operator==(rangeset_iterator<T> rit) { + return (it == rit.it && rit.current == current); + } + bool operator!=(rangeset_iterator<T> rit) { + return (it != rit.it) || (rit.current != current); + } + + T& operator*() { + return current; + } + + rangeset_iterator<T> operator++(int) { + if (current < it->second) + current++; + else { + it++; + if (it != ranges.end()) + current = it->first; + } + + return *this; + } +}; + + +template <class T> +class rangeset +{ + typedef typename map<T,T>::iterator map_iterator; + + _rangeset_base<T> theset; + inodeno_t _size; + +public: + rangeset() { _size = 0; } + typedef rangeset_iterator<T> iterator; + + iterator begin() { + map_iterator it = theset.ranges.begin(); + return iterator(it, theset.ranges); + } + + iterator end() { + map_iterator it = theset.ranges.end(); + return iterator(it, theset.ranges); + } + + map_iterator map_begin() { + return theset.ranges.begin(); + } + map_iterator map_end() { + return theset.ranges.end(); + } + int map_size() { + return theset.ranges.size(); + } + + void map_insert(T v1, T v2) { + theset.ranges.insert(pair<T,T>(v1,v2)); + _size += v2 - v1+1; + } + + + // ... + bool contains(T val) { + if (theset.get_range_for(val) == theset.ranges.end()) return false; + ceph_assert(!empty()); + return true; + } + + void insert(T val) { + ceph_assert(!contains(val)); + + map_iterator left = theset.get_range_for(val-1); + map_iterator right = theset.get_range_for(val+1); + + if (left != theset.ranges.end() && + right != theset.ranges.end()) { + // join! + left->second = right->second; + theset.ranges.erase(right); + _size++; + return; + } + + if (left != theset.ranges.end()) { + // add to left range + left->second = val; + _size++; + return; + } + + if (right != theset.ranges.end()) { + // add to right range + theset.ranges.insert(pair<T,T>(val, right->second)); + theset.ranges.erase(val+1); + _size++; + return; + } + + // new range + theset.ranges.insert(pair<T,T>(val,val)); + _size++; + return; + } + + unsigned size() { + return size(); + } + + bool empty() { + if (theset.ranges.empty()) { + ceph_assert(_size == 0); + return true; + } + ceph_assert(_size>0); + return false; + } + + + T first() { + ceph_assert(!empty()); + map_iterator it = theset.ranges.begin(); + return it->first; + } + + void erase(T val) { + ceph_assert(contains(val)); + map_iterator it = theset.get_range_for(val); + ceph_assert(it != theset.ranges.end()); + + // entire range + if (val == it->first && val == it->second) { + theset.ranges.erase(it); + _size--; + return; + } + + // beginning + if (val == it->first) { + theset.ranges.insert(pair<T,T>(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + // end + if (val == it->second) { + it->second = val-1; + _size--; + return; + } + + // middle split + theset.ranges.insert(pair<T,T>(it->first, val-1)); + theset.ranges.insert(pair<T,T>(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + void dump() { + for (typename map<T,T>::iterator it = theset.ranges.begin(); + it != theset.ranges.end(); + it++) { + cout << " " << it->first << "-" << it->second << endl; + } + } + +}; + + +#endif diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h new file mode 100644 index 00000000..89c54a36 --- /dev/null +++ b/src/include/rbd/features.h @@ -0,0 +1,102 @@ +#ifndef CEPH_RBD_FEATURES_H +#define CEPH_RBD_FEATURES_H + +#define RBD_FEATURE_LAYERING (1ULL<<0) +#define RBD_FEATURE_STRIPINGV2 (1ULL<<1) +#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_OBJECT_MAP (1ULL<<3) +#define RBD_FEATURE_FAST_DIFF (1ULL<<4) +#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) +#define RBD_FEATURE_JOURNALING (1ULL<<6) +#define RBD_FEATURE_DATA_POOL (1ULL<<7) +#define RBD_FEATURE_OPERATIONS (1ULL<<8) +#define RBD_FEATURE_MIGRATING (1ULL<<9) + +#define RBD_FEATURES_DEFAULT (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN) + +#define RBD_FEATURE_NAME_LAYERING "layering" +#define RBD_FEATURE_NAME_STRIPINGV2 "striping" +#define RBD_FEATURE_NAME_EXCLUSIVE_LOCK "exclusive-lock" +#define RBD_FEATURE_NAME_OBJECT_MAP "object-map" +#define RBD_FEATURE_NAME_FAST_DIFF "fast-diff" +#define RBD_FEATURE_NAME_DEEP_FLATTEN "deep-flatten" +#define RBD_FEATURE_NAME_JOURNALING "journaling" +#define RBD_FEATURE_NAME_DATA_POOL "data-pool" +#define RBD_FEATURE_NAME_OPERATIONS "operations" +#define RBD_FEATURE_NAME_MIGRATING "migrating" + +/// features that make an image inaccessible for read or write by +/// clients that don't understand them +#define RBD_FEATURES_INCOMPATIBLE (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL) + +/// features that make an image unwritable by clients that don't understand them +#define RBD_FEATURES_RW_INCOMPATIBLE (RBD_FEATURES_INCOMPATIBLE | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ + RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_DEEP_FLATTEN | \ + RBD_FEATURE_JOURNALING | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +/// features that may be dynamically enabled or disabled +#define RBD_FEATURES_MUTABLE (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING) + +/// features that may be dynamically disabled +#define RBD_FEATURES_DISABLE_ONLY (RBD_FEATURE_DEEP_FLATTEN) + +/// features that only work when used with a single client +/// using the image for writes +#define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_OBJECT_MAP | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_JOURNALING) + +/// features that will be implicitly enabled +#define RBD_FEATURES_IMPLICIT_ENABLE (RBD_FEATURE_STRIPINGV2 | \ + RBD_FEATURE_DATA_POOL | \ + RBD_FEATURE_FAST_DIFF | \ + RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +/// features that cannot be controlled by the user +#define RBD_FEATURES_INTERNAL (RBD_FEATURE_OPERATIONS | \ + RBD_FEATURE_MIGRATING) + +#define RBD_OPERATION_FEATURE_CLONE_PARENT (1ULL<<0) +#define RBD_OPERATION_FEATURE_CLONE_CHILD (1ULL<<1) +#define RBD_OPERATION_FEATURE_GROUP (1ULL<<2) +#define RBD_OPERATION_FEATURE_SNAP_TRASH (1ULL<<3) + +#define RBD_OPERATION_FEATURE_NAME_CLONE_PARENT "clone-parent" +#define RBD_OPERATION_FEATURE_NAME_CLONE_CHILD "clone-child" +#define RBD_OPERATION_FEATURE_NAME_GROUP "group" +#define RBD_OPERATION_FEATURE_NAME_SNAP_TRASH "snap-trash" + +/// all valid operation features +#define RBD_OPERATION_FEATURES_ALL (RBD_OPERATION_FEATURE_CLONE_PARENT | \ + RBD_OPERATION_FEATURE_CLONE_CHILD | \ + RBD_OPERATION_FEATURE_GROUP | \ + RBD_OPERATION_FEATURE_SNAP_TRASH) + +#endif diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h new file mode 100644 index 00000000..522a6fb6 --- /dev/null +++ b/src/include/rbd/librbd.h @@ -0,0 +1,1243 @@ +// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRBD_H +#define CEPH_LIBRBD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif +#include <stdbool.h> +#include <string.h> +#include <sys/uio.h> +#include "../rados/librados.h" +#include "features.h" + +#define LIBRBD_VER_MAJOR 1 +#define LIBRBD_VER_MINOR 12 +#define LIBRBD_VER_EXTRA 0 + +#define LIBRBD_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA) + +#define LIBRBD_SUPPORTS_AIO_FLUSH 1 +#define LIBRBD_SUPPORTS_AIO_OPEN 1 +#define LIBRBD_SUPPORTS_COMPARE_AND_WRITE 1 +#define LIBRBD_SUPPORTS_LOCKING 1 +#define LIBRBD_SUPPORTS_INVALIDATE 1 +#define LIBRBD_SUPPORTS_IOVEC 1 +#define LIBRBD_SUPPORTS_WATCH 0 +#define LIBRBD_SUPPORTS_WRITESAME 1 +#define LIBRBD_SUPPORTS_WRITE_ZEROES 1 + +#if __GNUC__ >= 4 + #define CEPH_RBD_API __attribute__ ((visibility ("default"))) +#else + #define CEPH_RBD_API +#endif + +#define RBD_FLAG_OBJECT_MAP_INVALID (1<<0) +#define RBD_FLAG_FAST_DIFF_INVALID (1<<1) + +typedef void *rbd_image_t; +typedef void *rbd_image_options_t; +typedef void *rbd_pool_stats_t; + +typedef void *rbd_completion_t; +typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg); + +typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr); + +typedef void (*rbd_update_callback_t)(void *arg); + +typedef enum { + RBD_SNAP_NAMESPACE_TYPE_USER = 0, + RBD_SNAP_NAMESPACE_TYPE_GROUP = 1, + RBD_SNAP_NAMESPACE_TYPE_TRASH = 2 +} rbd_snap_namespace_type_t; + +typedef struct { + char *id; + char *name; +} rbd_image_spec_t; + +typedef struct { + int64_t pool_id; + char *pool_name; + char *pool_namespace; + char *image_id; + char *image_name; + bool trash; +} rbd_linked_image_spec_t; + +typedef struct { + uint64_t id; + rbd_snap_namespace_type_t namespace_type; + char *name; +} rbd_snap_spec_t; + +typedef struct { + uint64_t id; + uint64_t size; + const char *name; +} rbd_snap_info_t; + +typedef struct { + const char *pool_name; + const char *image_name; + const char *image_id; + bool trash; +} rbd_child_info_t; + +#define RBD_MAX_IMAGE_NAME_SIZE 96 +#define RBD_MAX_BLOCK_NAME_SIZE 24 + +#define RBD_SNAP_REMOVE_UNPROTECT 1 << 0 +#define RBD_SNAP_REMOVE_FLATTEN 1 << 1 +#define RBD_SNAP_REMOVE_FORCE (RBD_SNAP_REMOVE_UNPROTECT | RBD_SNAP_REMOVE_FLATTEN) + +/** + * These types used to in set_image_notification to indicate the type of event + * socket passed in. + */ +enum { + EVENT_TYPE_PIPE = 1, + EVENT_TYPE_EVENTFD = 2 +}; + +typedef struct { + uint64_t size; + uint64_t obj_size; + uint64_t num_objs; + int order; + char block_name_prefix[RBD_MAX_BLOCK_NAME_SIZE]; /* deprecated */ + int64_t parent_pool; /* deprecated */ + char parent_name[RBD_MAX_IMAGE_NAME_SIZE]; /* deprecated */ +} rbd_image_info_t; + +typedef enum { + RBD_MIRROR_MODE_DISABLED, /* mirroring is disabled */ + RBD_MIRROR_MODE_IMAGE, /* mirroring enabled on a per-image basis */ + RBD_MIRROR_MODE_POOL /* mirroring enabled on all journaled images */ +} rbd_mirror_mode_t; + +typedef enum { + RBD_MIRROR_PEER_DIRECTION_RX = 0, + RBD_MIRROR_PEER_DIRECTION_TX = 1, + RBD_MIRROR_PEER_DIRECTION_RX_TX = 2 +} rbd_mirror_peer_direction_t; + +typedef struct { + char *uuid; + char *cluster_name; + char *client_name; +} rbd_mirror_peer_t; + +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_MON_HOST "mon_host" +#define RBD_MIRROR_PEER_ATTRIBUTE_NAME_KEY "key" + +typedef enum { + RBD_MIRROR_IMAGE_DISABLING = 0, + RBD_MIRROR_IMAGE_ENABLED = 1, + RBD_MIRROR_IMAGE_DISABLED = 2 +} rbd_mirror_image_state_t; + +typedef struct { + char *global_id; + rbd_mirror_image_state_t state; + bool primary; +} rbd_mirror_image_info_t; + +typedef enum { + MIRROR_IMAGE_STATUS_STATE_UNKNOWN = 0, + MIRROR_IMAGE_STATUS_STATE_ERROR = 1, + MIRROR_IMAGE_STATUS_STATE_SYNCING = 2, + MIRROR_IMAGE_STATUS_STATE_STARTING_REPLAY = 3, + MIRROR_IMAGE_STATUS_STATE_REPLAYING = 4, + MIRROR_IMAGE_STATUS_STATE_STOPPING_REPLAY = 5, + MIRROR_IMAGE_STATUS_STATE_STOPPED = 6, +} rbd_mirror_image_status_state_t; + +typedef struct { + char *name; + rbd_mirror_image_info_t info; + rbd_mirror_image_status_state_t state; + char *description; + time_t last_update; + bool up; +} rbd_mirror_image_status_t; + +typedef enum { + RBD_GROUP_IMAGE_STATE_ATTACHED, + RBD_GROUP_IMAGE_STATE_INCOMPLETE +} rbd_group_image_state_t; + +typedef struct { + char *name; + int64_t pool; + rbd_group_image_state_t state; +} rbd_group_image_info_t; + +typedef struct { + char *name; + int64_t pool; +} rbd_group_info_t; + +typedef enum { + RBD_GROUP_SNAP_STATE_INCOMPLETE, + RBD_GROUP_SNAP_STATE_COMPLETE +} rbd_group_snap_state_t; + +typedef struct { + char *name; + rbd_group_snap_state_t state; +} rbd_group_snap_info_t; + +typedef struct { + int64_t group_pool; + char *group_name; + char *group_snap_name; +} rbd_snap_group_namespace_t; + +typedef enum { + RBD_LOCK_MODE_EXCLUSIVE = 0, + RBD_LOCK_MODE_SHARED = 1, +} rbd_lock_mode_t; + +CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra); + +/* image options */ +enum { + RBD_IMAGE_OPTION_FORMAT = 0, + RBD_IMAGE_OPTION_FEATURES = 1, + RBD_IMAGE_OPTION_ORDER = 2, + RBD_IMAGE_OPTION_STRIPE_UNIT = 3, + RBD_IMAGE_OPTION_STRIPE_COUNT = 4, + RBD_IMAGE_OPTION_JOURNAL_ORDER = 5, + RBD_IMAGE_OPTION_JOURNAL_SPLAY_WIDTH = 6, + RBD_IMAGE_OPTION_JOURNAL_POOL = 7, + RBD_IMAGE_OPTION_FEATURES_SET = 8, + RBD_IMAGE_OPTION_FEATURES_CLEAR = 9, + RBD_IMAGE_OPTION_DATA_POOL = 10, + RBD_IMAGE_OPTION_FLATTEN = 11, + RBD_IMAGE_OPTION_CLONE_FORMAT = 12, +}; + +typedef enum { + RBD_TRASH_IMAGE_SOURCE_USER = 0, + RBD_TRASH_IMAGE_SOURCE_MIRRORING = 1, + RBD_TRASH_IMAGE_SOURCE_MIGRATION = 2, + RBD_TRASH_IMAGE_SOURCE_REMOVING = 3 +} rbd_trash_image_source_t; + +typedef struct { + char *id; + char *name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; +} rbd_trash_image_info_t; + +typedef struct { + char *addr; + int64_t id; + uint64_t cookie; +} rbd_image_watcher_t; + +typedef enum { + RBD_IMAGE_MIGRATION_STATE_UNKNOWN = -1, + RBD_IMAGE_MIGRATION_STATE_ERROR = 0, + RBD_IMAGE_MIGRATION_STATE_PREPARING = 1, + RBD_IMAGE_MIGRATION_STATE_PREPARED = 2, + RBD_IMAGE_MIGRATION_STATE_EXECUTING = 3, + RBD_IMAGE_MIGRATION_STATE_EXECUTED = 4, + RBD_IMAGE_MIGRATION_STATE_ABORTING = 5, +} rbd_image_migration_state_t; + +typedef struct { + int64_t source_pool_id; + char *source_pool_namespace; + char *source_image_name; + char *source_image_id; + int64_t dest_pool_id; + char *dest_pool_namespace; + char *dest_image_name; + char *dest_image_id; + rbd_image_migration_state_t state; + char *state_description; +} rbd_image_migration_status_t; + +typedef enum { + RBD_CONFIG_SOURCE_CONFIG = 0, + RBD_CONFIG_SOURCE_POOL = 1, + RBD_CONFIG_SOURCE_IMAGE = 2, +} rbd_config_source_t; + +typedef struct { + char *name; + char *value; + rbd_config_source_t source; +} rbd_config_option_t; + +typedef enum { + RBD_POOL_STAT_OPTION_IMAGES, + RBD_POOL_STAT_OPTION_IMAGE_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_IMAGE_SNAPSHOTS, + RBD_POOL_STAT_OPTION_TRASH_IMAGES, + RBD_POOL_STAT_OPTION_TRASH_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_MAX_PROVISIONED_BYTES, + RBD_POOL_STAT_OPTION_TRASH_SNAPSHOTS +} rbd_pool_stat_option_t; + +CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts); +CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts, + int optname, const char* optval); +CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts, + int optname, uint64_t optval); +CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts, + int optname, char* optval, + size_t maxlen); +CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts, + int optname, uint64_t* optval); +CEPH_RBD_API int rbd_image_options_is_set(rbd_image_options_t opts, + int optname, bool* is_set); +CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname); +CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts); +CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts); + +/* helpers */ +CEPH_RBD_API void rbd_image_spec_cleanup(rbd_image_spec_t *image); +CEPH_RBD_API void rbd_image_spec_list_cleanup(rbd_image_spec_t *images, + size_t num_images); +CEPH_RBD_API void rbd_linked_image_spec_cleanup(rbd_linked_image_spec_t *image); +CEPH_RBD_API void rbd_linked_image_spec_list_cleanup( + rbd_linked_image_spec_t *images, size_t num_images); +CEPH_RBD_API void rbd_snap_spec_cleanup(rbd_snap_spec_t *snap); + +/* images */ +CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size) + __attribute__((deprecated)); +CEPH_RBD_API int rbd_list2(rados_ioctx_t io, rbd_image_spec_t* images, + size_t *max_images); + +CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, + int *order); +CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order); +/** + * create new rbd image + * + * The stripe_unit must be a factor of the object size (1 << order). + * The stripe_count can be one (no intra-object striping) or greater + * than one. The RBD_FEATURE_STRIPINGV2 must be specified if the + * stripe_unit != the object size and the stripe_count is != 1. + * + * @param io ioctx + * @param name image name + * @param size image size in bytes + * @param features initial feature bits + * @param order object/block size, as a power of two (object size == 1 << order) + * @param stripe_unit stripe unit size, in bytes. + * @param stripe_count number of objects to stripe over before looping + * @return 0 on success, or negative error code + */ +CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); +CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order); +CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, uint64_t features, int *c_order, + uint64_t stripe_unit, int stripe_count); +CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name, + const char *p_snapname, rados_ioctx_t c_ioctx, + const char *c_name, rbd_image_options_t c_opts); +CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name); +CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_rename(rados_ioctx_t src_io_ctx, const char *srcname, + const char *destname); + +CEPH_RBD_API int rbd_trash_move(rados_ioctx_t io, const char *name, + uint64_t delay); +CEPH_RBD_API int rbd_trash_get(rados_ioctx_t io, const char *id, + rbd_trash_image_info_t *info); +CEPH_RBD_API void rbd_trash_get_cleanup(rbd_trash_image_info_t *info); +CEPH_RBD_API int rbd_trash_list(rados_ioctx_t io, + rbd_trash_image_info_t *trash_entries, + size_t *num_entries); +CEPH_RBD_API void rbd_trash_list_cleanup(rbd_trash_image_info_t *trash_entries, + size_t num_entries); +CEPH_RBD_API int rbd_trash_purge(rados_ioctx_t io, time_t expire_ts, float threshold); +CEPH_RBD_API int rbd_trash_purge_with_progress(rados_ioctx_t io, time_t expire_ts, + float threshold, librbd_progress_fn_t cb, + void* cbdata); +CEPH_RBD_API int rbd_trash_remove(rados_ioctx_t io, const char *id, bool force); +CEPH_RBD_API int rbd_trash_remove_with_progress(rados_ioctx_t io, + const char *id, + bool force, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_trash_restore(rados_ioctx_t io, const char *id, + const char *name); + +/* migration */ +CEPH_RBD_API int rbd_migration_prepare(rados_ioctx_t ioctx, + const char *image_name, + rados_ioctx_t dest_ioctx, + const char *dest_image_name, + rbd_image_options_t opts); +CEPH_RBD_API int rbd_migration_execute(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_execute_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_abort(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_abort_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_commit(rados_ioctx_t ioctx, + const char *image_name); +CEPH_RBD_API int rbd_migration_commit_with_progress(rados_ioctx_t ioctx, + const char *image_name, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_migration_status(rados_ioctx_t ioctx, + const char *image_name, + rbd_image_migration_status_t *status, + size_t status_size); +CEPH_RBD_API void rbd_migration_status_cleanup( + rbd_image_migration_status_t *status); + +/* pool mirroring */ +CEPH_RBD_API int rbd_mirror_site_name_get(rados_t cluster, + char *name, size_t *max_len); +CEPH_RBD_API int rbd_mirror_site_name_set(rados_t cluster, + const char *name); + +CEPH_RBD_API int rbd_mirror_mode_get(rados_ioctx_t io_ctx, + rbd_mirror_mode_t *mirror_mode); +CEPH_RBD_API int rbd_mirror_mode_set(rados_ioctx_t io_ctx, + rbd_mirror_mode_t mirror_mode); + +CEPH_RBD_API int rbd_mirror_peer_bootstrap_create(rados_ioctx_t io_ctx, + char *token, size_t *max_len); +CEPH_RBD_API int rbd_mirror_peer_bootstrap_import( + rados_ioctx_t io_ctx, rbd_mirror_peer_direction_t direction, + const char *token); + +CEPH_RBD_API int rbd_mirror_peer_add(rados_ioctx_t io_ctx, + char *uuid, size_t uuid_max_length, + const char *cluster_name, + const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_remove(rados_ioctx_t io_ctx, + const char *uuid); +CEPH_RBD_API int rbd_mirror_peer_list(rados_ioctx_t io_ctx, + rbd_mirror_peer_t *peers, int *max_peers); +CEPH_RBD_API void rbd_mirror_peer_list_cleanup(rbd_mirror_peer_t *peers, + int max_peers); +CEPH_RBD_API int rbd_mirror_peer_set_client(rados_ioctx_t io_ctx, + const char *uuid, + const char *client_name); +CEPH_RBD_API int rbd_mirror_peer_set_cluster(rados_ioctx_t io_ctx, + const char *uuid, + const char *cluster_name); +CEPH_RBD_API int rbd_mirror_peer_get_attributes( + rados_ioctx_t p, const char *uuid, char *keys, size_t *max_key_len, + char *values, size_t *max_value_len, size_t *key_value_count); +CEPH_RBD_API int rbd_mirror_peer_set_attributes( + rados_ioctx_t p, const char *uuid, const char *keys, const char *values, + size_t key_value_count); + +CEPH_RBD_API int rbd_mirror_image_status_list(rados_ioctx_t io_ctx, + const char *start_id, size_t max, + char **image_ids, + rbd_mirror_image_status_t *images, + size_t *len); +CEPH_RBD_API void rbd_mirror_image_status_list_cleanup(char **image_ids, + rbd_mirror_image_status_t *images, size_t len); +CEPH_RBD_API int rbd_mirror_image_status_summary(rados_ioctx_t io_ctx, + rbd_mirror_image_status_state_t *states, int *counts, size_t *maxlen); + +CEPH_RBD_API int rbd_mirror_image_instance_id_list(rados_ioctx_t io_ctx, + const char *start_id, + size_t max, char **image_ids, + char **instance_ids, + size_t *len); +CEPH_RBD_API void rbd_mirror_image_instance_id_list_cleanup(char **image_ids, + char **instance_ids, + size_t len); + +/* pool metadata */ +CEPH_RBD_API int rbd_pool_metadata_get(rados_ioctx_t io_ctx, const char *key, + char *value, size_t *val_len); +CEPH_RBD_API int rbd_pool_metadata_set(rados_ioctx_t io_ctx, const char *key, + const char *value); +CEPH_RBD_API int rbd_pool_metadata_remove(rados_ioctx_t io_ctx, + const char *key); +CEPH_RBD_API int rbd_pool_metadata_list(rados_ioctx_t io_ctx, const char *start, + uint64_t max, char *keys, + size_t *key_len, char *values, + size_t *vals_len); + +CEPH_RBD_API int rbd_config_pool_list(rados_ioctx_t io_ctx, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_pool_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); + +CEPH_RBD_API int rbd_aio_open(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); + +/** + * Open an image in read-only mode. + * + * This is intended for use by clients that cannot write to a block + * device due to cephx restrictions. There will be no watch + * established on the header object, since a watch is a write. This + * means the metadata reported about this image (parents, snapshots, + * size, etc.) may become stale. This should not be used for + * long-running operations, unless you can be sure that one of these + * properties changing is safe. + * + * Attempting to write to a read-only image will return -EROFS. + * + * @param io ioctx to determine the pool the image is in + * @param name image name + * @param image where to store newly opened image handle + * @param snap_name name of snapshot to open at, or NULL for no snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name); +CEPH_RBD_API int rbd_aio_open_read_only(rados_ioctx_t io, const char *name, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_open_by_id_read_only(rados_ioctx_t io, const char *id, + rbd_image_t *image, const char *snap_name, + rbd_completion_t c); +CEPH_RBD_API int rbd_close(rbd_image_t image); +CEPH_RBD_API int rbd_aio_close(rbd_image_t image, rbd_completion_t c); +CEPH_RBD_API int rbd_resize(rbd_image_t image, uint64_t size); +CEPH_RBD_API int rbd_resize2(rbd_image_t image, uint64_t size, bool allow_shrink, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_resize_with_progress(rbd_image_t image, uint64_t size, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_stat(rbd_image_t image, rbd_image_info_t *info, + size_t infosize); +CEPH_RBD_API int rbd_get_old_format(rbd_image_t image, uint8_t *old); +CEPH_RBD_API int rbd_get_size(rbd_image_t image, uint64_t *size); +CEPH_RBD_API int rbd_get_features(rbd_image_t image, uint64_t *features); +CEPH_RBD_API int rbd_update_features(rbd_image_t image, uint64_t features, + uint8_t enabled); +CEPH_RBD_API int rbd_get_op_features(rbd_image_t image, uint64_t *op_features); +CEPH_RBD_API int rbd_get_stripe_unit(rbd_image_t image, uint64_t *stripe_unit); +CEPH_RBD_API int rbd_get_stripe_count(rbd_image_t image, + uint64_t *stripe_count); + +CEPH_RBD_API int rbd_get_create_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_access_timestamp(rbd_image_t image, + struct timespec *timestamp); +CEPH_RBD_API int rbd_get_modify_timestamp(rbd_image_t image, + struct timespec *timestamp); + +CEPH_RBD_API int rbd_get_overlap(rbd_image_t image, uint64_t *overlap); +CEPH_RBD_API int rbd_get_name(rbd_image_t image, char *name, size_t *name_len); +CEPH_RBD_API int rbd_get_id(rbd_image_t image, char *id, size_t id_len); +CEPH_RBD_API int rbd_get_block_name_prefix(rbd_image_t image, + char *prefix, size_t prefix_len); +CEPH_RBD_API int64_t rbd_get_data_pool_id(rbd_image_t image); + +CEPH_RBD_API int rbd_get_parent_info(rbd_image_t image, + char *parent_poolname, size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_snapname, + size_t psnapnamelen) + __attribute__((deprecated)); +CEPH_RBD_API int rbd_get_parent_info2(rbd_image_t image, + char *parent_poolname, + size_t ppoolnamelen, + char *parent_name, size_t pnamelen, + char *parent_id, size_t pidlen, + char *parent_snapname, + size_t psnapnamelen) + __attribute__((deprecated)); +CEPH_RBD_API int rbd_get_parent(rbd_image_t image, + rbd_linked_image_spec_t *parent_image, + rbd_snap_spec_t *parent_snap); + +CEPH_RBD_API int rbd_get_flags(rbd_image_t image, uint64_t *flags); +CEPH_RBD_API int rbd_get_group(rbd_image_t image, rbd_group_info_t *group_info, + size_t group_info_size); +CEPH_RBD_API int rbd_set_image_notification(rbd_image_t image, int fd, int type); + +/* exclusive lock feature */ +CEPH_RBD_API int rbd_is_exclusive_lock_owner(rbd_image_t image, int *is_owner); +CEPH_RBD_API int rbd_lock_acquire(rbd_image_t image, rbd_lock_mode_t lock_mode); +CEPH_RBD_API int rbd_lock_release(rbd_image_t image); +CEPH_RBD_API int rbd_lock_get_owners(rbd_image_t image, + rbd_lock_mode_t *lock_mode, + char **lock_owners, + size_t *max_lock_owners); +CEPH_RBD_API void rbd_lock_get_owners_cleanup(char **lock_owners, + size_t lock_owner_count); +CEPH_RBD_API int rbd_lock_break(rbd_image_t image, rbd_lock_mode_t lock_mode, + const char *lock_owner); + +/* object map feature */ +CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image, + librbd_progress_fn_t cb, void *cbdata); + +CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx, + const char *destname); +CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest); +CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_copy4(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, rbd_image_options_t dest_opts, + size_t sparse_size); +CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p, + const char *destname, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata); +CEPH_RBD_API int rbd_copy_with_progress4(rbd_image_t image, + rados_ioctx_t dest_p, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, void *cbdata, + size_t sparse_size); + +/* deep copy */ +CEPH_RBD_API int rbd_deep_copy(rbd_image_t src, rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts); +CEPH_RBD_API int rbd_deep_copy_with_progress(rbd_image_t image, + rados_ioctx_t dest_io_ctx, + const char *destname, + rbd_image_options_t dest_opts, + librbd_progress_fn_t cb, + void *cbdata); + +/* snapshots */ +CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps, + int *max_snaps); +CEPH_RBD_API void rbd_snap_list_end(rbd_snap_info_t *snaps); +CEPH_RBD_API int rbd_snap_create(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_remove(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_remove2(rbd_image_t image, const char *snap_name, + uint32_t flags, librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_remove_by_id(rbd_image_t image, uint64_t snap_id); +CEPH_RBD_API int rbd_snap_rollback(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image, + const char *snapname, + librbd_progress_fn_t cb, + void *cbdata); +CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname, + const char* dstsnapsname); +/** + * Prevent a snapshot from being deleted until it is unprotected. + * + * @param snap_name which snapshot to protect + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if snap is already protected + */ +CEPH_RBD_API int rbd_snap_protect(rbd_image_t image, const char *snap_name); +/** + * Allow a snaphshot to be deleted. + * + * @param snap_name which snapshot to unprotect + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snap is not protected + */ +CEPH_RBD_API int rbd_snap_unprotect(rbd_image_t image, const char *snap_name); +/** + * Determine whether a snapshot is protected. + * + * @param snap_name which snapshot query + * @param is_protected where to store the result (0 or 1) + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_is_protected(rbd_image_t image, const char *snap_name, + int *is_protected); +/** + * Get the current snapshot limit for an image. If no limit is set, + * UINT64_MAX is returned. + * + * @param limit pointer where the limit will be stored on success + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_limit(rbd_image_t image, uint64_t *limit); + +/** + * Set a limit for the number of snapshots that may be taken of an image. + * + * @param limit the maximum number of snapshots allowed in the future. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_set_limit(rbd_image_t image, uint64_t limit); + +/** + * Get the timestamp of a snapshot for an image. + * + * @param snap_id the snap id of a snapshot of input image. + * @param timestamp the timestamp of input snapshot. + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_snap_get_timestamp(rbd_image_t image, uint64_t snap_id, struct timespec *timestamp); + +CEPH_RBD_API int rbd_snap_set(rbd_image_t image, const char *snapname); +CEPH_RBD_API int rbd_snap_set_by_id(rbd_image_t image, uint64_t snap_id); + +CEPH_RBD_API int rbd_snap_get_namespace_type(rbd_image_t image, + uint64_t snap_id, + rbd_snap_namespace_type_t *namespace_type); +CEPH_RBD_API int rbd_snap_get_group_namespace(rbd_image_t image, + uint64_t snap_id, + rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_group_namespace_cleanup(rbd_snap_group_namespace_t *group_snap, + size_t group_snap_size); +CEPH_RBD_API int rbd_snap_get_trash_namespace(rbd_image_t image, + uint64_t snap_id, + char* original_name, + size_t max_length); + +CEPH_RBD_API int rbd_flatten(rbd_image_t image); + +CEPH_RBD_API int rbd_flatten_with_progress(rbd_image_t image, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_sparsify(rbd_image_t image, size_t sparse_size); + +CEPH_RBD_API int rbd_sparsify_with_progress(rbd_image_t image, + size_t sparse_size, + librbd_progress_fn_t cb, + void *cbdata); + +/** + * List all images that are cloned from the image at the + * snapshot that is set via rbd_snap_set(). + * + * This iterates over all pools, so it should be run by a user with + * read access to all of them. pools_len and images_len are filled in + * with the number of bytes put into the pools and images buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the pool and image names + * of the children, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param pools buffer in which to store pool names + * @param pools_len number of bytes in pools buffer + * @param images buffer in which to store image names + * @param images_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API ssize_t rbd_list_children(rbd_image_t image, char *pools, + size_t *pools_len, char *images, + size_t *images_len) + __attribute__((deprecated)); +CEPH_RBD_API int rbd_list_children2(rbd_image_t image, + rbd_child_info_t *children, + int *max_children) + __attribute__((deprecated)); +CEPH_RBD_API void rbd_list_child_cleanup(rbd_child_info_t *child) + __attribute__((deprecated)); +CEPH_RBD_API void rbd_list_children_cleanup(rbd_child_info_t *children, + size_t num_children) + __attribute__((deprecated)); + +CEPH_RBD_API int rbd_list_children3(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +CEPH_RBD_API int rbd_list_descendants(rbd_image_t image, + rbd_linked_image_spec_t *images, + size_t *max_images); + +/** + * @defgroup librbd_h_locking Advisory Locking + * + * An rbd image may be locking exclusively, or shared, to facilitate + * e.g. live migration where the image may be open in two places at once. + * These locks are intended to guard against more than one client + * writing to an image without coordination. They don't need to + * be used for snapshots, since snapshots are read-only. + * + * Currently locks only guard against locks being acquired. + * They do not prevent anything else. + * + * A locker is identified by the internal rados client id of the + * holder and a user-defined cookie. This (client id, cookie) pair + * must be unique for each locker. + * + * A shared lock also has a user-defined tag associated with it. Each + * additional shared lock must specify the same tag or lock + * acquisition will fail. This can be used by e.g. groups of hosts + * using a clustered filesystem on top of an rbd image to make sure + * they're accessing the correct image. + * + * @{ + */ +/** + * List clients that have locked the image and information about the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the image + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RBD_API ssize_t rbd_list_lockers(rbd_image_t image, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Take an exclusive lock on the image. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_exclusive(rbd_image_t image, const char *cookie); + +/** + * Take a shared lock on the image. + * + * Other clients may also take a shared lock, as lock as they use the + * same tag. + * + * @param image the image to lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag user-defined identifier for this shared use of the lock + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RBD_API int rbd_lock_shared(rbd_image_t image, const char *cookie, + const char *tag); + +/** + * Release a shared or exclusive lock on the image. + * + * @param image the image to unlock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_unlock(rbd_image_t image, const char *cookie); + +/** + * Release a shared or exclusive lock that was taken by the specified client. + * + * @param image the image to unlock + * @param client the entity holding the lock (as given by rbd_list_lockers()) + * @param cookie user-defined identifier for the instance of the lock to break + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RBD_API int rbd_break_lock(rbd_image_t image, const char *client, + const char *cookie); + +/** @} locking */ + +/* I/O */ +CEPH_RBD_API ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len, + char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len, + char *buf, int op_flags); +/* DEPRECATED; use rbd_read_iterate2 */ +CEPH_RBD_API int64_t rbd_read_iterate(rbd_image_t image, uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); + +/** + * iterate read over an image + * + * Reads each region of the image and calls the callback. If the + * buffer pointer passed to the callback is NULL, the given extent is + * defined to be zeros (a hole). Normally the granularity for the + * callback is the image stripe size. + * + * @param image image to read + * @param ofs offset to start from + * @param len bytes of source image to cover + * @param cb callback for each region + * @returns 0 success, error otherwise + */ +CEPH_RBD_API int rbd_read_iterate2(rbd_image_t image, uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), + void *arg); +/** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent 1 if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ +CEPH_RBD_API int rbd_diff_iterate(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API int rbd_diff_iterate2(rbd_image_t image, + const char *fromsnapname, + uint64_t ofs, uint64_t len, + uint8_t include_parent, uint8_t whole_object, + int (*cb)(uint64_t, size_t, int, void *), + void *arg); +CEPH_RBD_API ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, int op_flags); +CEPH_RBD_API int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len); +CEPH_RBD_API ssize_t rbd_writesame(rbd_image_t image, uint64_t ofs, size_t len, + const char *buf, size_t data_len, + int op_flags); +CEPH_RBD_API ssize_t rbd_write_zeroes(rbd_image_t image, uint64_t ofs, + size_t len, int zero_flags, + int op_flags); +CEPH_RBD_API ssize_t rbd_compare_and_write(rbd_image_t image, uint64_t ofs, + size_t len, const char *cmp_buf, + const char *buf, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c); + +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len, + const char *buf, rbd_completion_t c, + int op_flags); +CEPH_RBD_API int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c); +/* + * @param op_flags: see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RBD_API int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len, + char *buf, rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_readv(rbd_image_t image, const struct iovec *iov, + int iovcnt, uint64_t off, rbd_completion_t c); +CEPH_RBD_API int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_writesame(rbd_image_t image, uint64_t off, size_t len, + const char *buf, size_t data_len, + rbd_completion_t c, int op_flags); +CEPH_RBD_API int rbd_aio_write_zeroes(rbd_image_t image, uint64_t off, + size_t len, rbd_completion_t c, + int zero_flags, int op_flags); +CEPH_RBD_API ssize_t rbd_aio_compare_and_write(rbd_image_t image, + uint64_t off, size_t len, + const char *cmp_buf, + const char *buf, + rbd_completion_t c, + uint64_t *mismatch_off, + int op_flags); + +CEPH_RBD_API int rbd_aio_create_completion(void *cb_arg, + rbd_callback_t complete_cb, + rbd_completion_t *c); +CEPH_RBD_API int rbd_aio_is_complete(rbd_completion_t c); +CEPH_RBD_API int rbd_aio_wait_for_complete(rbd_completion_t c); +CEPH_RBD_API ssize_t rbd_aio_get_return_value(rbd_completion_t c); +CEPH_RBD_API void *rbd_aio_get_arg(rbd_completion_t c); +CEPH_RBD_API void rbd_aio_release(rbd_completion_t c); +CEPH_RBD_API int rbd_flush(rbd_image_t image); +/** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_aio_flush(rbd_image_t image, rbd_completion_t c); + +/** + * Drop any cached data for an image + * + * @param image the image to invalidate cached data for + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_invalidate_cache(rbd_image_t image); + +CEPH_RBD_API int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp); + +CEPH_RBD_API int rbd_metadata_get(rbd_image_t image, const char *key, char *value, size_t *val_len); +CEPH_RBD_API int rbd_metadata_set(rbd_image_t image, const char *key, const char *value); +CEPH_RBD_API int rbd_metadata_remove(rbd_image_t image, const char *key); +/** + * List all metadatas associated with this image. + * + * This iterates over all metadatas, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are + * still filled in, but the data is not and -ERANGE is returned. + * Otherwise, the buffers are filled with the keys and values + * of the image, with a '\0' after each. + * + * @param image which image (and implicitly snapshot) to list clones of + * @param start_after which name to begin listing after + * (use the empty string to start at the beginning) + * @param max the maximum number of names to lis(if 0 means no limit) + * @param keys buffer in which to store pool names + * @param keys_len number of bytes in pools buffer + * @param values buffer in which to store image names + * @param vals_len number of bytes in images buffer + * @returns number of children on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RBD_API int rbd_metadata_list(rbd_image_t image, const char *start, uint64_t max, + char *keys, size_t *key_len, char *values, size_t *vals_len); + +// RBD image mirroring support functions +CEPH_RBD_API int rbd_mirror_image_enable(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_disable(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_promote(rbd_image_t image, bool force); +CEPH_RBD_API int rbd_mirror_image_demote(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_resync(rbd_image_t image); +CEPH_RBD_API int rbd_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size); +CEPH_RBD_API int rbd_mirror_image_get_status(rbd_image_t image, + rbd_mirror_image_status_t *mirror_image_status, + size_t status_size); +CEPH_RBD_API int rbd_mirror_image_get_instance_id(rbd_image_t image, + char *instance_id, + size_t *id_max_length); +CEPH_RBD_API int rbd_aio_mirror_image_promote(rbd_image_t image, bool force, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_demote(rbd_image_t image, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_info(rbd_image_t image, + rbd_mirror_image_info_t *mirror_image_info, + size_t info_size, + rbd_completion_t c); +CEPH_RBD_API int rbd_aio_mirror_image_get_status(rbd_image_t image, + rbd_mirror_image_status_t *mirror_image_status, + size_t status_size, + rbd_completion_t c); + +// RBD groups support functions +CEPH_RBD_API int rbd_group_create(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_remove(rados_ioctx_t p, const char *name); +CEPH_RBD_API int rbd_group_list(rados_ioctx_t p, char *names, size_t *size); +CEPH_RBD_API int rbd_group_rename(rados_ioctx_t p, const char *src_name, + const char *dest_name); +CEPH_RBD_API int rbd_group_info_cleanup(rbd_group_info_t *group_info, + size_t group_info_size); + +/** + * Register an image metadata change watcher. + * + * @param image the image to watch + * @param handle where to store the internal id assigned to this watch + * @param watch_cb what to do when a notify is received on this image + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_watch(rbd_image_t image, uint64_t *handle, + rbd_update_callback_t watch_cb, void *arg); + +/** + * Unregister an image watcher. + * + * @param image the image to unwatch + * @param handle which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RBD_API int rbd_update_unwatch(rbd_image_t image, uint64_t handle); + +/** + * List any watchers of an image. + * + * Watchers will be allocated and stored in the passed watchers array. If there + * are more watchers than max_watchers, -ERANGE will be returned and the number + * of watchers will be stored in max_watchers. + * + * The caller should call rbd_watchers_list_cleanup when finished with the list + * of watchers. + * + * @param image the image to list watchers for. + * @param watchers an array to store watchers in. + * @param max_watchers capacity of the watchers array. + * @returns 0 on success, negative error code on failure. + * @returns -ERANGE if there are too many watchers for the passed array. + * @returns the number of watchers in max_watchers. + */ +CEPH_RBD_API int rbd_watchers_list(rbd_image_t image, + rbd_image_watcher_t *watchers, + size_t *max_watchers); + +CEPH_RBD_API void rbd_watchers_list_cleanup(rbd_image_watcher_t *watchers, + size_t num_watchers); + +CEPH_RBD_API int rbd_config_image_list(rbd_image_t image, + rbd_config_option_t *options, + int *max_options); +CEPH_RBD_API void rbd_config_image_list_cleanup(rbd_config_option_t *options, + int max_options); + +CEPH_RBD_API int rbd_group_image_add(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_name); +CEPH_RBD_API int rbd_group_image_remove_by_id(rados_ioctx_t group_p, + const char *group_name, + rados_ioctx_t image_p, + const char *image_id); +CEPH_RBD_API int rbd_group_image_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_image_list_cleanup(rbd_group_image_info_t *images, + size_t group_image_info_size, + size_t num_entries); + +CEPH_RBD_API int rbd_group_snap_create(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_remove(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rename(rados_ioctx_t group_p, + const char *group_name, + const char *old_snap_name, + const char *new_snap_name); +CEPH_RBD_API int rbd_group_snap_list(rados_ioctx_t group_p, + const char *group_name, + rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t *num_entries); +CEPH_RBD_API int rbd_group_snap_list_cleanup(rbd_group_snap_info_t *snaps, + size_t group_snap_info_size, + size_t num_entries); +CEPH_RBD_API int rbd_group_snap_rollback(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name); +CEPH_RBD_API int rbd_group_snap_rollback_with_progress(rados_ioctx_t group_p, + const char *group_name, + const char *snap_name, + librbd_progress_fn_t cb, + void *cbdata); + +CEPH_RBD_API int rbd_namespace_create(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_remove(rados_ioctx_t io, + const char *namespace_name); +CEPH_RBD_API int rbd_namespace_list(rados_ioctx_t io, char *namespace_names, + size_t *size); +CEPH_RBD_API int rbd_namespace_exists(rados_ioctx_t io, + const char *namespace_name, + bool *exists); + +CEPH_RBD_API int rbd_pool_init(rados_ioctx_t io, bool force); + +CEPH_RBD_API void rbd_pool_stats_create(rbd_pool_stats_t *stats); +CEPH_RBD_API void rbd_pool_stats_destroy(rbd_pool_stats_t stats); +CEPH_RBD_API int rbd_pool_stats_option_add_uint64(rbd_pool_stats_t stats, + int stat_option, + uint64_t* stat_val); +CEPH_RBD_API int rbd_pool_stats_get(rados_ioctx_t io, rbd_pool_stats_t stats); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp new file mode 100644 index 00000000..646c6bb3 --- /dev/null +++ b/src/include/rbd/librbd.hpp @@ -0,0 +1,686 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __LIBRBD_HPP +#define __LIBRBD_HPP + +#include <string> +#include <list> +#include <map> +#include <vector> +#include "../rados/buffer.h" +#include "../rados/librados.hpp" +#include "librbd.h" + +namespace librbd { + + using librados::IoCtx; + + class Image; + class ImageOptions; + class PoolStats; + typedef void *image_ctx_t; + typedef void *completion_t; + typedef void (*callback_t)(completion_t cb, void *arg); + + typedef struct { + std::string id; + std::string name; + } image_spec_t; + + typedef struct { + int64_t pool_id; + std::string pool_name; + std::string pool_namespace; + std::string image_id; + std::string image_name; + bool trash; + } linked_image_spec_t; + + typedef rbd_snap_namespace_type_t snap_namespace_type_t; + + typedef struct { + uint64_t id; + snap_namespace_type_t namespace_type; + std::string name; + } snap_spec_t; + + typedef struct { + uint64_t id; + uint64_t size; + std::string name; + } snap_info_t; + + typedef struct { + int64_t group_pool; + std::string group_name; + std::string group_snap_name; + } snap_group_namespace_t; + + typedef struct { + std::string client; + std::string cookie; + std::string address; + } locker_t; + + typedef rbd_mirror_peer_direction_t mirror_peer_direction_t; + + typedef struct { + std::string uuid; + std::string cluster_name; + std::string client_name; + } mirror_peer_t; + + typedef rbd_mirror_image_state_t mirror_image_state_t; + + typedef struct { + std::string global_id; + mirror_image_state_t state; + bool primary; + } mirror_image_info_t; + + typedef rbd_mirror_image_status_state_t mirror_image_status_state_t; + + typedef struct { + std::string name; + mirror_image_info_t info; + mirror_image_status_state_t state; + std::string description; + time_t last_update; + bool up; + } mirror_image_status_t; + + typedef rbd_group_image_state_t group_image_state_t; + + typedef struct { + std::string name; + int64_t pool; + group_image_state_t state; + } group_image_info_t; + + typedef struct { + std::string name; + int64_t pool; + } group_info_t; + + typedef rbd_group_snap_state_t group_snap_state_t; + + typedef struct { + std::string name; + group_snap_state_t state; + } group_snap_info_t; + + typedef rbd_image_info_t image_info_t; + + class CEPH_RBD_API ProgressContext + { + public: + virtual ~ProgressContext(); + virtual int update_progress(uint64_t offset, uint64_t total) = 0; + }; + + typedef struct { + std::string id; + std::string name; + rbd_trash_image_source_t source; + time_t deletion_time; + time_t deferment_end_time; + } trash_image_info_t; + + typedef struct { + std::string pool_name; + std::string image_name; + std::string image_id; + bool trash; + } child_info_t; + + typedef struct { + std::string addr; + int64_t id; + uint64_t cookie; + } image_watcher_t; + + typedef rbd_image_migration_state_t image_migration_state_t; + + typedef struct { + int64_t source_pool_id; + std::string source_pool_namespace; + std::string source_image_name; + std::string source_image_id; + int64_t dest_pool_id; + std::string dest_pool_namespace; + std::string dest_image_name; + std::string dest_image_id; + image_migration_state_t state; + std::string state_description; + } image_migration_status_t; + + typedef rbd_config_source_t config_source_t; + + typedef struct { + std::string name; + std::string value; + config_source_t source; + } config_option_t; + +class CEPH_RBD_API RBD +{ +public: + RBD(); + ~RBD(); + + // This must be dynamically allocated with new, and + // must be released with release(). + // Do not use delete. + struct AioCompletion { + void *pc; + AioCompletion(void *cb_arg, callback_t complete_cb); + bool is_complete(); + int wait_for_complete(); + ssize_t get_return_value(); + void *get_arg(); + void release(); + }; + + void version(int *major, int *minor, int *extra); + + int open(IoCtx& io_ctx, Image& image, const char *name); + int open(IoCtx& io_ctx, Image& image, const char *name, const char *snapname); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id); + int open_by_id(IoCtx& io_ctx, Image& image, const char *id, const char *snapname); + int aio_open(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + // see librbd.h + int open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname); + int open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname); + int aio_open_read_only(IoCtx& io_ctx, Image& image, const char *name, + const char *snapname, RBD::AioCompletion *c); + int aio_open_by_id_read_only(IoCtx& io_ctx, Image& image, const char *id, + const char *snapname, RBD::AioCompletion *c); + + int list(IoCtx& io_ctx, std::vector<std::string>& names) + __attribute__((deprecated)); + int list2(IoCtx& io_ctx, std::vector<image_spec_t>* images); + + int create(IoCtx& io_ctx, const char *name, uint64_t size, int *order); + int create2(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order); + int create3(IoCtx& io_ctx, const char *name, uint64_t size, + uint64_t features, int *order, + uint64_t stripe_unit, uint64_t stripe_count); + int create4(IoCtx& io_ctx, const char *name, uint64_t size, + ImageOptions& opts); + int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order); + int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, uint64_t features, + int *c_order, uint64_t stripe_unit, int stripe_count); + int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname, + IoCtx& c_ioctx, const char *c_name, ImageOptions& opts); + int remove(IoCtx& io_ctx, const char *name); + int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx); + int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname); + + int trash_move(IoCtx &io_ctx, const char *name, uint64_t delay); + int trash_get(IoCtx &io_ctx, const char *id, trash_image_info_t *info); + int trash_list(IoCtx &io_ctx, std::vector<trash_image_info_t> &entries); + int trash_purge(IoCtx &io_ctx, time_t expire_ts, float threshold); + int trash_purge_with_progress(IoCtx &io_ctx, time_t expire_ts, float threshold, + ProgressContext &pctx); + int trash_remove(IoCtx &io_ctx, const char *image_id, bool force); + int trash_remove_with_progress(IoCtx &io_ctx, const char *image_id, + bool force, ProgressContext &pctx); + int trash_restore(IoCtx &io_ctx, const char *id, const char *name); + + // Migration + int migration_prepare(IoCtx& io_ctx, const char *image_name, + IoCtx& dest_io_ctx, const char *dest_image_name, + ImageOptions& opts); + int migration_execute(IoCtx& io_ctx, const char *image_name); + int migration_execute_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_abort(IoCtx& io_ctx, const char *image_name); + int migration_abort_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_commit(IoCtx& io_ctx, const char *image_name); + int migration_commit_with_progress(IoCtx& io_ctx, const char *image_name, + ProgressContext &prog_ctx); + int migration_status(IoCtx& io_ctx, const char *image_name, + image_migration_status_t *status, size_t status_size); + + // RBD pool mirroring support functions + int mirror_site_name_get(librados::Rados& rados, std::string* site_name); + int mirror_site_name_set(librados::Rados& rados, + const std::string& site_name); + + int mirror_mode_get(IoCtx& io_ctx, rbd_mirror_mode_t *mirror_mode); + int mirror_mode_set(IoCtx& io_ctx, rbd_mirror_mode_t mirror_mode); + + int mirror_peer_bootstrap_create(IoCtx& io_ctx, std::string* token); + int mirror_peer_bootstrap_import(IoCtx& io_ctx, + mirror_peer_direction_t direction, + const std::string &token); + + int mirror_peer_add(IoCtx& io_ctx, std::string *uuid, + const std::string &cluster_name, + const std::string &client_name); + int mirror_peer_remove(IoCtx& io_ctx, const std::string &uuid); + int mirror_peer_list(IoCtx& io_ctx, std::vector<mirror_peer_t> *peers); + int mirror_peer_set_client(IoCtx& io_ctx, const std::string &uuid, + const std::string &client_name); + int mirror_peer_set_cluster(IoCtx& io_ctx, const std::string &uuid, + const std::string &cluster_name); + int mirror_peer_get_attributes( + IoCtx& io_ctx, const std::string &uuid, + std::map<std::string, std::string> *key_vals); + int mirror_peer_set_attributes( + IoCtx& io_ctx, const std::string &uuid, + const std::map<std::string, std::string>& key_vals); + + int mirror_image_status_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map<std::string, mirror_image_status_t> *images); + int mirror_image_status_summary(IoCtx& io_ctx, + std::map<mirror_image_status_state_t, int> *states); + int mirror_image_instance_id_list(IoCtx& io_ctx, const std::string &start_id, + size_t max, std::map<std::string, std::string> *sevice_ids); + + // RBD groups support functions + int group_create(IoCtx& io_ctx, const char *group_name); + int group_remove(IoCtx& io_ctx, const char *group_name); + int group_list(IoCtx& io_ctx, std::vector<std::string> *names); + int group_rename(IoCtx& io_ctx, const char *src_group_name, + const char *dest_group_name); + + int group_image_add(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_name); + int group_image_remove_by_id(IoCtx& io_ctx, const char *group_name, + IoCtx& image_io_ctx, const char *image_id); + int group_image_list(IoCtx& io_ctx, const char *group_name, + std::vector<group_image_info_t> *images, + size_t group_image_info_size); + + int group_snap_create(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_remove(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rename(IoCtx& group_ioctx, const char *group_name, + const char *old_snap_name, const char *new_snap_name); + int group_snap_list(IoCtx& group_ioctx, const char *group_name, + std::vector<group_snap_info_t> *snaps, + size_t group_snap_info_size); + int group_snap_rollback(IoCtx& io_ctx, const char *group_name, + const char *snap_name); + int group_snap_rollback_with_progress(IoCtx& io_ctx, const char *group_name, + const char *snap_name, + ProgressContext& pctx); + + int namespace_create(IoCtx& ioctx, const char *namespace_name); + int namespace_remove(IoCtx& ioctx, const char *namespace_name); + int namespace_list(IoCtx& io_ctx, std::vector<std::string>* namespace_names); + int namespace_exists(IoCtx& io_ctx, const char *namespace_name, bool *exists); + + int pool_init(IoCtx& io_ctx, bool force); + int pool_stats_get(IoCtx& io_ctx, PoolStats *pool_stats); + + int pool_metadata_get(IoCtx &io_ctx, const std::string &key, + std::string *value); + int pool_metadata_set(IoCtx &io_ctx, const std::string &key, + const std::string &value); + int pool_metadata_remove(IoCtx &io_ctx, const std::string &key); + int pool_metadata_list(IoCtx &io_ctx, const std::string &start, uint64_t max, + std::map<std::string, ceph::bufferlist> *pairs); + + int config_list(IoCtx& io_ctx, std::vector<config_option_t> *options); + +private: + /* We don't allow assignment or copying */ + RBD(const RBD& rhs); + const RBD& operator=(const RBD& rhs); +}; + +class CEPH_RBD_API ImageOptions { +public: + ImageOptions(); + ImageOptions(rbd_image_options_t opts); + ImageOptions(const ImageOptions &imgopts); + ~ImageOptions(); + + int set(int optname, const std::string& optval); + int set(int optname, uint64_t optval); + int get(int optname, std::string* optval) const; + int get(int optname, uint64_t* optval) const; + int is_set(int optname, bool* is_set); + int unset(int optname); + void clear(); + bool empty() const; + +private: + friend class RBD; + friend class Image; + + rbd_image_options_t opts; +}; + +class CEPH_RBD_API PoolStats { +public: + PoolStats(); + ~PoolStats(); + + PoolStats(const PoolStats&) = delete; + PoolStats& operator=(const PoolStats&) = delete; + + int add(rbd_pool_stat_option_t option, uint64_t* opt_val); + +private: + friend class RBD; + + rbd_pool_stats_t pool_stats; +}; + +class CEPH_RBD_API UpdateWatchCtx { +public: + virtual ~UpdateWatchCtx() {} + /** + * Callback activated when we receive a notify event. + */ + virtual void handle_notify() = 0; +}; + +class CEPH_RBD_API Image +{ +public: + Image(); + ~Image(); + + int close(); + int aio_close(RBD::AioCompletion *c); + + int resize(uint64_t size); + int resize2(uint64_t size, bool allow_shrink, ProgressContext& pctx); + int resize_with_progress(uint64_t size, ProgressContext& pctx); + int stat(image_info_t &info, size_t infosize); + int get_name(std::string *name); + int get_id(std::string *id); + std::string get_block_name_prefix(); + int64_t get_data_pool_id(); + int parent_info(std::string *parent_poolname, std::string *parent_name, + std::string *parent_snapname) + __attribute__((deprecated)); + int parent_info2(std::string *parent_poolname, std::string *parent_name, + std::string *parent_id, std::string *parent_snapname) + __attribute__((deprecated)); + int get_parent(linked_image_spec_t *parent_image, snap_spec_t *parent_snap); + + int old_format(uint8_t *old); + int size(uint64_t *size); + int get_group(group_info_t *group_info, size_t group_info_size); + int features(uint64_t *features); + int update_features(uint64_t features, bool enabled); + int get_op_features(uint64_t *op_features); + int overlap(uint64_t *overlap); + int get_flags(uint64_t *flags); + int set_image_notification(int fd, int type); + + /* exclusive lock feature */ + int is_exclusive_lock_owner(bool *is_owner); + int lock_acquire(rbd_lock_mode_t lock_mode); + int lock_release(); + int lock_get_owners(rbd_lock_mode_t *lock_mode, + std::list<std::string> *lock_owners); + int lock_break(rbd_lock_mode_t lock_mode, const std::string &lock_owner); + + /* object map feature */ + int rebuild_object_map(ProgressContext &prog_ctx); + + int check_object_map(ProgressContext &prog_ctx); + + int copy(IoCtx& dest_io_ctx, const char *destname); + int copy2(Image& dest); + int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int copy4(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts, + size_t sparse_size); + int copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ProgressContext &prog_ctx); + int copy_with_progress2(Image& dest, ProgressContext &prog_ctx); + int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + int copy_with_progress4(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx, + size_t sparse_size); + + /* deep copy */ + int deep_copy(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts); + int deep_copy_with_progress(IoCtx& dest_io_ctx, const char *destname, + ImageOptions& opts, ProgressContext &prog_ctx); + + /* striping */ + uint64_t get_stripe_unit() const; + uint64_t get_stripe_count() const; + + int get_create_timestamp(struct timespec *timestamp); + int get_access_timestamp(struct timespec *timestamp); + int get_modify_timestamp(struct timespec *timestamp); + + int flatten(); + int flatten_with_progress(ProgressContext &prog_ctx); + + int sparsify(size_t sparse_size); + int sparsify_with_progress(size_t sparse_size, ProgressContext &prog_ctx); + /** + * Returns a pair of poolname, imagename for each clone + * of this image at the currently set snapshot. + */ + int list_children(std::set<std::pair<std::string, std::string> > *children) + __attribute__((deprecated)); + /** + * Returns a structure of poolname, imagename, imageid and trash flag + * for each clone of this image at the currently set snapshot. + */ + int list_children2(std::vector<librbd::child_info_t> *children) + __attribute__((deprecated)); + int list_children3(std::vector<linked_image_spec_t> *images); + int list_descendants(std::vector<linked_image_spec_t> *images); + + /* advisory locking (see librbd.h for details) */ + int list_lockers(std::list<locker_t> *lockers, + bool *exclusive, std::string *tag); + int lock_exclusive(const std::string& cookie); + int lock_shared(const std::string& cookie, const std::string& tag); + int unlock(const std::string& cookie); + int break_lock(const std::string& client, const std::string& cookie); + + /* snapshots */ + int snap_list(std::vector<snap_info_t>& snaps); + /* DEPRECATED; use snap_exists2 */ + bool snap_exists(const char *snapname) __attribute__ ((deprecated)); + int snap_exists2(const char *snapname, bool *exists); + int snap_create(const char *snapname); + int snap_remove(const char *snapname); + int snap_remove2(const char *snapname, uint32_t flags, ProgressContext& pctx); + int snap_remove_by_id(uint64_t snap_id); + int snap_rollback(const char *snap_name); + int snap_rollback_with_progress(const char *snap_name, ProgressContext& pctx); + int snap_protect(const char *snap_name); + int snap_unprotect(const char *snap_name); + int snap_is_protected(const char *snap_name, bool *is_protected); + int snap_set(const char *snap_name); + int snap_set_by_id(uint64_t snap_id); + int snap_rename(const char *srcname, const char *dstname); + int snap_get_limit(uint64_t *limit); + int snap_set_limit(uint64_t limit); + int snap_get_timestamp(uint64_t snap_id, struct timespec *timestamp); + int snap_get_namespace_type(uint64_t snap_id, + snap_namespace_type_t *namespace_type); + int snap_get_group_namespace(uint64_t snap_id, + snap_group_namespace_t *group_namespace, + size_t snap_group_namespace_size); + int snap_get_trash_namespace(uint64_t snap_id, std::string* original_name); + + /* I/O */ + ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t read2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + int64_t read_iterate(uint64_t ofs, size_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + int read_iterate2(uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, const char *, void *), void *arg); + /** + * get difference between two versions of an image + * + * This will return the differences between two versions of an image + * via a callback, which gets the offset and length and a flag + * indicating whether the extent exists (1), or is known/defined to + * be zeros (a hole, 0). If the source snapshot name is NULL, we + * interpret that as the beginning of time and return all allocated + * regions of the image. The end version is whatever is currently + * selected for the image handle (either a snapshot or the writeable + * head). + * + * @param fromsnapname start snapshot name, or NULL + * @param ofs start offset + * @param len len in bytes of region to report on + * @param include_parent true if full history diff should include parent + * @param whole_object 1 if diff extents should cover whole object + * @param cb callback to call for each allocated region + * @param arg argument to pass to the callback + * @returns 0 on success, or negative error code on error + */ + int diff_iterate(const char *fromsnapname, + uint64_t ofs, uint64_t len, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + int diff_iterate2(const char *fromsnapname, + uint64_t ofs, uint64_t len, + bool include_parent, bool whole_object, + int (*cb)(uint64_t, size_t, int, void *), void *arg); + + ssize_t write(uint64_t ofs, size_t len, ceph::bufferlist& bl); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + ssize_t write2(uint64_t ofs, size_t len, ceph::bufferlist& bl, int op_flags); + + int discard(uint64_t ofs, uint64_t len); + ssize_t writesame(uint64_t ofs, size_t len, ceph::bufferlist &bl, int op_flags); + ssize_t write_zeroes(uint64_t ofs, size_t len, int zero_flags, int op_flags); + + ssize_t compare_and_write(uint64_t ofs, size_t len, ceph::bufferlist &cmp_bl, + ceph::bufferlist& bl, uint64_t *mismatch_off, int op_flags); + + int aio_write(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_write2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int aio_discard(uint64_t off, uint64_t len, RBD::AioCompletion *c); + int aio_writesame(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + int aio_write_zeroes(uint64_t ofs, size_t len, RBD::AioCompletion *c, + int zero_flags, int op_flags); + + int aio_compare_and_write(uint64_t off, size_t len, ceph::bufferlist& cmp_bl, + ceph::bufferlist& bl, RBD::AioCompletion *c, + uint64_t *mismatch_off, int op_flags); + + /** + * read async from image + * + * The target bufferlist is populated with references to buffers + * that contain the data for the given extent of the image. + * + * NOTE: If caching is enabled, the bufferlist will directly + * reference buffers in the cache to avoid an unnecessary data copy. + * As a result, if the user intends to modify the buffer contents + * directly, they should make a copy first (unconditionally, or when + * the reference count on ther underlying buffer is more than 1). + * + * @param off offset in image + * @param len length of read + * @param bl bufferlist to read into + * @param c aio completion to notify when read is complete + */ + int aio_read(uint64_t off, size_t len, ceph::bufferlist& bl, RBD::AioCompletion *c); + /* @param op_flags see librados.h constants beginning with LIBRADOS_OP_FLAG */ + int aio_read2(uint64_t off, size_t len, ceph::bufferlist& bl, + RBD::AioCompletion *c, int op_flags); + + int flush(); + /** + * Start a flush if caching is enabled. Get a callback when + * the currently pending writes are on disk. + * + * @param image the image to flush writes to + * @param c what to call when flushing is complete + * @returns 0 on success, negative error code on failure + */ + int aio_flush(RBD::AioCompletion *c); + + /** + * Drop any cached data for this image + * + * @returns 0 on success, negative error code on failure + */ + int invalidate_cache(); + + int poll_io_events(RBD::AioCompletion **comps, int numcomp); + + int metadata_get(const std::string &key, std::string *value); + int metadata_set(const std::string &key, const std::string &value); + int metadata_remove(const std::string &key); + /** + * Returns a pair of key/value for this image + */ + int metadata_list(const std::string &start, uint64_t max, std::map<std::string, ceph::bufferlist> *pairs); + + // RBD image mirroring support functions + int mirror_image_enable(); + int mirror_image_disable(bool force); + int mirror_image_promote(bool force); + int mirror_image_demote(); + int mirror_image_resync(); + int mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size); + int mirror_image_get_status(mirror_image_status_t *mirror_image_status, + size_t status_size); + int mirror_image_get_instance_id(std::string *instance_id); + int aio_mirror_image_promote(bool force, RBD::AioCompletion *c); + int aio_mirror_image_demote(RBD::AioCompletion *c); + int aio_mirror_image_get_info(mirror_image_info_t *mirror_image_info, + size_t info_size, RBD::AioCompletion *c); + int aio_mirror_image_get_status(mirror_image_status_t *mirror_image_status, + size_t status_size, RBD::AioCompletion *c); + + int update_watch(UpdateWatchCtx *ctx, uint64_t *handle); + int update_unwatch(uint64_t handle); + + int list_watchers(std::list<image_watcher_t> &watchers); + + int config_list(std::vector<config_option_t> *options); + +private: + friend class RBD; + + Image(const Image& rhs); + const Image& operator=(const Image& rhs); + + image_ctx_t ctx; +}; + +} + +#endif diff --git a/src/include/rbd/object_map_types.h b/src/include/rbd/object_map_types.h new file mode 100644 index 00000000..54852caa --- /dev/null +++ b/src/include/rbd/object_map_types.h @@ -0,0 +1,13 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_RBD_OBJECT_MAP_TYPES_H +#define CEPH_RBD_OBJECT_MAP_TYPES_H + +#include "include/int_types.h" + +static const uint8_t OBJECT_NONEXISTENT = 0; +static const uint8_t OBJECT_EXISTS = 1; +static const uint8_t OBJECT_PENDING = 2; +static const uint8_t OBJECT_EXISTS_CLEAN = 3; + +#endif // CEPH_RBD_OBJECT_MAP_TYPES_H diff --git a/src/include/rbd_types.h b/src/include/rbd_types.h new file mode 100644 index 00000000..35a1a8bc --- /dev/null +++ b/src/include/rbd_types.h @@ -0,0 +1,159 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include "include/types.h" +#include "rbd/features.h" + +/* New-style rbd image 'foo' consists of objects + * rbd_id.foo - id of image + * rbd_header.<id> - image metadata + * rbd_object_map.<id> - optional image object map + * rbd_data.<id>.00000000 + * rbd_data.<id>.00000001 + * ... - data + */ + +#define RBD_HEADER_PREFIX "rbd_header." +#define RBD_OBJECT_MAP_PREFIX "rbd_object_map." +#define RBD_DATA_PREFIX "rbd_data." +#define RBD_ID_PREFIX "rbd_id." + +/* + * old-style rbd image 'foo' consists of objects + * foo.rbd - image metadata + * rb.<idhi>.<idlo>.00000000 + * rb.<idhi>.<idlo>.00000001 + * ... - data + */ + +#define RBD_SUFFIX ".rbd" +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" +#define RBD_NAMESPACE "rbd_namespace" +#define RBD_TASK "rbd_task" + +/* + * rbd_children object in each pool contains omap entries + * that map parent (poolid, imageid, snapid) to a list of children + * (imageids; snapids aren't required because we get all the snapshot + * info from a read of the child's header object anyway). + * + * The clone operation writes a new item to this child list, and rm or + * flatten removes an item, and may remove the whole entry if no children + * exist after the rm/flatten. + * + * When attempting to remove a parent, all pools are searched for + * rbd_children objects with entries referring to that parent; if any + * exist (and those children exist), the parent removal is prevented. + */ +#define RBD_CHILDREN "rbd_children" +#define RBD_LOCK_NAME "rbd_lock" + +/** + * rbd_mirroring object in each pool contains pool-specific settings + * for configuring mirroring. + */ +#define RBD_MIRRORING "rbd_mirroring" + +/** + * rbd_mirror_leader and rbd_mirror_instance.<instance id> objects are used + * for pool-level coordination between rbd-mirror daemons. + */ +#define RBD_MIRROR_LEADER "rbd_mirror_leader" +#define RBD_MIRROR_INSTANCE_PREFIX "rbd_mirror_instance." + +#define RBD_MAX_OBJ_NAME_SIZE 96 +#define RBD_MAX_BLOCK_NAME_SIZE 24 + +/** + * Maximum string length of the RBD v2 image id (not including + * null termination). This limit was derived from the existing + * RBD_MAX_BLOCK_NAME_SIZE limit which needs to hold the "rbd_data." + * prefix and null termination. + */ +#define RBD_MAX_IMAGE_ID_LENGTH 14 + +/** + * Maximum string length of the RBD block object name prefix (not including + * null termination). + * + * v1 format: rb.<max 8-byte high id>.<max 8-byte low id>.<max 8-byte extra> + * v2 format: rbd_data.[<max 19-byte pool id>.]<max 14-byte image id> + * + * Note: new features might require increasing this maximum prefix length. + */ +#define RBD_MAX_BLOCK_NAME_PREFIX_LENGTH 43 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_MIGRATE_HEADER_TEXT "<<< Migrating RBD Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +#define RBD_GROUP_INVALID_POOL (-1) + +#define RBD_GROUP_HEADER_PREFIX "rbd_group_header." + +#define RBD_GROUP_DIRECTORY "rbd_group_directory" + +#define RBD_TRASH "rbd_trash" + +/** + * MON config-key prefix for storing optional remote cluster connectivity + * parameters + */ +#define RBD_MIRROR_CONFIG_KEY_PREFIX "rbd/mirror/" +#define RBD_MIRROR_SITE_NAME_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "site_name" +#define RBD_MIRROR_PEER_CLIENT_ID_CONFIG_KEY RBD_MIRROR_CONFIG_KEY_PREFIX "peer_client_id" +#define RBD_MIRROR_PEER_CONFIG_KEY_PREFIX RBD_MIRROR_CONFIG_KEY_PREFIX "peer/" + +struct rbd_info { + ceph_le64 max_id; +} __attribute__ ((packed)); + +struct rbd_obj_snap_ondisk { + ceph_le64 id; + ceph_le64 image_size; +} __attribute__((packed)); + +struct rbd_obj_header_ondisk { + char text[40]; + char block_name[RBD_MAX_BLOCK_NAME_SIZE]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + ceph_le64 image_size; + ceph_le64 snap_seq; + ceph_le32 snap_count; + ceph_le32 reserved; + ceph_le64 snap_names_len; + struct rbd_obj_snap_ondisk snaps[0]; +} __attribute__((packed)); + +enum { + RBD_PROTECTION_STATUS_UNPROTECTED = 0, + RBD_PROTECTION_STATUS_UNPROTECTING = 1, + RBD_PROTECTION_STATUS_PROTECTED = 2, + RBD_PROTECTION_STATUS_LAST = 3 +}; + +#endif diff --git a/src/include/rgw/librgw_admin_user.h b/src/include/rgw/librgw_admin_user.h new file mode 100644 index 00000000..e1dd5a29 --- /dev/null +++ b/src/include/rgw/librgw_admin_user.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * create rgw admin user + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef LIB_RGW_ADMIN_USER_H +#define LIB_RGW_ADMIN_USER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_ADMIN_USER_VER_MAJOR 1 +#define LIBRGW_ADMIN_USER_VER_MINOR 0 +#define LIBRGW_ADMIN_USER_VER_EXTRA 0 + +#define LIBRGW_ADMIN_USER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_ADMIN_USER_VERSION_CODE LIBRGW_ADMIN_USER_VERSION(LIBRGW_ADMIN_USER_VER_MAJOR, LIBRGW_ADMIN_USER_VER_MINOR, LIBRGW_ADMIN_USER_VER_EXTRA) + +typedef void* librgw_admin_user_t; +int librgw_admin_user_create(librgw_admin_user_t *rgw_admin_user, int argc, char **argv); +void librgw_admin_user_shutdown(librgw_admin_user_t rgw_admin_user); + +struct rgw_user_info +{ + const char *uid; + const char *display_name; + const char *access_key; + const char* secret_key; + const char* email; + const char *caps; + const char *access; + bool admin; + bool system; +}; + + /* + * create a new rgw user + */ +int rgw_admin_create_user(librgw_admin_user_t rgw_admin_user, const char *uid, + const char *display_name, const char *access_key, const char* secret_key, + const char *email, const char *caps, + const char *access, bool admin, bool system); + +/* + * get rgw user info + */ +int rgw_admin_user_info(librgw_admin_user_t rgw_admin_user,const char * uid, rgw_user_info* user_info); + +#ifdef __cplusplus +} +#endif + +#endif /* LIBRGW_ADMIN_USER */ diff --git a/src/include/scope_guard.h b/src/include/scope_guard.h new file mode 100644 index 00000000..878d8c16 --- /dev/null +++ b/src/include/scope_guard.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SCOPE_GUARD +#define SCOPE_GUARD + +#include <utility> + +template <typename F> +struct scope_guard { + F f; + scope_guard() = delete; + scope_guard(const scope_guard &) = delete; + scope_guard(scope_guard &&) = default; + scope_guard & operator=(const scope_guard &) = delete; + scope_guard & operator=(scope_guard &&) = default; + scope_guard(const F& f) : f(f) {} + scope_guard(F &&f) : f(std::move(f)) {} + template<typename... Args> + scope_guard(std::in_place_t, Args&& ...args) : f(std::forward<Args>(args)...) {} + ~scope_guard() { + std::move(f)(); // Support at-most-once functions + } +}; + +template <typename F> +scope_guard<F> make_scope_guard(F &&f) { + return scope_guard<F>(std::forward<F>(f)); +} + +template<typename F, typename... Args> +scope_guard<F> make_scope_guard(std::in_place_type_t<F>, Args&& ...args) { + return { std::in_place, std::forward<Args>(args)... }; +} + +#endif diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h new file mode 100644 index 00000000..14b5efa1 --- /dev/null +++ b/src/include/sock_compat.h @@ -0,0 +1,43 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ + +#ifndef CEPH_SOCK_COMPAT_H +#define CEPH_SOCK_COMPAT_H + +#include "include/compat.h" +#include <sys/socket.h> + +/* + * This optimization may not be available on all platforms (e.g. OSX). + * Apparently a similar approach based on TCP_CORK can be used. + */ +#ifndef MSG_MORE +# define MSG_MORE 0 +#endif + +/* + * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE. + */ +#ifndef MSG_NOSIGNAL +# define MSG_NOSIGNAL 0 +# ifdef SO_NOSIGPIPE +# define CEPH_USE_SO_NOSIGPIPE +# else +# define CEPH_USE_SIGPIPE_BLOCKER +# warning "Using SIGPIPE blocking instead of suppression; this is not well-tested upstream!" +# endif +#endif + +int socket_cloexec(int domain, int type, int protocol); +int socketpair_cloexec(int domain, int type, int protocol, int sv[2]); +int accept_cloexec(int sockfd, struct sockaddr* addr, socklen_t* addrlen); + +#endif diff --git a/src/include/spinlock.h b/src/include/spinlock.h new file mode 100644 index 00000000..3f12bdc0 --- /dev/null +++ b/src/include/spinlock.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2017 SUSE LINUX GmbH + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + * @author Jesse Williamson <jwilliamson@suse.de> + * +*/ + +#ifndef CEPH_SPINLOCK_HPP +#define CEPH_SPINLOCK_HPP + +#include <atomic> + +namespace ceph { +inline namespace version_1_0 { + +class spinlock; + +inline void spin_lock(std::atomic_flag& lock); +inline void spin_unlock(std::atomic_flag& lock); +inline void spin_lock(ceph::spinlock& lock); +inline void spin_unlock(ceph::spinlock& lock); + +/* A pre-packaged spinlock type modelling BasicLockable: */ +class spinlock final +{ + std::atomic_flag af = ATOMIC_FLAG_INIT; + + public: + void lock() { + ceph::spin_lock(af); + } + + void unlock() noexcept { + ceph::spin_unlock(af); + } +}; + +// Free functions: +inline void spin_lock(std::atomic_flag& lock) +{ + while(lock.test_and_set(std::memory_order_acquire)) + ; +} + +inline void spin_unlock(std::atomic_flag& lock) +{ + lock.clear(std::memory_order_release); +} + +inline void spin_lock(std::atomic_flag *lock) +{ + spin_lock(*lock); +} + +inline void spin_unlock(std::atomic_flag *lock) +{ + spin_unlock(*lock); +} + +inline void spin_lock(ceph::spinlock& lock) +{ + lock.lock(); +} + +inline void spin_unlock(ceph::spinlock& lock) +{ + lock.unlock(); +} + +inline void spin_lock(ceph::spinlock *lock) +{ + spin_lock(*lock); +} + +inline void spin_unlock(ceph::spinlock *lock) +{ + spin_unlock(*lock); +} + +} // inline namespace (version) +} // namespace ceph + +#endif diff --git a/src/include/stat.h b/src/include/stat.h new file mode 100644 index 00000000..19398758 --- /dev/null +++ b/src/include/stat.h @@ -0,0 +1,145 @@ +#ifndef CEPH_STAT_H +#define CEPH_STAT_H + +#include <acconfig.h> + +#include <sys/stat.h> + +/* + * Access time-related `struct stat` members. + * + * Note that for each of the stat member get/set functions below, setting a + * high-res value (stat_set_*_nsec) on a platform without high-res support is + * a no-op. + */ + +#ifdef HAVE_STAT_ST_MTIM_TV_NSEC + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return st->st_mtim.tv_nsec; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_mtim.tv_nsec = nsec; +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return st->st_atim.tv_nsec; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_atim.tv_nsec = nsec; +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return st->st_ctim.tv_nsec; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_ctim.tv_nsec = nsec; +} + +#elif defined(HAVE_STAT_ST_MTIMESPEC_TV_NSEC) + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return st->st_mtimespec.tv_nsec; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_mtimespec.tv_nsec = nsec; +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return st->st_atimespec.tv_nsec; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_atimespec.tv_nsec = nsec; +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return st->st_ctimespec.tv_nsec; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ + st->st_ctimespec.tv_nsec = nsec; +} + +#else + +static inline uint32_t stat_get_mtime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_mtime_nsec(struct stat *st, uint32_t nsec) +{ +} + +static inline uint32_t stat_get_atime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_atime_nsec(struct stat *st, uint32_t nsec) +{ +} + +static inline uint32_t stat_get_ctime_nsec(struct stat *st) +{ + return 0; +} + +static inline void stat_set_ctime_nsec(struct stat *st, uint32_t nsec) +{ +} + +#endif + +/* + * Access second-resolution `struct stat` members. + */ + +static inline uint32_t stat_get_mtime_sec(struct stat *st) +{ + return st->st_mtime; +} + +static inline void stat_set_mtime_sec(struct stat *st, uint32_t sec) +{ + st->st_mtime = sec; +} + +static inline uint32_t stat_get_atime_sec(struct stat *st) +{ + return st->st_atime; +} + +static inline void stat_set_atime_sec(struct stat *st, uint32_t sec) +{ + st->st_atime = sec; +} + +static inline uint32_t stat_get_ctime_sec(struct stat *st) +{ + return st->st_ctime; +} + +static inline void stat_set_ctime_sec(struct stat *st, uint32_t sec) +{ + st->st_ctime = sec; +} + +#endif diff --git a/src/include/statlite.h b/src/include/statlite.h new file mode 100644 index 00000000..2ab3a940 --- /dev/null +++ b/src/include/statlite.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_STATLITE_H +#define CEPH_STATLITE_H + +extern "C" { + +#include <time.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <dirent.h> + +struct statlite { + dev_t st_dev; /* device */ + ino_t st_ino; /* inode */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device type (if inode device)*/ + unsigned long st_litemask; /* bit mask for optional fields */ + /***************************************************************/ + /**** Remaining fields are optional according to st_litemask ***/ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for filesystem I/O */ + blkcnt_t st_blocks; /* number of blocks allocated */ + struct timespec st_atim; /* Time of last access. */ + struct timespec st_mtim; /* Time of last modification. */ + struct timespec st_ctim; /* Time of last status change. */ + //time_t st_atime; /* time of last access */ + //time_t st_mtime; /* time of last modification */ + //time_t st_ctime; /* time of last change */ +}; + +#define S_STATLITE_SIZE 1 +#define S_STATLITE_BLKSIZE 2 +#define S_STATLITE_BLOCKS 4 +#define S_STATLITE_ATIME 8 +#define S_STATLITE_MTIME 16 +#define S_STATLITE_CTIME 32 + +#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) +#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) +#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) +#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) +#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) +#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) + +#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) +#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) +#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) +#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) +#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) +#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) + + +// readdirplus etc. + +struct dirent_plus { + struct dirent d_dirent; /* dirent struct for this entry */ + struct stat d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; +struct dirent_lite { + struct dirent d_dirent; /* dirent struct for this entry */ + struct statlite d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; + +} +#endif diff --git a/src/include/str_list.h b/src/include/str_list.h new file mode 100644 index 00000000..518db1ca --- /dev/null +++ b/src/include/str_list.h @@ -0,0 +1,129 @@ +#ifndef CEPH_STRLIST_H +#define CEPH_STRLIST_H + +#include <list> +#include <set> +#include <string> +#include <string_view> +#include <vector> + +namespace ceph { + +/// Split a string using the given delimiters, passing each piece as a +/// (non-null-terminated) std::string_view to the callback. +template <typename Func> // where Func(std::string_view) is a valid call +void for_each_substr(std::string_view s, const char *delims, Func&& f) +{ + auto pos = s.find_first_not_of(delims); + while (pos != s.npos) { + s.remove_prefix(pos); // trim delims from the front + auto end = s.find_first_of(delims); + f(s.substr(0, end)); + pos = s.find_first_not_of(delims, end); + } +} + +} // namespace ceph + +/** + * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as list + * @param [out] str_list List modified containing str after it has been split +**/ +extern void get_str_list(const std::string& str, + std::list<std::string>& str_list); + +/** + * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as list + * @param [in] delims characters used to split **str** + * @param [out] str_list List modified containing str after it has been split +**/ +extern void get_str_list(const std::string& str, + const char *delims, + std::list<std::string>& str_list); + +std::list<std::string> get_str_list(const std::string& str, + const char *delims = ";,= \t"); + +/** + * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**. + * + * @param [in] str String to split and save as Vector + * @param [out] str_vec Vector modified containing str after it has been split +**/ +extern void get_str_vec(const std::string& str, + std::vector<std::string>& str_vec); + +/** + * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_vec**. + * + * @param [in] str String to split and save as Vector + * @param [in] delims characters used to split **str** + * @param [out] str_vec Vector modified containing str after it has been split +**/ +extern void get_str_vec(const std::string& str, + const char *delims, + std::vector<std::string>& str_vec); + +std::vector<std::string> get_str_vec(const std::string& str, + const char *delims = ";,= \t"); +/** + * Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as Set + * @param [out] str_list Set modified containing str after it has been split +**/ +extern void get_str_set(const std::string& str, + std::set<std::string>& str_list); + +/** + * Split **str** into a list of strings, using the **delims** delimiters and output the result in **str_list**. + * + * @param [in] str String to split and save as Set + * @param [in] delims characters used to split **str** + * @param [out] str_list Set modified containing str after it has been split +**/ +template<class Compare = std::less<std::string> > +void get_str_set(const std::string& str, + const char *delims, + std::set<std::string, Compare>& str_list) +{ + str_list.clear(); + for_each_substr(str, delims, [&str_list] (auto token) { + str_list.emplace(token.begin(), token.end()); + }); +} + +std::set<std::string> get_str_set(const std::string& str, + const char *delims = ";,= \t"); + + + +/** + * Return a String containing the vector **v** joined with **sep** + * + * If **v** is empty, the function returns an empty string + * For each element in **v**, + * it will concatenate this element and **sep** with result + * + * @param [in] v Vector to join as a String + * @param [in] sep String used to join each element from **v** + * @return empty string if **v** is empty or concatenated string +**/ +inline std::string str_join(const std::vector<std::string>& v, const std::string& sep) +{ + if (v.empty()) + return std::string(); + std::vector<std::string>::const_iterator i = v.begin(); + std::string r = *i; + for (++i; i != v.end(); ++i) { + r += sep; + r += *i; + } + return r; +} + +#endif diff --git a/src/include/str_map.h b/src/include/str_map.h new file mode 100644 index 00000000..6a0370d1 --- /dev/null +++ b/src/include/str_map.h @@ -0,0 +1,148 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> + * + * Author: Loic Dachary <loic@dachary.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + */ + +#ifndef CEPH_STRMAP_H +#define CEPH_STRMAP_H + +#define CONST_DELIMS ",;\t\n " + +#include <map> +#include <string> +#include <sstream> + +/** + * Parse **str** and set **str_map** with the key/value pairs read + * from it. The format of **str** is either a well formed JSON object + * or a custom key[=value] plain text format. + * + * JSON is tried first. If successfully parsed into a JSON object, it + * is copied into **str_map** verbatim. If it is not a JSON object ( a + * string, integer etc. ), -EINVAL is returned and **ss** is set to + * a human readable error message. + * + * If **str** is no valid JSON and if **fallback_to_plain** is set to true + * (default: true) it is assumed to be a string containing white space + * separated key=value pairs. A white space is either space, tab or newline. + * Function **get_str_map** will be leveraged to parse the plain-text + * key/value pairs. + * + * @param [in] str JSON or plain text key/value pairs + * @param [out] ss human readable message on error + * @param [out] str_map key/value pairs read from str + * @param [in] fallback_to_plain attempt parsing as plain-text if json fails + * @return **0** on success or a -EINVAL on error. + */ +extern int get_json_str_map( + const std::string &str, + std::ostream &ss, + std::map<std::string,std::string> *str_map, + bool fallback_to_plain = true); + +/** + * Parse **str** and set **str_map** with the key/value pairs read from + * it. The format of **str** is a number of custom key[=value] pairs in + * plain text format. + * + * The string will be parsed taking **delims** as field delimiters for + * key/values. The value is optional resulting in an empty string when + * not provided. For example, using white space as delimiters: + * + * insert your own=political/ideological statement=here + * + * will be parsed into: + * + * { "insert": "", + * "your": "", + * "own": "political/ideological", + * "statement": "here" } + * + * Alternative delimiters may be provided. For instance, specifying + * "white space and slash", for the above statement, would be parsed + * into: + * + * { "insert": "", + * "your": "", + * "own": "political", + * "ideological": "", + * "statement": "here" } + * + * See how adding '/' to the delimiters field will spawn a new key without + * a set value. + * + * Always returns 0, as there is no condition for failure. + * + * @param [in] str plain text key/value pairs + * @param [in] delims field delimiters to be used for parsing str + * @param [out] str_map key/value pairs parsed from str + * @return **0** + */ +extern int get_str_map( + const std::string &str, + std::map<std::string,std::string> *str_map, + const char *delims = CONST_DELIMS); + +/** + * Returns the value of **key** in **str_map** if available. + * + * If **key** is not available in **str_map**, and if **def_val** is + * not-NULL then returns **def_val**. Otherwise checks if the value of + * **key** is an empty string and if so will return **key**. + * If the map contains **key**, the function returns the value of **key**. + * + * @param[in] str_map Map to obtain **key** from + * @param[in] key The key to search for in the map + * @param[in] def_val The value to return in case **key** is not present + */ +extern std::string get_str_map_value( + const std::map<std::string,std::string> &str_map, + const std::string &key, + const std::string *def_val = NULL); + +/** + * Returns the value of **key** in **str_map** if available. + * + * If **key** is available in **str_map** returns the value of **key**. + * + * If **key** is not available in **str_map**, and if **def_key** + * is not-NULL and available in **str_map**, then returns the value + * of **def_key**. + * + * Otherwise returns an empty string. + * + * @param[in] str_map Map to obtain **key** or **def_key** from + * @param[in] key Key to obtain the value of from **str_map** + * @param[in] def_key Key to fallback to if **key** is not present + * in **str_map** + */ +extern std::string get_str_map_key( + const std::map<std::string,std::string> &str_map, + const std::string &key, + const std::string *fallback_key = NULL); + + +// This function's only purpose is to check whether a given map has only +// ONE key with an empty value (which would mean that 'get_str_map()' read +// a map in the form of 'VALUE', without any KEY/VALUE pairs) and, in such +// event, to assign said 'VALUE' to a given 'def_key', such that we end up +// with a map of the form "m = { 'def_key' : 'VALUE' }" instead of the +// original "m = { 'VALUE' : '' }". +int get_conf_str_map_helper( + const std::string &str, + std::ostringstream &oss, + std::map<std::string,std::string> *m, + const std::string &def_key); + +#endif diff --git a/src/include/stringify.h b/src/include/stringify.h new file mode 100644 index 00000000..1b2a130c --- /dev/null +++ b/src/include/stringify.h @@ -0,0 +1,33 @@ +#ifndef __CEPH_STRINGIFY_H +#define __CEPH_STRINGIFY_H + +#include <string> +#include <sstream> + +#include "include/types.h" + +template<typename T> +inline std::string stringify(const T& a) { +#if defined(__GNUC__) && !(defined(__clang__) || defined(__INTEL_COMPILER)) + static __thread std::ostringstream ss; + ss.str(""); +#else + std::ostringstream ss; +#endif + ss << a; + return ss.str(); +} + +template <class T, class A> +T joinify(const A &begin, const A &end, const T &t) +{ + T result; + for (A it = begin; it != end; it++) { + if (!result.empty()) + result.append(t); + result.append(*it); + } + return result; +} + +#endif diff --git a/src/include/timegm.h b/src/include/timegm.h new file mode 100644 index 00000000..fb970432 --- /dev/null +++ b/src/include/timegm.h @@ -0,0 +1,79 @@ +// (C) Copyright Howard Hinnant +// (C) Copyright 2010-2011 Vicente J. Botet Escriba +// Use, modification and distribution are subject to the Boost Software License, +// Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt). + +//===-------------------------- locale ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// This code was adapted by Vicente from Howard Hinnant's experimental work +// on chrono i/o to Boost and some functions from libc++/locale to emulate the missing time_get::get() + +#ifndef BOOST_CHRONO_IO_TIME_POINT_IO_H +#define BOOST_CHRONO_IO_TIME_POINT_IO_H + +#include <time.h> + +static int32_t is_leap(int32_t year) { + if(year % 400 == 0) + return 1; + if(year % 100 == 0) + return 0; + if(year % 4 == 0) + return 1; + return 0; +} + +static int32_t days_from_0(int32_t year) { + year--; + return 365 * year + (year / 400) - (year/100) + (year / 4); +} + +int32_t static days_from_1970(int32_t year) { + static const int days_from_0_to_1970 = days_from_0(1970); + return days_from_0(year) - days_from_0_to_1970; +} + +static int32_t days_from_1jan(int32_t year,int32_t month,int32_t day) { + static const int32_t days[2][12] = + { + { 0,31,59,90,120,151,181,212,243,273,304,334}, + { 0,31,60,91,121,152,182,213,244,274,305,335} + }; + + return days[is_leap(year)][month-1] + day - 1; +} + +static time_t internal_timegm(tm const *t) { + int year = t->tm_year + 1900; + int month = t->tm_mon; + if(month > 11) + { + year += month/12; + month %= 12; + } + else if(month < 0) + { + int years_diff = (-month + 11)/12; + year -= years_diff; + month+=12 * years_diff; + } + month++; + int day = t->tm_mday; + int day_of_year = days_from_1jan(year,month,day); + int days_since_epoch = days_from_1970(year) + day_of_year ; + + time_t seconds_in_day = 3600 * 24; + time_t result = seconds_in_day * days_since_epoch + 3600 * t->tm_hour + 60 * t->tm_min + t->tm_sec; + + return result; +} + +#endif diff --git a/src/include/types.h b/src/include/types.h new file mode 100644 index 00000000..1ae15277 --- /dev/null +++ b/src/include/types.h @@ -0,0 +1,604 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_TYPES_H +#define CEPH_TYPES_H + +// this is needed for ceph_fs to compile in userland +#include "int_types.h" +#include "byteorder.h" + +#include "uuid.h" + +#include <netinet/in.h> +#include <fcntl.h> +#include <string.h> + +#include "ceph_fs.h" +#include "ceph_frag.h" +#include "rbd_types.h" + +#ifdef __cplusplus +#ifndef _BACKWARD_BACKWARD_WARNING_H +#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* +#endif +#endif + +extern "C" { +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "statlite.h" +} + +#include <string> +#include <list> +#include <set> +#include <boost/container/flat_set.hpp> +#include <boost/container/flat_map.hpp> +#include <map> +#include <vector> +#include <iostream> +#include <iomanip> + + +#include "include/unordered_map.h" + +#include "object.h" +#include "intarith.h" + +#include "acconfig.h" + +#include "assert.h" + +// DARWIN compatibility +#ifdef __APPLE__ +typedef long long loff_t; +typedef long long off64_t; +#define O_DIRECT 00040000 +#endif + +// FreeBSD compatibility +#ifdef __FreeBSD__ +typedef off_t loff_t; +typedef off_t off64_t; +#endif + +#if defined(__sun) || defined(_AIX) +typedef off_t loff_t; +#endif + + +// -- io helpers -- + +// Forward declare all the I/O helpers so strict ADL can find them in +// the case of containers of containers. I'm tempted to abstract this +// stuff using template templates like I did for denc. + +namespace std { +template<class A, class B> +inline std::ostream& operator<<(std::ostream&out, const std::pair<A,B>& v); +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v); +template<class A, std::size_t N, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v); +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t); +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset); +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m); +} + +namespace boost { +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const boost::tuple<Ts...> &t); + +namespace container { +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset); +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& iset); +} +} + +namespace std { +template<class A, class B> +inline std::ostream& operator<<(std::ostream& out, const std::pair<A,B>& v) { + return out << v.first << "," << v.second; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::vector<A,Alloc>& v) { + bool first = true; + out << "["; + for (const auto& p : v) { + if (!first) out << ","; + out << p; + first = false; + } + out << "]"; + return out; +} + +template<class A, std::size_t N, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::small_vector<A,N,Alloc>& v) { + bool first = true; + out << "["; + for (const auto& p : v) { + if (!first) out << ","; + out << p; + first = false; + } + out << "]"; + return out; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::deque<A,Alloc>& v) { + out << "<"; + for (auto p = v.begin(); p != v.end(); ++p) { + if (p != v.begin()) out << ","; + out << *p; + } + out << ">"; + return out; +} + +template<typename... Ts> +inline std::ostream& operator<<(std::ostream& out, const std::tuple<Ts...> &t) { + auto f = [n = sizeof...(Ts), i = 0U, &out](const auto& e) mutable { + out << e; + if (++i != n) + out << ","; + }; + ceph::for_each(t, f); + return out; +} + +template<class A, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::list<A,Alloc>& ilist) { + for (auto it = ilist.begin(); + it != ilist.end(); + ++it) { + if (it != ilist.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::set<A, Comp, Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multiset<A,Comp,Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::map<A,B,Comp,Alloc>& m) +{ + out << "{"; + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}"; + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const std::multimap<A,B,Comp,Alloc>& m) +{ + out << "{{"; + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}}"; + return out; +} + +} // namespace std + +namespace boost { +namespace tuples { +template<typename A, typename B, typename C> +inline std::ostream& operator<<(std::ostream& out, const boost::tuples::tuple<A, B, C> &t) { + return out << boost::get<0>(t) << "," + << boost::get<1>(t) << "," + << boost::get<2>(t); +} +} +namespace container { +template<class A, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_set<A, Comp, Alloc>& iset) { + for (auto it = iset.begin(); + it != iset.end(); + ++it) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template<class A, class B, class Comp, class Alloc> +inline std::ostream& operator<<(std::ostream& out, const boost::container::flat_map<A, B, Comp, Alloc>& m) { + for (auto it = m.begin(); + it != m.end(); + ++it) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + return out; +} +} +} // namespace boost + + + +/* + * comparators for stl containers + */ +// for ceph::unordered_map: +// ceph::unordered_map<const char*, long, hash<const char*>, eqstr> vals; +struct eqstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) == 0; + } +}; + +// for set, map +struct ltstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } +}; + + +namespace ceph { + class Formatter; +} + +#include "encoding.h" + +WRITE_RAW_ENCODER(ceph_fsid) +WRITE_RAW_ENCODER(ceph_file_layout) +WRITE_RAW_ENCODER(ceph_dir_layout) +WRITE_RAW_ENCODER(ceph_mds_session_head) +WRITE_RAW_ENCODER(ceph_mds_request_head_legacy) +WRITE_RAW_ENCODER(ceph_mds_request_head) +WRITE_RAW_ENCODER(ceph_mds_request_release) +WRITE_RAW_ENCODER(ceph_filelock) +WRITE_RAW_ENCODER(ceph_mds_caps_head) +WRITE_RAW_ENCODER(ceph_mds_caps_body_legacy) +WRITE_RAW_ENCODER(ceph_mds_cap_peer) +WRITE_RAW_ENCODER(ceph_mds_cap_release) +WRITE_RAW_ENCODER(ceph_mds_cap_item) +WRITE_RAW_ENCODER(ceph_mds_lease) +WRITE_RAW_ENCODER(ceph_mds_snap_head) +WRITE_RAW_ENCODER(ceph_mds_snap_realm) +WRITE_RAW_ENCODER(ceph_mds_reply_head) +WRITE_RAW_ENCODER(ceph_mds_reply_cap) +WRITE_RAW_ENCODER(ceph_mds_cap_reconnect) +WRITE_RAW_ENCODER(ceph_mds_snaprealm_reconnect) +WRITE_RAW_ENCODER(ceph_frag_tree_split) +WRITE_RAW_ENCODER(ceph_osd_reply_head) +WRITE_RAW_ENCODER(ceph_osd_op) +WRITE_RAW_ENCODER(ceph_msg_header) +WRITE_RAW_ENCODER(ceph_msg_footer) +WRITE_RAW_ENCODER(ceph_msg_footer_old) +WRITE_RAW_ENCODER(ceph_mon_subscribe_item) + +WRITE_RAW_ENCODER(ceph_mon_statfs) +WRITE_RAW_ENCODER(ceph_mon_statfs_reply) + +// ---------------------- +// some basic types + +// NOTE: these must match ceph_fs.h typedefs +typedef uint64_t ceph_tid_t; // transaction id +typedef uint64_t version_t; +typedef __u32 epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) + +// -------------------------------------- +// identify individual mount clients by 64bit value + +struct client_t { + int64_t v; + + // cppcheck-suppress noExplicitConstructor + client_t(int64_t _v = -2) : v(_v) {} + + void encode(bufferlist& bl) const { + using ceph::encode; + encode(v, bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(v, bl); + } +}; +WRITE_CLASS_ENCODER(client_t) + +static inline bool operator==(const client_t& l, const client_t& r) { return l.v == r.v; } +static inline bool operator!=(const client_t& l, const client_t& r) { return l.v != r.v; } +static inline bool operator<(const client_t& l, const client_t& r) { return l.v < r.v; } +static inline bool operator<=(const client_t& l, const client_t& r) { return l.v <= r.v; } +static inline bool operator>(const client_t& l, const client_t& r) { return l.v > r.v; } +static inline bool operator>=(const client_t& l, const client_t& r) { return l.v >= r.v; } + +static inline bool operator>=(const client_t& l, int64_t o) { return l.v >= o; } +static inline bool operator<(const client_t& l, int64_t o) { return l.v < o; } + +inline ostream& operator<<(ostream& out, const client_t& c) { + return out << c.v; +} + + + +// -- + +namespace { + inline ostream& format_u(ostream& out, const uint64_t v, const uint64_t n, + const int index, const uint64_t mult, const char* u) + { + char buffer[32]; + + if (index == 0) { + (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u); + } else if ((v % mult) == 0) { + // If this is an even multiple of the base, always display + // without any decimal fraction. + (void) snprintf(buffer, sizeof(buffer), "%" PRId64 "%s", n, u); + } else { + // We want to choose a precision that reflects the best choice + // for fitting in 5 characters. This can get rather tricky when + // we have numbers that are very close to an order of magnitude. + // For example, when displaying 10239 (which is really 9.999K), + // we want only a single place of precision for 10.0K. We could + // develop some complex heuristics for this, but it's much + // easier just to try each combination in turn. + int i; + for (i = 2; i >= 0; i--) { + if (snprintf(buffer, sizeof(buffer), "%.*f%s", i, + static_cast<double>(v) / mult, u) <= 7) + break; + } + } + + return out << buffer; + } +} + +/* + * Use this struct to pretty print values that should be formatted with a + * decimal unit prefix (the classic SI units). No actual unit will be added. + */ +struct si_u_t { + uint64_t v; + explicit si_u_t(uint64_t _v) : v(_v) {}; +}; + +inline ostream& operator<<(ostream& out, const si_u_t& b) +{ + uint64_t n = b.v; + int index = 0; + uint64_t mult = 1; + const char* u[] = {"", "k", "M", "G", "T", "P", "E"}; + + while (n >= 1000 && index < 7) { + n /= 1000; + index++; + mult *= 1000; + } + + return format_u(out, b.v, n, index, mult, u[index]); +} + +/* + * Use this struct to pretty print values that should be formatted with a + * binary unit prefix (IEC units). Since binary unit prefixes are to be used for + * "multiples of units in data processing, data transmission, and digital + * information" (so bits and bytes) and so far bits are not printed, the unit + * "B" for "byte" is added besides the multiplier. + */ +struct byte_u_t { + uint64_t v; + explicit byte_u_t(uint64_t _v) : v(_v) {}; +}; + +inline ostream& operator<<(ostream& out, const byte_u_t& b) +{ + uint64_t n = b.v; + int index = 0; + const char* u[] = {" B", " KiB", " MiB", " GiB", " TiB", " PiB", " EiB"}; + + while (n >= 1024 && index < 7) { + n /= 1024; + index++; + } + + return format_u(out, b.v, n, index, 1ULL << (10 * index), u[index]); +} + +inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i) +{ + return out << i.start + << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+"); +} + +struct weightf_t { + float v; + // cppcheck-suppress noExplicitConstructor + weightf_t(float _v) : v(_v) {} +}; + +inline ostream& operator<<(ostream& out, const weightf_t& w) +{ + if (w.v < -0.01F) { + return out << "-"; + } else if (w.v < 0.000001F) { + return out << "0"; + } else { + std::streamsize p = out.precision(); + return out << std::fixed << std::setprecision(5) << w.v << std::setprecision(p); + } +} + +struct shard_id_t { + int8_t id; + + shard_id_t() : id(0) {} + explicit shard_id_t(int8_t _id) : id(_id) {} + + operator int8_t() const { return id; } + + const static shard_id_t NO_SHARD; + + void encode(bufferlist &bl) const { + using ceph::encode; + encode(id, bl); + } + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + decode(id, bl); + } +}; +WRITE_CLASS_ENCODER(shard_id_t) +WRITE_EQ_OPERATORS_1(shard_id_t, id) +WRITE_CMP_OPERATORS_1(shard_id_t, id) +ostream &operator<<(ostream &lhs, const shard_id_t &rhs); + +#if defined(__sun) || defined(_AIX) || defined(__APPLE__) || defined(__FreeBSD__) +__s32 ceph_to_hostos_errno(__s32 e); +__s32 hostos_to_ceph_errno(__s32 e); +#else +#define ceph_to_hostos_errno(e) (e) +#define hostos_to_ceph_errno(e) (e) +#endif + +struct errorcode32_t { + int32_t code; + + errorcode32_t() : code(0) {} + // cppcheck-suppress noExplicitConstructor + errorcode32_t(int32_t i) : code(i) {} + + operator int() const { return code; } + int* operator&() { return &code; } + int operator==(int i) { return code == i; } + int operator>(int i) { return code > i; } + int operator>=(int i) { return code >= i; } + int operator<(int i) { return code < i; } + int operator<=(int i) { return code <= i; } + + void encode(bufferlist &bl) const { + using ceph::encode; + __s32 newcode = hostos_to_ceph_errno(code); + encode(newcode, bl); + } + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + decode(code, bl); + code = ceph_to_hostos_errno(code); + } +}; +WRITE_CLASS_ENCODER(errorcode32_t) +WRITE_EQ_OPERATORS_1(errorcode32_t, code) +WRITE_CMP_OPERATORS_1(errorcode32_t, code) + +template <uint8_t S> +struct sha_digest_t { + constexpr static uint32_t SIZE = S; + // TODO: we might consider std::array in the future. Avoiding it for now + // as sha_digest_t is a part of our public API. + unsigned char v[S] = {0}; + + string to_str() const { + char str[S * 2 + 1] = {0}; + str[0] = '\0'; + for (size_t i = 0; i < S; i++) { + ::sprintf(&str[i * 2], "%02x", static_cast<int>(v[i])); + } + return string(str); + } + sha_digest_t(const unsigned char *_v) { memcpy(v, _v, SIZE); }; + sha_digest_t() {} + + bool operator==(const sha_digest_t& r) const { + return ::memcmp(v, r.v, SIZE) == 0; + } + bool operator!=(const sha_digest_t& r) const { + return ::memcmp(v, r.v, SIZE) != 0; + } + + void encode(bufferlist &bl) const { + // copy to avoid reinterpret_cast, is_pod and other nasty things + using ceph::encode; + std::array<unsigned char, SIZE> tmparr; + memcpy(tmparr.data(), v, SIZE); + encode(tmparr, bl); + } + void decode(bufferlist::const_iterator &bl) { + using ceph::decode; + std::array<unsigned char, SIZE> tmparr; + decode(tmparr, bl); + memcpy(v, tmparr.data(), SIZE); + } +}; + +template <uint8_t S> +inline ostream &operator<<(ostream &out, const sha_digest_t<S> &b) { + string str = b.to_str(); + return out << str; +} + +using sha1_digest_t = sha_digest_t<20>; +WRITE_CLASS_ENCODER(sha1_digest_t) + +using sha256_digest_t = sha_digest_t<32>; +WRITE_CLASS_ENCODER(sha256_digest_t) + + +#endif diff --git a/src/include/unordered_map.h b/src/include/unordered_map.h new file mode 100644 index 00000000..aee5f5a7 --- /dev/null +++ b/src/include/unordered_map.h @@ -0,0 +1,11 @@ +#ifndef CEPH_UNORDERED_MAP_H +#define CEPH_UNORDERED_MAP_H + +#include <unordered_map> + +namespace ceph { + using std::unordered_map; + using std::unordered_multimap; +} + +#endif diff --git a/src/include/unordered_set.h b/src/include/unordered_set.h new file mode 100644 index 00000000..e30e1799 --- /dev/null +++ b/src/include/unordered_set.h @@ -0,0 +1,10 @@ +#ifndef CEPH_UNORDERED_SET_H +#define CEPH_UNORDERED_SET_H + +#include <unordered_set> + +namespace ceph { + using std::unordered_set; +} + +#endif diff --git a/src/include/util.h b/src/include/util.h new file mode 100644 index 00000000..18aa51ad --- /dev/null +++ b/src/include/util.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2012 Inktank Storage, Inc. + * Copyright (C) 2014 Red Hat <contact@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + */ +#ifndef CEPH_UTIL_H +#define CEPH_UTIL_H + +#include "common/Formatter.h" +#include "include/types.h" + +std::string bytes2str(uint64_t count); + +struct ceph_data_stats +{ + uint64_t byte_total; + uint64_t byte_used; + uint64_t byte_avail; + int avail_percent; + + ceph_data_stats() : + byte_total(0), + byte_used(0), + byte_avail(0), + avail_percent(0) + { } + + void dump(Formatter *f) const { + ceph_assert(f != NULL); + f->dump_int("total", byte_total); + f->dump_int("used", byte_used); + f->dump_int("avail", byte_avail); + f->dump_int("avail_percent", avail_percent); + } + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + encode(byte_total, bl); + encode(byte_used, bl); + encode(byte_avail, bl); + encode(avail_percent, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::const_iterator &p) { + DECODE_START(1, p); + decode(byte_total, p); + decode(byte_used, p); + decode(byte_avail, p); + decode(avail_percent, p); + DECODE_FINISH(p); + } + + static void generate_test_instances(list<ceph_data_stats*>& ls) { + ls.push_back(new ceph_data_stats); + ls.push_back(new ceph_data_stats); + ls.back()->byte_total = 1024*1024; + ls.back()->byte_used = 512*1024; + ls.back()->byte_avail = 512*1024; + ls.back()->avail_percent = 50; + } +}; +typedef struct ceph_data_stats ceph_data_stats_t; +WRITE_CLASS_ENCODER(ceph_data_stats) + +int get_fs_stats(ceph_data_stats_t &stats, const char *path); + +/// get memory limit for the current cgroup +int get_cgroup_memory_limit(uint64_t *limit); + +/// collect info from @p uname(2), @p /proc/meminfo and @p /proc/cpuinfo +void collect_sys_info(map<string, string> *m, CephContext *cct); + +/// dump service ids grouped by their host to the specified formatter +/// @param f formatter for the output +/// @param services a map from hostname to a list of service id hosted by this host +/// @param type the service type of given @p services, for example @p osd or @p mon. +void dump_services(Formatter* f, const map<string, list<int> >& services, const char* type); +/// dump service names grouped by their host to the specified formatter +/// @param f formatter for the output +/// @param services a map from hostname to a list of service name hosted by this host +/// @param type the service type of given @p services, for example @p osd or @p mon. +void dump_services(Formatter* f, const map<string, list<string> >& services, const char* type); + +string cleanbin(bufferlist &bl, bool &b64, bool show = false); +string cleanbin(string &str); + +namespace ceph::util { + +// Returns true if s matches any parameters: +template <typename ...XS> +bool match_str(const std::string& s, const XS& ...xs) +{ + return ((s == xs) || ...); +} + +} // namespace ceph::util +#endif /* CEPH_UTIL_H */ diff --git a/src/include/utime.h b/src/include/utime.h new file mode 100644 index 00000000..42f9b087 --- /dev/null +++ b/src/include/utime.h @@ -0,0 +1,579 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_UTIME_H +#define CEPH_UTIME_H + +#include <math.h> +#include <sys/time.h> +#include <time.h> +#include <errno.h> + +#include "include/types.h" +#include "include/timegm.h" +#include "common/strtol.h" +#include "common/ceph_time.h" +#include "common/safe_io.h" +#include "common/SubProcess.h" +#include "include/denc.h" + + +// -------- +// utime_t + +inline __u32 cap_to_u32_max(__u64 t) { + return std::min(t, (__u64)std::numeric_limits<uint32_t>::max()); +} +/* WARNING: If add member in utime_t, please make sure the encode/decode function + * work well. For little-endian machine, we should make sure there is no padding + * in 32-bit machine and 64-bit machine. + * You should also modify the padding_check function. + */ +class utime_t { +public: + struct { + __u32 tv_sec, tv_nsec; + } tv; + + public: + bool is_zero() const { + return (tv.tv_sec == 0) && (tv.tv_nsec == 0); + } + + void normalize() { + if (tv.tv_nsec > 1000000000ul) { + tv.tv_sec = cap_to_u32_max(tv.tv_sec + tv.tv_nsec / (1000000000ul)); + tv.tv_nsec %= 1000000000ul; + } + } + + // cons + utime_t() { tv.tv_sec = 0; tv.tv_nsec = 0; } + utime_t(time_t s, int n) { tv.tv_sec = s; tv.tv_nsec = n; normalize(); } + utime_t(const struct ceph_timespec &v) { + decode_timeval(&v); + } + utime_t(const struct timespec v) + { + // NOTE: this is used by ceph_clock_now() so should be kept + // as thin as possible. + tv.tv_sec = v.tv_sec; + tv.tv_nsec = v.tv_nsec; + } + // conversion from ceph::real_time/coarse_real_time + template <typename Clock, typename std::enable_if_t< + ceph::converts_to_timespec_v<Clock>>* = nullptr> + explicit utime_t(const std::chrono::time_point<Clock>& t) + : utime_t(Clock::to_timespec(t)) {} // forward to timespec ctor + + utime_t(const struct timeval &v) { + set_from_timeval(&v); + } + utime_t(const struct timeval *v) { + set_from_timeval(v); + } + void to_timespec(struct timespec *ts) const { + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_nsec; + } + void set_from_double(double d) { + tv.tv_sec = (__u32)trunc(d); + tv.tv_nsec = (__u32)((d - (double)tv.tv_sec) * 1000000000.0); + } + + real_time to_real_time() const { + ceph_timespec ts; + encode_timeval(&ts); + return ceph::real_clock::from_ceph_timespec(ts); + } + + // accessors + time_t sec() const { return tv.tv_sec; } + long usec() const { return tv.tv_nsec/1000; } + int nsec() const { return tv.tv_nsec; } + + // ref accessors/modifiers + __u32& sec_ref() { return tv.tv_sec; } + __u32& nsec_ref() { return tv.tv_nsec; } + + uint64_t to_nsec() const { + return (uint64_t)tv.tv_nsec + (uint64_t)tv.tv_sec * 1000000000ull; + } + uint64_t to_msec() const { + return (uint64_t)tv.tv_nsec / 1000000ull + (uint64_t)tv.tv_sec * 1000ull; + } + + void copy_to_timeval(struct timeval *v) const { + v->tv_sec = tv.tv_sec; + v->tv_usec = tv.tv_nsec/1000; + } + void set_from_timeval(const struct timeval *v) { + tv.tv_sec = v->tv_sec; + tv.tv_nsec = v->tv_usec*1000; + } + void padding_check() { + static_assert( + sizeof(utime_t) == + sizeof(tv.tv_sec) + + sizeof(tv.tv_nsec) + , + "utime_t have padding"); + } + void encode(bufferlist &bl) const { +#if defined(CEPH_LITTLE_ENDIAN) + bl.append((char *)(this), sizeof(__u32) + sizeof(__u32)); +#else + using ceph::encode; + encode(tv.tv_sec, bl); + encode(tv.tv_nsec, bl); +#endif + } + void decode(bufferlist::const_iterator &p) { +#if defined(CEPH_LITTLE_ENDIAN) + p.copy(sizeof(__u32) + sizeof(__u32), (char *)(this)); +#else + using ceph::decode; + decode(tv.tv_sec, p); + decode(tv.tv_nsec, p); +#endif + } + + DENC(utime_t, v, p) { + denc(v.tv.tv_sec, p); + denc(v.tv.tv_nsec, p); + } + + + void encode_timeval(struct ceph_timespec *t) const { + t->tv_sec = tv.tv_sec; + t->tv_nsec = tv.tv_nsec; + } + void decode_timeval(const struct ceph_timespec *t) { + tv.tv_sec = t->tv_sec; + tv.tv_nsec = t->tv_nsec; + } + + utime_t round_to_minute() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + utime_t round_to_hour() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + bdt.tm_min = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + utime_t round_to_day() { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + bdt.tm_sec = 0; + bdt.tm_min = 0; + bdt.tm_hour = 0; + tt = mktime(&bdt); + return utime_t(tt, 0); + } + + // cast to double + operator double() const { + return (double)sec() + ((double)nsec() / 1000000000.0L); + } + operator ceph_timespec() const { + ceph_timespec ts; + ts.tv_sec = sec(); + ts.tv_nsec = nsec(); + return ts; + } + + void sleep() const { + struct timespec ts; + to_timespec(&ts); + nanosleep(&ts, NULL); + } + + // output + ostream& gmtime(ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // aim for http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday + << ' ' + << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(6) << usec(); + out << "Z"; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + // output + ostream& gmtime_nsec(ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // aim for http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday + << ' ' + << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(9) << nsec(); + out << "Z"; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + // output + ostream& asctime(ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // aim for http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + gmtime_r(&tt, &bdt); + + char buf[128]; + asctime_r(&bdt, buf); + int len = strlen(buf); + if (buf[len - 1] == '\n') + buf[len - 1] = '\0'; + out << buf; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + ostream& localtime(ostream& out) const { + out.setf(std::ios::right); + char oldfill = out.fill(); + out.fill('0'); + if (sec() < ((time_t)(60*60*24*365*10))) { + // raw seconds. this looks like a relative time. + out << (long)sec() << "." << std::setw(6) << usec(); + } else { + // this looks like an absolute time. + // aim for http://en.wikipedia.org/wiki/ISO_8601 + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + out << std::setw(4) << (bdt.tm_year+1900) // 2007 -> '07' + << '-' << std::setw(2) << (bdt.tm_mon+1) + << '-' << std::setw(2) << bdt.tm_mday + << ' ' + << std::setw(2) << bdt.tm_hour + << ':' << std::setw(2) << bdt.tm_min + << ':' << std::setw(2) << bdt.tm_sec; + out << "." << std::setw(6) << usec(); + //out << '_' << bdt.tm_zone; + } + out.fill(oldfill); + out.unsetf(std::ios::right); + return out; + } + + int sprintf(char *out, int outlen) const { + struct tm bdt; + time_t tt = sec(); + localtime_r(&tt, &bdt); + + return ::snprintf(out, outlen, + "%04d-%02d-%02d %02d:%02d:%02d.%06ld", + bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday, + bdt.tm_hour, bdt.tm_min, bdt.tm_sec, usec()); + } + + static int snprintf(char *out, int outlen, time_t tt) { + struct tm bdt; + localtime_r(&tt, &bdt); + + return ::snprintf(out, outlen, + "%04d-%02d-%02d %02d:%02d:%02d", + bdt.tm_year + 1900, bdt.tm_mon + 1, bdt.tm_mday, + bdt.tm_hour, bdt.tm_min, bdt.tm_sec); + } + + static int invoke_date(const std::string& date_str, utime_t *result) { + char buf[256]; + + SubProcess bin_date("/bin/date", SubProcess::CLOSE, SubProcess::PIPE, + SubProcess::KEEP); + bin_date.add_cmd_args("-d", date_str.c_str(), "+%s %N", NULL); + + int r = bin_date.spawn(); + if (r < 0) return r; + + ssize_t n = safe_read(bin_date.get_stdout(), buf, sizeof(buf)); + + r = bin_date.join(); + if (r || n <= 0) return -EINVAL; + + uint64_t epoch, nsec; + std::istringstream iss(buf); + + iss >> epoch; + iss >> nsec; + + *result = utime_t(epoch, nsec); + + return 0; + } + + + static int parse_date(const string& date, uint64_t *epoch, uint64_t *nsec, + string *out_date=NULL, string *out_time=NULL) { + struct tm tm; + memset(&tm, 0, sizeof(tm)); + + if (nsec) + *nsec = 0; + + const char *p = strptime(date.c_str(), "%Y-%m-%d", &tm); + if (p) { + if (*p == ' ' || *p == 'T') { + p++; + // strptime doesn't understand fractional/decimal seconds, and + // it also only takes format chars or literals, so we have to + // get creative. + char fmt[32] = {0}; + strncpy(fmt, p, sizeof(fmt) - 1); + fmt[0] = '%'; + fmt[1] = 'H'; + fmt[2] = ':'; + fmt[3] = '%'; + fmt[4] = 'M'; + fmt[6] = '%'; + fmt[7] = 'S'; + const char *subsec = 0; + char *q = fmt + 8; + if (*q == '.') { + ++q; + subsec = p + 9; + q = fmt + 9; + while (*q && isdigit(*q)) { + ++q; + } + } + // look for tz... + if (*q == '-' || *q == '+') { + *q = '%'; + *(q+1) = 'z'; + *(q+2) = 0; + } + p = strptime(p, fmt, &tm); + if (!p) { + return -EINVAL; + } + if (nsec && subsec) { + unsigned i; + char buf[10]; /* 9 digit + null termination */ + for (i = 0; (i < sizeof(buf) - 1) && isdigit(*subsec); ++i, ++subsec) { + buf[i] = *subsec; + } + for (; i < sizeof(buf) - 1; ++i) { + buf[i] = '0'; + } + buf[i] = '\0'; + string err; + *nsec = (uint64_t)strict_strtol(buf, 10, &err); + if (!err.empty()) { + return -EINVAL; + } + } + } + } else { + int sec, usec; + int r = sscanf(date.c_str(), "%d.%d", &sec, &usec); + if (r != 2) { + return -EINVAL; + } + + time_t tt = sec; + gmtime_r(&tt, &tm); + + if (nsec) { + *nsec = (uint64_t)usec * 1000; + } + } + + // apply the tm_gmtoff manually below, since none of mktime, + // gmtime, and localtime seem to do it. zero it out here just in + // case some other libc *does* apply it. :( + auto gmtoff = tm.tm_gmtoff; + tm.tm_gmtoff = 0; + + time_t t = internal_timegm(&tm); + if (epoch) + *epoch = (uint64_t)t; + + *epoch -= gmtoff; + + if (out_date) { + char buf[32]; + strftime(buf, sizeof(buf), "%F", &tm); + *out_date = buf; + } + if (out_time) { + char buf[32]; + strftime(buf, sizeof(buf), "%T", &tm); + *out_time = buf; + } + + return 0; + } + + bool parse(const string& s) { + uint64_t epoch, nsec; + int r = parse_date(s, &epoch, &nsec); + if (r < 0) { + return false; + } + *this = utime_t(epoch, nsec); + return true; + } +}; +WRITE_CLASS_ENCODER(utime_t) +WRITE_CLASS_DENC(utime_t) + +// arithmetic operators +inline utime_t operator+(const utime_t& l, const utime_t& r) { + __u64 sec = (__u64)l.sec() + r.sec(); + return utime_t(cap_to_u32_max(sec), l.nsec() + r.nsec()); +} +inline utime_t& operator+=(utime_t& l, const utime_t& r) { + l.sec_ref() = cap_to_u32_max((__u64)l.sec() + r.sec()); + l.nsec_ref() += r.nsec(); + l.normalize(); + return l; +} +inline utime_t& operator+=(utime_t& l, double f) { + double fs = trunc(f); + double ns = (f - fs) * 1000000000.0; + l.sec_ref() = cap_to_u32_max(l.sec() + (__u64)fs); + l.nsec_ref() += (long)ns; + l.normalize(); + return l; +} + +inline utime_t operator-(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() - r.sec() - (l.nsec()<r.nsec() ? 1:0), + l.nsec() - r.nsec() + (l.nsec()<r.nsec() ? 1000000000:0) ); +} +inline utime_t& operator-=(utime_t& l, const utime_t& r) { + l.sec_ref() -= r.sec(); + if (l.nsec() >= r.nsec()) + l.nsec_ref() -= r.nsec(); + else { + l.nsec_ref() += 1000000000L - r.nsec(); + l.sec_ref()--; + } + return l; +} +inline utime_t& operator-=(utime_t& l, double f) { + double fs = trunc(f); + double ns = (f - fs) * 1000000000.0; + l.sec_ref() -= (long)fs; + long nsl = (long)ns; + if (nsl) { + l.sec_ref()--; + l.nsec_ref() = 1000000000L + l.nsec_ref() - nsl; + } + l.normalize(); + return l; +} + + +// comparators +inline bool operator>(const utime_t& a, const utime_t& b) +{ + return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.nsec() > b.nsec()); +} +inline bool operator<=(const utime_t& a, const utime_t& b) +{ + return !(operator>(a, b)); +} +inline bool operator<(const utime_t& a, const utime_t& b) +{ + return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.nsec() < b.nsec()); +} +inline bool operator>=(const utime_t& a, const utime_t& b) +{ + return !(operator<(a, b)); +} + +inline bool operator==(const utime_t& a, const utime_t& b) +{ + return a.sec() == b.sec() && a.nsec() == b.nsec(); +} +inline bool operator!=(const utime_t& a, const utime_t& b) +{ + return a.sec() != b.sec() || a.nsec() != b.nsec(); +} + + +// output + +// ostream +inline std::ostream& operator<<(std::ostream& out, const utime_t& t) +{ + return t.localtime(out); +} + +inline std::string utimespan_str(const utime_t& age) { + auto age_ts = ceph::timespan(age.nsec()) + std::chrono::seconds(age.sec()); + return timespan_str(age_ts); +} + +#endif diff --git a/src/include/uuid.h b/src/include/uuid.h new file mode 100644 index 00000000..f957f87a --- /dev/null +++ b/src/include/uuid.h @@ -0,0 +1,83 @@ +#ifndef _CEPH_UUID_H +#define _CEPH_UUID_H + +/* + * Thin C++ wrapper around libuuid. + */ + +#include "encoding.h" + +#include <ostream> +#include <random> + +#include <boost/uuid/uuid.hpp> +#include <boost/uuid/uuid_generators.hpp> +#include <boost/uuid/uuid_io.hpp> + +struct uuid_d { + boost::uuids::uuid uuid; + + uuid_d() { + boost::uuids::nil_generator gen; + uuid = gen(); + } + + bool is_zero() const { + return uuid.is_nil(); + } + + void generate_random() { + std::random_device rng; + boost::uuids::basic_random_generator gen(rng); + uuid = gen(); + } + + bool parse(const char *s) { + try { + boost::uuids::string_generator gen; + uuid = gen(s); + return true; + } catch (std::runtime_error& e) { + return false; + } + } + void print(char *s) const { + memcpy(s, boost::uuids::to_string(uuid).c_str(), 37); + } + + std::string to_string() const { + return boost::uuids::to_string(uuid); + } + + char *bytes() const { + return (char*)uuid.data; + } + + void encode(bufferlist& bl) const { + ::encode_raw(uuid, bl); + } + + void decode(bufferlist::const_iterator& p) const { + ::decode_raw(uuid, p); + } +}; +WRITE_CLASS_ENCODER(uuid_d) + +inline std::ostream& operator<<(std::ostream& out, const uuid_d& u) { + char b[37]; + u.print(b); + return out << b; +} + +inline bool operator==(const uuid_d& l, const uuid_d& r) { + return l.uuid == r.uuid; +} +inline bool operator!=(const uuid_d& l, const uuid_d& r) { + return l.uuid != r.uuid; +} +inline bool operator<(const uuid_d& l, const uuid_d& r) { + return l.to_string() < r.to_string(); +} + + +#endif diff --git a/src/include/xlist.h b/src/include/xlist.h new file mode 100644 index 00000000..733a318a --- /dev/null +++ b/src/include/xlist.h @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_XLIST_H +#define CEPH_XLIST_H + +#include <iterator> +#include <cstdlib> +#include <ostream> + +#include "include/ceph_assert.h" + +template<typename T> +class xlist { +public: + class item { + public: + item(T i) : _item(i) {} + ~item() { + ceph_assert(!is_on_list()); + } + + item(const item& other) = delete; + item(item&& other) = delete; + const item& operator= (const item& right) = delete; + item& operator= (item&& right) = delete; + + xlist* get_list() { return _list; } + bool is_on_list() const { return _list ? true:false; } + bool remove_myself() { + if (_list) { + _list->remove(this); + ceph_assert(_list == 0); + return true; + } else + return false; + } + void move_to_front() { + ceph_assert(_list); + _list->push_front(this); + } + void move_to_back() { + ceph_assert(_list); + _list->push_back(this); + } + + private: + friend xlist; + T _item; + item *_prev = nullptr, *_next = nullptr; + xlist *_list = nullptr; + }; + + typedef item* value_type; + typedef item* const_reference; + +private: + item *_front, *_back; + size_t _size; + +public: + xlist(const xlist& other) { + _front = other._front; + _back = other._back; + _size = other._size; + } + + xlist() : _front(0), _back(0), _size(0) {} + ~xlist() { + ceph_assert(_size == 0); + ceph_assert(_front == 0); + ceph_assert(_back == 0); + } + + size_t size() const { + ceph_assert((bool)_front == (bool)_size); + return _size; + } + bool empty() const { + ceph_assert((bool)_front == (bool)_size); + return _front == 0; + } + + void clear() { + while (_front) + remove(_front); + ceph_assert((bool)_front == (bool)_size); + } + + void push_front(item *i) { + if (i->_list) + i->_list->remove(i); + + i->_list = this; + i->_next = _front; + i->_prev = 0; + if (_front) + _front->_prev = i; + else + _back = i; + _front = i; + _size++; + } + void push_back(item *i) { + if (i->_list) + i->_list->remove(i); + + i->_list = this; + i->_next = 0; + i->_prev = _back; + if (_back) + _back->_next = i; + else + _front = i; + _back = i; + _size++; + } + void remove(item *i) { + ceph_assert(i->_list == this); + + if (i->_prev) + i->_prev->_next = i->_next; + else + _front = i->_next; + if (i->_next) + i->_next->_prev = i->_prev; + else + _back = i->_prev; + _size--; + + i->_list = 0; + i->_next = i->_prev = 0; + ceph_assert((bool)_front == (bool)_size); + } + + T front() { return static_cast<T>(_front->_item); } + const T front() const { return static_cast<const T>(_front->_item); } + + T back() { return static_cast<T>(_back->_item); } + const T back() const { return static_cast<const T>(_back->_item); } + + void pop_front() { + ceph_assert(!empty()); + remove(_front); + } + void pop_back() { + ceph_assert(!empty()); + remove(_back); + } + + class iterator: std::iterator<std::forward_iterator_tag, T> { + private: + item *cur; + public: + iterator(item *i = 0) : cur(i) {} + T operator*() { return static_cast<T>(cur->_item); } + iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur->_list); + cur = cur->_next; + return *this; + } + bool end() const { return cur == 0; } + bool operator==(const iterator& rhs) const { + return cur == rhs.cur; + } + bool operator!=(const iterator& rhs) const { + return cur != rhs.cur; + } + }; + + iterator begin() { return iterator(_front); } + iterator end() { return iterator(NULL); } + + class const_iterator: std::iterator<std::forward_iterator_tag, T> { + private: + item *cur; + public: + const_iterator(item *i = 0) : cur(i) {} + const T operator*() { return static_cast<const T>(cur->_item); } + const_iterator& operator++() { + ceph_assert(cur); + ceph_assert(cur->_list); + cur = cur->_next; + return *this; + } + bool end() const { return cur == 0; } + bool operator==(const_iterator& rhs) const { + return cur == rhs.cur; + } + bool operator!=(const_iterator& rhs) const { + return cur != rhs.cur; + } + }; + + const_iterator begin() const { return const_iterator(_front); } + const_iterator end() const { return const_iterator(NULL); } + + friend std::ostream &operator<<(std::ostream &oss, const xlist<T> &list) { + bool first = true; + for (const auto &item : list) { + if (!first) { + oss << ", "; + } + oss << *item; /* item should be a pointer */ + first = false; + } + return oss; + } +}; + + +#endif |