summaryrefslogtreecommitdiffstats
path: root/src/include/rados
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/include/rados
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/include/rados.h700
l---------src/include/rados/buffer.h1
l---------src/include/rados/buffer_fwd.h1
l---------src/include/rados/crc32c.h1
l---------src/include/rados/inline_memory.h1
-rw-r--r--src/include/rados/librados.h4156
-rw-r--r--src/include/rados/librados.hpp1568
-rw-r--r--src/include/rados/librados_fwd.hpp34
-rw-r--r--src/include/rados/librgw.h36
-rw-r--r--src/include/rados/objclass.h177
l---------src/include/rados/page.h1
-rw-r--r--src/include/rados/rados_types.h41
-rw-r--r--src/include/rados/rados_types.hpp341
-rw-r--r--src/include/rados/rgw_file.h431
-rw-r--r--src/include/radosstriper/libradosstriper.h620
-rw-r--r--src/include/radosstriper/libradosstriper.hpp241
16 files changed, 8350 insertions, 0 deletions
diff --git a/src/include/rados.h b/src/include/rados.h
new file mode 100644
index 000000000..eac3a2159
--- /dev/null
+++ b/src/include/rados.h
@@ -0,0 +1,700 @@
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <string.h>
+#include <stdbool.h>
+#include "msgr.h"
+
+/* See comment in ceph_fs.h. */
+#ifndef __KERNEL__
+#include "byteorder.h"
+#define __le16 ceph_le16
+#define __le32 ceph_le32
+#define __le64 ceph_le64
+#endif
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg pool types
+ *
+ * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are
+ * duplicated here only for CrushCompiler's benefit.
+ */
+#define CEPH_PG_TYPE_REPLICATED 1
+/* #define CEPH_PG_TYPE_RAID4 2 never implemented */
+#define CEPH_PG_TYPE_ERASURE 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ *
+ * ** This function is released to the public domain by the author. **
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */
+#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */
+#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */
+#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */
+#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */
+#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */
+#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */
+#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */
+#define CEPH_OSD_STOP (1<<12) /* osd has been stopped by admin */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), deprecated since mimic*/
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), deprecated since mimic */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */
+#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */
+#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */
+#define CEPH_OSDMAP_NOAUTOSCALE (1<<23) /* block pg autoscale */
+
+/* these are hidden in 'ceph status' view */
+#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS | \
+ CEPH_OSDMAP_RECOVERY_DELETES | \
+ CEPH_OSDMAP_SORTBITWISE | \
+ CEPH_OSDMAP_PURGED_SNAPDIRS | \
+ CEPH_OSDMAP_PGLOG_HARDLIMIT)
+#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \
+ CEPH_OSDMAP_REQUIRE_KRAKEN | \
+ CEPH_OSDMAP_REQUIRE_LUMINOUS)
+
+/*
+ * major ceph release numbers
+ */
+#define CEPH_RELEASE_ARGONAUT 1
+#define CEPH_RELEASE_BOBTAIL 2
+#define CEPH_RELEASE_CUTTLEFISH 3
+#define CEPH_RELEASE_DUMPLING 4
+#define CEPH_RELEASE_EMPEROR 5
+#define CEPH_RELEASE_FIREFLY 6
+#define CEPH_RELEASE_GIANT 7
+#define CEPH_RELEASE_HAMMER 8
+#define CEPH_RELEASE_INFERNALIS 9
+#define CEPH_RELEASE_JEWEL 10
+#define CEPH_RELEASE_KRAKEN 11
+#define CEPH_RELEASE_LUMINOUS 12
+#define CEPH_RELEASE_MIMIC 13
+#define CEPH_RELEASE_NAUTILUS 14
+#define CEPH_RELEASE_OCTOPUS 15
+#define CEPH_RELEASE_PACIFIC 16
+#define CEPH_RELEASE_QUINCY 17
+#define CEPH_RELEASE_REEF 18
+#define CEPH_RELEASE_MAX 19 /* highest + 1 */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+// LEAVE UNUSED 0x0600 used to be multiobject ops
+
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAPRMKEYRANGE, __CEPH_OSD_OP(WR, DATA, 44), "omap-rm-key-range") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \
+ /* was copy-get-classic */ \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /* cache pin/unpin */ \
+ f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \
+ f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \
+ \
+ /* ESX/SCSI */ \
+ f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \
+ f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \
+ \
+ /* Extensible */ \
+ f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \
+ f(SET_CHUNK, __CEPH_OSD_OP(CACHE, DATA, 40), "set-chunk") \
+ f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \
+ f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \
+ f(TIER_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 43), "tier-flush") \
+ f(TIER_EVICT, __CEPH_OSD_OP(CACHE, DATA, 44), "tier-evict") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ /* 8 used to be scrub-stop */ \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \
+ f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \
+ f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \
+ f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return op & CEPH_OSD_OP_MODE_WR;
+}
+static inline int ceph_osd_op_mode_cache(int op)
+{
+ return op & CEPH_OSD_OP_MODE_CACHE;
+}
+static inline bool ceph_osd_op_uses_extent(int op)
+{
+ switch(op) {
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_MAPEXT:
+ case CEPH_OSD_OP_MASKTRUNC:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_SYNC_READ:
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_TRUNCATE:
+ case CEPH_OSD_OP_ZERO:
+ case CEPH_OSD_OP_APPEND:
+ case CEPH_OSD_OP_TRIMTRUNC:
+ case CEPH_OSD_OP_CMPEXT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * and objclass.h. Any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id
+ */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
+ CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */
+ CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */
+ CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000, /* client understands pool EIO flag */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */
+ CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */
+ CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */
+ CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
+};
+
+#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLOCKLISTED 108 /* blocklisted */
+#define EBLACKLISTED 108 /* deprecated */
+
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
+enum {
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+ * cloneid */
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* use provided truncate_{seq,size} (copy-from2 only) */
+};
+
+#define CEPH_OSD_COPY_FROM_FLAGS \
+ (CEPH_OSD_COPY_FROM_FLAG_FLUSH | \
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | \
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | \
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | \
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED | \
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)
+
+enum {
+ CEPH_OSD_TMAP2OMAP_NULLOK = 1,
+};
+
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+enum {
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0,
+ CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1,
+ CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+const char *ceph_osd_alloc_hint_flag_name(int f);
+
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+const char *ceph_osd_backoff_op_name(int op);
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 count;
+ __le32 start_epoch; /* for the pgls sequence */
+ } __attribute__ ((packed)) pgls;
+ struct {
+ __le64 snapid;
+ } __attribute__ ((packed)) snap;
+ struct {
+ __le64 cookie;
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __u32 gen; /* registration generation */
+ __u32 timeout; /* connection timeout */
+ } __attribute__ ((packed)) watch;
+ struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
+ __le64 unused;
+ __le64 ver;
+ } __attribute__ ((packed)) assert_ver;
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 max; /* max data in reply */
+ } __attribute__ ((packed)) copy_get;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+ /*
+ * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+ * for src object, flags for dest object are in
+ * ceph_osd_op::flags.
+ */
+ __le32 src_fadvise_flags;
+ } __attribute__ ((packed)) copy_from;
+ struct {
+ struct ceph_timespec stamp;
+ } __attribute__ ((packed)) hit_set_get;
+ struct {
+ __u8 flags;
+ } __attribute__ ((packed)) tmap2omap;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } __attribute__ ((packed)) alloc_hint;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le64 data_length;
+ } __attribute__ ((packed)) writesame;
+ struct {
+ __le64 offset;
+ __le64 length;
+ __le32 chunk_size;
+ __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */
+ } __attribute__ ((packed)) checksum;
+ } __attribute__ ((packed));
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * Check the compatibility of struct ceph_osd_op
+ * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) +
+ * sizeof(ceph_osd_op::flags) +
+ * sizeof(ceph_osd_op::extent) +
+ * sizeof(ceph_osd_op::payload_len))
+ */
+#ifdef __cplusplus
+static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4),
+ "sizeof(ceph_osd_op) breaks the compatibility");
+#endif
+
+struct ceph_osd_reply_head {
+ __le32 client_inc; /* client incarnation */
+ __le32 flags;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+ __le32 result; /* result code */
+
+ __le32 object_len; /* length of object name */
+ __le32 num_ops;
+ struct ceph_osd_op ops[0]; /* ops[], object */
+} __attribute__ ((packed));
+
+#ifndef __KERNEL__
+#undef __le16
+#undef __le32
+#undef __le64
+#endif
+
+#endif
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
new file mode 120000
index 000000000..51fc03be1
--- /dev/null
+++ b/src/include/rados/buffer.h
@@ -0,0 +1 @@
+../buffer.h \ No newline at end of file
diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h
new file mode 120000
index 000000000..bd1f6f1b0
--- /dev/null
+++ b/src/include/rados/buffer_fwd.h
@@ -0,0 +1 @@
+../buffer_fwd.h \ No newline at end of file
diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h
new file mode 120000
index 000000000..19ef4317e
--- /dev/null
+++ b/src/include/rados/crc32c.h
@@ -0,0 +1 @@
+../crc32c.h \ No newline at end of file
diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h
new file mode 120000
index 000000000..48f0d4436
--- /dev/null
+++ b/src/include/rados/inline_memory.h
@@ -0,0 +1 @@
+../inline_memory.h \ No newline at end of file
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
new file mode 100644
index 000000000..858804c3a
--- /dev/null
+++ b/src/include/rados/librados.h
@@ -0,0 +1,4156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_LIBRADOS_H
+#define CEPH_LIBRADOS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/in.h>
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+#include <unistd.h>
+#include <string.h>
+#include "rados_types.h"
+
+#include <sys/time.h>
+
+#ifndef CEPH_OSD_TMAP_SET
+/* These are also defined in rados.h and objclass.h. Keep them in sync! */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c'
+#define CEPH_OSD_TMAP_RM 'r'
+#endif
+
+#define LIBRADOS_VER_MAJOR 3
+#define LIBRADOS_VER_MINOR 0
+#define LIBRADOS_VER_EXTRA 0
+
+#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA)
+
+#define LIBRADOS_SUPPORTS_WATCH 1
+#define LIBRADOS_SUPPORTS_SERVICES 1
+#define LIBRADOS_SUPPORTS_GETADDRS 1
+#define LIBRADOS_SUPPORTS_APP_METADATA 1
+
+/* RADOS lock flags
+ * They are also defined in cls_lock_types.h. Keep them in sync!
+ */
+#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0)
+#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW
+#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1)
+
+/*
+ * Constants for rados_write_op_create().
+ */
+#define LIBRADOS_CREATE_EXCLUSIVE 1
+#define LIBRADOS_CREATE_IDEMPOTENT 0
+
+/*
+ * Flags that can be set on a per-op basis via
+ * rados_read_op_set_flags() and rados_write_op_set_flags().
+ */
+enum {
+ // fail a create operation if the object already exists
+ LIBRADOS_OP_FLAG_EXCL = 0x1,
+ // allow the transaction to succeed even if the flagged op fails
+ LIBRADOS_OP_FLAG_FAILOK = 0x2,
+ // indicate read/write op random
+ LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4,
+ // indicate read/write op sequential
+ LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8,
+ // indicate read/write data will be accessed in the near future (by someone)
+ LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10,
+ // indicate read/write data will not accessed in the near future (by anyone)
+ LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20,
+ // indicate read/write data will not accessed again (by *this* client)
+ LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40,
+ // optionally support FUA (force unit access) on write requests
+ LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80,
+};
+
+#define CEPH_RADOS_API
+
+/**
+ * @name xattr comparison operations
+ * Operators for comparing xattrs on objects, and aborting the
+ * rados_read_op or rados_write_op transaction if the comparison
+ * fails.
+ *
+ * @{
+ */
+enum {
+ LIBRADOS_CMPXATTR_OP_EQ = 1,
+ LIBRADOS_CMPXATTR_OP_NE = 2,
+ LIBRADOS_CMPXATTR_OP_GT = 3,
+ LIBRADOS_CMPXATTR_OP_GTE = 4,
+ LIBRADOS_CMPXATTR_OP_LT = 5,
+ LIBRADOS_CMPXATTR_OP_LTE = 6
+};
+/** @} */
+
+/**
+ * @name Operation Flags
+ * Flags for rados_read_op_operate(), rados_write_op_operate(),
+ * rados_aio_read_op_operate(), and rados_aio_write_op_operate().
+ * See librados.hpp for details.
+ * @{
+ */
+enum {
+ LIBRADOS_OPERATION_NOFLAG = 0,
+ LIBRADOS_OPERATION_BALANCE_READS = 1,
+ LIBRADOS_OPERATION_LOCALIZE_READS = 2,
+ LIBRADOS_OPERATION_ORDER_READS_WRITES = 4,
+ LIBRADOS_OPERATION_IGNORE_CACHE = 8,
+ LIBRADOS_OPERATION_SKIPRWLOCKS = 16,
+ LIBRADOS_OPERATION_IGNORE_OVERLAY = 32,
+ /* send requests to cluster despite the cluster or pool being marked
+ full; ops will either succeed (e.g., delete) or return EDQUOT or
+ ENOSPC. */
+ LIBRADOS_OPERATION_FULL_TRY = 64,
+ /*
+ * Mainly for delete op
+ */
+ LIBRADOS_OPERATION_FULL_FORCE = 128,
+ LIBRADOS_OPERATION_IGNORE_REDIRECT = 256,
+ LIBRADOS_OPERATION_ORDERSNAP = 512,
+ /* enable/allow >0 return values and payloads on write/update */
+ LIBRADOS_OPERATION_RETURNVEC = 1024,
+};
+/** @} */
+
+/**
+ * @name Alloc hint flags
+ * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2()
+ * indicating future IO patterns.
+ * @{
+ */
+enum {
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+/** @} */
+
+typedef enum {
+ LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0,
+ LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1,
+ LIBRADOS_CHECKSUM_TYPE_CRC32C = 2
+} rados_checksum_type_t;
+
+/*
+ * snap id contants
+ */
+#define LIBRADOS_SNAP_HEAD UINT64_C(-2)
+#define LIBRADOS_SNAP_DIR UINT64_C(-1)
+
+/**
+ * @typedef rados_t
+ *
+ * A handle for interacting with a RADOS cluster. It encapsulates all
+ * RADOS client configuration, including username, key for
+ * authentication, logging, and debugging. Talking to different clusters
+ * -- or to the same cluster with different users -- requires
+ * different cluster handles.
+ */
+#ifndef VOIDPTR_RADOS_T
+#define VOIDPTR_RADOS_T
+typedef void *rados_t;
+#endif //VOIDPTR_RADOS_T
+
+/**
+ * @typedef rados_config_t
+ *
+ * A handle for the ceph configuration context for the rados_t cluster
+ * instance. This can be used to share configuration context/state
+ * (e.g., logging configuration) between librados instance.
+ *
+ * @warning The config context does not have independent reference
+ * counting. As such, a rados_config_t handle retrieved from a given
+ * rados_t is only valid as long as that rados_t.
+ */
+typedef void *rados_config_t;
+
+/**
+ * @typedef rados_ioctx_t
+ *
+ * An io context encapsulates a few settings for all I/O operations
+ * done on it:
+ * - pool - set when the io context is created (see rados_ioctx_create())
+ * - snapshot context for writes (see
+ * rados_ioctx_selfmanaged_snap_set_write_ctx())
+ * - snapshot id to read from (see rados_ioctx_snap_set_read())
+ * - object locator for all single-object operations (see
+ * rados_ioctx_locator_set_key())
+ * - namespace for all single-object operations (see
+ * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES
+ * before rados_nobjects_list_open() will list all objects in all
+ * namespaces.
+ *
+ * @warning Changing any of these settings is not thread-safe -
+ * librados users must synchronize any of these changes on their own,
+ * or use separate io contexts for each thread
+ */
+typedef void *rados_ioctx_t;
+
+/**
+ * @typedef rados_list_ctx_t
+ *
+ * An iterator for listing the objects in a pool.
+ * Used with rados_nobjects_list_open(),
+ * rados_nobjects_list_next(), rados_nobjects_list_next2(), and
+ * rados_nobjects_list_close().
+ */
+typedef void *rados_list_ctx_t;
+
+/**
+ * @typedef rados_object_list_cursor
+ *
+ * The cursor used with rados_enumerate_objects
+ * and accompanying methods.
+ */
+typedef void * rados_object_list_cursor;
+
+/**
+ * @struct rados_object_list_item
+ *
+ * The item populated by rados_object_list in
+ * the results array.
+ */
+typedef struct {
+
+ /// oid length
+ size_t oid_length;
+ /// name of the object
+ char *oid;
+ /// namespace length
+ size_t nspace_length;
+ /// the object namespace
+ char *nspace;
+ /// locator length
+ size_t locator_length;
+ /// object locator
+ char *locator;
+} rados_object_list_item;
+
+/**
+ * @typedef rados_snap_t
+ * The id of a snapshot.
+ */
+typedef uint64_t rados_snap_t;
+
+/**
+ * @typedef rados_xattrs_iter_t
+ * An iterator for listing extended attrbutes on an object.
+ * Used with rados_getxattrs(), rados_getxattrs_next(), and
+ * rados_getxattrs_end().
+ */
+typedef void *rados_xattrs_iter_t;
+
+/**
+ * @typedef rados_omap_iter_t
+ * An iterator for listing omap key/value pairs on an object.
+ * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and
+ * rados_omap_get_end().
+ */
+typedef void *rados_omap_iter_t;
+
+/**
+ * @struct rados_pool_stat_t
+ * Usage information for a pool.
+ */
+struct rados_pool_stat_t {
+ /// space used in bytes
+ uint64_t num_bytes;
+ /// space used in KB
+ uint64_t num_kb;
+ /// number of objects in the pool
+ uint64_t num_objects;
+ /// number of clones of objects
+ uint64_t num_object_clones;
+ /// num_objects * num_replicas
+ uint64_t num_object_copies;
+ /// number of objects missing on primary
+ uint64_t num_objects_missing_on_primary;
+ /// number of objects found on no OSDs
+ uint64_t num_objects_unfound;
+ /// number of objects replicated fewer times than they should be
+ /// (but found on at least one OSD)
+ uint64_t num_objects_degraded;
+ /// number of objects read
+ uint64_t num_rd;
+ /// objects read in KB
+ uint64_t num_rd_kb;
+ /// number of objects written
+ uint64_t num_wr;
+ /// objects written in KB
+ uint64_t num_wr_kb;
+ /// bytes originally provided by user
+ uint64_t num_user_bytes;
+ /// bytes passed compression
+ uint64_t compressed_bytes_orig;
+ /// bytes resulted after compression
+ uint64_t compressed_bytes;
+ /// bytes allocated at storage
+ uint64_t compressed_bytes_alloc;
+};
+
+/**
+ * @struct rados_cluster_stat_t
+ * Cluster-wide usage information
+ */
+struct rados_cluster_stat_t {
+ /// total device size
+ uint64_t kb;
+ /// total used
+ uint64_t kb_used;
+ /// total available/free
+ uint64_t kb_avail;
+ /// number of objects
+ uint64_t num_objects;
+};
+
+/**
+ * @typedef rados_write_op_t
+ *
+ * An object write operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_write_op() rados_release_write_op()
+ * - Extended attribute manipulation: rados_write_op_cmpxattr()
+ * rados_write_op_cmpxattr(), rados_write_op_setxattr(),
+ * rados_write_op_rmxattr()
+ * - Object map key/value pairs: rados_write_op_omap_set(),
+ * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(),
+ * rados_write_op_omap_cmp()
+ * - Object properties: rados_write_op_assert_exists(),
+ * rados_write_op_assert_version()
+ * - Creating objects: rados_write_op_create()
+ * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero
+ * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove,
+ * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext()
+ * - Hints: rados_write_op_set_alloc_hint()
+ * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate()
+ */
+typedef void *rados_write_op_t;
+
+/**
+ * @typedef rados_read_op_t
+ *
+ * An object read operation stores a number of operations which can be
+ * executed atomically. For usage, see:
+ * - Creation and deletion: rados_create_read_op() rados_release_read_op()
+ * - Extended attribute manipulation: rados_read_op_cmpxattr(),
+ * rados_read_op_getxattr(), rados_read_op_getxattrs()
+ * - Object map key/value pairs: rados_read_op_omap_get_vals(),
+ * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(),
+ * rados_read_op_omap_cmp()
+ * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(),
+ * rados_read_op_assert_version()
+ * - IO on objects: rados_read_op_read(), rados_read_op_checksum(),
+ * rados_read_op_cmpext()
+ * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf()
+ * - Request properties: rados_read_op_set_flags()
+ * - Performing the operation: rados_read_op_operate(),
+ * rados_aio_read_op_operate()
+ */
+typedef void *rados_read_op_t;
+
+/**
+ * @typedef rados_completion_t
+ * Represents the state of an asynchronous operation - it contains the
+ * return value once the operation completes, and can be used to block
+ * until the operation is complete or safe.
+ */
+typedef void *rados_completion_t;
+
+/**
+ * @struct blkin_trace_info
+ * blkin trace information for Zipkin tracing
+ */
+struct blkin_trace_info;
+
+/**
+ * Get the version of librados.
+ *
+ * The version number is major.minor.extra. Note that this is
+ * unrelated to the Ceph version number.
+ *
+ * TODO: define version semantics, i.e.:
+ * - incrementing major is for backwards-incompatible changes
+ * - incrementing minor is for backwards-compatible changes
+ * - incrementing extra is for bug fixes
+ *
+ * @param major where to store the major version number
+ * @param minor where to store the minor version number
+ * @param extra where to store the extra version number
+ */
+CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra);
+
+/**
+ * @name Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using librados.
+ *
+ * @{
+ */
+
+/**
+ * Create a handle for communicating with a RADOS cluster.
+ *
+ * Ceph environment variables are read when this is called, so if
+ * $CEPH_ARGS specifies everything you need to connect, no further
+ * configuration is necessary.
+ *
+ * @param cluster where to store the handle
+ * @param id the user to connect as (i.e. admin, not client.admin)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id);
+
+/**
+ * Extended version of rados_create.
+ *
+ * Like rados_create, but
+ * 1) don't assume 'client\.'+id; allow full specification of name
+ * 2) allow specification of cluster name
+ * 3) flags for future expansion
+ */
+CEPH_RADOS_API int rados_create2(rados_t *pcluster,
+ const char *const clustername,
+ const char * const name, uint64_t flags);
+
+/**
+ * Initialize a cluster handle from an existing configuration.
+ *
+ * Share configuration state with another rados_t instance.
+ *
+ * @param cluster where to store the handle
+ * @param cct the existing configuration to use
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_create_with_context(rados_t *cluster,
+ rados_config_t cct);
+
+/**
+ * Ping the monitor with ID mon_id, storing the resulting reply in
+ * buf (if specified) with a maximum size of len.
+ *
+ * The result buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param mon_id [in] ID of the monitor to ping
+ * @param outstr [out] double pointer with the resulting reply
+ * @param outstrlen [out] pointer with the size of the reply in outstr
+ */
+CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id,
+ char **outstr, size_t *outstrlen);
+
+/**
+ * Connect to the cluster.
+ *
+ * @note BUG: Before calling this, calling a function that communicates with the
+ * cluster will crash.
+ *
+ * @pre The cluster handle is configured with at least a monitor
+ * address. If cephx is enabled, a client name and secret must also be
+ * set.
+ *
+ * @post If this succeeds, any function in librados may be used
+ *
+ * @param cluster The cluster to connect to.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_connect(rados_t cluster);
+
+/**
+ * Disconnects from the cluster.
+ *
+ * For clean up, this is only necessary after rados_connect() has
+ * succeeded.
+ *
+ * @warning This does not guarantee any asynchronous writes have
+ * completed. To do that, you must call rados_aio_flush() on all open
+ * io contexts.
+ *
+ * @warning We implicitly call rados_watch_flush() on shutdown. If
+ * there are watches being used, this should be done explicitly before
+ * destroying the relevant IoCtx. We do it here as a safety measure.
+ *
+ * @post the cluster handle cannot be used again
+ *
+ * @param cluster the cluster to shutdown
+ */
+CEPH_RADOS_API void rados_shutdown(rados_t cluster);
+
+/** @} init */
+
+/**
+ * @name Configuration
+ * These functions read and update Ceph configuration for a cluster
+ * handle. Any configuration changes must be done before connecting to
+ * the cluster.
+ *
+ * Options that librados users might want to set include:
+ * - mon_host
+ * - auth_supported
+ * - key, keyfile, or keyring when using cephx
+ * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog
+ * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms
+ *
+ * See docs.ceph.com for information about available configuration options`
+ *
+ * @{
+ */
+
+/**
+ * Configure the cluster handle using a Ceph config file
+ *
+ * If path is NULL, the default locations are searched, and the first
+ * found is used. The locations are:
+ * - $CEPH_CONF (environment variable)
+ * - /etc/ceph/ceph.conf
+ * - ~/.ceph/config
+ * - ceph.conf (in the current working directory)
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param path path to a Ceph configuration file
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path);
+
+/**
+ * Configure the cluster handle with command line arguments
+ *
+ * argv can contain any common Ceph command line option, including any
+ * configuration parameter prefixed by '--' and replacing spaces with
+ * dashes or underscores. For example, the following options are equivalent:
+ * - --mon-host 10.0.0.1:6789
+ * - --mon_host 10.0.0.1:6789
+ * - -m 10.0.0.1:6789
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc,
+ const char **argv);
+
+
+/**
+ * Configure the cluster handle with command line arguments, returning
+ * any remainders. Same rados_conf_parse_argv, except for extra
+ * remargv argument to hold returns unrecognized arguments.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param argc number of arguments in argv
+ * @param argv arguments to parse
+ * @param remargv char* array for returned unrecognized arguments
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc,
+ const char **argv,
+ const char **remargv);
+/**
+ * Configure the cluster handle based on an environment variable
+ *
+ * The contents of the environment variable are parsed as if they were
+ * Ceph command line options. If var is NULL, the CEPH_ARGS
+ * environment variable is used.
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @note BUG: this is not threadsafe - it uses a static buffer
+ *
+ * @param cluster cluster handle to configure
+ * @param var name of the environment variable to read
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var);
+
+/**
+ * Set a configuration option
+ *
+ * @pre rados_connect() has not been called on the cluster handle
+ *
+ * @param cluster cluster handle to configure
+ * @param option option to set
+ * @param value value of the option
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when the option is not a Ceph configuration option
+ */
+CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option,
+ const char *value);
+
+/**
+ * Get the value of a configuration option
+ *
+ * @param cluster configuration to read
+ * @param option which option to read
+ * @param buf where to write the configuration value
+ * @param len the size of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENAMETOOLONG if the buffer is too short to contain the
+ * requested value
+ */
+CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option,
+ char *buf, size_t len);
+
+/** @} config */
+
+/**
+ * Read usage info about the cluster
+ *
+ * This tells you total space, space used, space available, and number
+ * of objects. These are not updated immediately when data is written,
+ * they are eventually consistent.
+ *
+ * @param cluster cluster to query
+ * @param result where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cluster_stat(rados_t cluster,
+ struct rados_cluster_stat_t *result);
+
+/**
+ * Get the fsid of the cluster as a hexadecimal string.
+ *
+ * The fsid is a unique id of an entire Ceph cluster.
+ *
+ * @param cluster where to get the fsid
+ * @param buf where to write the fsid
+ * @param len the size of buf in bytes (should be 37)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the buffer is too short to contain the
+ * fsid
+ */
+CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len);
+
+/**
+ * Get/wait for the most recent osdmap
+ *
+ * @param cluster the cluster to shutdown
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster);
+
+/**
+ * @name Pools
+ *
+ * RADOS pools are separate namespaces for objects. Pools may have
+ * different crush rules associated with them, so they could have
+ * differing replication levels or placement strategies. RADOS
+ * permissions are also tied to pools - users can have different read,
+ * write, and execute permissions on a per-pool basis.
+ *
+ * @{
+ */
+
+/**
+ * List pools
+ *
+ * Gets a list of pool names as NULL-terminated strings. The pool
+ * names will be placed in the supplied buffer one after another.
+ * After the last pool name, there will be two 0 bytes in a row.
+ *
+ * If len is too short to fit all the pool name entries we need, we will fill
+ * as much as we can.
+ *
+ * Buf may be null to determine the buffer size needed to list all pools.
+ *
+ * @param cluster cluster handle
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len);
+
+/**
+ * List inconsistent placement groups of the given pool
+ *
+ * Gets a list of inconsistent placement groups as NULL-terminated strings.
+ * The placement group names will be placed in the supplied buffer one after
+ * another. After the last name, there will be two 0 types in a row.
+ *
+ * If len is too short to fit all the placement group entries we need, we will
+ * fill as much as we can.
+ *
+ * @param cluster cluster handle
+ * @param pool pool ID
+ * @param buf output buffer
+ * @param len output buffer length
+ * @returns length of the buffer we would need to list all pools
+ */
+CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool,
+ char *buf, size_t len);
+
+/**
+ * Get a configuration handle for a rados cluster handle
+ *
+ * This handle is valid only as long as the cluster handle is valid.
+ *
+ * @param cluster cluster handle
+ * @returns config handle for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster);
+
+/**
+ * Get a global id for current instance
+ *
+ * This id is a unique representation of current connection to the cluster
+ *
+ * @param cluster cluster handle
+ * @returns instance global id
+ */
+CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster);
+
+/**
+ * Gets the minimum compatible OSD version
+ *
+ * @param cluster cluster handle
+ * @param require_osd_release [out] minimum compatible OSD version
+ * based upon the current features
+ * @returns 0 on sucess, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster,
+ int8_t* require_osd_release);
+
+/**
+ * Gets the minimum compatible client version
+ *
+ * @param cluster cluster handle
+ * @param min_compat_client [out] minimum compatible client version
+ * based upon the current features
+ * @param require_min_compat_client [out] required minimum client version
+ * based upon explicit setting
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster,
+ int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+/**
+ * Create an io context
+ *
+ * The io context allows you to perform operations within a particular
+ * pool. For more details see rados_ioctx_t.
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name name of the pool
+ * @param ioctx where to store the io context
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name,
+ rados_ioctx_t *ioctx);
+CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id,
+ rados_ioctx_t *ioctx);
+
+/**
+ * The opposite of rados_ioctx_create
+ *
+ * This just tells librados that you no longer need to use the io context.
+ * It may not be freed immediately if there are pending asynchronous
+ * requests on it, but you should not use an io context again after
+ * calling this function on it.
+ *
+ * @warning This does not guarantee any asynchronous
+ * writes have completed. You must call rados_aio_flush()
+ * on the io context before destroying it to do that.
+ *
+ * @warning If this ioctx is used by rados_watch, the caller needs to
+ * be sure that all registered watches are disconnected via
+ * rados_unwatch() and that rados_watch_flush() is called. This
+ * ensures that a racing watch callback does not make use of a
+ * destroyed ioctx.
+ *
+ * @param io the io context to dispose of
+ */
+CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io);
+
+/**
+ * Get configuration handle for a pool handle
+ *
+ * @param io pool handle
+ * @returns rados_config_t for this cluster
+ */
+CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io);
+
+/**
+ * Get the cluster handle used by this rados_ioctx_t
+ * Note that this is a weak reference, and should not
+ * be destroyed via rados_shutdown().
+ *
+ * @param io the io context
+ * @returns the cluster handle for this io context
+ */
+CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io);
+
+/**
+ * Get pool usage statistics
+ *
+ * Fills in a rados_pool_stat_t after querying the cluster.
+ *
+ * @param io determines which pool to query
+ * @param stats where to store the results
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io,
+ struct rados_pool_stat_t *stats);
+
+/**
+ * Get the id of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param pool_name which pool to look up
+ * @returns id of the pool
+ * @returns -ENOENT if the pool is not found
+ */
+CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster,
+ const char *pool_name);
+
+/**
+ * Get the name of a pool
+ *
+ * @param cluster which cluster the pool is in
+ * @param id the id of the pool
+ * @param buf where to store the pool name
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id,
+ char *buf, size_t maxlen);
+
+/**
+ * Create a pool with default settings
+ *
+ * The default crush rule is rule 0.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name);
+
+/**
+ * Create a pool owned by a specific auid.
+ *
+ * DEPRECATED: auid support has been removed, and this call will be removed in a future
+ * release.
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid)
+ __attribute__((deprecated));
+
+/**
+ * Create a pool with a specific CRUSH rule
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool1
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster,
+ const char *pool_name,
+ uint8_t crush_rule_num);
+
+/**
+ * Create a pool with a specific CRUSH rule and auid
+ *
+ * DEPRECATED: auid support has been removed and this call will be removed
+ * in a future release.
+ *
+ * This is a combination of rados_pool_create_with_crush_rule() and
+ * rados_pool_create_with_auid().
+ *
+ * @param cluster the cluster in which the pool will be created
+ * @param pool_name the name of the new pool
+ * @param crush_rule_num which rule to use for placement in the new pool2
+ * @param auid the id of the owner of the new pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster,
+ const char *pool_name,
+ uint64_t auid,
+ uint8_t crush_rule_num)
+ __attribute__((deprecated));
+
+/**
+ * Returns the pool that is the base tier for this pool.
+ *
+ * The return value is the ID of the pool that should be used to read from/write to.
+ * If tiering is not set up for the pool, returns \c pool.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool ID of the pool to query
+ * @param base_tier [out] base tier, or \c pool if tiering is not configured
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool,
+ int64_t* base_tier);
+
+/**
+ * Delete a pool and all data inside it
+ *
+ * The pool is removed from the cluster immediately,
+ * but the actual data is deleted in the background.
+ *
+ * @param cluster the cluster the pool is in
+ * @param pool_name which pool to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name);
+
+/**
+ * Attempt to change an io context's associated auid "owner"
+ *
+ * DEPRECATED: auid support has been removed and this call has no effect.
+ *
+ * Requires that you have write permission on both the current and new
+ * auid.
+ *
+ * @param io reference to the pool to change.
+ * @param auid the auid you wish the io to have.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid)
+ __attribute__((deprecated));
+
+
+/**
+ * Get the auid of a pool
+ *
+ * DEPRECATED: auid support has been removed and this call always reports
+ * CEPH_AUTH_UID_DEFAULT (-1).
+
+ * @param io pool to query
+ * @param auid where to store the auid
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid)
+ __attribute__((deprecated));
+
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param req 1 if alignment is supported, 0 if not.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+ int *req);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+ uint64_t *alignment);
+
+/**
+ * Get the pool id of the io context
+ *
+ * @param io the io context to query
+ * @returns the id of the pool the io context uses
+ */
+CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io);
+
+/**
+ * Get the pool name of the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} pools */
+
+/**
+ * @name Object Locators
+ *
+ * @{
+ */
+
+/**
+ * Set the key for mapping objects to pgs within an io context.
+ *
+ * The key is used instead of the object name to determine which
+ * placement groups an object is put in. This affects all subsequent
+ * operations of the io context - until a different locator key is
+ * set, all objects in this io context will be placed in the same pg.
+ *
+ * @param io the io context to change
+ * @param key the key to use as the object locator, or NULL to discard
+ * any previously set key
+ */
+CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io,
+ const char *key);
+
+/**
+ * Set the namespace for objects within an io context
+ *
+ * The namespace specification further refines a pool into different
+ * domains. The mapping of objects to pgs is also based on this
+ * value.
+ *
+ * @param io the io context to change
+ * @param nspace the name to use as the namespace, or NULL use the
+ * default namespace
+ */
+CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io,
+ const char *nspace);
+
+/**
+ * Get the namespace for objects within the io context
+ *
+ * @param io the io context to query
+ * @param buf pointer to buffer where name will be stored
+ * @param maxlen size of buffer where name will be stored
+ * @returns length of string stored, or -ERANGE if buffer too small
+ */
+CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf,
+ unsigned maxlen);
+
+/** @} obj_loc */
+
+/**
+ * @name Listing Objects
+ * @{
+ */
+/**
+ * Start listing objects in a pool
+ *
+ * @param io the pool to list from
+ * @param ctx the handle to store list context in
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io,
+ rados_list_ctx_t *ctx);
+
+/**
+ * Return hash position of iterator, rounded to the current PG
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @returns current hash position, rounded to the current pg
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx);
+
+/**
+ * Reposition object iterator to a different hash position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param pos hash position to move to
+ * @returns actual (rounded) position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx,
+ uint32_t pos);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor position to move to
+ * @returns rounded position we moved to
+ */
+CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor cursor);
+
+/**
+ * Reposition object iterator to a different position
+ *
+ * The returned handle must be released with rados_object_list_cursor_free().
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param cursor where to store cursor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx,
+ rados_object_list_cursor *cursor);
+
+/**
+ * Get the next object name and locator in the pool
+ *
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace);
+
+/**
+ * Get the next object name, locator and their sizes in the pool
+ *
+ * The sizes allow to list objects with \0 (the NUL character)
+ * in .e.g *entry. Is is unusual see such object names but a bug
+ * in a client has risen the need to handle them as well.
+ * *entry and *key are valid until next call to rados_nobjects_list_*
+ *
+ * @param ctx iterator marking where you are in the listing
+ * @param entry where to store the name of the entry
+ * @param key where to store the object locator (set to NULL to ignore)
+ * @param nspace where to store the object namespace (set to NULL to ignore)
+ * @param entry_size where to store the size of name of the entry
+ * @param key_size where to store the size of object locator (set to NULL to ignore)
+ * @param nspace_size where to store the size of object namespace (set to NULL to ignore)
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT when there are no more objects to list
+ */
+CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key,
+ const char **nspace,
+ size_t *entry_size,
+ size_t *key_size,
+ size_t *nspace_size);
+
+/**
+ * Close the object listing handle.
+ *
+ * This should be called when the handle is no longer needed.
+ * The handle should not be used after it has been closed.
+ *
+ * @param ctx the handle to close
+ */
+CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx);
+
+/**
+ * Get cursor handle pointing to the *beginning* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin(
+ rados_ioctx_t io);
+
+/**
+ * Get cursor handle pointing to the *end* of a pool.
+ *
+ * This is an opaque handle pointing to the start of a pool. It must
+ * be released with rados_object_list_cursor_free().
+ *
+ * @param io ioctx for the pool
+ * @returns handle for the pool, NULL on error (pool does not exist)
+ */
+CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io);
+
+/**
+ * Check if a cursor has reached the end of a pool
+ *
+ * @param io ioctx
+ * @param cur cursor
+ * @returns 1 if the cursor has reached the end of the pool, 0 otherwise
+ */
+CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Release a cursor
+ *
+ * Release a cursor. The handle may not be used after this point.
+ *
+ * @param io ioctx
+ * @param cur cursor
+ */
+CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io,
+ rados_object_list_cursor cur);
+
+/**
+ * Compare two cursor positions
+ *
+ * Compare two cursors, and indicate whether the first cursor precedes,
+ * matches, or follows the second.
+ *
+ * @param io ioctx
+ * @param lhs first cursor
+ * @param rhs second cursor
+ * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs
+ */
+CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io,
+ rados_object_list_cursor lhs, rados_object_list_cursor rhs);
+
+/**
+ * @return the number of items set in the results array
+ */
+CEPH_RADOS_API int rados_object_list(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t result_size,
+ const char *filter_buf,
+ const size_t filter_buf_len,
+ rados_object_list_item *results,
+ rados_object_list_cursor *next);
+
+CEPH_RADOS_API void rados_object_list_free(
+ const size_t result_size,
+ rados_object_list_item *results);
+
+/**
+ * Obtain cursors delineating a subset of a range. Use this
+ * when you want to split up the work of iterating over the
+ * global namespace. Expected use case is when you are iterating
+ * in parallel, with `m` workers, and each worker taking an id `n`.
+ *
+ * @param io ioctx
+ * @param start start of the range to be sliced up (inclusive)
+ * @param finish end of the range to be sliced up (exclusive)
+ * @param n which of the m chunks you would like to get cursors for
+ * @param m how many chunks to divide start-finish into
+ * @param split_start cursor populated with start of the subrange (inclusive)
+ * @param split_finish cursor populated with end of the subrange (exclusive)
+ */
+CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io,
+ const rados_object_list_cursor start,
+ const rados_object_list_cursor finish,
+ const size_t n,
+ const size_t m,
+ rados_object_list_cursor *split_start,
+ rados_object_list_cursor *split_finish);
+
+
+/** @} Listing Objects */
+
+/**
+ * @name Snapshots
+ *
+ * RADOS snapshots are based upon sequence numbers that form a
+ * snapshot context. They are pool-specific. The snapshot context
+ * consists of the current snapshot sequence number for a pool, and an
+ * array of sequence numbers at which snapshots were taken, in
+ * descending order. Whenever a snapshot is created or deleted, the
+ * snapshot sequence number for the pool is increased. To add a new
+ * snapshot, the new snapshot sequence number must be increased and
+ * added to the snapshot context.
+ *
+ * There are two ways to manage these snapshot contexts:
+ * -# within the RADOS cluster
+ * These are called pool snapshots, and store the snapshot context
+ * in the OSDMap. These represent a snapshot of all the objects in
+ * a pool.
+ * -# within the RADOS clients
+ * These are called self-managed snapshots, and push the
+ * responsibility for keeping track of the snapshot context to the
+ * clients. For every write, the client must send the snapshot
+ * context. In librados, this is accomplished with
+ * rados_selfmanaged_snap_set_write_ctx(). These are more
+ * difficult to manage, but are restricted to specific objects
+ * instead of applying to an entire pool.
+ *
+ * @{
+ */
+
+/**
+ * Create a pool-wide snapshot
+ *
+ * @param io the pool to snapshot
+ * @param snapname the name of the snapshot
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Delete a pool snapshot
+ *
+ * @param io the pool to delete the snapshot from
+ * @param snapname which snapshot to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io,
+ const char *snapname);
+
+/**
+ * Rollback an object to a pool snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapname which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname);
+
+/**
+ * @warning Deprecated: Use rados_ioctx_snap_rollback() instead
+ */
+CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid,
+ const char *snapname)
+ __attribute__((deprecated));
+
+/**
+ * Set the snapshot from which reads are performed.
+ *
+ * Subsequent reads will return data as it was at the time of that
+ * snapshot.
+ *
+ * @param io the io context to change
+ * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no
+ * snapshot (i.e. normal operation)
+ */
+CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io,
+ rados_snap_t snap);
+
+/**
+ * Allocate an ID for a self-managed snapshot
+ *
+ * Get a unique ID to put in the snaphot context to create a
+ * snapshot. A clone of an object is not created until a write with
+ * the new snapshot context is completed.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io,
+ rados_snap_t *snapid,
+ rados_completion_t completion);
+
+/**
+ * Remove a self-managed snapshot
+ *
+ * This increases the snapshot sequence number, which will cause
+ * snapshots to be removed lazily.
+ *
+ * @param io the pool in which the snapshot will exist
+ * @param snapid where to store the newly allocated snapshot ID
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid);
+CEPH_RADOS_API void
+rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io,
+ rados_snap_t snapid,
+ rados_completion_t completion);
+
+/**
+ * Rollback an object to a self-managed snapshot
+ *
+ * The contents of the object will be the same as
+ * when the snapshot was taken.
+ *
+ * @param io the pool in which the object is stored
+ * @param oid the name of the object to rollback
+ * @param snapid which snapshot to rollback to
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io,
+ const char *oid,
+ rados_snap_t snapid);
+
+/**
+ * Set the snapshot context for use when writing to objects
+ *
+ * This is stored in the io context, and applies to all future writes.
+ *
+ * @param io the io context to change
+ * @param seq the newest snapshot sequence number for the pool
+ * @param snaps array of snapshots in sorted by descending id
+ * @param num_snaps how many snaphosts are in the snaps array
+ * @returns 0 on success, negative error code on failure
+ * @returns -EINVAL if snaps are not in descending order
+ */
+CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io,
+ rados_snap_t seq,
+ rados_snap_t *snaps,
+ int num_snaps);
+
+/**
+ * List all the ids of pool snapshots
+ *
+ * If the output array does not have enough space to fit all the
+ * snapshots, -ERANGE is returned and the caller should retry with a
+ * larger array.
+ *
+ * @param io the pool to read from
+ * @param snaps where to store the results
+ * @param maxlen the number of rados_snap_t that fit in the snaps array
+ * @returns number of snapshots on success, negative error code on failure
+ * @returns -ERANGE is returned if the snaps array is too short
+ */
+CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps,
+ int maxlen);
+
+/**
+ * Get the id of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param name the snapshot to find
+ * @param id where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name,
+ rados_snap_t *id);
+
+/**
+ * Get the name of a pool snapshot
+ *
+ * @param io the pool to read from
+ * @param id the snapshot to find
+ * @param name where to store the result
+ * @param maxlen the size of the name array
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the name array is too small
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id,
+ char *name, int maxlen);
+
+/**
+ * Find when a pool snapshot occurred
+ *
+ * @param io the pool the snapshot was taken in
+ * @param id the snapshot to lookup
+ * @param t where to store the result
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id,
+ time_t *t);
+
+/** @} Snapshots */
+
+/**
+ * @name Synchronous I/O
+ * Writes are replicated to a number of OSDs based on the
+ * configuration of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_ioctx_wait_for_complete(). For greater data safety, use the
+ * asynchronous functions and rados_aio_wait_for_safe().
+ *
+ * @{
+ */
+
+/**
+ * Return the version of the last object read or written to.
+ *
+ * This exposes the internal version number of the last object read or
+ * written via this io context
+ *
+ * @param io the io context to check
+ * @returns last read or written object version
+ */
+CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object, starting at
+ * offset *off*. The value of *len* must be <= UINT_MAX/2.
+ *
+ * @note This will never return a positive value not equal to len.
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Write *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Write the same *data_len* bytes from *buf* multiple times into the
+ * *oid* object. *write_len* bytes are written in total, which must be
+ * a multiple of *data_len*. The value of *write_len* and *data_len*
+ * must be <= UINT_MAX/2.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Append *len* bytes from *buf* into the *oid* object. The value of
+ * *len* must be <= UINT_MAX/2.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid,
+ const char *buf, size_t len);
+
+/**
+ * Read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf,
+ size_t len, uint64_t off);
+
+/**
+ * Compute checksum from object data
+ *
+ * The io context determines the snapshot to checksum, if any was set
+ * by rados_ioctx_snap_set_read(). The length of the init_value and
+ * resulting checksum are dependent upon the checksum type:
+ *
+ * XXHASH64: le64
+ * XXHASH32: le32
+ * CRC32C: le32
+ *
+ * The checksum result is encoded the following manner:
+ *
+ * le32 num_checksum_chunks
+ * {
+ * leXX checksum for chunk (where XX = appropriate size for the checksum type)
+ * } * num_checksum_chunks
+ *
+ * @param io the context in which to perform the checksum
+ * @param oid the name of the object to checksum
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param len the number of bytes to checksum
+ * @param off the offset to start checksumming in the object
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result
+ * @param checksum_len the number of bytes available for the result
+ * @return negative error code on failure
+ */
+CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid,
+ rados_checksum_type_t type,
+ const char *init_value, size_t init_value_len,
+ size_t len, uint64_t off, size_t chunk_size,
+ char *pchecksum, size_t checksum_len);
+
+/**
+ * Delete an object
+ *
+ * @note This does not delete any snapshots of the object.
+ *
+ * @param io the pool to delete the object from
+ * @param oid the name of the object to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @param io the context in which to truncate
+ * @param oid the name of the object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid,
+ uint64_t size);
+
+/**
+ * Compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o name of the object
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o,
+ const char *cmp_buf, size_t cmp_len,
+ uint64_t off);
+
+/**
+ * @name Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name, const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Get the next omap key/value pair on the object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key is
+ * null-terminated, and val has length len. If the end of the list has
+ * been reached, key and val are NULL, and len is 0. key and val will
+ * not be accessible after rados_omap_get_end() is called on iter, so
+ * if they are needed after that they should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *len);
+
+/**
+ * Get the next omap key/value pair on the object. Note that it's
+ * perfectly safe to mix calls to rados_omap_get_next and
+ * rados_omap_get_next2.
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post key and val are the next key/value pair. key has length
+ * keylen and val has length vallen. If the end of the list has
+ * been reached, key and val are NULL, and keylen and vallen is 0.
+ * key and val will not be accessible after rados_omap_get_end()
+ * is called on iter, so if they are needed after that they
+ * should be copied.
+ *
+ * @param iter iterator to advance
+ * @param key where to store the key of the next omap entry
+ * @param val where to store the value of the next omap entry
+ * @param key_len where to store the number of bytes in key
+ * @param val_len where to store the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter,
+ char **key,
+ char **val,
+ size_t *key_len,
+ size_t *val_len);
+
+/**
+ * Return number of elements in the iterator
+ *
+ * @param iter the iterator of which to return the size
+ */
+CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter);
+
+/**
+ * Close the omap iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter);
+
+/**
+ * Get object size and most recent update time from the OSD.
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize,
+ time_t *pmtime);
+
+CEPH_RADOS_API int rados_stat2(rados_ioctx_t io, const char *o, uint64_t *psize,
+ struct timespec *pmtime);
+
+/**
+ * Execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param oid the object to call the method on
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns the length of the output, or
+ * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For
+ * methods that don't return data, the return value is
+ * method-specific.
+ */
+CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len, char *buf,
+ size_t out_len);
+
+
+/** @} Synchronous I/O */
+
+/**
+ * @name Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_callback_t
+ * Callbacks for asynchrous operations take two parameters:
+ * - cb the completion that has finished
+ * - arg application defined data made available to the callback function
+ */
+typedef void (*rados_callback_t)(rados_completion_t cb, void *arg);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * TODO: more complete documentation of this elsewhere (in the RADOS docs?)
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all replicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_completion_t *pc);
+
+/**
+ * Constructs a completion to use with asynchronous operations
+ *
+ * The complete callback corresponds to operation being acked.
+ *
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is committed
+ * on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_completion_t *pc);
+
+/**
+ * Block until an operation completes
+ *
+ * This means it is in memory on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c);
+
+/**
+ * Block until an operation is safe
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c)
+ __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c);
+
+/**
+ * Block until an operation completes and callback completes
+ *
+ * This means it is in memory on all replicas and can be read.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c);
+
+/**
+ * Block until an operation is safe and callback has completed
+ *
+ * This means it is on stable storage on all replicas.
+ *
+ * @note BUG: this should be void
+ *
+ * @param c operation to wait for
+ * @returns 0
+ */
+CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c)
+ __attribute__((deprecated));
+
+/**
+ * Has an asynchronous operation and callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is complete
+ */
+CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c);
+
+/**
+ * Is an asynchronous operation safe and has the callback completed
+ *
+ * @param c async operation to inspect
+ * @returns whether c is safe
+ */
+CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c);
+
+/**
+ * Get the return value of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns return value of the operation
+ */
+CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c);
+
+/**
+ * Get the internal object version of the target of an asychronous operation
+ *
+ * The return value is set when the operation is complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operation to inspect
+ * @returns version number of the asychronous operation's target
+ */
+CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c);
+
+/**
+ * Release a completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c completion to release
+ */
+CEPH_RADOS_API void rados_aio_release(rados_completion_t c);
+
+/**
+ * Write data to an object asynchronously
+ *
+ * Queues the write and returns. The return value of the completion
+ * will be 0 on success, negative error code on failure.
+ *
+ * @param io the context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len, uint64_t off);
+
+/**
+ * Asynchronously append data to an object
+ *
+ * Queues the append and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the append is safe and complete
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write an entire object
+ *
+ * The object is filled with the provided data. If the object exists,
+ * it is atomically truncated and then written.
+ * Queues the write_full and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the write_full is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t len);
+
+/**
+ * Asynchronously write the same buffer multiple times
+ *
+ * Queues the writesame and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the io context in which the write will occur
+ * @param oid name of the object
+ * @param completion what to do when the writesame is safe and complete
+ * @param buf data to write
+ * @param data_len length of the data, in bytes
+ * @param write_len the total number of bytes to write
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ const char *buf, size_t data_len,
+ size_t write_len, uint64_t off);
+
+/**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than LIBRADOS_SNAP_HEAD
+ */
+CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously read data from an object
+ *
+ * The io context determines the snapshot to read from, if any was set
+ * by rados_ioctx_snap_set_read().
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @note only the 'complete' callback of the completion will be called.
+ *
+ * @param io the context in which to perform the read
+ * @param oid the name of the object to read from
+ * @param completion what to do when the read is complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ char *buf, size_t len, uint64_t off);
+
+/**
+ * Block until all pending writes in an io context are safe
+ *
+ * This is not equivalent to calling rados_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @note BUG: always returns 0, should be void or accept a timeout
+ *
+ * @param io the context to flush
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io);
+
+
+/**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * rados_aio_flush().
+ *
+ * @param io the context to flush
+ * @param completion what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io,
+ rados_completion_t completion);
+
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param io ioctx
+ * @param o object name
+ * @param completion what to do when the stat is complete
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ uint64_t *psize, time_t *pmtime);
+
+CEPH_RADOS_API int rados_aio_stat2(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ uint64_t *psize, struct timespec *pmtime);
+
+/**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param io the context in which to perform the comparison
+ * @param o the name of the object to compare with
+ * @param completion what to do when the comparison is complete
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off);
+
+/**
+ * Cancel async operation
+ *
+ * @param io ioctx
+ * @param completion completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io,
+ rados_completion_t completion);
+
+/**
+ * Asynchronously execute an OSD class method on an object
+ *
+ * The OSD has a plugin mechanism for performing complicated
+ * operations on an object atomically. These plugins are called
+ * classes. This function allows librados users to call the custom
+ * methods. The input and output formats are defined by the class.
+ * Classes in ceph.git can be found in src/cls subdirectories
+ *
+ * @param io the context in which to call the method
+ * @param o name of the object
+ * @param completion what to do when the exec completes
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param buf where to store output
+ * @param out_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *cls, const char *method,
+ const char *in_buf, size_t in_len,
+ char *buf, size_t out_len);
+
+/** @} Asynchronous I/O */
+
+/**
+ * @name Asynchronous Xattrs
+ * Extended attributes are stored as extended attributes on the files
+ * representing an object on the OSDs. Thus, they have the same
+ * limitations as the underlying filesystem. On ext4, this means that
+ * the total data stored in xattrs cannot exceed 4KB.
+ *
+ * @{
+ */
+
+/**
+ * Asynchronously get the value of an extended attribute on an object.
+ *
+ * @param io the context in which the attribute is read
+ * @param o name of the object
+ * @param completion what to do when the getxattr completes
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, char *buf, size_t len);
+
+/**
+ * Asynchronously set an extended attribute on an object.
+ *
+ * @param io the context in which xattr is set
+ * @param o name of the object
+ * @param completion what to do when the setxattr completes
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name, const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously delete an extended attribute from an object.
+ *
+ * @param io the context in which to delete the xattr
+ * @param o the name of the object
+ * @param completion what to do when the rmxattr completes
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *name);
+
+/**
+ * Asynchronously start iterating over xattrs on an object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param io the context in which to list xattrs
+ * @param oid name of the object
+ * @param completion what to do when the getxattrs completes
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid,
+ rados_completion_t completion,
+ rados_xattrs_iter_t *iter);
+
+/** @} Asynchronous Xattrs */
+
+/**
+ * @name Watch/Notify
+ *
+ * Watch/notify is a protocol to help communicate among clients. It
+ * can be used to sychronize client state. All that's needed is a
+ * well-known object name (for example, rbd uses the header object of
+ * an image).
+ *
+ * Watchers register an interest in an object, and receive all
+ * notifies on that object. A notify attempts to communicate with all
+ * clients watching an object, and blocks on the notifier until each
+ * client responds or a timeout is reached.
+ *
+ * See rados_watch() and rados_notify() for more details.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_watchcb_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param opcode undefined
+ * @param ver version of the watched object
+ * @param arg application-specific data
+ *
+ * @note BUG: opcode is an internal detail that shouldn't be exposed
+ * @note BUG: ver is unused
+ */
+typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg);
+
+/**
+ * @typedef rados_watchcb2_t
+ *
+ * Callback activated when a notify is received on a watched
+ * object.
+ *
+ * @param arg opaque user-defined value provided to rados_watch2()
+ * @param notify_id an id for this notify event
+ * @param handle the watcher handle we are notifying
+ * @param notifier_id the unique client id for the notifier
+ * @param data payload from the notifier
+ * @param data_len length of payload buffer
+ */
+typedef void (*rados_watchcb2_t)(void *arg,
+ uint64_t notify_id,
+ uint64_t handle,
+ uint64_t notifier_id,
+ void *data,
+ size_t data_len);
+
+/**
+ * @typedef rados_watcherrcb_t
+ *
+ * Callback activated when we encounter an error with the watch session.
+ * This can happen when the location of the objects moves within the
+ * cluster and we fail to register our watch with the new object location,
+ * or when our connection with the object OSD is otherwise interrupted and
+ * we may have missed notify events.
+ *
+ * @param pre opaque user-defined value provided to rados_watch2()
+ * @param cookie the internal id assigned to the watch session
+ * @param err error code
+ */
+ typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @note BUG: librados should provide a way for watchers to notice connection resets
+ * @note BUG: the ver parameter does not work, and -ERANGE will never be returned
+ * (See URL tracker.ceph.com/issues/2592)
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param ver expected version of the object
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param arg application defined data to pass when watchcb is called
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if the version of the object is greater than ver
+ */
+CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver,
+ uint64_t *cookie,
+ rados_watchcb_t watchcb, void *arg)
+ __attribute__((deprecated));
+
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to the
+ * primary OSD for a watched object, the watch will be removed after
+ * a timeout configured with osd_client_watch_timeout.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param cookie where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after 30 seconds. Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ void *arg);
+
+/**
+ * Asynchronous register an interest in an object
+ *
+ * A watch operation registers the client as being interested in
+ * notifications on an object. OSDs keep track of watches on
+ * persistent storage, so they are preserved across cluster changes by
+ * the normal recovery process. If the client loses its connection to
+ * the primary OSD for a watched object, the watch will be removed
+ * after the number of seconds that configured in timeout parameter.
+ * Watches are automatically reestablished when a new
+ * connection is made, or a placement group switches OSDs.
+ *
+ * @param io the pool the object is in
+ * @param o the object to watch
+ * @param completion what to do when operation has been attempted
+ * @param handle where to store the internal id assigned to this watch
+ * @param watchcb what to do when a notify is received on this object
+ * @param watcherrcb what to do when the watch session encounters an error
+ * @param timeout how many seconds the connection will keep after disconnection
+ * @param arg opaque value to pass to the callback
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o,
+ rados_completion_t completion, uint64_t *handle,
+ rados_watchcb2_t watchcb,
+ rados_watcherrcb_t watcherrcb,
+ uint32_t timeout,
+ void *arg);
+
+/**
+ * Check on the status of a watch
+ *
+ * Return the number of milliseconds since the watch was last confirmed.
+ * Or, if there has been an error, return that.
+ *
+ * If there is an error, the watch is no longer valid, and should be
+ * destroyed with rados_unwatch2(). The the user is still interested
+ * in the object, a new watch should be created with rados_watch2().
+ *
+ * @param io the pool the object is in
+ * @param cookie the watch handle
+ * @returns ms since last confirmed on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the watched object (ignored)
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie)
+ __attribute__((deprecated));
+
+/**
+ * Unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie);
+
+/**
+ * Asynchronous unregister an interest in an object
+ *
+ * Once this completes, no more notifies will be sent to us for this
+ * watch. This should be called to clean up unneeded watchers.
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param cookie which watch to unregister
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie,
+ rados_completion_t completion);
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * @note BUG: the timeout is not changeable via the C API
+ * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param ver obsolete - just pass zero
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
+ const char *buf, int buf_len)
+ __attribute__((deprecated));
+
+/**
+ * Sychronously notify watchers of an object
+ *
+ * This blocks until all watchers of the object have received and
+ * reacted to the notify, or a timeout is reached.
+ *
+ * The reply buffer is optional. If specified, the client will get
+ * back an encoded buffer that includes the ids of the clients that
+ * acknowledged the notify as well as their notify ack payloads (if
+ * any). Clients that timed out are not included. Even clients that
+ * do not include a notify ack payload are included in the list but
+ * have a 0-length payload associated with them. The format:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ * Note: There may be multiple instances of the same gid if there are
+ * multiple watchers registered via the same client.
+ *
+ * Note: The buffer must be released with rados_buffer_free() when the
+ * user is done with it.
+ *
+ * Note: Since the result buffer includes clients that time out, it
+ * will be set even when rados_notify() returns an error code (like
+ * -ETIMEDOUT).
+ *
+ * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
+ * @param o the name of the object
+ * @param buf data to send to watchers
+ * @param buf_len length of buf in bytes
+ * @param timeout_ms notify timeout (in ms)
+ * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free)
+ * @param reply_buffer_len pointer to size of reply buffer
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+ rados_completion_t completion,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms, char **reply_buffer,
+ size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
+ const char *buf, int buf_len,
+ uint64_t timeout_ms,
+ char **reply_buffer, size_t *reply_buffer_len);
+
+/**
+ * Decode a notify response
+ *
+ * Decode a notify response (from rados_aio_notify() call) into acks and
+ * timeout arrays.
+ *
+ * @param reply_buffer buffer from rados_aio_notify() call
+ * @param reply_buffer_len reply_buffer length
+ * @param acks pointer to struct notify_ack_t pointer
+ * @param nr_acks pointer to ack count
+ * @param timeouts pointer to notify_timeout_t pointer
+ * @param nr_timeouts pointer to timeout count
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len,
+ struct notify_ack_t **acks, size_t *nr_acks,
+ struct notify_timeout_t **timeouts, size_t *nr_timeouts);
+
+/**
+ * Free notify allocated buffer
+ *
+ * Release memory allocated by rados_decode_notify_response() call
+ *
+ * @param acks notify_ack_t struct (from rados_decode_notify_response())
+ * @param nr_acks ack count
+ * @param timeouts notify_timeout_t struct (from rados_decode_notify_response())
+ */
+CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks,
+ struct notify_timeout_t *timeouts);
+
+/**
+ * Acknolwedge receipt of a notify
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param notify_id the notify_id we got on the watchcb2_t callback
+ * @param cookie the watcher handle
+ * @param buf payload to return to notifier (optional)
+ * @param buf_len payload length
+ * @returns 0 on success
+ */
+CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o,
+ uint64_t notify_id, uint64_t cookie,
+ const char *buf, int buf_len);
+
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will block until all pending watch/notify callbacks have
+ * been executed and the queue is empty. It should usually be called
+ * after shutting down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ */
+CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
+/**
+ * Flush watch/notify callbacks
+ *
+ * This call will be nonblock, and the completion will be called
+ * until all pending watch/notify callbacks have been executed and
+ * the queue is empty. It should usually be called after shutting
+ * down any watches before shutting down the ioctx or
+ * librados to ensure that any callbacks do not misuse the ioctx (for
+ * example by calling rados_notify_ack after the ioctx has been
+ * destroyed).
+ *
+ * @param cluster the cluster handle
+ * @param completion what to do when operation has been attempted
+ */
+CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion);
+
+/** @} Watch/Notify */
+
+/**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
+ * @name Hints
+ *
+ * @{
+ */
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it was
+ * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not
+ * guaranteed to do anything on the backend.
+ *
+ * @param io the pool the object is in
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/** @} Hints */
+
+/**
+ * @name Object Operations
+ *
+ * A single rados operation can do multiple operations on one object
+ * atomically. The whole operation will succeed or fail, and no partial
+ * results will be visible.
+ *
+ * Operations may be either reads, which can return data, or writes,
+ * which cannot. The effects of writes are applied and visible all at
+ * once, so an operation that sets an xattr and then checks its value
+ * will not see the updated value.
+ *
+ * @{
+ */
+
+/**
+ * Create a new rados_write_op_t write operation. This will store all actions
+ * to be performed atomically. You must call rados_release_write_op when you are
+ * finished with it.
+ *
+ * @note the ownership of a write operartion is passed to the function
+ * performing the operation, so the same instance of @c rados_write_op_t
+ * cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_write_op_t rados_create_write_op(void);
+
+/**
+ * Free a rados_write_op_t, must be called when you're done with it.
+ * @param write_op operation to deallocate, created with rados_create_write_op
+ */
+CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op);
+
+/**
+ * Set flags for the last operation added to this write_op.
+ * At least one op must have been added to the write_op.
+ * @param write_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op,
+ int flags);
+
+/**
+ * Ensure that the object exists before writing
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before writing. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_write_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_write_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param write_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param write_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that given xattr satisfies comparison.
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param write_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Set an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr
+ * @param value buffer to set xattr to
+ * @param value_len length of buffer to set xattr to
+ */
+CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op,
+ const char *name,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Remove an xattr
+ * @param write_op operation to add this action to
+ * @param name name of the xattr to remove
+ */
+CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op,
+ const char *name);
+
+/**
+ * Create the object
+ * @param write_op operation to add this action to
+ * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or
+ LIBRADOS_CREATE_IDEMPOTENT
+ * will error if the object already exists.
+ * @param category category string (DEPRECATED, HAS NO EFFECT)
+ */
+CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op,
+ int exclusive,
+ const char* category);
+
+/**
+ * Write to offset
+ * @param write_op operation to add this action to
+ * @param offset offset to write to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len,
+ uint64_t offset);
+
+/**
+ * Write whole object, atomically replacing it.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+
+/**
+ * Write the same buffer multiple times
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param data_len length of buffer
+ * @param write_len total number of bytes to write, as a multiple of @c data_len
+ * @param offset offset to write to
+ */
+CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op,
+ const char *buffer,
+ size_t data_len,
+ size_t write_len,
+ uint64_t offset);
+
+/**
+ * Append to end of object.
+ * @param write_op operation to add this action to
+ * @param buffer bytes to write
+ * @param len length of buffer
+ */
+CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op,
+ const char *buffer,
+ size_t len);
+/**
+ * Remove object
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op);
+
+/**
+ * Truncate an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to truncate to
+ */
+CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op,
+ uint64_t offset);
+
+/**
+ * Zero part of an object
+ * @param write_op operation to add this action to
+ * @param offset Offset to zero
+ * @param len length to zero
+ */
+CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op,
+ uint64_t offset,
+ uint64_t len);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * @param write_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ int *prval);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *lens,
+ size_t num);
+
+/**
+ * Set key/value pairs on an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to set
+ * @param vals array of pointers to values to set
+ * @param key_lens array of lengths corresponding to each key
+ * @param val_lens array of lengths corresponding to each value
+ * @param num number of key/value pairs to set
+ */
+CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op,
+ char const* const* keys,
+ char const* const* vals,
+ const size_t *key_lens,
+ const size_t *val_lens,
+ size_t num);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of null-terminated char arrays representing keys to remove
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op,
+ char const* const* keys,
+ size_t keys_len);
+
+/**
+ * Remove key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ * @param keys array of char arrays representing keys to remove
+ * @param key_lens array of size_t values representing length of each key
+ * @param keys_len number of key/value pairs to remove
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op,
+ char const* const* keys,
+ const size_t* key_lens,
+ size_t keys_len);
+
+
+/**
+ * Remove key/value pairs from an object whose keys are in the range
+ * [key_begin, key_end)
+ *
+ * @param write_op operation to add this action to
+ * @param key_begin the lower bound of the key range to remove
+ * @param key_begin_len length of key_begin
+ * @param key_end the upper bound of the key range to remove
+ * @param key_end_len length of key_end
+ */
+CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op,
+ const char *key_begin,
+ size_t key_begin_len,
+ const char *key_end,
+ size_t key_end_len);
+
+/**
+ * Remove all key/value pairs from an object
+ *
+ * @param write_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+
+/**
+ * Set allocation hint for an object
+ *
+ * @param write_op operation to add this action to
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags hints about future IO patterns
+ */
+CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+/**
+ * Perform a write operation synchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+
+CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ const char *oid,
+ struct timespec *mtime,
+ int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ time_t *mtime,
+ int flags);
+
+/**
+ * Perform a write operation asynchronously
+ * @param write_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param mtime the time to set the mtime to, NULL for the current time
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_write_op_operate2(rados_write_op_t write_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ struct timespec *mtime,
+ int flags);
+
+/**
+ * Create a new rados_read_op_t read operation. This will store all
+ * actions to be performed atomically. You must call
+ * rados_release_read_op when you are finished with it (after it
+ * completes, or you decide not to send it in the first place).
+ *
+ * @note the ownership of a read operartion is passed to the function
+ * performing the operation, so the same instance of @c rados_read_op_t
+ * cannot be used again after being performed.
+ *
+ * @returns non-NULL on success, NULL on memory allocation error.
+ */
+CEPH_RADOS_API rados_read_op_t rados_create_read_op(void);
+
+/**
+ * Free a rados_read_op_t, must be called when you're done with it.
+ * @param read_op operation to deallocate, created with rados_create_read_op
+ */
+CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op);
+
+/**
+ * Set flags for the last operation added to this read_op.
+ * At least one op must have been added to the read_op.
+ * @param read_op operation to add this action to
+ * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG
+ */
+CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags);
+
+/**
+ * Ensure that the object exists before reading
+ * @param read_op operation to add this action to
+ */
+CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op);
+
+/**
+ * Ensure that the object exists and that its internal version
+ * number is equal to "ver" before reading. "ver" should be a
+ * version number previously obtained with rados_get_last_version().
+ * - If the object's version is greater than the asserted version
+ * then rados_read_op_operate will return -ERANGE instead of
+ * executing the op.
+ * - If the object's version is less than the asserted version
+ * then rados_read_op_operate will return -EOVERFLOW instead
+ * of executing the op.
+ * @param read_op operation to add this action to
+ * @param ver object version number
+ */
+CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver);
+
+/**
+ * Ensure that given object range (extent) satisfies comparison.
+ *
+ * @param read_op operation to add this action to
+ * @param cmp_buf buffer containing bytes to be compared with object contents
+ * @param cmp_len length to compare and size of @c cmp_buf in bytes
+ * @param off object byte offset at which to start the comparison
+ * @param prval returned result of comparison, 0 on success, negative error code
+ * on failure, (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op,
+ const char *cmp_buf,
+ size_t cmp_len,
+ uint64_t off,
+ int *prval);
+
+/**
+ * Ensure that the an xattr satisfies a comparison
+ * If the comparison is not satisfied, the return code of the
+ * operation will be -ECANCELED
+ * @param read_op operation to add this action to
+ * @param name name of the xattr to look up
+ * @param comparison_operator currently undocumented, look for
+ * LIBRADOS_CMPXATTR_OP_EQ in librados.h
+ * @param value buffer to compare actual xattr value to
+ * @param value_len length of buffer to compare actual xattr value to
+ */
+CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op,
+ const char *name,
+ uint8_t comparison_operator,
+ const char *value,
+ size_t value_len);
+
+/**
+ * Start iterating over xattrs on an object.
+ *
+ * @param read_op operation to add this action to
+ * @param iter where to store the iterator
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op,
+ rados_xattrs_iter_t *iter,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Ensure that the an omap value satisfies a comparison,
+ * with the supplied value on the right hand side (i.e.
+ * for OP_LT, the comparison is actual_value < value.
+ *
+ * @param read_op operation to add this action to
+ * @param key which omap value to compare
+ * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ,
+ LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT
+ * @param val value to compare with
+ * @param key_len length of key in bytes
+ * @param val_len length of value in bytes
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op,
+ const char *key,
+ uint8_t comparison_operator,
+ const char *val,
+ size_t key_len,
+ size_t val_len,
+ int *prval);
+
+/**
+ * Get object size and mtime
+ * @param read_op operation to add this action to
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op,
+ uint64_t *psize,
+ time_t *pmtime,
+ int *prval);
+
+CEPH_RADOS_API void rados_read_op_stat2(rados_read_op_t read_op,
+ uint64_t *psize,
+ struct timespec *pmtime,
+ int *prval);
+/**
+ * Read bytes from offset into buffer.
+ *
+ * prlen will be filled with the number of bytes read if successful.
+ * A short read can only occur if the read reaches the end of the
+ * object.
+ *
+ * @param read_op operation to add this action to
+ * @param offset offset to read from
+ * @param len length of buffer
+ * @param buffer where to put the data
+ * @param bytes_read where to store the number of bytes read by this action
+ * @param prval where to store the return value of this action
+ */
+CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op,
+ uint64_t offset,
+ size_t len,
+ char *buffer,
+ size_t *bytes_read,
+ int *prval);
+
+/**
+ * Compute checksum from object data
+ *
+ * @param read_op operation to add this action to
+ * @param type the checksum algorithm to utilize
+ * @param init_value the init value for the algorithm
+ * @param init_value_len the length of the init value
+ * @param offset the offset to start checksumming in the object
+ * @param len the number of bytes to checksum
+ * @param chunk_size optional length-aligned chunk size for checksums
+ * @param pchecksum where to store the checksum result for this action
+ * @param checksum_len the number of bytes available for the result
+ * @param prval where to store the return value for this action
+ */
+CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op,
+ rados_checksum_type_t type,
+ const char *init_value,
+ size_t init_value_len,
+ uint64_t offset, size_t len,
+ size_t chunk_size, char *pchecksum,
+ size_t checksum_len, int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * The output buffer is allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf where to put librados-allocated output buffer
+ * @param out_len length of out_buf in bytes
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char **out_buf,
+ size_t *out_len,
+ int *prval);
+
+/**
+ * Execute an OSD class method on an object
+ * See rados_exec() for general description.
+ *
+ * If the output buffer is too small, prval will
+ * be set to -ERANGE and used_len will be 0.
+ *
+ * @param read_op operation to add this action to
+ * @param cls the name of the class
+ * @param method the name of the method
+ * @param in_buf where to find input
+ * @param in_len length of in_buf in bytes
+ * @param out_buf user-provided buffer to read into
+ * @param out_len length of out_buf in bytes
+ * @param used_len where to store the number of bytes read into out_buf
+ * @param prval where to store the return value from the method
+ */
+CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op,
+ const char *cls,
+ const char *method,
+ const char *in_buf,
+ size_t in_len,
+ char *out_buf,
+ size_t out_len,
+ size_t *used_len,
+ int *prval);
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over key/value pairs on an object.
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param filter_prefix list only keys beginning with filter_prefix
+ * @param max_return list no more than max_return key/value pairs
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op,
+ const char *start_after,
+ const char *filter_prefix,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ int *prval)
+ __attribute__((deprecated)); /* use v2 below */
+
+/**
+ * Start iterating over keys on an object.
+ *
+ * They will be returned sorted by key, and the iterator
+ * will fill in NULL for all values if specified.
+ *
+ * @param read_op operation to add this action to
+ * @param start_after list keys starting after start_after
+ * @param max_return list no more than max_return keys
+ * @param iter where to store the iterator
+ * @param pmore flag indicating whether there are more keys to fetch
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op,
+ const char *start_after,
+ uint64_t max_return,
+ rados_omap_iter_t *iter,
+ unsigned char *pmore,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to null-terminated keys to get
+ * @param keys_len the number of strings in keys
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t keys_len,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Start iterating over specific key/value pairs
+ *
+ * They will be returned sorted by key.
+ *
+ * @param read_op operation to add this action to
+ * @param keys array of pointers to keys to get
+ * @param num_keys the number of strings in keys
+ * @param key_lens array of size_t's describing each key len (in bytes)
+ * @param iter where to store the iterator
+ * @param prval where to store the return value from this action
+ */
+CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op,
+ char const* const* keys,
+ size_t num_keys,
+ const size_t* key_lens,
+ rados_omap_iter_t *iter,
+ int *prval);
+
+/**
+ * Perform a read operation synchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ const char *oid,
+ int flags);
+
+/**
+ * Perform a read operation asynchronously
+ * @param read_op operation to perform
+ * @param io the ioctx that the object is in
+ * @param completion what to do when operation has been attempted
+ * @param oid the object id
+ * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*)
+ */
+CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op,
+ rados_ioctx_t io,
+ rados_completion_t completion,
+ const char *oid,
+ int flags);
+
+/** @} Object Operations */
+
+/**
+ * Take an exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param oid the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid,
+ const char * name, const char * cookie,
+ const char * desc,
+ struct timeval * duration,
+ uint8_t flags);
+
+/**
+ * Take a shared lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for this instance of the lock
+ * @param tag The tag of the lock
+ * @param desc user-defined lock description
+ * @param duration the duration of the lock. Set to NULL for infinite duration.
+ * @param flags lock flags
+ * @returns 0 on success, negative error code on failure
+ * @returns -EBUSY if the lock is already held by another (client, cookie) pair
+ * @returns -EEXIST if the lock is already held by the same (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o,
+ const char * name, const char * cookie,
+ const char * tag, const char * desc,
+ struct timeval * duration, uint8_t flags);
+
+/**
+ * Release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ */
+CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie);
+
+/**
+ * Asynchronous release a shared or exclusive lock on an object.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @param completion what to do when operation has been attempted
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o,
+ const char *name, const char *cookie,
+ rados_completion_t completion);
+
+/**
+ * List clients that have locked the named object lock and information about
+ * the lock.
+ *
+ * The number of bytes required in each buffer is put in the
+ * corresponding size out parameter. If any of the provided buffers
+ * are too short, -ERANGE is returned after these sizes are filled in.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param exclusive where to store whether the lock is exclusive (1) or shared (0)
+ * @param tag where to store the tag associated with the object lock
+ * @param tag_len number of bytes in tag buffer
+ * @param clients buffer in which locker clients are stored, separated by '\0'
+ * @param clients_len number of bytes in the clients buffer
+ * @param cookies buffer in which locker cookies are stored, separated by '\0'
+ * @param cookies_len number of bytes in the cookies buffer
+ * @param addrs buffer in which locker addresses are stored, separated by '\0'
+ * @param addrs_len number of bytes in the clients buffer
+ * @returns number of lockers on success, negative error code on failure
+ * @returns -ERANGE if any of the buffers are too short
+ */
+CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o,
+ const char *name, int *exclusive,
+ char *tag, size_t *tag_len,
+ char *clients, size_t *clients_len,
+ char *cookies, size_t *cookies_len,
+ char *addrs, size_t *addrs_len);
+
+/**
+ * Releases a shared or exclusive lock on an object, which was taken by the
+ * specified client.
+ *
+ * @param io the context to operate in
+ * @param o the name of the object
+ * @param name the name of the lock
+ * @param client the client currently holding the lock
+ * @param cookie user-defined identifier for the instance of the lock
+ * @returns 0 on success, negative error code on failure
+ * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair
+ * @returns -EINVAL if the client cannot be parsed
+ */
+CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o,
+ const char *name, const char *client,
+ const char *cookie);
+
+/**
+ * Blocklists the specified client from the OSDs
+ *
+ * @param cluster cluster handle
+ * @param client_address client address
+ * @param expire_seconds number of seconds to blocklist (0 for default)
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_blocklist_add(rados_t cluster,
+ char *client_address,
+ uint32_t expire_seconds);
+CEPH_RADOS_API int rados_blacklist_add(rados_t cluster,
+ char *client_address,
+ uint32_t expire_seconds)
+ __attribute__((deprecated));
+
+/**
+ * Gets addresses of the RADOS session, suitable for blocklisting.
+ *
+ * @param cluster cluster handle
+ * @param addrs the output string.
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs);
+
+CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io)
+ __attribute__((deprecated));
+
+CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io);
+
+CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io);
+
+/**
+ * Enable an application on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param force 0 if only single application per pool
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io,
+ const char *app_name, int force);
+
+/**
+ * List all enabled applications
+ *
+ * If the provided buffer is too short, the required length is filled in and
+ * -ERANGE is returned. Otherwise, the buffers are filled with the application
+ * names, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param values buffer in which to store application names
+ * @param values_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values,
+ size_t *values_len);
+
+/**
+ * Get application metadata value from pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value result buffer
+ * @param value_len maximum len of value
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io,
+ const char *app_name,
+ const char *key, char *value,
+ size_t *value_len);
+
+/**
+ * Set application metadata on a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @param value metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io,
+ const char *app_name,
+ const char *key,
+ const char *value);
+
+/**
+ * Remove application metadata from a pool
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param key metadata key
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io,
+ const char *app_name,
+ const char *key);
+
+/**
+ * List all metadata key/value pairs associated with an application.
+ *
+ * This iterates over all metadata, key_len and val_len are filled in
+ * with the number of bytes put into the keys and values buffers.
+ *
+ * If the provided buffers are too short, the required lengths are filled
+ * in and -ERANGE is returned. Otherwise, the buffers are filled with
+ * the keys and values of the metadata, with a '\0' after each.
+ *
+ * @param io pool ioctx
+ * @param app_name application name
+ * @param keys buffer in which to store key names
+ * @param key_len number of bytes in keys buffer
+ * @param values buffer in which to store values
+ * @param vals_len number of bytes in values buffer
+ * @returns 0 on success, negative error code on failure
+ * @returns -ERANGE if either buffer is too short
+ */
+CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io,
+ const char *app_name,
+ char *keys, size_t *key_len,
+ char *values,
+ size_t *vals_len);
+
+/**
+ * @name Mon/OSD/PG Commands
+ *
+ * These interfaces send commands relating to the monitor, OSD, or PGs.
+ *
+ * @{
+ */
+
+/**
+ * Send monitor command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send ceph-mgr command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send ceph-mgr tell command.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name mgr name to target
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mgr_command_target(
+ rados_t cluster,
+ const char *name,
+ const char **cmd,
+ size_t cmdlen, const char *inbuf,
+ size_t inbuflen, char **outbuf,
+ size_t *outbuflen, char **outs,
+ size_t *outslen);
+
+/**
+ * Send monitor command to a specific monitor.
+ *
+ * @note Takes command string in carefully-formatted JSON; must match
+ * defined commands, types, etc.
+ *
+ * The result buffers are allocated on the heap; the caller is
+ * expected to release that memory with rados_buffer_free(). The
+ * buffer and length pointers can all be NULL, in which case they are
+ * not filled in.
+ *
+ * @param cluster cluster handle
+ * @param name target monitor's name
+ * @param cmd an array of char *'s representing the command
+ * @param cmdlen count of valid entries in cmd
+ * @param inbuf any bulk input data (crush map, etc.)
+ * @param inbuflen input buffer length
+ * @param outbuf double pointer to output buffer
+ * @param outbuflen pointer to output buffer length
+ * @param outs double pointer to status string
+ * @param outslen pointer to status string length
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/**
+ * free a rados-allocated buffer
+ *
+ * Release memory allocated by librados calls like rados_mon_command().
+ *
+ * @param buf buffer pointer
+ */
+CEPH_RADOS_API void rados_buffer_free(char *buf);
+
+CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr,
+ const char **cmd, size_t cmdlen,
+ const char *inbuf, size_t inbuflen,
+ char **outbuf, size_t *outbuflen,
+ char **outs, size_t *outslen);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback_t)(void *arg,
+ const char *line,
+ const char *who,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+/*
+ * This is not a doxygen comment leadin, because doxygen breaks on
+ * a typedef with function params and returns, and I can't figure out
+ * how to fix it.
+ *
+ * Monitor cluster log
+ *
+ * Monitor events logged to the cluster log. The callback get each
+ * log entry both as a single formatted line and with each field in a
+ * separate arg.
+ *
+ * Calling with a cb argument of NULL will deregister any previously
+ * registered callback.
+ *
+ * @param cluster cluster handle
+ * @param level minimum log level (debug, info, warn|warning, err|error)
+ * @param cb callback to run for each log message. It MUST NOT block
+ * nor call back into librados.
+ * @param arg void argument to pass to cb
+ *
+ * @returns 0 on success, negative code on error
+ */
+typedef void (*rados_log_callback2_t)(void *arg,
+ const char *line,
+ const char *channel,
+ const char *who,
+ const char *name,
+ uint64_t sec, uint64_t nsec,
+ uint64_t seq, const char *level,
+ const char *msg);
+
+CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level,
+ rados_log_callback_t cb, void *arg);
+CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level,
+ rados_log_callback2_t cb, void *arg);
+
+
+/**
+ * register daemon instance for a service
+ *
+ * Register us as a daemon providing a particular service. We identify
+ * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname').
+ * The metadata is a map of keys and values with arbitrary static metdata
+ * for this instance. The encoding is a series of NULL-terminated strings,
+ * alternating key names and values, terminating with an empty key name.
+ * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}.
+ *
+ * For the lifetime of the librados instance, regular beacons will be sent
+ * to the cluster to maintain our registration in the service map.
+ *
+ * @param cluster handle
+ * @param service service name
+ * @param daemon daemon instance name
+ * @param metadata_dict static daemon metadata dict
+ */
+CEPH_RADOS_API int rados_service_register(
+ rados_t cluster,
+ const char *service,
+ const char *daemon,
+ const char *metadata_dict);
+
+/**
+ * update daemon status
+ *
+ * Update our mutable status information in the service map.
+ *
+ * The status dict is encoded the same way the daemon metadata is encoded
+ * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is
+ * {foo=bar,this=that}.
+ *
+ * @param cluster rados cluster handle
+ * @param status_dict status dict
+ */
+CEPH_RADOS_API int rados_service_update_status(
+ rados_t cluster,
+ const char *status_dict);
+
+/** @} Mon/OSD/PG commands */
+
+/*
+ * These methods are no longer supported and return -ENOTSUP where possible.
+ */
+CEPH_RADOS_API int rados_objects_list_open(
+ rados_ioctx_t io,
+ rados_list_ctx_t *ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+CEPH_RADOS_API uint32_t rados_objects_list_seek(
+ rados_list_ctx_t ctx,
+ uint32_t pos) __attribute__((deprecated));
+CEPH_RADOS_API int rados_objects_list_next(
+ rados_list_ctx_t ctx,
+ const char **entry,
+ const char **key) __attribute__((deprecated));
+CEPH_RADOS_API void rados_objects_list_close(
+ rados_list_ctx_t ctx) __attribute__((deprecated));
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
new file mode 100644
index 000000000..cb8261af1
--- /dev/null
+++ b/src/include/rados/librados.hpp
@@ -0,0 +1,1568 @@
+#ifndef __LIBRADOS_HPP
+#define __LIBRADOS_HPP
+
+#include <string>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+#include <utility>
+#include "buffer.h"
+
+#include "librados.h"
+#include "librados_fwd.hpp"
+#include "rados_types.hpp"
+
+namespace libradosstriper
+{
+ class RadosStriper;
+}
+
+namespace neorados { class RADOS; }
+
+namespace librados {
+
+using ceph::bufferlist;
+
+struct AioCompletionImpl;
+struct IoCtxImpl;
+struct ListObjectImpl;
+class NObjectIteratorImpl;
+struct ObjListCtx;
+class ObjectOperationImpl;
+struct PlacementGroupImpl;
+struct PoolAsyncCompletionImpl;
+
+typedef struct rados_cluster_stat_t cluster_stat_t;
+typedef struct rados_pool_stat_t pool_stat_t;
+
+typedef void *list_ctx_t;
+typedef uint64_t auid_t;
+typedef void *config_t;
+
+typedef struct {
+ std::string client;
+ std::string cookie;
+ std::string address;
+} locker_t;
+
+typedef std::map<std::string, pool_stat_t> stats_map;
+
+typedef void *completion_t;
+typedef void (*callback_t)(completion_t cb, void *arg);
+
+inline namespace v14_2_0 {
+
+ class IoCtx;
+ class RadosClient;
+
+ class CEPH_RADOS_API ListObject
+ {
+ public:
+ const std::string& get_nspace() const;
+ const std::string& get_oid() const;
+ const std::string& get_locator() const;
+
+ ListObject();
+ ~ListObject();
+ ListObject( const ListObject&);
+ ListObject& operator=(const ListObject& rhs);
+ private:
+ ListObject(ListObjectImpl *impl);
+
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& out, const ListObject& lop);
+
+ ListObjectImpl *impl;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop);
+
+ class CEPH_RADOS_API NObjectIterator;
+
+ class CEPH_RADOS_API ObjectCursor
+ {
+ public:
+ ObjectCursor();
+ ObjectCursor(const ObjectCursor &rhs);
+ explicit ObjectCursor(rados_object_list_cursor c);
+ ~ObjectCursor();
+ ObjectCursor& operator=(const ObjectCursor& rhs);
+ bool operator<(const ObjectCursor &rhs) const;
+ bool operator==(const ObjectCursor &rhs) const;
+ void set(rados_object_list_cursor c);
+
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+ friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ std::string to_str() const;
+ bool from_str(const std::string& s);
+
+ protected:
+ rados_object_list_cursor c_cursor;
+ };
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc);
+
+ class CEPH_RADOS_API NObjectIterator {
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = ListObject;
+ using difference_type = std::ptrdiff_t;
+ using pointer = ListObject*;
+ using reference = ListObject&;
+ static const NObjectIterator __EndObjectIterator;
+ NObjectIterator(): impl(NULL) {}
+ ~NObjectIterator();
+ NObjectIterator(const NObjectIterator &rhs);
+ NObjectIterator& operator=(const NObjectIterator& rhs);
+
+ bool operator==(const NObjectIterator& rhs) const;
+ bool operator!=(const NObjectIterator& rhs) const;
+ const ListObject& operator*() const;
+ const ListObject* operator->() const;
+ NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions
+ NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions
+ friend class IoCtx;
+ friend class librados::NObjectIteratorImpl;
+
+ /// get current hash position of the iterator, rounded to the current pg
+ uint32_t get_pg_hash_position() const;
+
+ /// move the iterator to a given hash position. this may (will!) be rounded
+ /// to the nearest pg. errors are thrown as exceptions
+ uint32_t seek(uint32_t pos);
+
+ /// move the iterator to a given cursor position. errors are thrown as exceptions
+ uint32_t seek(const ObjectCursor& cursor);
+
+ /// get current cursor position
+ ObjectCursor get_cursor();
+
+ /**
+ * Configure PGLS filter to be applied OSD-side (requires caller
+ * to know/understand the format expected by the OSD)
+ */
+ void set_filter(const bufferlist &bl);
+
+ private:
+ NObjectIterator(ObjListCtx *ctx_);
+ void get_next();
+ NObjectIteratorImpl *impl;
+ };
+
+ class CEPH_RADOS_API ObjectItem
+ {
+ public:
+ std::string oid;
+ std::string nspace;
+ std::string locator;
+ };
+
+ /// DEPRECATED; do not use
+ class CEPH_RADOS_API WatchCtx {
+ public:
+ virtual ~WatchCtx();
+ virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0;
+ };
+
+ class CEPH_RADOS_API WatchCtx2 {
+ public:
+ virtual ~WatchCtx2();
+ /**
+ * Callback activated when we receive a notify event.
+ *
+ * @param notify_id unique id for this notify event
+ * @param cookie the watcher we are notifying
+ * @param notifier_id the unique client id of the notifier
+ * @param bl opaque notify payload (from the notifier)
+ */
+ virtual void handle_notify(uint64_t notify_id,
+ uint64_t cookie,
+ uint64_t notifier_id,
+ bufferlist& bl) = 0;
+
+ /**
+ * Callback activated when we encounter an error with the watch.
+ *
+ * Errors we may see:
+ * -ENOTCONN : our watch was disconnected
+ * -ETIMEDOUT : our watch is still valid, but we may have missed
+ * a notify event.
+ *
+ * @param cookie the watcher with the problem
+ * @param err error
+ */
+ virtual void handle_error(uint64_t cookie, int err) = 0;
+ };
+
+ struct CEPH_RADOS_API AioCompletion {
+ AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {}
+ ~AioCompletion();
+ int set_complete_callback(void *cb_arg, callback_t cb);
+ int set_safe_callback(void *cb_arg, callback_t cb)
+ __attribute__ ((deprecated));
+ int wait_for_complete();
+ int wait_for_safe() __attribute__ ((deprecated));
+ int wait_for_complete_and_cb();
+ int wait_for_safe_and_cb() __attribute__ ((deprecated));
+ bool is_complete();
+ bool is_safe() __attribute__ ((deprecated));
+ bool is_complete_and_cb();
+ bool is_safe_and_cb() __attribute__ ((deprecated));
+ int get_return_value();
+ int get_version() __attribute__ ((deprecated));
+ uint64_t get_version64();
+ void release();
+ AioCompletionImpl *pc;
+ };
+
+ struct CEPH_RADOS_API PoolAsyncCompletion {
+ PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {}
+ ~PoolAsyncCompletion();
+ int set_callback(void *cb_arg, callback_t cb);
+ int wait();
+ bool is_complete();
+ int get_return_value();
+ void release();
+ PoolAsyncCompletionImpl *pc;
+ };
+
+ /**
+ * These are per-op flags which may be different among
+ * ops added to an ObjectOperation.
+ */
+ enum ObjectOperationFlags {
+ OP_EXCL = LIBRADOS_OP_FLAG_EXCL,
+ OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK,
+ OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM,
+ OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL,
+ OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED,
+ OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED,
+ OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE,
+ };
+
+ class CEPH_RADOS_API ObjectOperationCompletion {
+ public:
+ virtual ~ObjectOperationCompletion() {}
+ virtual void handle_completion(int r, bufferlist& outbl) = 0;
+ };
+
+ /**
+ * These flags apply to the ObjectOperation as a whole.
+ *
+ * Prior to octopus BALANCE_READS and LOCALIZE_READS should only
+ * be used when reading from data you're certain won't change, like
+ * a snapshot, or where eventual consistency is ok. Since octopus
+ * (get_min_compatible_osd() >= CEPH_RELEASE_OCTOPUS) both are safe
+ * for general use.
+ *
+ * ORDER_READS_WRITES will order reads the same way writes are
+ * ordered (e.g., waiting for degraded objects). In particular, it
+ * will make a write followed by a read sequence be preserved.
+ *
+ * IGNORE_CACHE will skip the caching logic on the OSD that normally
+ * handles promotion of objects between tiers. This allows an operation
+ * to operate (or read) the cached (or uncached) object, even if it is
+ * not coherent.
+ *
+ * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and
+ * process the op directly on the destination pool. This is useful
+ * for CACHE_FLUSH and CACHE_EVICT operations.
+ */
+ enum ObjectOperationGlobalFlags {
+ OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG,
+ OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS,
+ OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS,
+ OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES,
+ OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE,
+ OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS,
+ OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY,
+ // send requests to cluster despite the cluster or pool being
+ // marked full; ops will either succeed (e.g., delete) or return
+ // EDQUOT or ENOSPC
+ OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY,
+ // mainly for delete
+ OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE,
+ OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT,
+ OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP,
+ // enable/allow return value and per-op return code/buffers
+ OPERATION_RETURNVEC = LIBRADOS_OPERATION_RETURNVEC,
+ };
+
+ /*
+ * Alloc hint flags for the alloc_hint operation.
+ */
+ enum AllocHintFlags {
+ ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ ALLOC_HINT_FLAG_LONGLIVED = 128,
+ ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+ };
+
+ /*
+ * ObjectOperation : compound object operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectOperation
+ {
+ public:
+ ObjectOperation();
+ virtual ~ObjectOperation();
+
+ ObjectOperation(const ObjectOperation&) = delete;
+ ObjectOperation& operator=(const ObjectOperation&) = delete;
+
+ /**
+ * Move constructor.
+ * \warning A moved from ObjectOperation is invalid and may not be used for
+ * any purpose. This is a hard contract violation and will
+ * kill your program.
+ */
+ ObjectOperation(ObjectOperation&&);
+ ObjectOperation& operator =(ObjectOperation&&);
+
+ size_t size();
+ void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated));
+ //flag mean ObjectOperationFlags
+ void set_op_flags2(int flags);
+
+ void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval);
+ void cmpxattr(const char *name, uint8_t op, const bufferlist& val);
+ void cmpxattr(const char *name, uint8_t op, uint64_t v);
+ void exec(const char *cls, const char *method, bufferlist& inbl);
+ void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval);
+ void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion);
+ /**
+ * Guard operation with a check that object version == ver
+ *
+ * @param ver [in] version to check
+ */
+ void assert_version(uint64_t ver);
+
+ /**
+ * Guard operation with a check that the object already exists
+ */
+ void assert_exists();
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param assertions [in] comparison assertions
+ * @param prval [out] place error code in prval upon completion
+ *
+ * assertions has the form of mappings from keys to (comparison rval, assertion)
+ * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ].
+ *
+ * That is, to assert that the value at key 'foo' is greater than 'bar':
+ *
+ * ObjectReadOperation op;
+ * int r;
+ * map<string, pair<bufferlist, int> > assertions;
+ * bufferlist bar(string('bar'));
+ * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT);
+ * op.omap_cmp(assertions, &r);
+ */
+ void omap_cmp(
+ const std::map<std::string, std::pair<bufferlist, int> > &assertions,
+ int *prval);
+
+ protected:
+ ObjectOperationImpl* impl;
+ friend class IoCtx;
+ friend class Rados;
+ };
+
+ /*
+ * ObjectWriteOperation : compound object write operation
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation
+ {
+ protected:
+ time_t *unused;
+ public:
+ ObjectWriteOperation() : unused(NULL) {}
+ ~ObjectWriteOperation() override {}
+
+ ObjectWriteOperation(ObjectWriteOperation&&) = default;
+ ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default;
+
+ void mtime(time_t *pt);
+ void mtime2(struct timespec *pts);
+
+ void create(bool exclusive);
+ void create(bool exclusive,
+ const std::string& category); ///< NOTE: category is unused
+
+ void write(uint64_t off, const bufferlist& bl);
+ void write_full(const bufferlist& bl);
+ void writesame(uint64_t off, uint64_t write_len,
+ const bufferlist& bl);
+ void append(const bufferlist& bl);
+ void remove();
+ void truncate(uint64_t off);
+ void zero(uint64_t off, uint64_t len);
+ void rmxattr(const char *name);
+ void setxattr(const char *name, const bufferlist& bl);
+ void setxattr(const char *name, const bufferlist&& bl);
+ void tmap_update(const bufferlist& cmdbl);
+ void tmap_put(const bufferlist& bl);
+ void selfmanaged_snap_rollback(uint64_t snapid);
+
+ /**
+ * Rollback an object to the specified snapshot id
+ *
+ * Used with pool snapshots
+ *
+ * @param snapid [in] snopshot id specified
+ */
+ void snap_rollback(uint64_t snapid);
+
+ /**
+ * set keys and values according to map
+ *
+ * @param map [in] keys and values to set
+ */
+ void omap_set(const std::map<std::string, bufferlist> &map);
+
+ /**
+ * set header
+ *
+ * @param bl [in] header to set
+ */
+ void omap_set_header(const bufferlist &bl);
+
+ /**
+ * Clears omap contents
+ */
+ void omap_clear();
+
+ /**
+ * Clears keys in to_rm
+ *
+ * @param to_rm [in] keys to remove
+ */
+ void omap_rm_keys(const std::set<std::string> &to_rm);
+
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress).
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param src_version current version of the source object
+ * @param src_fadvise_flags the fadvise flags for source object
+ */
+ void copy_from(const std::string& src, const IoCtx& src_ioctx,
+ uint64_t src_version, uint32_t src_fadvise_flags);
+
+ /**
+ * Copy an object
+ *
+ * Copies an object from another location. The operation is atomic in that
+ * the copy either succeeds in its entirety or fails (e.g., because the
+ * source object was modified while the copy was in progress). Instead of
+ * copying truncate_seq and truncate_size from the source object it receives
+ * these values as parameters.
+ *
+ * @param src source object name
+ * @param src_ioctx ioctx for the source object
+ * @param src_version current version of the source object
+ * @param truncate_seq truncate sequence for the destination object
+ * @param truncate_size truncate size for the destination object
+ * @param src_fadvise_flags the fadvise flags for source object
+ */
+ void copy_from2(const std::string& src, const IoCtx& src_ioctx,
+ uint64_t src_version, uint32_t truncate_seq,
+ uint64_t truncate_size, uint32_t src_fadvise_flags);
+
+ /**
+ * undirty an object
+ *
+ * Clear an objects dirty flag
+ */
+ void undirty();
+
+ /**
+ * Set allocation hint for an object
+ *
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @param flags flags ()
+ */
+ void set_alloc_hint(uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ void set_alloc_hint2(uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ void cache_pin();
+ void cache_unpin();
+
+ /**
+ * Extensible tier
+ *
+ * Set redirect target
+ */
+ void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+ uint64_t tgt_version, int flag = 0);
+ void tier_promote();
+ void unset_manifest();
+
+ friend class IoCtx;
+ };
+
+ /*
+ * ObjectReadOperation : compound object operation that return value
+ * Batch multiple object operations into a single request, to be applied
+ * atomically.
+ */
+ class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation
+ {
+ public:
+ ObjectReadOperation() {}
+ ~ObjectReadOperation() override {}
+
+ ObjectReadOperation(ObjectReadOperation&&) = default;
+ ObjectReadOperation& operator =(ObjectReadOperation&&) = default;
+
+ void stat(uint64_t *psize, time_t *pmtime, int *prval);
+ void stat2(uint64_t *psize, struct timespec *pts, int *prval);
+ void getxattr(const char *name, bufferlist *pbl, int *prval);
+ void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval);
+ void read(size_t off, uint64_t len, bufferlist *pbl, int *prval);
+ void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl,
+ uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl,
+ int *prval);
+
+ /**
+ * see aio_sparse_read()
+ */
+ void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m,
+ bufferlist *data_bl, int *prval,
+ uint64_t truncate_size = 0,
+ uint32_t truncate_seq = 0);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list no keys smaller than start_after
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_vals: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_vals2: keys and values from the object omap
+ *
+ * Get up to max_return keys and values beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param filter_prefix [in] list only keys beginning with filter_prefix
+ * @param max_return [in] list no more than max_return key/value pairs
+ * @param out_vals [out] place returned values in out_vals on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals2(
+ const std::string &start_after,
+ const std::string &filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore,
+ int *prval);
+
+
+ /**
+ * omap_get_keys: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ int *prval) __attribute__ ((deprecated)); // use v2
+
+ /**
+ * omap_get_keys2: keys from the object omap
+ *
+ * Get up to max_return keys beginning after start_after
+ *
+ * @param start_after [in] list keys starting after start_after
+ * @param max_return [in] list no more than max_return keys
+ * @param out_keys [out] place returned values in out_keys on completion
+ * @param pmore [out] pointer to bool indicating whether there are more keys
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_keys2(const std::string &start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore,
+ int *prval);
+
+ /**
+ * omap_get_header: get header from object omap
+ *
+ * @param header [out] place header here upon completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_header(bufferlist *header, int *prval);
+
+ /**
+ * get key/value pairs for specified keys
+ *
+ * @param keys [in] keys to get
+ * @param map [out] place key/value pairs found here on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void omap_get_vals_by_keys(const std::set<std::string> &keys,
+ std::map<std::string, bufferlist> *map,
+ int *prval);
+
+ /**
+ * list_watchers: Get list watchers of object
+ *
+ * @param out_watchers [out] place returned values in out_watchers on completion
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval);
+
+ /**
+ * list snapshot clones associated with a logical object
+ *
+ * This will include a record for each version of the object,
+ * include the "HEAD" (which will have a cloneid of SNAP_HEAD).
+ * Each clone includes a vector of snap ids for which it is
+ * defined to exist.
+ *
+ * NOTE: this operation must be submitted from an IoCtx with a
+ * read snapid of SNAP_DIR for reliable results.
+ *
+ * @param out_snaps [out] pointer to resulting snap_set_t
+ * @param prval [out] place error code in prval upon completion
+ */
+ void list_snaps(snap_set_t *out_snaps, int *prval);
+
+ /**
+ * query dirty state of an object
+ *
+ * @param isdirty [out] pointer to resulting bool
+ * @param prval [out] place error code in prval upon completion
+ */
+ void is_dirty(bool *isdirty, int *prval);
+
+ /**
+ * flush a cache tier object to backing tier; will block racing
+ * updates.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_flush();
+
+ /**
+ * Flush a cache tier object to backing tier; will EAGAIN if we race
+ * with an update. Must be used with the SKIPRWLOCKS flag.
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promotion.
+ */
+ void cache_try_flush();
+
+ /**
+ * evict a clean cache tier object
+ *
+ * This should be used in concert with OPERATION_IGNORE_CACHE to avoid
+ * triggering a promote on the OSD (that is then evicted).
+ */
+ void cache_evict();
+
+ /**
+ * Extensible tier
+ *
+ * set_chunk: make a chunk pointing a part of the source object at the target
+ * object
+ *
+ * @param src_offset [in] source offset to indicate the start position of
+ * a chunk in the source object
+ * @param src_length [in] source length to set the length of the chunk
+ * @param tgt_oid [in] target object's id to set a chunk
+ * @param tgt_offset [in] the start position of the target object
+ * @param flag [in] flag for the source object
+ *
+ */
+ void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+ std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+ /**
+ * flush a manifest tier object to backing tier, performing deduplication;
+ * will block racing updates.
+ *
+ * Invoking tier_flush() implicitly makes a manifest object even if
+ * the target object is not manifest.
+ */
+ void tier_flush();
+ /**
+ * evict a manifest tier object to backing tier; will block racing
+ * updates.
+ */
+ void tier_evict();
+ };
+
+ /* IoCtx : This is a context in which we can perform I/O.
+ * It includes a Pool,
+ *
+ * Typical use (error checking omitted):
+ *
+ * IoCtx p;
+ * rados.ioctx_create("my_pool", p);
+ * p->stat(&stats);
+ * ... etc ...
+ *
+ * NOTE: be sure to call watch_flush() prior to destroying any IoCtx
+ * that is used for watch events to ensure that racing callbacks
+ * have completed.
+ */
+ class CEPH_RADOS_API IoCtx
+ {
+ public:
+ IoCtx();
+ static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool);
+ IoCtx(const IoCtx& rhs);
+ IoCtx& operator=(const IoCtx& rhs);
+ IoCtx(IoCtx&& rhs) noexcept;
+ IoCtx& operator=(IoCtx&& rhs) noexcept;
+
+ ~IoCtx();
+
+ bool is_valid() const;
+
+ // Close our pool handle
+ void close();
+
+ // deep copy
+ void dup(const IoCtx& rhs);
+
+ // set pool auid
+ int set_auid(uint64_t auid_)
+ __attribute__ ((deprecated));
+
+ // set pool auid
+ int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+
+ // get pool auid
+ int get_auid(uint64_t *auid_)
+ __attribute__ ((deprecated));
+
+ uint64_t get_instance_id() const;
+
+ std::string get_pool_name();
+
+ bool pool_requires_alignment();
+ int pool_requires_alignment2(bool * req);
+ uint64_t pool_required_alignment();
+ int pool_required_alignment2(uint64_t * alignment);
+
+ // create an object
+ int create(const std::string& oid, bool exclusive);
+ int create(const std::string& oid, bool exclusive,
+ const std::string& category); ///< category is unused
+
+ /**
+ * write bytes to an object at a specified offset
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ /**
+ * append bytes to an object
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int append(const std::string& oid, bufferlist& bl, size_t len);
+ /**
+ * replace object contents with provided data
+ *
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& oid, bufferlist& bl);
+ int writesame(const std::string& oid, bufferlist& bl,
+ size_t write_len, uint64_t off);
+ int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off);
+ int checksum(const std::string& o, rados_checksum_type_t type,
+ const bufferlist &init_value_bl, size_t len, uint64_t off,
+ size_t chunk_size, bufferlist *pbl);
+ int remove(const std::string& oid);
+ int remove(const std::string& oid, int flags);
+ int trunc(const std::string& oid, uint64_t size);
+ int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m);
+ int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl);
+ int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off);
+ int getxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset);
+ int setxattr(const std::string& oid, const char *name, bufferlist& bl);
+ int rmxattr(const std::string& oid, const char *name);
+ int stat(const std::string& oid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts);
+ int exec(const std::string& oid, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist& outbl);
+ /**
+ * modify object tmap based on encoded update sequence
+ *
+ * NOTE: this call steals the contents of @param bl
+ */
+ int tmap_update(const std::string& oid, bufferlist& cmdbl);
+
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_vals(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals);
+ int omap_get_vals2(const std::string& oid,
+ const std::string& start_after,
+ const std::string& filter_prefix,
+ uint64_t max_return,
+ std::map<std::string, bufferlist> *out_vals,
+ bool *pmore);
+ int omap_get_keys(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys);
+ int omap_get_keys2(const std::string& oid,
+ const std::string& start_after,
+ uint64_t max_return,
+ std::set<std::string> *out_keys,
+ bool *pmore);
+ int omap_get_header(const std::string& oid,
+ bufferlist *bl);
+ int omap_get_vals_by_keys(const std::string& oid,
+ const std::set<std::string>& keys,
+ std::map<std::string, bufferlist> *vals);
+ int omap_set(const std::string& oid,
+ const std::map<std::string, bufferlist>& map);
+ int omap_set_header(const std::string& oid,
+ const bufferlist& bl);
+ int omap_clear(const std::string& oid);
+ int omap_rm_keys(const std::string& oid,
+ const std::set<std::string>& keys);
+
+ void snap_set_read(snap_t seq);
+ int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps);
+
+ // Create a snapshot with a given name
+ int snap_create(const char *snapname);
+
+ // Look up a snapshot by name.
+ // Returns 0 on success; error code otherwise
+ int snap_lookup(const char *snapname, snap_t *snap);
+
+ // Gets a timestamp for a snap
+ int snap_get_stamp(snap_t snapid, time_t *t);
+
+ // Gets the name of a snap
+ int snap_get_name(snap_t snapid, std::string *s);
+
+ // Remove a snapshot from this pool
+ int snap_remove(const char *snapname);
+
+ int snap_list(std::vector<snap_t> *snaps);
+
+ int snap_rollback(const std::string& oid, const char *snapname);
+
+ // Deprecated name kept for backward compatibility - same as snap_rollback()
+ int rollback(const std::string& oid, const char *snapname)
+ __attribute__ ((deprecated));
+
+ int selfmanaged_snap_create(uint64_t *snapid);
+ void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c);
+
+ int selfmanaged_snap_remove(uint64_t snapid);
+ void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c);
+
+ int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid);
+
+ // Advisory locking on rados objects.
+ int lock_exclusive(const std::string &oid, const std::string &name,
+ const std::string &cookie,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int lock_shared(const std::string &oid, const std::string &name,
+ const std::string &cookie, const std::string &tag,
+ const std::string &description,
+ struct timeval * duration, uint8_t flags);
+
+ int unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie);
+
+ int break_lock(const std::string &oid, const std::string &name,
+ const std::string &client, const std::string &cookie);
+
+ int list_lockers(const std::string &oid, const std::string &name,
+ int *exclusive,
+ std::string *tag,
+ std::list<librados::locker_t> *lockers);
+
+
+ /// Start enumerating objects for a pool. Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from a hash position.
+ /// Errors are thrown as exceptions.
+ NObjectIterator nobjects_begin(uint32_t start_hash_position,
+ const bufferlist &filter=bufferlist());
+ /// Start enumerating objects for a pool starting from cursor. Errors are
+ /// thrown as exceptions.
+ NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor,
+ const bufferlist &filter=bufferlist());
+ /// Iterator indicating the end of a pool
+ const NObjectIterator& nobjects_end() const;
+
+ /// Get cursor for pool beginning
+ ObjectCursor object_list_begin();
+
+ /// Get cursor for pool end
+ ObjectCursor object_list_end();
+
+ /// Check whether a cursor is at the end of a pool
+ bool object_list_is_end(const ObjectCursor &oc);
+
+ /// List some objects between two cursors
+ int object_list(const ObjectCursor &start, const ObjectCursor &finish,
+ const size_t result_count,
+ const bufferlist &filter,
+ std::vector<ObjectItem> *result,
+ ObjectCursor *next);
+
+ /// Generate cursors that include the N out of Mth slice of the pool
+ void object_list_slice(
+ const ObjectCursor start,
+ const ObjectCursor finish,
+ const size_t n,
+ const size_t m,
+ ObjectCursor *split_start,
+ ObjectCursor *split_finish);
+
+ /**
+ * List available hit set objects
+ *
+ * @param uint32_t [in] hash position to query
+ * @param c [in] completion
+ * @param pls [out] list of available intervals
+ */
+ int hit_set_list(uint32_t hash, AioCompletion *c,
+ std::list< std::pair<time_t, time_t> > *pls);
+
+ /**
+ * Retrieve hit set for a given hash, and time
+ *
+ * @param hash [in] hash position
+ * @param c [in] completion
+ * @param stamp [in] time interval that falls within the hit set's interval
+ * @param pbl [out] buffer to store the result in
+ */
+ int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp,
+ bufferlist *pbl);
+
+ uint64_t get_last_version();
+
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off);
+ /**
+ * Asynchronously read from an object at a particular snapshot
+ *
+ * This is the same as normal aio_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param pbl where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_read(const std::string& oid, AioCompletion *c,
+ bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid);
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off);
+ /**
+ * Asynchronously read existing extents from an object at a
+ * particular snapshot
+ *
+ * This is the same as normal aio_sparse_read, except that it chooses
+ * the snapshot to read from from its arguments instead of the
+ * internal IoCtx state.
+ *
+ * m will be filled in with a map of extents in the object,
+ * mapping offsets to lengths (in bytes) within the range
+ * requested. The data for all of the extents are stored
+ * back-to-back in offset order in data_bl.
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param m where to store the map of extents
+ * @param data_bl where to store the data
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @param snapid the id of the snapshot to read from
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_sparse_read(const std::string& oid, AioCompletion *c,
+ std::map<uint64_t,uint64_t> *m, bufferlist *data_bl,
+ size_t len, uint64_t off, uint64_t snapid);
+ /**
+ * Asynchronously compare an on-disk object range with a buffer
+ *
+ * @param oid the name of the object to read from
+ * @param c what to do when the read is complete
+ * @param off object byte offset at which to start the comparison
+ * @param cmp_bl buffer containing bytes to be compared with object contents
+ * @returns 0 on success, negative error code on failure,
+ * (-MAX_ERRNO - mismatch_off) on mismatch
+ */
+ int aio_cmpext(const std::string& oid,
+ librados::AioCompletion *c,
+ uint64_t off,
+ bufferlist& cmp_bl);
+ int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len, uint64_t off);
+ int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t len);
+ int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl);
+ int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl,
+ size_t write_len, uint64_t off);
+
+ /**
+ * Asynchronously remove an object
+ *
+ * Queues the remove and returns.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param oid the name of the object
+ * @param c what to do when the remove is safe and complete
+ * @returns 0 on success, -EROFS if the io context specifies a snap_seq
+ * other than SNAP_HEAD
+ */
+ int aio_remove(const std::string& oid, AioCompletion *c);
+ int aio_remove(const std::string& oid, AioCompletion *c, int flags);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * Schedule a callback for when all currently pending
+ * aio writes are safe. This is a non-blocking version of
+ * aio_flush().
+ *
+ * @param c what to do when the writes are safe
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush_async(AioCompletion *c);
+ int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset);
+ int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl);
+ int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name);
+ int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * Cancel aio operation
+ *
+ * @param c completion handle
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_cancel(AioCompletion *c);
+
+ int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method,
+ bufferlist& inbl, bufferlist *outbl);
+
+ /*
+ * asynchronous version of unlock
+ */
+ int aio_unlock(const std::string &oid, const std::string &name,
+ const std::string &cookie, AioCompletion *c);
+
+ // compound object operations
+ int operate(const std::string& oid, ObjectWriteOperation *op);
+ int operate(const std::string& oid, ObjectWriteOperation *op, int flags);
+ int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl);
+ int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op);
+ int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags);
+ /**
+ * Schedule an async write operation with explicit snapshot parameters
+ *
+ * This is the same as the first aio_operate(), except that it
+ * gets the snapshot context from its arguments instead of the
+ * IoCtx internal state.
+ *
+ * @param oid the object to operate on
+ * @param c what to do when the operation is complete and safe
+ * @param op which operations to perform
+ * @param seq latest selfmanaged snapshot sequence number for this object
+ * @param snaps currently existing selfmanaged snapshot ids for this object
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectWriteOperation *op, snap_t seq,
+ std::vector<snap_t>& snaps, int flags,
+ const blkin_trace_info *trace_info);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, bufferlist *pbl);
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, snap_t snapid, int flags,
+ bufferlist *pbl)
+ __attribute__ ((deprecated));
+
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl);
+ int aio_operate(const std::string& oid, AioCompletion *c,
+ ObjectReadOperation *op, int flags,
+ bufferlist *pbl, const blkin_trace_info *trace_info);
+
+ // watch/notify
+ int watch2(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int watch3(const std::string& o, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx);
+ int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle,
+ librados::WatchCtx2 *ctx, uint32_t timeout);
+ int unwatch2(uint64_t handle);
+ int aio_unwatch(uint64_t handle, AioCompletion *c);
+ /**
+ * Send a notify event to watchers
+ *
+ * Upon completion the pbl bufferlist reply payload will be
+ * encoded like so:
+ *
+ * le32 num_acks
+ * {
+ * le64 gid global id for the client (for client.1234 that's 1234)
+ * le64 cookie cookie for the client
+ * le32 buflen length of reply message buffer
+ * u8 * buflen payload
+ * } * num_acks
+ * le32 num_timeouts
+ * {
+ * le64 gid global id for the client
+ * le64 cookie cookie for the client
+ * } * num_timeouts
+ *
+ *
+ */
+ int notify2(const std::string& o, ///< object
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+ int aio_notify(const std::string& o, ///< object
+ AioCompletion *c, ///< completion when notify completes
+ bufferlist& bl, ///< optional broadcast payload
+ uint64_t timeout_ms, ///< timeout (in ms)
+ bufferlist *pbl); ///< reply buffer
+ /*
+ * Decode a notify response into acks and timeout vectors.
+ */
+ void decode_notify_response(bufferlist &bl,
+ std::vector<librados::notify_ack_t> *acks,
+ std::vector<librados::notify_timeout_t> *timeouts);
+
+ int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
+ int list_snaps(const std::string& o, snap_set_t *out_snaps);
+ void set_notify_timeout(uint32_t timeout);
+
+ /// acknowledge a notify we received.
+ void notify_ack(const std::string& o, ///< watched object
+ uint64_t notify_id, ///< notify id
+ uint64_t cookie, ///< our watch handle
+ bufferlist& bl); ///< optional reply payload
+
+ /***
+ * check on watch validity
+ *
+ * Check if a watch is valid. If so, return the number of
+ * milliseconds since we last confirmed its liveness. If there is
+ * a known error, return it.
+ *
+ * If there is an error, the watch is no longer valid, and should
+ * be destroyed with unwatch(). The user is still interested in
+ * the object, a new watch should be created with watch().
+ *
+ * @param cookie watch handle
+ * @returns ms since last confirmed valid, or error
+ */
+ int watch_check(uint64_t cookie);
+
+ // old, deprecated versions
+ int watch(const std::string& o, uint64_t ver, uint64_t *cookie,
+ librados::WatchCtx *ctx) __attribute__ ((deprecated));
+ int notify(const std::string& o, uint64_t ver, bufferlist& bl)
+ __attribute__ ((deprecated));
+ int unwatch(const std::string& o, uint64_t cookie)
+ __attribute__ ((deprecated));
+
+ /**
+ * Set allocation hint for an object
+ *
+ * This is an advisory operation, it will always succeed (as if it
+ * was submitted with a OP_FAILOK flag set) and is not guaranteed
+ * to do anything on the backend.
+ *
+ * @param o the name of the object
+ * @param expected_object_size expected size of the object, in bytes
+ * @param expected_write_size expected size of writes to the object, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+ int set_alloc_hint(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size);
+ int set_alloc_hint2(const std::string& o,
+ uint64_t expected_object_size,
+ uint64_t expected_write_size,
+ uint32_t flags);
+
+ // assert version for next sync operations
+ void set_assert_version(uint64_t ver);
+
+ /**
+ * Pin/unpin an object in cache tier
+ *
+ * @param o the name of the object
+ * @returns 0 on success, negative error code on failure
+ */
+ int cache_pin(const std::string& o);
+ int cache_unpin(const std::string& o);
+
+ std::string get_pool_name() const;
+
+ void locator_set_key(const std::string& key);
+ void set_namespace(const std::string& nspace);
+ std::string get_namespace() const;
+
+ int64_t get_id();
+
+ // deprecated versions
+ uint32_t get_object_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+ uint32_t get_object_pg_hash_position(const std::string& oid)
+ __attribute__ ((deprecated));
+
+ int get_object_hash_position2(const std::string& oid, uint32_t *hash_position);
+ int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position);
+
+ config_t cct();
+
+ void set_osdmap_full_try()
+ __attribute__ ((deprecated));
+ void unset_osdmap_full_try()
+ __attribute__ ((deprecated));
+
+ bool get_pool_full_try();
+ void set_pool_full_try();
+ void unset_pool_full_try();
+
+ int application_enable(const std::string& app_name, bool force);
+ int application_enable_async(const std::string& app_name,
+ bool force, PoolAsyncCompletion *c);
+ int application_list(std::set<std::string> *app_names);
+ int application_metadata_get(const std::string& app_name,
+ const std::string &key,
+ std::string *value);
+ int application_metadata_set(const std::string& app_name,
+ const std::string &key,
+ const std::string& value);
+ int application_metadata_remove(const std::string& app_name,
+ const std::string &key);
+ int application_metadata_list(const std::string& app_name,
+ std::map<std::string, std::string> *values);
+
+ private:
+ /* You can only get IoCtx instances from Rados */
+ IoCtx(IoCtxImpl *io_ctx_impl_);
+
+ friend class Rados; // Only Rados can use our private constructor to create IoCtxes.
+ friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl
+ friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl
+ friend class ObjectReadOperation; // set_chunk needs to see our IoCtxImpl
+
+ IoCtxImpl *io_ctx_impl;
+ };
+
+ struct CEPH_RADOS_API PlacementGroup {
+ PlacementGroup();
+ PlacementGroup(const PlacementGroup&);
+ ~PlacementGroup();
+ bool parse(const char*);
+ std::unique_ptr<PlacementGroupImpl> impl;
+ };
+
+ CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&);
+
+ class CEPH_RADOS_API Rados
+ {
+ public:
+ static void version(int *major, int *minor, int *extra);
+
+ Rados();
+ explicit Rados(IoCtx& ioctx);
+ ~Rados();
+ static void from_rados_t(rados_t cluster, Rados &rados);
+
+ int init(const char * const id);
+ int init2(const char * const name, const char * const clustername,
+ uint64_t flags);
+ int init_with_context(config_t cct_);
+ config_t cct();
+ int connect();
+ void shutdown();
+ int watch_flush();
+ int aio_watch_flush(AioCompletion*);
+ int conf_read_file(const char * const path) const;
+ int conf_parse_argv(int argc, const char ** argv) const;
+ int conf_parse_argv_remainder(int argc, const char ** argv,
+ const char ** remargv) const;
+ int conf_parse_env(const char *env) const;
+ int conf_set(const char *option, const char *value);
+ int conf_get(const char *option, std::string &val);
+
+ int service_daemon_register(
+ const std::string& service, ///< service name (e.g., 'rgw')
+ const std::string& name, ///< daemon name (e.g., 'gwfoo')
+ const std::map<std::string,std::string>& metadata); ///< static metadata about daemon
+ int service_daemon_update_status(
+ std::map<std::string,std::string>&& status);
+
+ int pool_create(const char *name);
+ int pool_create(const char *name, uint64_t auid)
+ __attribute__ ((deprecated));
+ int pool_create(const char *name, uint64_t auid, uint8_t crush_rule)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule(const char *name, uint8_t crush_rule);
+ int pool_create_async(const char *name, PoolAsyncCompletion *c);
+ int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c)
+ __attribute__ ((deprecated));
+ int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c);
+ int pool_get_base_tier(int64_t pool, int64_t* base_tier);
+ int pool_delete(const char *name);
+ int pool_delete_async(const char *name, PoolAsyncCompletion *c);
+ int64_t pool_lookup(const char *name);
+ int pool_reverse_lookup(int64_t id, std::string *name);
+
+ uint64_t get_instance_id();
+
+ int get_min_compatible_osd(int8_t* require_osd_release);
+ int get_min_compatible_client(int8_t* min_compat_client,
+ int8_t* require_min_compat_client);
+
+ int mon_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int mgr_command(std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int osd_command(int osdid, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+ int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl,
+ bufferlist *outbl, std::string *outs);
+
+ int ioctx_create(const char *name, IoCtx &pioctx);
+ int ioctx_create2(int64_t pool_id, IoCtx &pioctx);
+
+ // Features useful for test cases
+ void test_blocklist_self(bool set);
+
+ /* pool info */
+ int pool_list(std::list<std::string>& v);
+ int pool_list2(std::list<std::pair<int64_t, std::string> >& v);
+ int get_pool_stats(std::list<std::string>& v,
+ stats_map& result);
+ /// deprecated; use simpler form. categories no longer supported.
+ int get_pool_stats(std::list<std::string>& v,
+ std::map<std::string, stats_map>& stats);
+ /// deprecated; categories no longer supported
+ int get_pool_stats(std::list<std::string>& v,
+ std::string& category,
+ std::map<std::string, stats_map>& stats);
+ /// check if pool has selfmanaged snaps
+ bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname);
+
+ int cluster_stat(cluster_stat_t& result);
+ int cluster_fsid(std::string *fsid);
+
+ /**
+ * List inconsistent placement groups in the given pool
+ *
+ * @param pool_id the pool id
+ * @param pgs [out] the inconsistent PGs
+ */
+ int get_inconsistent_pgs(int64_t pool_id,
+ std::vector<PlacementGroup>* pgs);
+ /**
+ * List the inconsistent objects found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param objects [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_objects(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_obj_t>* objects,
+ uint32_t* interval);
+ /**
+ * List the inconsistent snapsets found in a given PG by last scrub
+ *
+ * @param pg the placement group returned by @c pg_list()
+ * @param start_after the first returned @c objects
+ * @param max_return the max number of the returned @c objects
+ * @param c what to do when the operation is complete and safe
+ * @param snapsets [out] the objects where inconsistencies are found
+ * @param interval [in,out] an epoch indicating current interval
+ * @returns if a non-zero @c interval is specified, will return -EAGAIN i
+ * the current interval begin epoch is different.
+ */
+ int get_inconsistent_snapsets(const PlacementGroup& pg,
+ const object_id_t &start_after,
+ unsigned max_return,
+ AioCompletion *c,
+ std::vector<inconsistent_snapset_t>* snapset,
+ uint32_t* interval);
+
+ /// get/wait for the most recent osdmap
+ int wait_for_latest_osdmap();
+
+ int blocklist_add(const std::string& client_address,
+ uint32_t expire_seconds);
+
+ std::string get_addrs() const;
+
+ /*
+ * pool aio
+ *
+ * It is up to the caller to release the completion handler, even if the pool_create_async()
+ * and/or pool_delete_async() fails and does not send the async request
+ */
+ static PoolAsyncCompletion *pool_async_create_completion();
+
+ // -- aio --
+ static AioCompletion *aio_create_completion();
+ static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete,
+ callback_t cb_safe)
+ __attribute__ ((deprecated));
+ static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete);
+
+ friend std::ostream& operator<<(std::ostream &oss, const Rados& r);
+ private:
+ friend class neorados::RADOS;
+
+ // We don't allow assignment or copying
+ Rados(const Rados& rhs);
+ const Rados& operator=(const Rados& rhs);
+ RadosClient *client;
+ };
+
+} // namespace v14_2_0
+} // namespace librados
+
+#endif
+
diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp
new file mode 100644
index 000000000..396f3a838
--- /dev/null
+++ b/src/include/rados/librados_fwd.hpp
@@ -0,0 +1,34 @@
+#ifndef __LIBRADOS_FWD_HPP
+#define __LIBRADOS_FWD_HPP
+
+struct blkin_trace_info;
+
+namespace libradosstriper {
+
+class RadosStriper;
+
+} // namespace libradosstriper
+
+namespace librados {
+inline namespace v14_2_0 {
+
+class AioCompletion;
+class IoCtx;
+class ListObject;
+class NObjectIterator;
+class ObjectCursor;
+class ObjectItem;
+class ObjectOperation;
+class ObjectOperationCompletion;
+class ObjectReadOperation;
+class ObjectWriteOperation;
+class PlacementGroup;
+class PoolAsyncCompletion;
+class Rados;
+class WatchCtx;
+class WatchCtx2;
+
+} // inline namespace v14_2_0
+} // namespace librados
+
+#endif // __LIBRADOS_FWD_HPP
diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h
new file mode 100644
index 000000000..c20e96bed
--- /dev/null
+++ b/src/include/rados/librgw.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_LIBRGW_H
+#define CEPH_LIBRGW_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_VER_MAJOR 1
+#define LIBRGW_VER_MINOR 1
+#define LIBRGW_VER_EXTRA 0
+
+#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA)
+
+typedef void* librgw_t;
+int librgw_create(librgw_t *rgw, int argc, char **argv);
+void librgw_shutdown(librgw_t rgw);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CEPH_LIBRGW_H */
diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h
new file mode 100644
index 000000000..80ae69d25
--- /dev/null
+++ b/src/include/rados/objclass.h
@@ -0,0 +1,177 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H
+
+#ifdef __cplusplus
+
+#include "buffer.h"
+
+extern "C" {
+#endif
+
+#define CEPH_CLS_API [[gnu::visibility("default")]]
+
+#define CLS_VER(maj,min) \
+int __cls_ver__## maj ## _ ##min = 0; \
+int __cls_ver_maj = maj; \
+int __cls_ver_min = min;
+
+#define CLS_NAME(name) \
+int __cls_name__## name = 0; \
+const char *__cls_name = #name;
+
+#define CLS_INIT(name) \
+CEPH_CLS_API void __cls_init()
+
+#define CLS_METHOD_RD 0x1 /// method executes read operations
+#define CLS_METHOD_WR 0x2 /// method executes write operations
+#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier
+
+#define CLS_LOG(level, fmt, ...) \
+ cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
+#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__)
+
+/**
+ * Initialize a class.
+ */
+void __cls_init();
+
+/**
+ * @typdef cls_handle_t
+ *
+ * A handle for interacting with the object class.
+ */
+typedef void *cls_handle_t;
+
+/**
+ * @typedef cls_method_handle_t
+ *
+ * A handle for interacting with the method of the object class.
+ */
+typedef void *cls_method_handle_t;
+
+/**
+ * @typedef cls_method_context_t
+ *
+ * A context for the method of the object class.
+ */
+typedef void* cls_method_context_t;
+
+/*class utils*/
+extern int cls_log(int level, const char *format, ...)
+ __attribute__((__format__(printf, 2, 3)));
+
+/* class registration api */
+extern int cls_register(const char *name, cls_handle_t *handle);
+
+#ifdef __cplusplus
+}
+
+/**
+ * @typedef cls_method_cxx_call_t
+ *
+ */
+typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx,
+ class ceph::buffer::list *inbl, class ceph::buffer::list *outbl);
+
+/**
+ * Register a method.
+ *
+ * @param hclass
+ * @param method
+ * @param flags
+ * @param class_call
+ * @param handle
+ */
+extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags,
+ cls_method_cxx_call_t class_call, cls_method_handle_t *handle);
+
+/**
+ * Create an object.
+ *
+ * @param hctx
+ * @param exclusive
+ */
+extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive);
+
+/**
+ * Remove an object.
+ *
+ * @param hctx
+ */
+extern int cls_cxx_remove(cls_method_context_t hctx);
+
+/**
+ * Check on the status of an object.
+ *
+ * @param hctx
+ * @param size
+ * @param mtime
+ */
+extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime);
+
+/**
+ * Read contents of an object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Write to the object.
+ *
+ * @param hctx
+ * @param ofs
+ * @param len
+ * @param bl
+ */
+extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl);
+
+/**
+ * Get xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param outbl
+ */
+extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *outbl);
+
+/**
+ * Set xattr of the object.
+ *
+ * @param hctx
+ * @param name
+ * @param inbl
+ */
+extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name,
+ ceph::bufferlist *inbl);
+
+/**
+ * Get value corresponding to a key from the map.
+ *
+ * @param hctx
+ * @param key
+ * @param outbl
+ */
+extern int cls_cxx_map_get_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *outbl);
+
+/**
+ * Set value corresponding to a key in the map.
+ *
+ * @param hctx
+ * @param key
+ * @param inbl
+ */
+extern int cls_cxx_map_set_val(cls_method_context_t hctx,
+ const std::string &key, ceph::bufferlist *inbl);
+
+#endif
+
+#endif
diff --git a/src/include/rados/page.h b/src/include/rados/page.h
new file mode 120000
index 000000000..cf983e838
--- /dev/null
+++ b/src/include/rados/page.h
@@ -0,0 +1 @@
+../page.h \ No newline at end of file
diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h
new file mode 100644
index 000000000..d308341ec
--- /dev/null
+++ b/src/include/rados/rados_types.h
@@ -0,0 +1,41 @@
+#ifndef CEPH_RADOS_TYPES_H
+#define CEPH_RADOS_TYPES_H
+
+#include <stdint.h>
+
+/**
+ * @struct obj_watch_t
+ * One item from list_watchers
+ */
+struct obj_watch_t {
+ /// Address of the Watcher
+ char addr[256];
+ /// Watcher ID
+ int64_t watcher_id;
+ /// Cookie
+ uint64_t cookie;
+ /// Timeout in Seconds
+ uint32_t timeout_seconds;
+};
+
+struct notify_ack_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+ char *payload;
+ uint64_t payload_len;
+};
+
+struct notify_timeout_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+};
+
+/**
+ *
+ * Pass as nspace argument to rados_ioctx_set_namespace()
+ * before calling rados_nobjects_list_open() to return
+ * all objects in all namespaces.
+ */
+#define LIBRADOS_ALL_NSPACES "\001"
+
+#endif
diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp
new file mode 100644
index 000000000..84023579b
--- /dev/null
+++ b/src/include/rados/rados_types.hpp
@@ -0,0 +1,341 @@
+#ifndef CEPH_RADOS_TYPES_HPP
+#define CEPH_RADOS_TYPES_HPP
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <stdint.h>
+#include <string>
+
+#include "buffer.h"
+#include "rados_types.h"
+
+namespace librados {
+
+typedef uint64_t snap_t;
+
+enum {
+ SNAP_HEAD = (uint64_t)(-2),
+ SNAP_DIR = (uint64_t)(-1)
+};
+
+struct clone_info_t {
+ snap_t cloneid;
+ std::vector<snap_t> snaps; // ascending
+ std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest
+ uint64_t size;
+ clone_info_t() : cloneid(0), size(0) {}
+};
+
+struct snap_set_t {
+ std::vector<clone_info_t> clones; // ascending
+ snap_t seq; // newest snapid seen by the object
+ snap_set_t() : seq(0) {}
+};
+
+struct object_id_t {
+ std::string name;
+ std::string nspace;
+ std::string locator;
+ snap_t snap = 0;
+ object_id_t() = default;
+ object_id_t(const std::string& name,
+ const std::string& nspace,
+ const std::string& locator,
+ snap_t snap)
+ : name(name),
+ nspace(nspace),
+ locator(locator),
+ snap(snap)
+ {}
+};
+
+struct err_t {
+ enum : uint64_t {
+ SHARD_MISSING = 1 << 1,
+ SHARD_STAT_ERR = 1 << 2,
+ SHARD_READ_ERR = 1 << 3,
+ DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old
+ DATA_DIGEST_MISMATCH_INFO = 1 << 9,
+ OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old
+ OMAP_DIGEST_MISMATCH_INFO = 1 << 10,
+ SIZE_MISMATCH_OI = 1 << 11, // Old
+ SIZE_MISMATCH_INFO = 1 << 11,
+ SHARD_EC_HASH_MISMATCH = 1 << 12,
+ SHARD_EC_SIZE_MISMATCH = 1 << 13,
+ OI_ATTR_MISSING = 1 << 14, // Old
+ INFO_MISSING = 1 << 14,
+ OI_ATTR_CORRUPTED = 1 << 15, // Old
+ INFO_CORRUPTED = 1 << 15,
+ SS_ATTR_MISSING = 1 << 16, // Old
+ SNAPSET_MISSING = 1 << 16,
+ SS_ATTR_CORRUPTED = 1 << 17, // Old
+ SNAPSET_CORRUPTED = 1 << 17,
+ OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old
+ OBJ_SIZE_INFO_MISMATCH = 1 << 18,
+ HINFO_MISSING = 1 << 19,
+ HINFO_CORRUPTED = 1 << 20
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED;
+ static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH;
+ bool has_shard_missing() const {
+ return errors & SHARD_MISSING;
+ }
+ bool has_stat_error() const {
+ return errors & SHARD_STAT_ERR;
+ }
+ bool has_read_error() const {
+ return errors & SHARD_READ_ERR;
+ }
+ bool has_data_digest_mismatch_oi() const { // Compatibility
+ return errors & DATA_DIGEST_MISMATCH_OI;
+ }
+ bool has_data_digest_mismatch_info() const {
+ return errors & DATA_DIGEST_MISMATCH_INFO;
+ }
+ bool has_omap_digest_mismatch_oi() const { // Compatibility
+ return errors & OMAP_DIGEST_MISMATCH_OI;
+ }
+ bool has_omap_digest_mismatch_info() const {
+ return errors & OMAP_DIGEST_MISMATCH_INFO;
+ }
+ bool has_size_mismatch_oi() const { // Compatibility
+ return errors & SIZE_MISMATCH_OI;
+ }
+ bool has_size_mismatch_info() const {
+ return errors & SIZE_MISMATCH_INFO;
+ }
+ bool has_ec_hash_error() const {
+ return errors & SHARD_EC_HASH_MISMATCH;
+ }
+ bool has_ec_size_error() const {
+ return errors & SHARD_EC_SIZE_MISMATCH;
+ }
+ bool has_oi_attr_missing() const { // Compatibility
+ return errors & OI_ATTR_MISSING;
+ }
+ bool has_info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool has_oi_attr_corrupted() const { // Compatibility
+ return errors & OI_ATTR_CORRUPTED;
+ }
+ bool has_info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool has_ss_attr_missing() const { // Compatibility
+ return errors & SS_ATTR_MISSING;
+ }
+ bool has_snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool has_ss_attr_corrupted() const { // Compatibility
+ return errors & SS_ATTR_CORRUPTED;
+ }
+ bool has_snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_obj_size_oi_mismatch() const { // Compatibility
+ return errors & OBJ_SIZE_OI_MISMATCH;
+ }
+ bool has_obj_size_info_mismatch() const {
+ return errors & OBJ_SIZE_INFO_MISMATCH;
+ }
+ bool has_hinfo_missing() const {
+ return errors & HINFO_MISSING;
+ }
+ bool has_hinfo_corrupted() const {
+ return errors & HINFO_CORRUPTED;
+ }
+};
+
+struct shard_info_t : err_t {
+ std::map<std::string, ceph::bufferlist> attrs;
+ uint64_t size = -1;
+ bool omap_digest_present = false;
+ uint32_t omap_digest = 0;
+ bool data_digest_present = false;
+ uint32_t data_digest = 0;
+ bool selected_oi = false;
+ bool primary = false;
+};
+
+struct osd_shard_t {
+ int32_t osd;
+ int8_t shard;
+};
+
+inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) {
+ if (lhs.osd < rhs.osd)
+ return true;
+ else if (lhs.osd > rhs.osd)
+ return false;
+ else
+ return lhs.shard < rhs.shard;
+}
+
+struct obj_err_t {
+ enum : uint64_t {
+ OBJECT_INFO_INCONSISTENCY = 1 << 1,
+ // XXX: Can an older rados binary work if these bits stay the same?
+ DATA_DIGEST_MISMATCH = 1 << 4,
+ OMAP_DIGEST_MISMATCH = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ ATTR_VALUE_MISMATCH = 1 << 7,
+ ATTR_NAME_MISMATCH = 1 << 8,
+ SNAPSET_INCONSISTENCY = 1 << 9,
+ HINFO_INCONSISTENCY = 1 << 10,
+ SIZE_TOO_LARGE = 1 << 11,
+ // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS
+ };
+ uint64_t errors = 0;
+ static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH
+ |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE;
+ static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH;
+ bool has_object_info_inconsistency() const {
+ return errors & OBJECT_INFO_INCONSISTENCY;
+ }
+ bool has_data_digest_mismatch() const {
+ return errors & DATA_DIGEST_MISMATCH;
+ }
+ bool has_omap_digest_mismatch() const {
+ return errors & OMAP_DIGEST_MISMATCH;
+ }
+ bool has_size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool has_attr_value_mismatch() const {
+ return errors & ATTR_VALUE_MISMATCH;
+ }
+ bool has_attr_name_mismatch() const {
+ return errors & ATTR_NAME_MISMATCH;
+ }
+ bool has_shallow_errors() const {
+ return errors & SHALLOW_ERRORS;
+ }
+ bool has_deep_errors() const {
+ return errors & DEEP_ERRORS;
+ }
+ bool has_snapset_inconsistency() const {
+ return errors & SNAPSET_INCONSISTENCY;
+ }
+ bool has_hinfo_inconsistency() const {
+ return errors & HINFO_INCONSISTENCY;
+ }
+ bool has_size_too_large() const {
+ return errors & SIZE_TOO_LARGE;
+ }
+};
+
+struct inconsistent_obj_t : obj_err_t {
+ inconsistent_obj_t() = default;
+ inconsistent_obj_t(const object_id_t& object)
+ : object{object}, version(0)
+ {}
+ object_id_t object;
+ uint64_t version; // XXX: Redundant with object info attr
+ std::map<osd_shard_t, shard_info_t> shards;
+ err_t union_shards;
+};
+
+struct inconsistent_snapset_t {
+ inconsistent_snapset_t() = default;
+ inconsistent_snapset_t(const object_id_t& head)
+ : object{head}
+ {}
+ enum {
+ SNAPSET_MISSING = 1 << 0,
+ SNAPSET_CORRUPTED = 1 << 1,
+ CLONE_MISSING = 1 << 2,
+ SNAP_ERROR = 1 << 3,
+ HEAD_MISMATCH = 1 << 4, // Unused
+ HEADLESS_CLONE = 1 << 5,
+ SIZE_MISMATCH = 1 << 6,
+ OI_MISSING = 1 << 7, // Old
+ INFO_MISSING = 1 << 7,
+ OI_CORRUPTED = 1 << 8, // Old
+ INFO_CORRUPTED = 1 << 8,
+ EXTRA_CLONES = 1 << 9,
+ };
+ uint64_t errors = 0;
+ object_id_t object;
+ // Extra clones
+ std::vector<snap_t> clones;
+ std::vector<snap_t> missing;
+ ceph::bufferlist ss_bl;
+
+ bool ss_attr_missing() const { // Compatibility
+ return errors & SNAPSET_MISSING;
+ }
+ bool snapset_missing() const {
+ return errors & SNAPSET_MISSING;
+ }
+ bool ss_attr_corrupted() const { // Compatibility
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool snapset_corrupted() const {
+ return errors & SNAPSET_CORRUPTED;
+ }
+ bool clone_missing() const {
+ return errors & CLONE_MISSING;
+ }
+ bool snapset_mismatch() const { // Compatibility
+ return errors & SNAP_ERROR;
+ }
+ bool snapset_error() const {
+ return errors & SNAP_ERROR;
+ }
+ bool head_mismatch() const { // Compatibility
+ return false;
+ }
+ bool headless() const {
+ return errors & HEADLESS_CLONE;
+ }
+ bool size_mismatch() const {
+ return errors & SIZE_MISMATCH;
+ }
+ bool oi_attr_missing() const { // Compatibility
+ return errors & OI_MISSING;
+ }
+ bool info_missing() const {
+ return errors & INFO_MISSING;
+ }
+ bool oi_attr_corrupted() const { // Compatibility
+ return errors & OI_CORRUPTED;
+ }
+ bool info_corrupted() const {
+ return errors & INFO_CORRUPTED;
+ }
+ bool extra_clones() const {
+ return errors & EXTRA_CLONES;
+ }
+};
+
+/**
+ * @var all_nspaces
+ * Pass as nspace argument to IoCtx::set_namespace()
+ * before calling nobjects_begin() to iterate
+ * through all objects in all namespaces.
+ */
+const std::string all_nspaces(LIBRADOS_ALL_NSPACES);
+
+struct notify_ack_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+ ceph::bufferlist payload_bl;
+};
+
+struct notify_timeout_t {
+ uint64_t notifier_id;
+ uint64_t cookie;
+};
+}
+#endif
diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h
new file mode 100644
index 000000000..e1ea45593
--- /dev/null
+++ b/src/include/rados/rgw_file.h
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * convert RGW commands to file commands
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef RADOS_RGW_FILE_H
+#define RADOS_RGW_FILE_H
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "librgw.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBRGW_FILE_VER_MAJOR 1
+#define LIBRGW_FILE_VER_MINOR 2
+#define LIBRGW_FILE_VER_EXTRA 0
+
+#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA)
+
+/*
+ * object types
+ */
+enum rgw_fh_type {
+ RGW_FS_TYPE_NIL = 0,
+ RGW_FS_TYPE_FILE,
+ RGW_FS_TYPE_DIRECTORY,
+ RGW_FS_TYPE_SYMBOLIC_LINK,
+};
+
+/*
+ * dynamic allocated handle to support nfs handle
+ */
+
+/* content-addressable hash */
+struct rgw_fh_hk {
+ uint64_t bucket;
+ uint64_t object;
+};
+
+struct rgw_file_handle
+{
+ /* content-addressable hash */
+ struct rgw_fh_hk fh_hk;
+ void *fh_private; /* librgw private data */
+ /* object type */
+ enum rgw_fh_type fh_type;
+};
+
+struct rgw_fs
+{
+ librgw_t rgw;
+ void *fs_private;
+ struct rgw_file_handle* root_fh;
+};
+
+
+/* XXX mount info hypothetical--emulate Unix, support at least
+ * UUID-length fsid */
+struct rgw_statvfs {
+ uint64_t f_bsize; /* file system block size */
+ uint64_t f_frsize; /* fragment size */
+ uint64_t f_blocks; /* size of fs in f_frsize units */
+ uint64_t f_bfree; /* # free blocks */
+ uint64_t f_bavail; /* # free blocks for unprivileged users */
+ uint64_t f_files; /* # inodes */
+ uint64_t f_ffree; /* # free inodes */
+ uint64_t f_favail; /* # free inodes for unprivileged users */
+ uint64_t f_fsid[2]; /* file system ID */
+ uint64_t f_flag; /* mount flags */
+ uint64_t f_namemax; /* maximum filename length */
+};
+
+
+void rgwfile_version(int *major, int *minor, int *extra);
+
+/*
+ lookup object by name (POSIX style)
+*/
+#define RGW_LOOKUP_FLAG_NONE 0x0000
+#define RGW_LOOKUP_FLAG_CREATE 0x0001
+#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */
+#define RGW_LOOKUP_FLAG_DIR 0x0004
+#define RGW_LOOKUP_FLAG_FILE 0x0008
+
+#define RGW_LOOKUP_TYPE_FLAGS \
+ (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE)
+
+int rgw_lookup(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *path,
+ struct rgw_file_handle **fh,
+ struct stat *st, uint32_t mask, uint32_t flags);
+
+/*
+ lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ * release file handle
+ */
+#define RGW_FH_RELE_FLAG_NONE 0x0000
+
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ attach rgw namespace
+*/
+#define RGW_MOUNT_FLAG_NONE 0x0000
+
+int rgw_mount(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *key,
+ const char *secret, const char *root, struct rgw_fs **rgw_fs,
+ uint32_t flags);
+
+/*
+ register invalidate callbacks
+*/
+#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000
+
+typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk);
+
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+ void *arg, uint32_t flags);
+
+/*
+ detach rgw namespace
+*/
+#define RGW_UMOUNT_FLAG_NONE 0x0000
+
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags);
+
+
+/*
+ get filesystem attributes
+*/
+#define RGW_STATFS_FLAG_NONE 0x0000
+
+int rgw_statfs(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ struct rgw_statvfs *vfs_st,
+ uint32_t flags);
+
+
+/* XXX (get|set)attr mask bits */
+#define RGW_SETATTR_MODE 1
+#define RGW_SETATTR_UID 2
+#define RGW_SETATTR_GID 4
+#define RGW_SETATTR_MTIME 8
+#define RGW_SETATTR_ATIME 16
+#define RGW_SETATTR_SIZE 32
+#define RGW_SETATTR_CTIME 64
+
+/*
+ create file
+*/
+#define RGW_CREATE_FLAG_NONE 0x0000
+
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a symbolic link
+ */
+#define RGW_CREATELINK_FLAG_NONE 0x0000
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ const char *name, const char *link_path, struct stat *st,
+ uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags,
+ uint32_t flags);
+
+/*
+ create a new directory
+*/
+#define RGW_MKDIR_FLAG_NONE 0x0000
+
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, struct stat *st, uint32_t mask,
+ struct rgw_file_handle **fh, uint32_t flags);
+
+/*
+ rename object
+*/
+#define RGW_RENAME_FLAG_NONE 0x0000
+
+int rgw_rename(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *olddir, const char* old_name,
+ struct rgw_file_handle *newdir, const char* new_name,
+ uint32_t flags);
+
+/*
+ remove file or directory
+*/
+#define RGW_UNLINK_FLAG_NONE 0x0000
+
+int rgw_unlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char* path,
+ uint32_t flags);
+
+/*
+ read directory content
+*/
+typedef int (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset,
+ struct stat *st, uint32_t mask,
+ uint32_t flags);
+
+#define RGW_READDIR_FLAG_NONE 0x0000
+#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, uint64_t *offset,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh, const char *name,
+ rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+ uint32_t flags);
+
+/* project offset of dirent name */
+#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000
+
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *parent_fh,
+ const char *name, int64_t *offset,
+ uint32_t flags);
+
+/*
+ get unix attributes for object
+*/
+#define RGW_GETATTR_FLAG_NONE 0x0000
+
+int rgw_getattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t flags);
+
+/*
+ set unix attributes for object
+*/
+#define RGW_SETATTR_FLAG_NONE 0x0000
+
+int rgw_setattr(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, struct stat *st,
+ uint32_t mask, uint32_t flags);
+
+/*
+ truncate file
+*/
+#define RGW_TRUNCATE_FLAG_NONE 0x0000
+
+int rgw_truncate(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t size,
+ uint32_t flags);
+
+/*
+ open file
+*/
+#define RGW_OPEN_FLAG_NONE 0x0000
+#define RGW_OPEN_FLAG_CREATE 0x0001
+#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */
+#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */
+
+int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+ uint32_t posix_flags, uint32_t flags);
+
+/*
+ close file
+*/
+
+#define RGW_CLOSE_FLAG_NONE 0x0000
+#define RGW_CLOSE_FLAG_RELE 0x0001
+
+int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ read data from file
+*/
+#define RGW_READ_FLAG_NONE 0x0000
+
+int rgw_read(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ read symbolic link
+*/
+#define RGW_READLINK_FLAG_NONE 0x0000
+
+int rgw_readlink(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_read, void *buffer,
+ uint32_t flags);
+
+/*
+ write data to file
+*/
+#define RGW_WRITE_FLAG_NONE 0x0000
+
+int rgw_write(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, uint64_t offset,
+ size_t length, size_t *bytes_written, void *buffer,
+ uint32_t flags);
+
+#define RGW_UIO_NONE 0x0000
+#define RGW_UIO_GIFT 0x0001
+#define RGW_UIO_FREE 0x0002
+#define RGW_UIO_BUFQ 0x0004
+
+struct rgw_uio;
+typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t);
+
+/* buffer vector descriptors */
+struct rgw_vio {
+ void *vio_p1;
+ void *vio_u1;
+ void *vio_base;
+ int32_t vio_len;
+};
+
+struct rgw_uio {
+ rgw_uio_release uio_rele;
+ void *uio_p1;
+ void *uio_u1;
+ uint64_t uio_offset;
+ uint64_t uio_resid;
+ uint32_t uio_cnt;
+ uint32_t uio_flags;
+ struct rgw_vio *uio_vio; /* appended vectors */
+};
+
+typedef struct rgw_uio rgw_uio;
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+int rgw_writev(struct rgw_fs *rgw_fs,
+ struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags);
+
+/*
+ sync written data
+*/
+#define RGW_FSYNC_FLAG_NONE 0x0000
+
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint32_t flags);
+
+/*
+ NFS commit operation
+*/
+
+#define RGW_COMMIT_FLAG_NONE 0x0000
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ uint64_t offset, uint64_t length, uint32_t flags);
+
+/*
+ extended attributes
+ */
+typedef struct rgw_xattrstr
+{
+ char *val;
+ uint32_t len;
+} rgw_xattrstr;
+
+typedef struct rgw_xattr
+{
+ rgw_xattrstr key;
+ rgw_xattrstr val;
+} rgw_xattr;
+
+typedef struct rgw_xattrlist
+{
+ rgw_xattr *xattrs;
+ uint32_t xattr_cnt;
+} rgw_xattrlist;
+
+#define RGW_GETXATTR_FLAG_NONE 0x0000
+
+typedef int (*rgw_getxattr_cb)(rgw_xattrlist *attrs, void *arg,
+ uint32_t flags);
+
+int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg,
+ uint32_t flags);
+
+#define RGW_LSXATTR_FLAG_NONE 0x0000
+#define RGW_LSXATTR_FLAG_STOP 0x0001
+
+int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrstr *filter_prefix /* unimplemented for now */,
+ rgw_getxattr_cb cb, void *cb_arg, uint32_t flags);
+
+#define RGW_SETXATTR_FLAG_NONE 0x0000
+
+int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, uint32_t flags);
+
+#define RGW_RMXATTR_FLAG_NONE 0x0000
+
+int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+ rgw_xattrlist *attrs, uint32_t flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RADOS_RGW_FILE_H */
diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h
new file mode 100644
index 000000000..a35345f7d
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.h
@@ -0,0 +1,620 @@
+#ifndef CEPH_LIBRADOSSTRIPER_H
+#define CEPH_LIBRADOSSTRIPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "../rados/librados.h"
+
+#define LIBRADOSSTRIPER_VER_MAJOR 0
+#define LIBRADOSSTRIPER_VER_MINOR 0
+#define LIBRADOSSTRIPER_VER_EXTRA 0
+
+#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra)
+
+#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA)
+
+/**
+ * @typedef rados_striper_t
+ *
+ * A handle for interacting with striped objects in a RADOS cluster.
+ */
+typedef void *rados_striper_t;
+
+/**
+ * @defgroup libradosstriper_h_init Setup and Teardown
+ * These are the first and last functions to that should be called
+ * when using libradosstriper.
+ *
+ * @{
+ */
+
+/**
+ * Creates a rados striper using the given io context
+ * Striper has initially default object layout.
+ * See rados_striper_set_object_layout_*() to change this
+ *
+ * @param ioctx the rados context to use
+ * @param striper where to store the rados striper
+ * @returns 0 on success, negative error code on failure
+ */
+ int rados_striper_create(rados_ioctx_t ioctx,
+ rados_striper_t *striper);
+
+/**
+ * Destroys a rados striper
+ *
+ * @param striper the striper to destroy
+ */
+void rados_striper_destroy(rados_striper_t striper);
+
+/**
+ * Sets the object layout's stripe unit of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_unit the stripe_unit value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper,
+ unsigned int stripe_unit);
+
+/**
+ * Sets the object layout's stripe count of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param stripe_count the stripe_count value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_stripe_count(rados_striper_t striper,
+ unsigned int stripe_count);
+
+/**
+ * Sets the object layout's object_size of a rados striper for future objects.
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ *
+ * @param striper the targeted striper
+ * @param object_size the object_size value of the new object layout
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_set_object_layout_object_size(rados_striper_t striper,
+ unsigned int object_size);
+
+/** @} init */
+
+/**
+ * @defgroup libradosstriper_h_synch_io Synchronous I/O
+ * Writes are striped to several rados objects which are then
+ * replicated to a number of OSDs based on the configuration
+ * of the pool they are in. These write functions block
+ * until data is in memory on all replicas of the object they're
+ * writing to - they are equivalent to doing the corresponding
+ * asynchronous write, and the calling
+ * rados_striper_ioctx_wait_for_complete().
+ *
+ * @{
+ */
+
+/**
+ * Synchronously write data to a striped object at the specified offset
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_write(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously write an entire striped object
+ *
+ * The striped object is filled with the provided data. If the striped object exists,
+ * it is truncated and then written.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_write_full(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Append data to an object
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param buf the data to append
+ * @param len length of buf (in bytes)
+ * @returns 0 on success, negative error code on failure
+ * failure
+ */
+int rados_striper_append(rados_striper_t striper,
+ const char *soid,
+ const char *buf,
+ size_t len);
+
+/**
+ * Synchronously read data from a striped object at the specified offset
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns number of bytes read on success, negative error code on
+ * failure
+ */
+int rados_striper_read(rados_striper_t striper,
+ const char *soid,
+ char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Synchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_remove(rados_striper_t striper,
+ const char* soid);
+
+/**
+ * Resize an object
+ *
+ * If this enlarges the object, the new area is logically filled with
+ * zeroes. If this shrinks the object, the excess data is removed.
+ *
+ * @note the truncation is not fully atomic. The metadata part is,
+ * so the behavior will be atomic from user point of view when
+ * the object size is reduced. However, in case of failure, old data
+ * may stay around, hidden. They may reappear if the object size is
+ * later grown, instead of the expected 0s. When growing the
+ * object and in case of failure, the new 0 data may not be
+ * fully created. This can lead to ENOENT errors when
+ * writing/reading the missing parts.
+ * @note the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ * @param io the rados context to use
+ * @param soid the name of the striped object
+ * @param size the new size of the object in bytes
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size);
+
+/** @} Synchronous I/O */
+
+/**
+ * @defgroup libradosstriper_h_xattrs Xattrs
+ * Extended attributes are stored as extended attributes on the
+ * first rados regular object of the striped object.
+ * Thus, they have the same limitations as the underlying
+ * rados extended attributes.
+ *
+ * @{
+ */
+
+/**
+ * Get the value of an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the getxattr will occur
+ * @param oid name of the striped object
+ * @param name which extended attribute to read
+ * @param buf where to store the result
+ * @param len size of buf in bytes
+ * @returns length of xattr value on success, negative error code on failure
+ */
+int rados_striper_getxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ char *buf,
+ size_t len);
+
+/**
+ * Set an extended attribute on a striped object.
+ *
+ * @param striper the striper in which the setxattr will occur
+ * @param oid name of the object
+ * @param name which extended attribute to set
+ * @param buf what to store in the xattr
+ * @param len the number of bytes in buf
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_setxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name,
+ const char *buf,
+ size_t len);
+
+/**
+ * Delete an extended attribute from a striped object.
+ *
+ * @param striper the striper in which the rmxattr will occur
+ * @param oid name of the object
+ * @param name which xattr to delete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_rmxattr(rados_striper_t striper,
+ const char *oid,
+ const char *name);
+
+/**
+ * Start iterating over xattrs on a striped object.
+ *
+ * @post iter is a valid iterator
+ *
+ * @param striper the striper in which the getxattrs will occur
+ * @param oid name of the object
+ * @param iter where to store the iterator
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs(rados_striper_t striper,
+ const char *oid,
+ rados_xattrs_iter_t *iter);
+
+/**
+ * Get the next xattr on the striped object
+ *
+ * @pre iter is a valid iterator
+ *
+ * @post name is the NULL-terminated name of the next xattr, and val
+ * contains the value of the xattr, which is of length len. If the end
+ * of the list has been reached, name and val are NULL, and len is 0.
+ *
+ * @param iter iterator to advance
+ * @param name where to store the name of the next xattr
+ * @param val where to store the value of the next xattr
+ * @param len the number of bytes in val
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_getxattrs_next(rados_xattrs_iter_t iter,
+ const char **name,
+ const char **val,
+ size_t *len);
+
+/**
+ * Close the xattr iterator.
+ *
+ * iter should not be used after this is called.
+ *
+ * @param iter the iterator to close
+ */
+void rados_striper_getxattrs_end(rados_xattrs_iter_t iter);
+
+/** @} Xattrs */
+
+/**
+ * Synchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_stat(rados_striper_t striper,
+ const char* soid,
+ uint64_t *psize,
+ time_t *pmtime);
+
+int rados_striper_stat2(rados_striper_t striper,
+ const char* soid,
+ uint64_t *psize,
+ struct timespec *pmtime);
+
+/**
+ * @defgroup libradosstriper_h_asynch_io Asynchronous I/O
+ * Read and write to objects without blocking.
+ *
+ * @{
+ */
+
+/**
+ * @typedef rados_striper_multi_completion_t
+ * Represents the state of a set of asynchronous operations
+ * it contains the aggregated return value once the operations complete
+ * and can be used to block until all operations are complete and/or safe.
+ */
+typedef void *rados_striper_multi_completion_t;
+
+/**
+ * Constructs a multi completion to use with asynchronous operations
+ *
+ * The complete and safe callbacks correspond to operations being
+ * acked and committed, respectively. The callbacks are called in
+ * order of receipt, so the safe callback may be triggered before the
+ * complete callback, and vice versa. This is affected by journalling
+ * on the OSDs.
+ *
+ * @note Read operations only get a complete callback.
+ * @note BUG: this should check for ENOMEM instead of throwing an exception
+ *
+ * @param cb_arg application-defined data passed to the callback functions
+ * @param cb_complete the function to be called when the operation is
+ * in memory on all relpicas
+ * @param cb_safe the function to be called when the operation is on
+ * stable storage on all replicas
+ * @param pc where to store the completion
+ * @returns 0
+ */
+int rados_striper_multi_aio_create_completion(void *cb_arg,
+ rados_callback_t cb_complete,
+ rados_callback_t cb_safe,
+ rados_striper_multi_completion_t *pc);
+
+/**
+ * Block until all operation complete
+ *
+ * This means data is in memory on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operation are safe
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation completed?
+ *
+ * @warning This does not imply that the complete callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe?
+ *
+ * @warning This does not imply that the safe callback has
+ * finished
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations complete and callback completes
+ *
+ * This means data is in memory on all replicas and can be read.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Block until all operations are safe and callback has completed
+ *
+ * This means data is on stable storage on all replicas.
+ *
+ * @param c operations to wait for
+ * @returns 0
+ */
+void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Has a multi asynchronous operation and callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is complete
+ */
+int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Is a multi asynchronous operation safe and has the callback completed
+ *
+ * @param c async operations to inspect
+ * @returns whether c is safe
+ */
+int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c);
+
+/**
+ * Get the return value of a multi asychronous operation
+ *
+ * The return value is set when all operations are complete or safe,
+ * whichever comes first.
+ *
+ * @pre The operation is safe or complete
+ *
+ * @note BUG: complete callback may never be called when the safe
+ * message is received before the complete message
+ *
+ * @param c async operations to inspect
+ * @returns aggregated return value of the operations
+ */
+int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c);
+
+/**
+ * Release a multi asynchrnous IO completion
+ *
+ * Call this when you no longer need the completion. It may not be
+ * freed immediately if the operation is not acked and committed.
+ *
+ * @param c multi completion to release
+ */
+void rados_striper_multi_aio_release(rados_striper_multi_completion_t c);
+
+/**
+ * Asynchronously write data to a striped object at the specified offset
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @param off byte offset in the object to begin writing at
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously appends data to a striped object
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_append(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously fills and object with the provided data.
+ * If the object exists, it is truncated and then written.
+ *
+ * The return value of the completion will be 0 on success, negative
+ * error code on failure.
+ *
+ * @param striper the striper in which the write will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the write is safe and complete
+ * @param buf data to write
+ * @param len length of the data, in bytes
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_write_full(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ const char *buf,
+ size_t len);
+
+/**
+ * Asynchronously read data from a striped object at the specified offset
+ *
+ * The return value of the completion will be number of bytes read on
+ * success, negative error code on failure.
+ *
+ * @param striper the striper in which the read will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the read is safe and complete
+ * @param buf where to store the results
+ * @param len the number of bytes to read
+ * @param off the offset to start reading from in the object
+ * @returns 0 on success, negative error code on
+ * failure
+ */
+int rados_striper_aio_read(rados_striper_t striper,
+ const char *soid,
+ rados_completion_t completion,
+ char *buf,
+ const size_t len,
+ uint64_t off);
+
+/**
+ * Asynchronously removes a striped object
+ *
+ * @note There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ * @param striper the striper in which the remove will occur
+ * @param soid the name of the striped object
+ * @param completion what to do when the remove is safe and complete
+ * @returns 0 on success, negative error code on failure
+ */
+
+int rados_striper_aio_remove(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion);
+
+/**
+ * Block until all pending writes in a striper are safe
+ *
+ * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all
+ * write completions, since this waits for the associated callbacks to
+ * complete as well.
+ *
+ * @param striper the striper in which the flush will occur
+ * @returns 0 on success, negative error code on failure
+*/
+void rados_striper_aio_flush(rados_striper_t striper);
+
+/**
+ * Asynchronously get object stats (size/mtime)
+ *
+ * @param striper the striper in which the stat will occur
+ * @param soid the id of the striped object
+ * @param psize where to store object size
+ * @param pmtime where to store modification time
+ * @param completion what to do when the stats is complete
+ * @returns 0 on success, negative error code on failure
+ */
+int rados_striper_aio_stat(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion,
+ uint64_t *psize,
+ time_t *pmtime);
+
+int rados_striper_aio_stat2(rados_striper_t striper,
+ const char* soid,
+ rados_completion_t completion,
+ uint64_t *psize,
+ struct timespec *pmtime);
+/** @} Asynchronous I/O */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp
new file mode 100644
index 000000000..fb790b0d7
--- /dev/null
+++ b/src/include/radosstriper/libradosstriper.hpp
@@ -0,0 +1,241 @@
+#ifndef __LIBRADOSSTRIPER_HPP
+#define __LIBRADOSSTRIPER_HPP
+
+#include <string.h>
+#include <string>
+#include <map>
+#include "../rados/buffer.h"
+#include "../rados/librados.hpp"
+
+#include "libradosstriper.h"
+
+namespace libradosstriper
+{
+ struct RadosStriperImpl;
+ struct MultiAioCompletionImpl;
+
+ /*
+ * Completion object for multiple asynchronous IO
+ * It allows to internally handle several "requests"
+ */
+ struct MultiAioCompletion {
+ MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {}
+ ~MultiAioCompletion();
+ int set_complete_callback(void *cb_arg, librados::callback_t cb);
+ int set_safe_callback(void *cb_arg, librados::callback_t cb) __attribute__ ((deprecated));
+ void wait_for_complete();
+ void wait_for_safe() __attribute__ ((deprecated));
+ void wait_for_complete_and_cb();
+ void wait_for_safe_and_cb() __attribute__ ((deprecated));
+ bool is_complete();
+ bool is_safe() __attribute__ ((deprecated));
+ bool is_complete_and_cb();
+ bool is_safe_and_cb() __attribute__ ((deprecated));
+ int get_return_value();
+ void release();
+ MultiAioCompletionImpl *pc;
+ };
+
+ /* RadosStriper : This class allows to perform read/writes on striped objects
+ *
+ * Typical use (error checking omitted):
+ *
+ * RadosStriper rs;
+ * RadosStriper.striper_create("my_cluster", rs);
+ * bufferlist bl;
+ * ... put data in bl ...
+ * rs.write(object_name, bl, len, offset);
+ * bufferlist bl2;
+ * rs.read(object_name, &bl2, len, offset);
+ * ...
+ */
+ class RadosStriper
+ {
+ public:
+
+ /*
+ * constructor
+ */
+ RadosStriper();
+
+ /*
+ * builds the C counter part of a RadosStriper
+ */
+ static void to_rados_striper_t(RadosStriper &striper,
+ rados_striper_t *s);
+
+ /*
+ * copy constructor
+ */
+ RadosStriper(const RadosStriper& rs);
+
+ /*
+ * operator=
+ */
+ RadosStriper& operator=(const RadosStriper& rs);
+
+ /*
+ * destructor
+ * Internally calling close() if an object is currently opened
+ */
+ ~RadosStriper();
+
+ /*
+ * create method
+ */
+ static int striper_create(librados::IoCtx& ioctx,
+ RadosStriper *striper);
+
+ /*
+ * set object layout's stripe unit
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_unit(unsigned int stripe_unit);
+
+ /*
+ * set object layout's stripe count
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_stripe_count(unsigned int stripe_count);
+
+ /*
+ * set object layout's object size
+ * This layout will be used when new objects are created (by writing to them)
+ * Already existing objects will be opened with their own layout.
+ */
+ int set_object_layout_object_size(unsigned int object_size);
+
+ /**
+ * Get the value of an extended attribute on a striped object
+ */
+ int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Set the value of an extended attribute on a striped object
+ */
+ int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl);
+
+ /**
+ * Delete an extended attribute from a striped object
+ */
+ int rmxattr(const std::string& oid, const char *name);
+
+ /**
+ * Start iterating over xattrs on a striped object.
+ */
+ int getxattrs(const std::string& oid,
+ std::map<std::string, ceph::bufferlist>& attrset);
+
+ /**
+ * synchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * synchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @param bl.
+ */
+ int write_full(const std::string& soid, const ceph::bufferlist& bl);
+
+ /**
+ * synchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int append(const std::string& soid, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * asynchronously write to the striped object at the specified offset.
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously fill the striped object with the specified data
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl);
+
+ /**
+ * asynchronously append data to the striped object
+ * NOTE: this call steals the contents of @p bl.
+ */
+ int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len);
+
+ /**
+ * synchronously read from the striped object at the specified offset.
+ */
+ int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off);
+
+ /**
+ * asynchronously read from the striped object at the specified offset.
+ */
+ int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off);
+
+ /**
+ * synchronously get striped object stats (size/mtime)
+ */
+ int stat(const std::string& soid, uint64_t *psize, time_t *pmtime);
+ int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts);
+
+ /**
+ * asynchronously get striped object stats (size/mtime)
+ */
+ int aio_stat(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, time_t *pmtime);
+ int aio_stat2(const std::string& soid, librados::AioCompletion *c,
+ uint64_t *psize, struct timespec *pts);
+
+ /**
+ * deletes a striped object.
+ * There is no atomicity of the deletion and the striped
+ * object may be left incomplete if an error is returned (metadata
+ * all present, but some stripes missing)
+ * However, there is a atomicity of the metadata deletion and
+ * the deletion can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during deletion (same EBUSY return code)
+ */
+ int remove(const std::string& soid);
+ int remove(const std::string& soid, int flags);
+
+ /**
+ * asynchronous remove of striped objects
+ * See synchronous version for comments on (lack of) atomicity
+ */
+ int aio_remove(const std::string& soid, librados::AioCompletion *c);
+ int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags);
+
+ /**
+ * Resizes a striped object
+ * the truncation can not happen if any I/O is ongoing (it
+ * will return EBUSY). Identically, no I/O will be able to start
+ * during truncation (same EBUSY return code)
+ */
+ int trunc(const std::string& oid, uint64_t size);
+
+ /**
+ * Wait for all currently pending aio writes to be safe.
+ *
+ * @returns 0 on success, negative error code on failure
+ */
+ int aio_flush();
+
+ /**
+ * creation of multi aio completion objects
+ */
+ static MultiAioCompletion *multi_aio_create_completion();
+ static MultiAioCompletion *multi_aio_create_completion(void *cb_arg,
+ librados::callback_t cb_complete,
+ librados::callback_t cb_safe);
+
+ private:
+ RadosStriperImpl *rados_striper_impl;
+
+ };
+
+}
+
+#endif