diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/include/rados | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/include/rados.h | 700 | ||||
l--------- | src/include/rados/buffer.h | 1 | ||||
l--------- | src/include/rados/buffer_fwd.h | 1 | ||||
l--------- | src/include/rados/crc32c.h | 1 | ||||
l--------- | src/include/rados/inline_memory.h | 1 | ||||
-rw-r--r-- | src/include/rados/librados.h | 4156 | ||||
-rw-r--r-- | src/include/rados/librados.hpp | 1568 | ||||
-rw-r--r-- | src/include/rados/librados_fwd.hpp | 34 | ||||
-rw-r--r-- | src/include/rados/librgw.h | 36 | ||||
-rw-r--r-- | src/include/rados/objclass.h | 177 | ||||
l--------- | src/include/rados/page.h | 1 | ||||
-rw-r--r-- | src/include/rados/rados_types.h | 41 | ||||
-rw-r--r-- | src/include/rados/rados_types.hpp | 341 | ||||
-rw-r--r-- | src/include/rados/rgw_file.h | 431 | ||||
-rw-r--r-- | src/include/radosstriper/libradosstriper.h | 620 | ||||
-rw-r--r-- | src/include/radosstriper/libradosstriper.hpp | 241 |
16 files changed, 8350 insertions, 0 deletions
diff --git a/src/include/rados.h b/src/include/rados.h new file mode 100644 index 000000000..eac3a2159 --- /dev/null +++ b/src/include/rados.h @@ -0,0 +1,700 @@ +#ifndef CEPH_RADOS_H +#define CEPH_RADOS_H + +/* + * Data types for the Ceph distributed object storage layer RADOS + * (Reliable Autonomic Distributed Object Store). + */ + +#include <string.h> +#include <stdbool.h> +#include "msgr.h" + +/* See comment in ceph_fs.h. */ +#ifndef __KERNEL__ +#include "byteorder.h" +#define __le16 ceph_le16 +#define __le32 ceph_le32 +#define __le64 ceph_le64 +#endif + +/* + * fs id + */ +struct ceph_fsid { + unsigned char fsid[16]; +}; + +static inline int ceph_fsid_compare(const struct ceph_fsid *a, + const struct ceph_fsid *b) +{ + return memcmp(a, b, sizeof(*a)); +} + +/* + * ino, object, etc. + */ +typedef __le64 ceph_snapid_t; +#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ +#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ +#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ + +struct ceph_timespec { + __le32 tv_sec; + __le32 tv_nsec; +} __attribute__ ((packed)); + + +/* + * object layout - how objects are mapped into PGs + */ +#define CEPH_OBJECT_LAYOUT_HASH 1 +#define CEPH_OBJECT_LAYOUT_LINEAR 2 +#define CEPH_OBJECT_LAYOUT_HASHINO 3 + +/* + * pg layout -- how PGs are mapped onto (sets of) OSDs + */ +#define CEPH_PG_LAYOUT_CRUSH 0 +#define CEPH_PG_LAYOUT_HASH 1 +#define CEPH_PG_LAYOUT_LINEAR 2 +#define CEPH_PG_LAYOUT_HYBRID 3 + +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ + +/* + * placement group. + * we encode this into one __le64. + */ +struct ceph_pg { + __le16 preferred; /* preferred primary osd */ + __le16 ps; /* placement seed */ + __le32 pool; /* object pool */ +} __attribute__ ((packed)); + +/* + * pg pool types + * + * NOTE: These map 1:1 on to the pg_pool_t::TYPE_* values. They are + * duplicated here only for CrushCompiler's benefit. + */ +#define CEPH_PG_TYPE_REPLICATED 1 +/* #define CEPH_PG_TYPE_RAID4 2 never implemented */ +#define CEPH_PG_TYPE_ERASURE 3 + +/* + * stable_mod func is used to control number of placement groups. + * similar to straight-up modulo, but produces a stable mapping as b + * increases over time. b is the number of bins, and bmask is the + * containing power of 2 minus 1. + * + * b <= bmask and bmask=(2**n)-1 + * e.g., b=12 -> bmask=15, b=123 -> bmask=127 + * + * ** This function is released to the public domain by the author. ** + */ +static inline int ceph_stable_mod(int x, int b, int bmask) +{ + if ((x & bmask) < b) + return x & bmask; + else + return x & (bmask >> 1); +} + +/* + * object layout - how a given object should be stored. + */ +struct ceph_object_layout { + struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ + __le32 ol_stripe_unit; /* for per-object parity, if any */ +} __attribute__ ((packed)); + +/* + * compound epoch+version, used by storage layer to serialize mutations + */ +struct ceph_eversion { + __le32 epoch; + __le64 version; +} __attribute__ ((packed)); + +/* + * osd map bits + */ + +/* status bits */ +#define CEPH_OSD_EXISTS (1<<0) +#define CEPH_OSD_UP (1<<1) +#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */ +#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */ +#define CEPH_OSD_FULL (1<<4) /* osd is at or above full threshold */ +#define CEPH_OSD_NEARFULL (1<<5) /* osd is at or above nearfull threshold */ +#define CEPH_OSD_BACKFILLFULL (1<<6) /* osd is at or above backfillfull threshold */ +#define CEPH_OSD_DESTROYED (1<<7) /* osd has been destroyed */ +#define CEPH_OSD_NOUP (1<<8) /* osd can not be marked up */ +#define CEPH_OSD_NODOWN (1<<9) /* osd can not be marked down */ +#define CEPH_OSD_NOIN (1<<10) /* osd can not be marked in */ +#define CEPH_OSD_NOOUT (1<<11) /* osd can not be marked out */ +#define CEPH_OSD_STOP (1<<12) /* osd has been stopped by admin */ + +extern const char *ceph_osd_state_name(int s); + +/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ +#define CEPH_OSD_IN 0x10000 +#define CEPH_OSD_OUT 0 + +#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000 +#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000 + + +/* + * osd map flag bits + */ +#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC), deprecated since mimic*/ +#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC), deprecated since mimic */ +#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ +#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ +#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ +#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */ +#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */ +#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */ +#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ +#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ +#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ +#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ +#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ +#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ +#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ +#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ +#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */ +#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */ +#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */ +#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */ +#define CEPH_OSDMAP_PURGED_SNAPDIRS (1<<20) /* osds have converted snapsets */ +#define CEPH_OSDMAP_NOSNAPTRIM (1<<21) /* disable snap trimming */ +#define CEPH_OSDMAP_PGLOG_HARDLIMIT (1<<22) /* put a hard limit on pg log length */ +#define CEPH_OSDMAP_NOAUTOSCALE (1<<23) /* block pg autoscale */ + +/* these are hidden in 'ceph status' view */ +#define CEPH_OSDMAP_SEMIHIDDEN_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL| \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS | \ + CEPH_OSDMAP_RECOVERY_DELETES | \ + CEPH_OSDMAP_SORTBITWISE | \ + CEPH_OSDMAP_PURGED_SNAPDIRS | \ + CEPH_OSDMAP_PGLOG_HARDLIMIT) +#define CEPH_OSDMAP_LEGACY_REQUIRE_FLAGS (CEPH_OSDMAP_REQUIRE_JEWEL | \ + CEPH_OSDMAP_REQUIRE_KRAKEN | \ + CEPH_OSDMAP_REQUIRE_LUMINOUS) + +/* + * major ceph release numbers + */ +#define CEPH_RELEASE_ARGONAUT 1 +#define CEPH_RELEASE_BOBTAIL 2 +#define CEPH_RELEASE_CUTTLEFISH 3 +#define CEPH_RELEASE_DUMPLING 4 +#define CEPH_RELEASE_EMPEROR 5 +#define CEPH_RELEASE_FIREFLY 6 +#define CEPH_RELEASE_GIANT 7 +#define CEPH_RELEASE_HAMMER 8 +#define CEPH_RELEASE_INFERNALIS 9 +#define CEPH_RELEASE_JEWEL 10 +#define CEPH_RELEASE_KRAKEN 11 +#define CEPH_RELEASE_LUMINOUS 12 +#define CEPH_RELEASE_MIMIC 13 +#define CEPH_RELEASE_NAUTILUS 14 +#define CEPH_RELEASE_OCTOPUS 15 +#define CEPH_RELEASE_PACIFIC 16 +#define CEPH_RELEASE_QUINCY 17 +#define CEPH_RELEASE_REEF 18 +#define CEPH_RELEASE_MAX 19 /* highest + 1 */ + +/* + * The error code to return when an OSD can't handle a write + * because it is too large. + */ +#define OSD_WRITETOOBIG EMSGSIZE + +/* + * osd ops + * + * WARNING: do not use these op codes directly. Use the helpers + * defined below instead. In certain cases, op code behavior was + * redefined, resulting in special-cases in the helpers. + */ +#define CEPH_OSD_OP_MODE 0xf000 +#define CEPH_OSD_OP_MODE_RD 0x1000 +#define CEPH_OSD_OP_MODE_WR 0x2000 +#define CEPH_OSD_OP_MODE_RMW 0x3000 +#define CEPH_OSD_OP_MODE_SUB 0x4000 +#define CEPH_OSD_OP_MODE_CACHE 0x8000 + +#define CEPH_OSD_OP_TYPE 0x0f00 +#define CEPH_OSD_OP_TYPE_DATA 0x0200 +#define CEPH_OSD_OP_TYPE_ATTR 0x0300 +#define CEPH_OSD_OP_TYPE_EXEC 0x0400 +#define CEPH_OSD_OP_TYPE_PG 0x0500 +// LEAVE UNUSED 0x0600 used to be multiobject ops + +#define __CEPH_OSD_OP1(mode, nr) \ + (CEPH_OSD_OP_MODE_##mode | (nr)) + +#define __CEPH_OSD_OP(mode, type, nr) \ + (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr)) + +#define __CEPH_FORALL_OSD_OPS(f) \ + /** data **/ \ + /* read */ \ + f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \ + f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \ + f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \ + f(CHECKSUM, __CEPH_OSD_OP(RD, DATA, 31), "checksum") \ + \ + /* fancy read */ \ + f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \ + f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \ + \ + f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \ + f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \ + \ + /* versioning */ \ + f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \ + \ + f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \ + \ + f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \ + \ + /* sync */ \ + f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \ + \ + /* write */ \ + f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \ + f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \ + f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \ + f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \ + f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \ + \ + /* fancy write */ \ + f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ + f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \ + f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ + f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ + \ + f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \ + f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \ + f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \ + \ + f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \ + f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \ + \ + f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \ + \ + /* omap */ \ + f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \ + f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \ + f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \ + f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \ + f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \ + f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \ + f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \ + f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \ + f(OMAPRMKEYRANGE, __CEPH_OSD_OP(WR, DATA, 44), "omap-rm-key-range") \ + f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \ + \ + /* tiering */ \ + f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ + /* was copy-get-classic */ \ + f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ + f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ + f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \ + f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \ + f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \ + f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \ + \ + /* convert tmap to omap */ \ + f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \ + \ + /* hints */ \ + f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \ + \ + /* cache pin/unpin */ \ + f(CACHE_PIN, __CEPH_OSD_OP(WR, DATA, 36), "cache-pin") \ + f(CACHE_UNPIN, __CEPH_OSD_OP(WR, DATA, 37), "cache-unpin") \ + \ + /* ESX/SCSI */ \ + f(WRITESAME, __CEPH_OSD_OP(WR, DATA, 38), "write-same") \ + f(CMPEXT, __CEPH_OSD_OP(RD, DATA, 32), "cmpext") \ + \ + /* Extensible */ \ + f(SET_REDIRECT, __CEPH_OSD_OP(WR, DATA, 39), "set-redirect") \ + f(SET_CHUNK, __CEPH_OSD_OP(CACHE, DATA, 40), "set-chunk") \ + f(TIER_PROMOTE, __CEPH_OSD_OP(WR, DATA, 41), "tier-promote") \ + f(UNSET_MANIFEST, __CEPH_OSD_OP(WR, DATA, 42), "unset-manifest") \ + f(TIER_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 43), "tier-flush") \ + f(TIER_EVICT, __CEPH_OSD_OP(CACHE, DATA, 44), "tier-evict") \ + \ + /** attrs **/ \ + /* read */ \ + f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \ + f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \ + f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \ + \ + /* write */ \ + f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \ + f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \ + f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \ + f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \ + \ + /** subop **/ \ + f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \ + f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \ + f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \ + f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \ + f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \ + f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \ + f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \ + /* 8 used to be scrub-stop */ \ + f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \ + \ + /** exec **/ \ + /* note: the RD bit here is wrong; see special-case below in helper */ \ + f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \ + \ + /** pg **/ \ + f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \ + f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \ + f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \ + f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get") \ + f(PGNLS, __CEPH_OSD_OP(RD, PG, 5), "pgnls") \ + f(PGNLS_FILTER, __CEPH_OSD_OP(RD, PG, 6), "pgnls-filter") \ + f(SCRUBLS, __CEPH_OSD_OP(RD, PG, 7), "scrubls") + +enum { +#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode), +__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY) +#undef GENERATE_ENUM_ENTRY +}; + +static inline int ceph_osd_op_type_data(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; +} +static inline int ceph_osd_op_type_attr(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; +} +static inline int ceph_osd_op_type_exec(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; +} +static inline int ceph_osd_op_type_pg(int op) +{ + return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; +} + +static inline int ceph_osd_op_mode_subop(int op) +{ + return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; +} +static inline int ceph_osd_op_mode_read(int op) +{ + return (op & CEPH_OSD_OP_MODE_RD) && + op != CEPH_OSD_OP_CALL; +} +static inline int ceph_osd_op_mode_modify(int op) +{ + return op & CEPH_OSD_OP_MODE_WR; +} +static inline int ceph_osd_op_mode_cache(int op) +{ + return op & CEPH_OSD_OP_MODE_CACHE; +} +static inline bool ceph_osd_op_uses_extent(int op) +{ + switch(op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_MAPEXT: + case CEPH_OSD_OP_MASKTRUNC: + case CEPH_OSD_OP_SPARSE_READ: + case CEPH_OSD_OP_SYNC_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_TRIMTRUNC: + case CEPH_OSD_OP_CMPEXT: + return true; + default: + return false; + } +} + +/* + * note that the following tmap stuff is also defined in the ceph librados.h + * and objclass.h. Any modification here needs to be updated there + */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' /* create key */ +#define CEPH_OSD_TMAP_RM 'r' +#define CEPH_OSD_TMAP_RMSLOPPY 'R' + +extern const char *ceph_osd_op_name(int op); + +/* + * osd op flags + * + * An op may be READ, WRITE, or READ|WRITE. + */ +enum { + CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */ + CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */ + CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */ + CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */ + CEPH_OSD_FLAG_READ = 0x0010, /* op may read */ + CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */ + CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */ + CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */ + CEPH_OSD_FLAG_BALANCE_READS = 0x0100, + CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */ + CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */ + CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */ + CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ + CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ + CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY =0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ + CEPH_OSD_FLAG_MAP_SNAP_CLONE =0x80000, /* map snap direct to clone id + */ + CEPH_OSD_FLAG_ENFORCE_SNAPC =0x100000, /* use snapc provided even if + pool uses pool snaps */ + CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ + CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ + CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ + CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ + CEPH_OSD_FLAG_IGNORE_REDIRECT = 0x2000000, /* ignore redirection */ + CEPH_OSD_FLAG_RETURNVEC = 0x4000000, /* allow overall result >= 0, and return >= 0 and buffer for each op in opvec */ + CEPH_OSD_FLAG_SUPPORTSPOOLEIO = 0x8000000, /* client understands pool EIO flag */ +}; + +enum { + CEPH_OSD_OP_FLAG_EXCL = 0x1, /* EXCL object create */ + CEPH_OSD_OP_FLAG_FAILOK = 0x2, /* continue despite failure */ + CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ + CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */ + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */ + CEPH_OSD_OP_FLAG_WITH_REFERENCE = 0x80, /* need reference couting */ + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */ +}; + +#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/ +#define EBLOCKLISTED 108 /* blocklisted */ +#define EBLACKLISTED 108 /* deprecated */ + +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + +enum { + CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to + * cloneid */ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* use provided truncate_{seq,size} (copy-from2 only) */ +}; + +#define CEPH_OSD_COPY_FROM_FLAGS \ + (CEPH_OSD_COPY_FROM_FLAG_FLUSH | \ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY | \ + CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE | \ + CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE | \ + CEPH_OSD_COPY_FROM_FLAG_RWORDERED | \ + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ) + +enum { + CEPH_OSD_TMAP2OMAP_NULLOK = 1, +}; + +enum { + CEPH_OSD_WATCH_OP_UNWATCH = 0, + CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, + /* note: use only ODD ids to prevent pre-giant code from + interpreting the op as UNWATCH */ + CEPH_OSD_WATCH_OP_WATCH = 3, + CEPH_OSD_WATCH_OP_RECONNECT = 5, + CEPH_OSD_WATCH_OP_PING = 7, +}; + +enum { + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32 = 0, + CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64 = 1, + CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C = 2 +}; + +const char *ceph_osd_watch_op_name(int o); + +enum { + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8, + CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32, + CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64, + CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128, + CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; + +const char *ceph_osd_alloc_hint_flag_name(int f); + +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; + +const char *ceph_osd_backoff_op_name(int op); + +/* + * an individual object operation. each may be accompanied by some data + * payload + */ +struct ceph_osd_op { + __le16 op; /* CEPH_OSD_OP_* */ + __le32 flags; /* CEPH_OSD_OP_FLAG_* */ + union { + struct { + __le64 offset, length; + __le64 truncate_size; + __le32 truncate_seq; + } __attribute__ ((packed)) extent; + struct { + __le32 name_len; + __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ + } __attribute__ ((packed)) xattr; + struct { + __u8 class_len; + __u8 method_len; + __u8 argc; + __le32 indata_len; + } __attribute__ ((packed)) cls; + struct { + __le64 count; + __le32 start_epoch; /* for the pgls sequence */ + } __attribute__ ((packed)) pgls; + struct { + __le64 snapid; + } __attribute__ ((packed)) snap; + struct { + __le64 cookie; + __le64 ver; /* no longer used */ + __u8 op; /* CEPH_OSD_WATCH_OP_* */ + __u32 gen; /* registration generation */ + __u32 timeout; /* connection timeout */ + } __attribute__ ((packed)) watch; + struct { + __le64 cookie; + } __attribute__ ((packed)) notify; + struct { + __le64 unused; + __le64 ver; + } __attribute__ ((packed)) assert_ver; + struct { + __le64 offset, length; + __le64 src_offset; + } __attribute__ ((packed)) clonerange; + struct { + __le64 max; /* max data in reply */ + } __attribute__ ((packed)) copy_get; + struct { + __le64 snapid; + __le64 src_version; + __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ + /* + * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags + * for src object, flags for dest object are in + * ceph_osd_op::flags. + */ + __le32 src_fadvise_flags; + } __attribute__ ((packed)) copy_from; + struct { + struct ceph_timespec stamp; + } __attribute__ ((packed)) hit_set_get; + struct { + __u8 flags; + } __attribute__ ((packed)) tmap2omap; + struct { + __le64 expected_object_size; + __le64 expected_write_size; + __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ + } __attribute__ ((packed)) alloc_hint; + struct { + __le64 offset; + __le64 length; + __le64 data_length; + } __attribute__ ((packed)) writesame; + struct { + __le64 offset; + __le64 length; + __le32 chunk_size; + __u8 type; /* CEPH_OSD_CHECKSUM_OP_TYPE_* */ + } __attribute__ ((packed)) checksum; + } __attribute__ ((packed)); + __le32 payload_len; +} __attribute__ ((packed)); + +/* + * Check the compatibility of struct ceph_osd_op + * (2+4+(2*8+8+4)+4) = (sizeof(ceph_osd_op::op) + + * sizeof(ceph_osd_op::flags) + + * sizeof(ceph_osd_op::extent) + + * sizeof(ceph_osd_op::payload_len)) + */ +#ifdef __cplusplus +static_assert(sizeof(ceph_osd_op) == (2+4+(2*8+8+4)+4), + "sizeof(ceph_osd_op) breaks the compatibility"); +#endif + +struct ceph_osd_reply_head { + __le32 client_inc; /* client incarnation */ + __le32 flags; + struct ceph_object_layout layout; + __le32 osdmap_epoch; + struct ceph_eversion reassert_version; /* for replaying uncommitted */ + + __le32 result; /* result code */ + + __le32 object_len; /* length of object name */ + __le32 num_ops; + struct ceph_osd_op ops[0]; /* ops[], object */ +} __attribute__ ((packed)); + +#ifndef __KERNEL__ +#undef __le16 +#undef __le32 +#undef __le64 +#endif + +#endif diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h new file mode 120000 index 000000000..51fc03be1 --- /dev/null +++ b/src/include/rados/buffer.h @@ -0,0 +1 @@ +../buffer.h
\ No newline at end of file diff --git a/src/include/rados/buffer_fwd.h b/src/include/rados/buffer_fwd.h new file mode 120000 index 000000000..bd1f6f1b0 --- /dev/null +++ b/src/include/rados/buffer_fwd.h @@ -0,0 +1 @@ +../buffer_fwd.h
\ No newline at end of file diff --git a/src/include/rados/crc32c.h b/src/include/rados/crc32c.h new file mode 120000 index 000000000..19ef4317e --- /dev/null +++ b/src/include/rados/crc32c.h @@ -0,0 +1 @@ +../crc32c.h
\ No newline at end of file diff --git a/src/include/rados/inline_memory.h b/src/include/rados/inline_memory.h new file mode 120000 index 000000000..48f0d4436 --- /dev/null +++ b/src/include/rados/inline_memory.h @@ -0,0 +1 @@ +../inline_memory.h
\ No newline at end of file diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h new file mode 100644 index 000000000..858804c3a --- /dev/null +++ b/src/include/rados/librados.h @@ -0,0 +1,4156 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2012 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_LIBRADOS_H +#define CEPH_LIBRADOS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/in.h> +#if defined(__linux__) +#include <linux/types.h> +#elif defined(__FreeBSD__) +#include <sys/types.h> +#endif +#include <unistd.h> +#include <string.h> +#include "rados_types.h" + +#include <sys/time.h> + +#ifndef CEPH_OSD_TMAP_SET +/* These are also defined in rados.h and objclass.h. Keep them in sync! */ +#define CEPH_OSD_TMAP_HDR 'h' +#define CEPH_OSD_TMAP_SET 's' +#define CEPH_OSD_TMAP_CREATE 'c' +#define CEPH_OSD_TMAP_RM 'r' +#endif + +#define LIBRADOS_VER_MAJOR 3 +#define LIBRADOS_VER_MINOR 0 +#define LIBRADOS_VER_EXTRA 0 + +#define LIBRADOS_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOS_VERSION_CODE LIBRADOS_VERSION(LIBRADOS_VER_MAJOR, LIBRADOS_VER_MINOR, LIBRADOS_VER_EXTRA) + +#define LIBRADOS_SUPPORTS_WATCH 1 +#define LIBRADOS_SUPPORTS_SERVICES 1 +#define LIBRADOS_SUPPORTS_GETADDRS 1 +#define LIBRADOS_SUPPORTS_APP_METADATA 1 + +/* RADOS lock flags + * They are also defined in cls_lock_types.h. Keep them in sync! + */ +#define LIBRADOS_LOCK_FLAG_RENEW (1u<<0) +#define LIBRADOS_LOCK_FLAG_MAY_RENEW LIBRADOS_LOCK_FLAG_RENEW +#define LIBRADOS_LOCK_FLAG_MUST_RENEW (1u<<1) + +/* + * Constants for rados_write_op_create(). + */ +#define LIBRADOS_CREATE_EXCLUSIVE 1 +#define LIBRADOS_CREATE_IDEMPOTENT 0 + +/* + * Flags that can be set on a per-op basis via + * rados_read_op_set_flags() and rados_write_op_set_flags(). + */ +enum { + // fail a create operation if the object already exists + LIBRADOS_OP_FLAG_EXCL = 0x1, + // allow the transaction to succeed even if the flagged op fails + LIBRADOS_OP_FLAG_FAILOK = 0x2, + // indicate read/write op random + LIBRADOS_OP_FLAG_FADVISE_RANDOM = 0x4, + // indicate read/write op sequential + LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, + // indicate read/write data will be accessed in the near future (by someone) + LIBRADOS_OP_FLAG_FADVISE_WILLNEED = 0x10, + // indicate read/write data will not accessed in the near future (by anyone) + LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20, + // indicate read/write data will not accessed again (by *this* client) + LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40, + // optionally support FUA (force unit access) on write requests + LIBRADOS_OP_FLAG_FADVISE_FUA = 0x80, +}; + +#define CEPH_RADOS_API + +/** + * @name xattr comparison operations + * Operators for comparing xattrs on objects, and aborting the + * rados_read_op or rados_write_op transaction if the comparison + * fails. + * + * @{ + */ +enum { + LIBRADOS_CMPXATTR_OP_EQ = 1, + LIBRADOS_CMPXATTR_OP_NE = 2, + LIBRADOS_CMPXATTR_OP_GT = 3, + LIBRADOS_CMPXATTR_OP_GTE = 4, + LIBRADOS_CMPXATTR_OP_LT = 5, + LIBRADOS_CMPXATTR_OP_LTE = 6 +}; +/** @} */ + +/** + * @name Operation Flags + * Flags for rados_read_op_operate(), rados_write_op_operate(), + * rados_aio_read_op_operate(), and rados_aio_write_op_operate(). + * See librados.hpp for details. + * @{ + */ +enum { + LIBRADOS_OPERATION_NOFLAG = 0, + LIBRADOS_OPERATION_BALANCE_READS = 1, + LIBRADOS_OPERATION_LOCALIZE_READS = 2, + LIBRADOS_OPERATION_ORDER_READS_WRITES = 4, + LIBRADOS_OPERATION_IGNORE_CACHE = 8, + LIBRADOS_OPERATION_SKIPRWLOCKS = 16, + LIBRADOS_OPERATION_IGNORE_OVERLAY = 32, + /* send requests to cluster despite the cluster or pool being marked + full; ops will either succeed (e.g., delete) or return EDQUOT or + ENOSPC. */ + LIBRADOS_OPERATION_FULL_TRY = 64, + /* + * Mainly for delete op + */ + LIBRADOS_OPERATION_FULL_FORCE = 128, + LIBRADOS_OPERATION_IGNORE_REDIRECT = 256, + LIBRADOS_OPERATION_ORDERSNAP = 512, + /* enable/allow >0 return values and payloads on write/update */ + LIBRADOS_OPERATION_RETURNVEC = 1024, +}; +/** @} */ + +/** + * @name Alloc hint flags + * Flags for rados_write_op_alloc_hint2() and rados_set_alloc_hint2() + * indicating future IO patterns. + * @{ + */ +enum { + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + LIBRADOS_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + LIBRADOS_ALLOC_HINT_FLAG_RANDOM_READ = 8, + LIBRADOS_ALLOC_HINT_FLAG_APPEND_ONLY = 16, + LIBRADOS_ALLOC_HINT_FLAG_IMMUTABLE = 32, + LIBRADOS_ALLOC_HINT_FLAG_SHORTLIVED = 64, + LIBRADOS_ALLOC_HINT_FLAG_LONGLIVED = 128, + LIBRADOS_ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + LIBRADOS_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, +}; +/** @} */ + +typedef enum { + LIBRADOS_CHECKSUM_TYPE_XXHASH32 = 0, + LIBRADOS_CHECKSUM_TYPE_XXHASH64 = 1, + LIBRADOS_CHECKSUM_TYPE_CRC32C = 2 +} rados_checksum_type_t; + +/* + * snap id contants + */ +#define LIBRADOS_SNAP_HEAD UINT64_C(-2) +#define LIBRADOS_SNAP_DIR UINT64_C(-1) + +/** + * @typedef rados_t + * + * A handle for interacting with a RADOS cluster. It encapsulates all + * RADOS client configuration, including username, key for + * authentication, logging, and debugging. Talking to different clusters + * -- or to the same cluster with different users -- requires + * different cluster handles. + */ +#ifndef VOIDPTR_RADOS_T +#define VOIDPTR_RADOS_T +typedef void *rados_t; +#endif //VOIDPTR_RADOS_T + +/** + * @typedef rados_config_t + * + * A handle for the ceph configuration context for the rados_t cluster + * instance. This can be used to share configuration context/state + * (e.g., logging configuration) between librados instance. + * + * @warning The config context does not have independent reference + * counting. As such, a rados_config_t handle retrieved from a given + * rados_t is only valid as long as that rados_t. + */ +typedef void *rados_config_t; + +/** + * @typedef rados_ioctx_t + * + * An io context encapsulates a few settings for all I/O operations + * done on it: + * - pool - set when the io context is created (see rados_ioctx_create()) + * - snapshot context for writes (see + * rados_ioctx_selfmanaged_snap_set_write_ctx()) + * - snapshot id to read from (see rados_ioctx_snap_set_read()) + * - object locator for all single-object operations (see + * rados_ioctx_locator_set_key()) + * - namespace for all single-object operations (see + * rados_ioctx_set_namespace()). Set to LIBRADOS_ALL_NSPACES + * before rados_nobjects_list_open() will list all objects in all + * namespaces. + * + * @warning Changing any of these settings is not thread-safe - + * librados users must synchronize any of these changes on their own, + * or use separate io contexts for each thread + */ +typedef void *rados_ioctx_t; + +/** + * @typedef rados_list_ctx_t + * + * An iterator for listing the objects in a pool. + * Used with rados_nobjects_list_open(), + * rados_nobjects_list_next(), rados_nobjects_list_next2(), and + * rados_nobjects_list_close(). + */ +typedef void *rados_list_ctx_t; + +/** + * @typedef rados_object_list_cursor + * + * The cursor used with rados_enumerate_objects + * and accompanying methods. + */ +typedef void * rados_object_list_cursor; + +/** + * @struct rados_object_list_item + * + * The item populated by rados_object_list in + * the results array. + */ +typedef struct { + + /// oid length + size_t oid_length; + /// name of the object + char *oid; + /// namespace length + size_t nspace_length; + /// the object namespace + char *nspace; + /// locator length + size_t locator_length; + /// object locator + char *locator; +} rados_object_list_item; + +/** + * @typedef rados_snap_t + * The id of a snapshot. + */ +typedef uint64_t rados_snap_t; + +/** + * @typedef rados_xattrs_iter_t + * An iterator for listing extended attrbutes on an object. + * Used with rados_getxattrs(), rados_getxattrs_next(), and + * rados_getxattrs_end(). + */ +typedef void *rados_xattrs_iter_t; + +/** + * @typedef rados_omap_iter_t + * An iterator for listing omap key/value pairs on an object. + * Used with rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_vals_by_keys(), rados_omap_get_next(), and + * rados_omap_get_end(). + */ +typedef void *rados_omap_iter_t; + +/** + * @struct rados_pool_stat_t + * Usage information for a pool. + */ +struct rados_pool_stat_t { + /// space used in bytes + uint64_t num_bytes; + /// space used in KB + uint64_t num_kb; + /// number of objects in the pool + uint64_t num_objects; + /// number of clones of objects + uint64_t num_object_clones; + /// num_objects * num_replicas + uint64_t num_object_copies; + /// number of objects missing on primary + uint64_t num_objects_missing_on_primary; + /// number of objects found on no OSDs + uint64_t num_objects_unfound; + /// number of objects replicated fewer times than they should be + /// (but found on at least one OSD) + uint64_t num_objects_degraded; + /// number of objects read + uint64_t num_rd; + /// objects read in KB + uint64_t num_rd_kb; + /// number of objects written + uint64_t num_wr; + /// objects written in KB + uint64_t num_wr_kb; + /// bytes originally provided by user + uint64_t num_user_bytes; + /// bytes passed compression + uint64_t compressed_bytes_orig; + /// bytes resulted after compression + uint64_t compressed_bytes; + /// bytes allocated at storage + uint64_t compressed_bytes_alloc; +}; + +/** + * @struct rados_cluster_stat_t + * Cluster-wide usage information + */ +struct rados_cluster_stat_t { + /// total device size + uint64_t kb; + /// total used + uint64_t kb_used; + /// total available/free + uint64_t kb_avail; + /// number of objects + uint64_t num_objects; +}; + +/** + * @typedef rados_write_op_t + * + * An object write operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_write_op() rados_release_write_op() + * - Extended attribute manipulation: rados_write_op_cmpxattr() + * rados_write_op_cmpxattr(), rados_write_op_setxattr(), + * rados_write_op_rmxattr() + * - Object map key/value pairs: rados_write_op_omap_set(), + * rados_write_op_omap_rm_keys(), rados_write_op_omap_clear(), + * rados_write_op_omap_cmp() + * - Object properties: rados_write_op_assert_exists(), + * rados_write_op_assert_version() + * - Creating objects: rados_write_op_create() + * - IO on objects: rados_write_op_append(), rados_write_op_write(), rados_write_op_zero + * rados_write_op_write_full(), rados_write_op_writesame(), rados_write_op_remove, + * rados_write_op_truncate(), rados_write_op_zero(), rados_write_op_cmpext() + * - Hints: rados_write_op_set_alloc_hint() + * - Performing the operation: rados_write_op_operate(), rados_aio_write_op_operate() + */ +typedef void *rados_write_op_t; + +/** + * @typedef rados_read_op_t + * + * An object read operation stores a number of operations which can be + * executed atomically. For usage, see: + * - Creation and deletion: rados_create_read_op() rados_release_read_op() + * - Extended attribute manipulation: rados_read_op_cmpxattr(), + * rados_read_op_getxattr(), rados_read_op_getxattrs() + * - Object map key/value pairs: rados_read_op_omap_get_vals(), + * rados_read_op_omap_get_keys(), rados_read_op_omap_get_vals_by_keys(), + * rados_read_op_omap_cmp() + * - Object properties: rados_read_op_stat(), rados_read_op_assert_exists(), + * rados_read_op_assert_version() + * - IO on objects: rados_read_op_read(), rados_read_op_checksum(), + * rados_read_op_cmpext() + * - Custom operations: rados_read_op_exec(), rados_read_op_exec_user_buf() + * - Request properties: rados_read_op_set_flags() + * - Performing the operation: rados_read_op_operate(), + * rados_aio_read_op_operate() + */ +typedef void *rados_read_op_t; + +/** + * @typedef rados_completion_t + * Represents the state of an asynchronous operation - it contains the + * return value once the operation completes, and can be used to block + * until the operation is complete or safe. + */ +typedef void *rados_completion_t; + +/** + * @struct blkin_trace_info + * blkin trace information for Zipkin tracing + */ +struct blkin_trace_info; + +/** + * Get the version of librados. + * + * The version number is major.minor.extra. Note that this is + * unrelated to the Ceph version number. + * + * TODO: define version semantics, i.e.: + * - incrementing major is for backwards-incompatible changes + * - incrementing minor is for backwards-compatible changes + * - incrementing extra is for bug fixes + * + * @param major where to store the major version number + * @param minor where to store the minor version number + * @param extra where to store the extra version number + */ +CEPH_RADOS_API void rados_version(int *major, int *minor, int *extra); + +/** + * @name Setup and Teardown + * These are the first and last functions to that should be called + * when using librados. + * + * @{ + */ + +/** + * Create a handle for communicating with a RADOS cluster. + * + * Ceph environment variables are read when this is called, so if + * $CEPH_ARGS specifies everything you need to connect, no further + * configuration is necessary. + * + * @param cluster where to store the handle + * @param id the user to connect as (i.e. admin, not client.admin) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create(rados_t *cluster, const char * const id); + +/** + * Extended version of rados_create. + * + * Like rados_create, but + * 1) don't assume 'client\.'+id; allow full specification of name + * 2) allow specification of cluster name + * 3) flags for future expansion + */ +CEPH_RADOS_API int rados_create2(rados_t *pcluster, + const char *const clustername, + const char * const name, uint64_t flags); + +/** + * Initialize a cluster handle from an existing configuration. + * + * Share configuration state with another rados_t instance. + * + * @param cluster where to store the handle + * @param cct the existing configuration to use + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_create_with_context(rados_t *cluster, + rados_config_t cct); + +/** + * Ping the monitor with ID mon_id, storing the resulting reply in + * buf (if specified) with a maximum size of len. + * + * The result buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param mon_id [in] ID of the monitor to ping + * @param outstr [out] double pointer with the resulting reply + * @param outstrlen [out] pointer with the size of the reply in outstr + */ +CEPH_RADOS_API int rados_ping_monitor(rados_t cluster, const char *mon_id, + char **outstr, size_t *outstrlen); + +/** + * Connect to the cluster. + * + * @note BUG: Before calling this, calling a function that communicates with the + * cluster will crash. + * + * @pre The cluster handle is configured with at least a monitor + * address. If cephx is enabled, a client name and secret must also be + * set. + * + * @post If this succeeds, any function in librados may be used + * + * @param cluster The cluster to connect to. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_connect(rados_t cluster); + +/** + * Disconnects from the cluster. + * + * For clean up, this is only necessary after rados_connect() has + * succeeded. + * + * @warning This does not guarantee any asynchronous writes have + * completed. To do that, you must call rados_aio_flush() on all open + * io contexts. + * + * @warning We implicitly call rados_watch_flush() on shutdown. If + * there are watches being used, this should be done explicitly before + * destroying the relevant IoCtx. We do it here as a safety measure. + * + * @post the cluster handle cannot be used again + * + * @param cluster the cluster to shutdown + */ +CEPH_RADOS_API void rados_shutdown(rados_t cluster); + +/** @} init */ + +/** + * @name Configuration + * These functions read and update Ceph configuration for a cluster + * handle. Any configuration changes must be done before connecting to + * the cluster. + * + * Options that librados users might want to set include: + * - mon_host + * - auth_supported + * - key, keyfile, or keyring when using cephx + * - log_file, log_to_stderr, err_to_stderr, and log_to_syslog + * - debug_rados, debug_objecter, debug_monc, debug_auth, or debug_ms + * + * See docs.ceph.com for information about available configuration options` + * + * @{ + */ + +/** + * Configure the cluster handle using a Ceph config file + * + * If path is NULL, the default locations are searched, and the first + * found is used. The locations are: + * - $CEPH_CONF (environment variable) + * - /etc/ceph/ceph.conf + * - ~/.ceph/config + * - ceph.conf (in the current working directory) + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param path path to a Ceph configuration file + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_read_file(rados_t cluster, const char *path); + +/** + * Configure the cluster handle with command line arguments + * + * argv can contain any common Ceph command line option, including any + * configuration parameter prefixed by '--' and replacing spaces with + * dashes or underscores. For example, the following options are equivalent: + * - --mon-host 10.0.0.1:6789 + * - --mon_host 10.0.0.1:6789 + * - -m 10.0.0.1:6789 + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv(rados_t cluster, int argc, + const char **argv); + + +/** + * Configure the cluster handle with command line arguments, returning + * any remainders. Same rados_conf_parse_argv, except for extra + * remargv argument to hold returns unrecognized arguments. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param argc number of arguments in argv + * @param argv arguments to parse + * @param remargv char* array for returned unrecognized arguments + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_argv_remainder(rados_t cluster, int argc, + const char **argv, + const char **remargv); +/** + * Configure the cluster handle based on an environment variable + * + * The contents of the environment variable are parsed as if they were + * Ceph command line options. If var is NULL, the CEPH_ARGS + * environment variable is used. + * + * @pre rados_connect() has not been called on the cluster handle + * + * @note BUG: this is not threadsafe - it uses a static buffer + * + * @param cluster cluster handle to configure + * @param var name of the environment variable to read + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_conf_parse_env(rados_t cluster, const char *var); + +/** + * Set a configuration option + * + * @pre rados_connect() has not been called on the cluster handle + * + * @param cluster cluster handle to configure + * @param option option to set + * @param value value of the option + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when the option is not a Ceph configuration option + */ +CEPH_RADOS_API int rados_conf_set(rados_t cluster, const char *option, + const char *value); + +/** + * Get the value of a configuration option + * + * @param cluster configuration to read + * @param option which option to read + * @param buf where to write the configuration value + * @param len the size of buf in bytes + * @returns 0 on success, negative error code on failure + * @returns -ENAMETOOLONG if the buffer is too short to contain the + * requested value + */ +CEPH_RADOS_API int rados_conf_get(rados_t cluster, const char *option, + char *buf, size_t len); + +/** @} config */ + +/** + * Read usage info about the cluster + * + * This tells you total space, space used, space available, and number + * of objects. These are not updated immediately when data is written, + * they are eventually consistent. + * + * @param cluster cluster to query + * @param result where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cluster_stat(rados_t cluster, + struct rados_cluster_stat_t *result); + +/** + * Get the fsid of the cluster as a hexadecimal string. + * + * The fsid is a unique id of an entire Ceph cluster. + * + * @param cluster where to get the fsid + * @param buf where to write the fsid + * @param len the size of buf in bytes (should be 37) + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the buffer is too short to contain the + * fsid + */ +CEPH_RADOS_API int rados_cluster_fsid(rados_t cluster, char *buf, size_t len); + +/** + * Get/wait for the most recent osdmap + * + * @param cluster the cluster to shutdown + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_wait_for_latest_osdmap(rados_t cluster); + +/** + * @name Pools + * + * RADOS pools are separate namespaces for objects. Pools may have + * different crush rules associated with them, so they could have + * differing replication levels or placement strategies. RADOS + * permissions are also tied to pools - users can have different read, + * write, and execute permissions on a per-pool basis. + * + * @{ + */ + +/** + * List pools + * + * Gets a list of pool names as NULL-terminated strings. The pool + * names will be placed in the supplied buffer one after another. + * After the last pool name, there will be two 0 bytes in a row. + * + * If len is too short to fit all the pool name entries we need, we will fill + * as much as we can. + * + * Buf may be null to determine the buffer size needed to list all pools. + * + * @param cluster cluster handle + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_pool_list(rados_t cluster, char *buf, size_t len); + +/** + * List inconsistent placement groups of the given pool + * + * Gets a list of inconsistent placement groups as NULL-terminated strings. + * The placement group names will be placed in the supplied buffer one after + * another. After the last name, there will be two 0 types in a row. + * + * If len is too short to fit all the placement group entries we need, we will + * fill as much as we can. + * + * @param cluster cluster handle + * @param pool pool ID + * @param buf output buffer + * @param len output buffer length + * @returns length of the buffer we would need to list all pools + */ +CEPH_RADOS_API int rados_inconsistent_pg_list(rados_t cluster, int64_t pool, + char *buf, size_t len); + +/** + * Get a configuration handle for a rados cluster handle + * + * This handle is valid only as long as the cluster handle is valid. + * + * @param cluster cluster handle + * @returns config handle for this cluster + */ +CEPH_RADOS_API rados_config_t rados_cct(rados_t cluster); + +/** + * Get a global id for current instance + * + * This id is a unique representation of current connection to the cluster + * + * @param cluster cluster handle + * @returns instance global id + */ +CEPH_RADOS_API uint64_t rados_get_instance_id(rados_t cluster); + +/** + * Gets the minimum compatible OSD version + * + * @param cluster cluster handle + * @param require_osd_release [out] minimum compatible OSD version + * based upon the current features + * @returns 0 on sucess, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_osd(rados_t cluster, + int8_t* require_osd_release); + +/** + * Gets the minimum compatible client version + * + * @param cluster cluster handle + * @param min_compat_client [out] minimum compatible client version + * based upon the current features + * @param require_min_compat_client [out] required minimum client version + * based upon explicit setting + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_get_min_compatible_client(rados_t cluster, + int8_t* min_compat_client, + int8_t* require_min_compat_client); + +/** + * Create an io context + * + * The io context allows you to perform operations within a particular + * pool. For more details see rados_ioctx_t. + * + * @param cluster which cluster the pool is in + * @param pool_name name of the pool + * @param ioctx where to store the io context + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_create(rados_t cluster, const char *pool_name, + rados_ioctx_t *ioctx); +CEPH_RADOS_API int rados_ioctx_create2(rados_t cluster, int64_t pool_id, + rados_ioctx_t *ioctx); + +/** + * The opposite of rados_ioctx_create + * + * This just tells librados that you no longer need to use the io context. + * It may not be freed immediately if there are pending asynchronous + * requests on it, but you should not use an io context again after + * calling this function on it. + * + * @warning This does not guarantee any asynchronous + * writes have completed. You must call rados_aio_flush() + * on the io context before destroying it to do that. + * + * @warning If this ioctx is used by rados_watch, the caller needs to + * be sure that all registered watches are disconnected via + * rados_unwatch() and that rados_watch_flush() is called. This + * ensures that a racing watch callback does not make use of a + * destroyed ioctx. + * + * @param io the io context to dispose of + */ +CEPH_RADOS_API void rados_ioctx_destroy(rados_ioctx_t io); + +/** + * Get configuration handle for a pool handle + * + * @param io pool handle + * @returns rados_config_t for this cluster + */ +CEPH_RADOS_API rados_config_t rados_ioctx_cct(rados_ioctx_t io); + +/** + * Get the cluster handle used by this rados_ioctx_t + * Note that this is a weak reference, and should not + * be destroyed via rados_shutdown(). + * + * @param io the io context + * @returns the cluster handle for this io context + */ +CEPH_RADOS_API rados_t rados_ioctx_get_cluster(rados_ioctx_t io); + +/** + * Get pool usage statistics + * + * Fills in a rados_pool_stat_t after querying the cluster. + * + * @param io determines which pool to query + * @param stats where to store the results + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_stat(rados_ioctx_t io, + struct rados_pool_stat_t *stats); + +/** + * Get the id of a pool + * + * @param cluster which cluster the pool is in + * @param pool_name which pool to look up + * @returns id of the pool + * @returns -ENOENT if the pool is not found + */ +CEPH_RADOS_API int64_t rados_pool_lookup(rados_t cluster, + const char *pool_name); + +/** + * Get the name of a pool + * + * @param cluster which cluster the pool is in + * @param id the id of the pool + * @param buf where to store the pool name + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_pool_reverse_lookup(rados_t cluster, int64_t id, + char *buf, size_t maxlen); + +/** + * Create a pool with default settings + * + * The default crush rule is rule 0. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create(rados_t cluster, const char *pool_name); + +/** + * Create a pool owned by a specific auid. + * + * DEPRECATED: auid support has been removed, and this call will be removed in a future + * release. + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_auid(rados_t cluster, + const char *pool_name, + uint64_t auid) + __attribute__((deprecated)); + +/** + * Create a pool with a specific CRUSH rule + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool1 + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_crush_rule(rados_t cluster, + const char *pool_name, + uint8_t crush_rule_num); + +/** + * Create a pool with a specific CRUSH rule and auid + * + * DEPRECATED: auid support has been removed and this call will be removed + * in a future release. + * + * This is a combination of rados_pool_create_with_crush_rule() and + * rados_pool_create_with_auid(). + * + * @param cluster the cluster in which the pool will be created + * @param pool_name the name of the new pool + * @param crush_rule_num which rule to use for placement in the new pool2 + * @param auid the id of the owner of the new pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_create_with_all(rados_t cluster, + const char *pool_name, + uint64_t auid, + uint8_t crush_rule_num) + __attribute__((deprecated)); + +/** + * Returns the pool that is the base tier for this pool. + * + * The return value is the ID of the pool that should be used to read from/write to. + * If tiering is not set up for the pool, returns \c pool. + * + * @param cluster the cluster the pool is in + * @param pool ID of the pool to query + * @param base_tier [out] base tier, or \c pool if tiering is not configured + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_get_base_tier(rados_t cluster, int64_t pool, + int64_t* base_tier); + +/** + * Delete a pool and all data inside it + * + * The pool is removed from the cluster immediately, + * but the actual data is deleted in the background. + * + * @param cluster the cluster the pool is in + * @param pool_name which pool to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_pool_delete(rados_t cluster, const char *pool_name); + +/** + * Attempt to change an io context's associated auid "owner" + * + * DEPRECATED: auid support has been removed and this call has no effect. + * + * Requires that you have write permission on both the current and new + * auid. + * + * @param io reference to the pool to change. + * @param auid the auid you wish the io to have. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid) + __attribute__((deprecated)); + + +/** + * Get the auid of a pool + * + * DEPRECATED: auid support has been removed and this call always reports + * CEPH_AUTH_UID_DEFAULT (-1). + + * @param io pool to query + * @param auid where to store the auid + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid) + __attribute__((deprecated)); + +/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Test whether the specified pool requires alignment or not. + * + * @param io pool to query + * @param req 1 if alignment is supported, 0 if not. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io, + int *req); + +/* deprecated, use rados_ioctx_pool_required_alignment2 instead */ +CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io) + __attribute__((deprecated)); + +/** + * Get the alignment flavor of a pool + * + * @param io pool to query + * @param alignment where to store the alignment flavor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io, + uint64_t *alignment); + +/** + * Get the pool id of the io context + * + * @param io the io context to query + * @returns the id of the pool the io context uses + */ +CEPH_RADOS_API int64_t rados_ioctx_get_id(rados_ioctx_t io); + +/** + * Get the pool name of the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_pool_name(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} pools */ + +/** + * @name Object Locators + * + * @{ + */ + +/** + * Set the key for mapping objects to pgs within an io context. + * + * The key is used instead of the object name to determine which + * placement groups an object is put in. This affects all subsequent + * operations of the io context - until a different locator key is + * set, all objects in this io context will be placed in the same pg. + * + * @param io the io context to change + * @param key the key to use as the object locator, or NULL to discard + * any previously set key + */ +CEPH_RADOS_API void rados_ioctx_locator_set_key(rados_ioctx_t io, + const char *key); + +/** + * Set the namespace for objects within an io context + * + * The namespace specification further refines a pool into different + * domains. The mapping of objects to pgs is also based on this + * value. + * + * @param io the io context to change + * @param nspace the name to use as the namespace, or NULL use the + * default namespace + */ +CEPH_RADOS_API void rados_ioctx_set_namespace(rados_ioctx_t io, + const char *nspace); + +/** + * Get the namespace for objects within the io context + * + * @param io the io context to query + * @param buf pointer to buffer where name will be stored + * @param maxlen size of buffer where name will be stored + * @returns length of string stored, or -ERANGE if buffer too small + */ +CEPH_RADOS_API int rados_ioctx_get_namespace(rados_ioctx_t io, char *buf, + unsigned maxlen); + +/** @} obj_loc */ + +/** + * @name Listing Objects + * @{ + */ +/** + * Start listing objects in a pool + * + * @param io the pool to list from + * @param ctx the handle to store list context in + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_open(rados_ioctx_t io, + rados_list_ctx_t *ctx); + +/** + * Return hash position of iterator, rounded to the current PG + * + * @param ctx iterator marking where you are in the listing + * @returns current hash position, rounded to the current pg + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_get_pg_hash_position(rados_list_ctx_t ctx); + +/** + * Reposition object iterator to a different hash position + * + * @param ctx iterator marking where you are in the listing + * @param pos hash position to move to + * @returns actual (rounded) position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek(rados_list_ctx_t ctx, + uint32_t pos); + +/** + * Reposition object iterator to a different position + * + * @param ctx iterator marking where you are in the listing + * @param cursor position to move to + * @returns rounded position we moved to + */ +CEPH_RADOS_API uint32_t rados_nobjects_list_seek_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor cursor); + +/** + * Reposition object iterator to a different position + * + * The returned handle must be released with rados_object_list_cursor_free(). + * + * @param ctx iterator marking where you are in the listing + * @param cursor where to store cursor + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_nobjects_list_get_cursor(rados_list_ctx_t ctx, + rados_object_list_cursor *cursor); + +/** + * Get the next object name and locator in the pool + * + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace); + +/** + * Get the next object name, locator and their sizes in the pool + * + * The sizes allow to list objects with \0 (the NUL character) + * in .e.g *entry. Is is unusual see such object names but a bug + * in a client has risen the need to handle them as well. + * *entry and *key are valid until next call to rados_nobjects_list_* + * + * @param ctx iterator marking where you are in the listing + * @param entry where to store the name of the entry + * @param key where to store the object locator (set to NULL to ignore) + * @param nspace where to store the object namespace (set to NULL to ignore) + * @param entry_size where to store the size of name of the entry + * @param key_size where to store the size of object locator (set to NULL to ignore) + * @param nspace_size where to store the size of object namespace (set to NULL to ignore) + * @returns 0 on success, negative error code on failure + * @returns -ENOENT when there are no more objects to list + */ +CEPH_RADOS_API int rados_nobjects_list_next2(rados_list_ctx_t ctx, + const char **entry, + const char **key, + const char **nspace, + size_t *entry_size, + size_t *key_size, + size_t *nspace_size); + +/** + * Close the object listing handle. + * + * This should be called when the handle is no longer needed. + * The handle should not be used after it has been closed. + * + * @param ctx the handle to close + */ +CEPH_RADOS_API void rados_nobjects_list_close(rados_list_ctx_t ctx); + +/** + * Get cursor handle pointing to the *beginning* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_begin( + rados_ioctx_t io); + +/** + * Get cursor handle pointing to the *end* of a pool. + * + * This is an opaque handle pointing to the start of a pool. It must + * be released with rados_object_list_cursor_free(). + * + * @param io ioctx for the pool + * @returns handle for the pool, NULL on error (pool does not exist) + */ +CEPH_RADOS_API rados_object_list_cursor rados_object_list_end(rados_ioctx_t io); + +/** + * Check if a cursor has reached the end of a pool + * + * @param io ioctx + * @param cur cursor + * @returns 1 if the cursor has reached the end of the pool, 0 otherwise + */ +CEPH_RADOS_API int rados_object_list_is_end(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Release a cursor + * + * Release a cursor. The handle may not be used after this point. + * + * @param io ioctx + * @param cur cursor + */ +CEPH_RADOS_API void rados_object_list_cursor_free(rados_ioctx_t io, + rados_object_list_cursor cur); + +/** + * Compare two cursor positions + * + * Compare two cursors, and indicate whether the first cursor precedes, + * matches, or follows the second. + * + * @param io ioctx + * @param lhs first cursor + * @param rhs second cursor + * @returns -1, 0, or 1 for lhs < rhs, lhs == rhs, or lhs > rhs + */ +CEPH_RADOS_API int rados_object_list_cursor_cmp(rados_ioctx_t io, + rados_object_list_cursor lhs, rados_object_list_cursor rhs); + +/** + * @return the number of items set in the results array + */ +CEPH_RADOS_API int rados_object_list(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t result_size, + const char *filter_buf, + const size_t filter_buf_len, + rados_object_list_item *results, + rados_object_list_cursor *next); + +CEPH_RADOS_API void rados_object_list_free( + const size_t result_size, + rados_object_list_item *results); + +/** + * Obtain cursors delineating a subset of a range. Use this + * when you want to split up the work of iterating over the + * global namespace. Expected use case is when you are iterating + * in parallel, with `m` workers, and each worker taking an id `n`. + * + * @param io ioctx + * @param start start of the range to be sliced up (inclusive) + * @param finish end of the range to be sliced up (exclusive) + * @param n which of the m chunks you would like to get cursors for + * @param m how many chunks to divide start-finish into + * @param split_start cursor populated with start of the subrange (inclusive) + * @param split_finish cursor populated with end of the subrange (exclusive) + */ +CEPH_RADOS_API void rados_object_list_slice(rados_ioctx_t io, + const rados_object_list_cursor start, + const rados_object_list_cursor finish, + const size_t n, + const size_t m, + rados_object_list_cursor *split_start, + rados_object_list_cursor *split_finish); + + +/** @} Listing Objects */ + +/** + * @name Snapshots + * + * RADOS snapshots are based upon sequence numbers that form a + * snapshot context. They are pool-specific. The snapshot context + * consists of the current snapshot sequence number for a pool, and an + * array of sequence numbers at which snapshots were taken, in + * descending order. Whenever a snapshot is created or deleted, the + * snapshot sequence number for the pool is increased. To add a new + * snapshot, the new snapshot sequence number must be increased and + * added to the snapshot context. + * + * There are two ways to manage these snapshot contexts: + * -# within the RADOS cluster + * These are called pool snapshots, and store the snapshot context + * in the OSDMap. These represent a snapshot of all the objects in + * a pool. + * -# within the RADOS clients + * These are called self-managed snapshots, and push the + * responsibility for keeping track of the snapshot context to the + * clients. For every write, the client must send the snapshot + * context. In librados, this is accomplished with + * rados_selfmanaged_snap_set_write_ctx(). These are more + * difficult to manage, but are restricted to specific objects + * instead of applying to an entire pool. + * + * @{ + */ + +/** + * Create a pool-wide snapshot + * + * @param io the pool to snapshot + * @param snapname the name of the snapshot + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_create(rados_ioctx_t io, + const char *snapname); + +/** + * Delete a pool snapshot + * + * @param io the pool to delete the snapshot from + * @param snapname which snapshot to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_remove(rados_ioctx_t io, + const char *snapname); + +/** + * Rollback an object to a pool snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapname which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_rollback(rados_ioctx_t io, const char *oid, + const char *snapname); + +/** + * @warning Deprecated: Use rados_ioctx_snap_rollback() instead + */ +CEPH_RADOS_API int rados_rollback(rados_ioctx_t io, const char *oid, + const char *snapname) + __attribute__((deprecated)); + +/** + * Set the snapshot from which reads are performed. + * + * Subsequent reads will return data as it was at the time of that + * snapshot. + * + * @param io the io context to change + * @param snap the id of the snapshot to set, or LIBRADOS_SNAP_HEAD for no + * snapshot (i.e. normal operation) + */ +CEPH_RADOS_API void rados_ioctx_snap_set_read(rados_ioctx_t io, + rados_snap_t snap); + +/** + * Allocate an ID for a self-managed snapshot + * + * Get a unique ID to put in the snaphot context to create a + * snapshot. A clone of an object is not created until a write with + * the new snapshot context is completed. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_create(rados_ioctx_t io, + rados_snap_t *snapid, + rados_completion_t completion); + +/** + * Remove a self-managed snapshot + * + * This increases the snapshot sequence number, which will cause + * snapshots to be removed lazily. + * + * @param io the pool in which the snapshot will exist + * @param snapid where to store the newly allocated snapshot ID + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid); +CEPH_RADOS_API void +rados_aio_ioctx_selfmanaged_snap_remove(rados_ioctx_t io, + rados_snap_t snapid, + rados_completion_t completion); + +/** + * Rollback an object to a self-managed snapshot + * + * The contents of the object will be the same as + * when the snapshot was taken. + * + * @param io the pool in which the object is stored + * @param oid the name of the object to rollback + * @param snapid which snapshot to rollback to + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_rollback(rados_ioctx_t io, + const char *oid, + rados_snap_t snapid); + +/** + * Set the snapshot context for use when writing to objects + * + * This is stored in the io context, and applies to all future writes. + * + * @param io the io context to change + * @param seq the newest snapshot sequence number for the pool + * @param snaps array of snapshots in sorted by descending id + * @param num_snaps how many snaphosts are in the snaps array + * @returns 0 on success, negative error code on failure + * @returns -EINVAL if snaps are not in descending order + */ +CEPH_RADOS_API int rados_ioctx_selfmanaged_snap_set_write_ctx(rados_ioctx_t io, + rados_snap_t seq, + rados_snap_t *snaps, + int num_snaps); + +/** + * List all the ids of pool snapshots + * + * If the output array does not have enough space to fit all the + * snapshots, -ERANGE is returned and the caller should retry with a + * larger array. + * + * @param io the pool to read from + * @param snaps where to store the results + * @param maxlen the number of rados_snap_t that fit in the snaps array + * @returns number of snapshots on success, negative error code on failure + * @returns -ERANGE is returned if the snaps array is too short + */ +CEPH_RADOS_API int rados_ioctx_snap_list(rados_ioctx_t io, rados_snap_t *snaps, + int maxlen); + +/** + * Get the id of a pool snapshot + * + * @param io the pool to read from + * @param name the snapshot to find + * @param id where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_lookup(rados_ioctx_t io, const char *name, + rados_snap_t *id); + +/** + * Get the name of a pool snapshot + * + * @param io the pool to read from + * @param id the snapshot to find + * @param name where to store the result + * @param maxlen the size of the name array + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the name array is too small + */ +CEPH_RADOS_API int rados_ioctx_snap_get_name(rados_ioctx_t io, rados_snap_t id, + char *name, int maxlen); + +/** + * Find when a pool snapshot occurred + * + * @param io the pool the snapshot was taken in + * @param id the snapshot to lookup + * @param t where to store the result + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_ioctx_snap_get_stamp(rados_ioctx_t io, rados_snap_t id, + time_t *t); + +/** @} Snapshots */ + +/** + * @name Synchronous I/O + * Writes are replicated to a number of OSDs based on the + * configuration of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_ioctx_wait_for_complete(). For greater data safety, use the + * asynchronous functions and rados_aio_wait_for_safe(). + * + * @{ + */ + +/** + * Return the version of the last object read or written to. + * + * This exposes the internal version number of the last object read or + * written via this io context + * + * @param io the io context to check + * @returns last read or written object version + */ +CEPH_RADOS_API uint64_t rados_get_last_version(rados_ioctx_t io); + +/** + * Write *len* bytes from *buf* into the *oid* object, starting at + * offset *off*. The value of *len* must be <= UINT_MAX/2. + * + * @note This will never return a positive value not equal to len. + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid, + const char *buf, size_t len, uint64_t off); + +/** + * Write *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_write_full(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Write the same *data_len* bytes from *buf* multiple times into the + * *oid* object. *write_len* bytes are written in total, which must be + * a multiple of *data_len*. The value of *write_len* and *data_len* + * must be <= UINT_MAX/2. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_writesame(rados_ioctx_t io, const char *oid, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Append *len* bytes from *buf* into the *oid* object. The value of + * *len* must be <= UINT_MAX/2. + * + * @param io the context to operate in + * @param oid the name of the object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_append(rados_ioctx_t io, const char *oid, + const char *buf, size_t len); + +/** + * Read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +CEPH_RADOS_API int rados_read(rados_ioctx_t io, const char *oid, char *buf, + size_t len, uint64_t off); + +/** + * Compute checksum from object data + * + * The io context determines the snapshot to checksum, if any was set + * by rados_ioctx_snap_set_read(). The length of the init_value and + * resulting checksum are dependent upon the checksum type: + * + * XXHASH64: le64 + * XXHASH32: le32 + * CRC32C: le32 + * + * The checksum result is encoded the following manner: + * + * le32 num_checksum_chunks + * { + * leXX checksum for chunk (where XX = appropriate size for the checksum type) + * } * num_checksum_chunks + * + * @param io the context in which to perform the checksum + * @param oid the name of the object to checksum + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param len the number of bytes to checksum + * @param off the offset to start checksumming in the object + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result + * @param checksum_len the number of bytes available for the result + * @return negative error code on failure + */ +CEPH_RADOS_API int rados_checksum(rados_ioctx_t io, const char *oid, + rados_checksum_type_t type, + const char *init_value, size_t init_value_len, + size_t len, uint64_t off, size_t chunk_size, + char *pchecksum, size_t checksum_len); + +/** + * Delete an object + * + * @note This does not delete any snapshots of the object. + * + * @param io the pool to delete the object from + * @param oid the name of the object to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_remove(rados_ioctx_t io, const char *oid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @param io the context in which to truncate + * @param oid the name of the object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_trunc(rados_ioctx_t io, const char *oid, + uint64_t size); + +/** + * Compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o name of the object + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_cmpext(rados_ioctx_t io, const char *o, + const char *cmp_buf, size_t cmp_len, + uint64_t off); + +/** + * @name Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattr(rados_ioctx_t io, const char *o, + const char *name, char *buf, size_t len); + +/** + * Set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_setxattr(rados_ioctx_t io, const char *o, + const char *name, const char *buf, + size_t len); + +/** + * Delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_rmxattr(rados_ioctx_t io, const char *o, + const char *name); + +/** + * Start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs(rados_ioctx_t io, const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Get the next omap key/value pair on the object + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key is + * null-terminated, and val has length len. If the end of the list has + * been reached, key and val are NULL, and len is 0. key and val will + * not be accessible after rados_omap_get_end() is called on iter, so + * if they are needed after that they should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next(rados_omap_iter_t iter, + char **key, + char **val, + size_t *len); + +/** + * Get the next omap key/value pair on the object. Note that it's + * perfectly safe to mix calls to rados_omap_get_next and + * rados_omap_get_next2. + * + * @pre iter is a valid iterator + * + * @post key and val are the next key/value pair. key has length + * keylen and val has length vallen. If the end of the list has + * been reached, key and val are NULL, and keylen and vallen is 0. + * key and val will not be accessible after rados_omap_get_end() + * is called on iter, so if they are needed after that they + * should be copied. + * + * @param iter iterator to advance + * @param key where to store the key of the next omap entry + * @param val where to store the value of the next omap entry + * @param key_len where to store the number of bytes in key + * @param val_len where to store the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_omap_get_next2(rados_omap_iter_t iter, + char **key, + char **val, + size_t *key_len, + size_t *val_len); + +/** + * Return number of elements in the iterator + * + * @param iter the iterator of which to return the size + */ +CEPH_RADOS_API unsigned int rados_omap_iter_size(rados_omap_iter_t iter); + +/** + * Close the omap iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +CEPH_RADOS_API void rados_omap_get_end(rados_omap_iter_t iter); + +/** + * Get object size and most recent update time from the OSD. + * + * @param io ioctx + * @param o object name + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_stat(rados_ioctx_t io, const char *o, uint64_t *psize, + time_t *pmtime); + +CEPH_RADOS_API int rados_stat2(rados_ioctx_t io, const char *o, uint64_t *psize, + struct timespec *pmtime); + +/** + * Execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param oid the object to call the method on + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns the length of the output, or + * -ERANGE if out_buf does not have enough space to store it (For methods that return data). For + * methods that don't return data, the return value is + * method-specific. + */ +CEPH_RADOS_API int rados_exec(rados_ioctx_t io, const char *oid, + const char *cls, const char *method, + const char *in_buf, size_t in_len, char *buf, + size_t out_len); + + +/** @} Synchronous I/O */ + +/** + * @name Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_callback_t + * Callbacks for asynchrous operations take two parameters: + * - cb the completion that has finished + * - arg application defined data made available to the callback function + */ +typedef void (*rados_callback_t)(rados_completion_t cb, void *arg); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * TODO: more complete documentation of this elsewhere (in the RADOS docs?) + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all replicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_completion_t *pc); + +/** + * Constructs a completion to use with asynchronous operations + * + * The complete callback corresponds to operation being acked. + * + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is committed + * on all replicas + * @param pc where to store the completion + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_create_completion2(void *cb_arg, + rados_callback_t cb_complete, + rados_completion_t *pc); + +/** + * Block until an operation completes + * + * This means it is in memory on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete(rados_completion_t c); + +/** + * Block until an operation is safe + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete(rados_completion_t c); + +/** + * Is an asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe(rados_completion_t c); + +/** + * Block until an operation completes and callback completes + * + * This means it is in memory on all replicas and can be read. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_complete_and_cb(rados_completion_t c); + +/** + * Block until an operation is safe and callback has completed + * + * This means it is on stable storage on all replicas. + * + * @note BUG: this should be void + * + * @param c operation to wait for + * @returns 0 + */ +CEPH_RADOS_API int rados_aio_wait_for_safe_and_cb(rados_completion_t c) + __attribute__((deprecated)); + +/** + * Has an asynchronous operation and callback completed + * + * @param c async operation to inspect + * @returns whether c is complete + */ +CEPH_RADOS_API int rados_aio_is_complete_and_cb(rados_completion_t c); + +/** + * Is an asynchronous operation safe and has the callback completed + * + * @param c async operation to inspect + * @returns whether c is safe + */ +CEPH_RADOS_API int rados_aio_is_safe_and_cb(rados_completion_t c); + +/** + * Get the return value of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns return value of the operation + */ +CEPH_RADOS_API int rados_aio_get_return_value(rados_completion_t c); + +/** + * Get the internal object version of the target of an asychronous operation + * + * The return value is set when the operation is complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operation to inspect + * @returns version number of the asychronous operation's target + */ +CEPH_RADOS_API uint64_t rados_aio_get_version(rados_completion_t c); + +/** + * Release a completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c completion to release + */ +CEPH_RADOS_API void rados_aio_release(rados_completion_t c); + +/** + * Write data to an object asynchronously + * + * Queues the write and returns. The return value of the completion + * will be 0 on success, negative error code on failure. + * + * @param io the context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len, uint64_t off); + +/** + * Asynchronously append data to an object + * + * Queues the append and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the append is safe and complete + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_append(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write an entire object + * + * The object is filled with the provided data. If the object exists, + * it is atomically truncated and then written. + * Queues the write_full and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the write_full is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_write_full(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t len); + +/** + * Asynchronously write the same buffer multiple times + * + * Queues the writesame and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the io context in which the write will occur + * @param oid name of the object + * @param completion what to do when the writesame is safe and complete + * @param buf data to write + * @param data_len length of the data, in bytes + * @param write_len the total number of bytes to write + * @param off byte offset in the object to begin writing at + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_writesame(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + const char *buf, size_t data_len, + size_t write_len, uint64_t off); + +/** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param io the context to operate in + * @param oid the name of the object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than LIBRADOS_SNAP_HEAD + */ +CEPH_RADOS_API int rados_aio_remove(rados_ioctx_t io, const char *oid, + rados_completion_t completion); + +/** + * Asynchronously read data from an object + * + * The io context determines the snapshot to read from, if any was set + * by rados_ioctx_snap_set_read(). + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @note only the 'complete' callback of the completion will be called. + * + * @param io the context in which to perform the read + * @param oid the name of the object to read from + * @param completion what to do when the read is complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_read(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + char *buf, size_t len, uint64_t off); + +/** + * Block until all pending writes in an io context are safe + * + * This is not equivalent to calling rados_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @note BUG: always returns 0, should be void or accept a timeout + * + * @param io the context to flush + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush(rados_ioctx_t io); + + +/** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * rados_aio_flush(). + * + * @param io the context to flush + * @param completion what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_flush_async(rados_ioctx_t io, + rados_completion_t completion); + + +/** + * Asynchronously get object stats (size/mtime) + * + * @param io ioctx + * @param o object name + * @param completion what to do when the stat is complete + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_stat(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, time_t *pmtime); + +CEPH_RADOS_API int rados_aio_stat2(rados_ioctx_t io, const char *o, + rados_completion_t completion, + uint64_t *psize, struct timespec *pmtime); + +/** + * Asynchronously compare an on-disk object range with a buffer + * + * @param io the context in which to perform the comparison + * @param o the name of the object to compare with + * @param completion what to do when the comparison is complete + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API int rados_aio_cmpext(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cmp_buf, + size_t cmp_len, + uint64_t off); + +/** + * Cancel async operation + * + * @param io ioctx + * @param completion completion handle + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_cancel(rados_ioctx_t io, + rados_completion_t completion); + +/** + * Asynchronously execute an OSD class method on an object + * + * The OSD has a plugin mechanism for performing complicated + * operations on an object atomically. These plugins are called + * classes. This function allows librados users to call the custom + * methods. The input and output formats are defined by the class. + * Classes in ceph.git can be found in src/cls subdirectories + * + * @param io the context in which to call the method + * @param o name of the object + * @param completion what to do when the exec completes + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param buf where to store output + * @param out_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_exec(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *cls, const char *method, + const char *in_buf, size_t in_len, + char *buf, size_t out_len); + +/** @} Asynchronous I/O */ + +/** + * @name Asynchronous Xattrs + * Extended attributes are stored as extended attributes on the files + * representing an object on the OSDs. Thus, they have the same + * limitations as the underlying filesystem. On ext4, this means that + * the total data stored in xattrs cannot exceed 4KB. + * + * @{ + */ + +/** + * Asynchronously get the value of an extended attribute on an object. + * + * @param io the context in which the attribute is read + * @param o name of the object + * @param completion what to do when the getxattr completes + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, char *buf, size_t len); + +/** + * Asynchronously set an extended attribute on an object. + * + * @param io the context in which xattr is set + * @param o name of the object + * @param completion what to do when the setxattr completes + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_setxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name, const char *buf, + size_t len); + +/** + * Asynchronously delete an extended attribute from an object. + * + * @param io the context in which to delete the xattr + * @param o the name of the object + * @param completion what to do when the rmxattr completes + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_rmxattr(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *name); + +/** + * Asynchronously start iterating over xattrs on an object. + * + * @post iter is a valid iterator + * + * @param io the context in which to list xattrs + * @param oid name of the object + * @param completion what to do when the getxattrs completes + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_getxattrs(rados_ioctx_t io, const char *oid, + rados_completion_t completion, + rados_xattrs_iter_t *iter); + +/** @} Asynchronous Xattrs */ + +/** + * @name Watch/Notify + * + * Watch/notify is a protocol to help communicate among clients. It + * can be used to sychronize client state. All that's needed is a + * well-known object name (for example, rbd uses the header object of + * an image). + * + * Watchers register an interest in an object, and receive all + * notifies on that object. A notify attempts to communicate with all + * clients watching an object, and blocks on the notifier until each + * client responds or a timeout is reached. + * + * See rados_watch() and rados_notify() for more details. + * + * @{ + */ + +/** + * @typedef rados_watchcb_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param opcode undefined + * @param ver version of the watched object + * @param arg application-specific data + * + * @note BUG: opcode is an internal detail that shouldn't be exposed + * @note BUG: ver is unused + */ +typedef void (*rados_watchcb_t)(uint8_t opcode, uint64_t ver, void *arg); + +/** + * @typedef rados_watchcb2_t + * + * Callback activated when a notify is received on a watched + * object. + * + * @param arg opaque user-defined value provided to rados_watch2() + * @param notify_id an id for this notify event + * @param handle the watcher handle we are notifying + * @param notifier_id the unique client id for the notifier + * @param data payload from the notifier + * @param data_len length of payload buffer + */ +typedef void (*rados_watchcb2_t)(void *arg, + uint64_t notify_id, + uint64_t handle, + uint64_t notifier_id, + void *data, + size_t data_len); + +/** + * @typedef rados_watcherrcb_t + * + * Callback activated when we encounter an error with the watch session. + * This can happen when the location of the objects moves within the + * cluster and we fail to register our watch with the new object location, + * or when our connection with the object OSD is otherwise interrupted and + * we may have missed notify events. + * + * @param pre opaque user-defined value provided to rados_watch2() + * @param cookie the internal id assigned to the watch session + * @param err error code + */ + typedef void (*rados_watcherrcb_t)(void *pre, uint64_t cookie, int err); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @note BUG: librados should provide a way for watchers to notice connection resets + * @note BUG: the ver parameter does not work, and -ERANGE will never be returned + * (See URL tracker.ceph.com/issues/2592) + * + * @param io the pool the object is in + * @param o the object to watch + * @param ver expected version of the object + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param arg application defined data to pass when watchcb is called + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if the version of the object is greater than ver + */ +CEPH_RADOS_API int rados_watch(rados_ioctx_t io, const char *o, uint64_t ver, + uint64_t *cookie, + rados_watchcb_t watchcb, void *arg) + __attribute__((deprecated)); + + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to the + * primary OSD for a watched object, the watch will be removed after + * a timeout configured with osd_client_watch_timeout. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch2(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param cookie where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch3(rados_ioctx_t io, const char *o, uint64_t *cookie, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after 30 seconds. Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + void *arg); + +/** + * Asynchronous register an interest in an object + * + * A watch operation registers the client as being interested in + * notifications on an object. OSDs keep track of watches on + * persistent storage, so they are preserved across cluster changes by + * the normal recovery process. If the client loses its connection to + * the primary OSD for a watched object, the watch will be removed + * after the number of seconds that configured in timeout parameter. + * Watches are automatically reestablished when a new + * connection is made, or a placement group switches OSDs. + * + * @param io the pool the object is in + * @param o the object to watch + * @param completion what to do when operation has been attempted + * @param handle where to store the internal id assigned to this watch + * @param watchcb what to do when a notify is received on this object + * @param watcherrcb what to do when the watch session encounters an error + * @param timeout how many seconds the connection will keep after disconnection + * @param arg opaque value to pass to the callback + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_watch2(rados_ioctx_t io, const char *o, + rados_completion_t completion, uint64_t *handle, + rados_watchcb2_t watchcb, + rados_watcherrcb_t watcherrcb, + uint32_t timeout, + void *arg); + +/** + * Check on the status of a watch + * + * Return the number of milliseconds since the watch was last confirmed. + * Or, if there has been an error, return that. + * + * If there is an error, the watch is no longer valid, and should be + * destroyed with rados_unwatch2(). The the user is still interested + * in the object, a new watch should be created with rados_watch2(). + * + * @param io the pool the object is in + * @param cookie the watch handle + * @returns ms since last confirmed on success, negative error code on failure + */ +CEPH_RADOS_API int rados_watch_check(rados_ioctx_t io, uint64_t cookie); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param o the name of the watched object (ignored) + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch(rados_ioctx_t io, const char *o, uint64_t cookie) + __attribute__((deprecated)); + +/** + * Unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_unwatch2(rados_ioctx_t io, uint64_t cookie); + +/** + * Asynchronous unregister an interest in an object + * + * Once this completes, no more notifies will be sent to us for this + * watch. This should be called to clean up unneeded watchers. + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param cookie which watch to unregister + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unwatch(rados_ioctx_t io, uint64_t cookie, + rados_completion_t completion); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * @note BUG: the timeout is not changeable via the C API + * @note BUG: the bufferlist is inaccessible in a rados_watchcb_t + * + * @param io the pool the object is in + * @param o the name of the object + * @param ver obsolete - just pass zero + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver, + const char *buf, int buf_len) + __attribute__((deprecated)); + +/** + * Sychronously notify watchers of an object + * + * This blocks until all watchers of the object have received and + * reacted to the notify, or a timeout is reached. + * + * The reply buffer is optional. If specified, the client will get + * back an encoded buffer that includes the ids of the clients that + * acknowledged the notify as well as their notify ack payloads (if + * any). Clients that timed out are not included. Even clients that + * do not include a notify ack payload are included in the list but + * have a 0-length payload associated with them. The format: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * Note: There may be multiple instances of the same gid if there are + * multiple watchers registered via the same client. + * + * Note: The buffer must be released with rados_buffer_free() when the + * user is done with it. + * + * Note: Since the result buffer includes clients that time out, it + * will be set even when rados_notify() returns an error code (like + * -ETIMEDOUT). + * + * @param io the pool the object is in + * @param completion what to do when operation has been attempted + * @param o the name of the object + * @param buf data to send to watchers + * @param buf_len length of buf in bytes + * @param timeout_ms notify timeout (in ms) + * @param reply_buffer pointer to reply buffer pointer (free with rados_buffer_free) + * @param reply_buffer_len pointer to size of reply buffer + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o, + rados_completion_t completion, + const char *buf, int buf_len, + uint64_t timeout_ms, char **reply_buffer, + size_t *reply_buffer_len); +CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o, + const char *buf, int buf_len, + uint64_t timeout_ms, + char **reply_buffer, size_t *reply_buffer_len); + +/** + * Decode a notify response + * + * Decode a notify response (from rados_aio_notify() call) into acks and + * timeout arrays. + * + * @param reply_buffer buffer from rados_aio_notify() call + * @param reply_buffer_len reply_buffer length + * @param acks pointer to struct notify_ack_t pointer + * @param nr_acks pointer to ack count + * @param timeouts pointer to notify_timeout_t pointer + * @param nr_timeouts pointer to timeout count + * @returns 0 on success + */ +CEPH_RADOS_API int rados_decode_notify_response(char *reply_buffer, size_t reply_buffer_len, + struct notify_ack_t **acks, size_t *nr_acks, + struct notify_timeout_t **timeouts, size_t *nr_timeouts); + +/** + * Free notify allocated buffer + * + * Release memory allocated by rados_decode_notify_response() call + * + * @param acks notify_ack_t struct (from rados_decode_notify_response()) + * @param nr_acks ack count + * @param timeouts notify_timeout_t struct (from rados_decode_notify_response()) + */ +CEPH_RADOS_API void rados_free_notify_response(struct notify_ack_t *acks, size_t nr_acks, + struct notify_timeout_t *timeouts); + +/** + * Acknolwedge receipt of a notify + * + * @param io the pool the object is in + * @param o the name of the object + * @param notify_id the notify_id we got on the watchcb2_t callback + * @param cookie the watcher handle + * @param buf payload to return to notifier (optional) + * @param buf_len payload length + * @returns 0 on success + */ +CEPH_RADOS_API int rados_notify_ack(rados_ioctx_t io, const char *o, + uint64_t notify_id, uint64_t cookie, + const char *buf, int buf_len); + +/** + * Flush watch/notify callbacks + * + * This call will block until all pending watch/notify callbacks have + * been executed and the queue is empty. It should usually be called + * after shutting down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + */ +CEPH_RADOS_API int rados_watch_flush(rados_t cluster); +/** + * Flush watch/notify callbacks + * + * This call will be nonblock, and the completion will be called + * until all pending watch/notify callbacks have been executed and + * the queue is empty. It should usually be called after shutting + * down any watches before shutting down the ioctx or + * librados to ensure that any callbacks do not misuse the ioctx (for + * example by calling rados_notify_ack after the ioctx has been + * destroyed). + * + * @param cluster the cluster handle + * @param completion what to do when operation has been attempted + */ +CEPH_RADOS_API int rados_aio_watch_flush(rados_t cluster, rados_completion_t completion); + +/** @} Watch/Notify */ + +/** + * Pin an object in the cache tier + * + * When an object is pinned in the cache tier, it stays in the cache + * tier, and won't be flushed out. + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o); + +/** + * Unpin an object in the cache tier + * + * After an object is unpinned in the cache tier, it can be flushed out + * + * @param io the pool the object is in + * @param o the object id + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o); + +/** + * @name Hints + * + * @{ + */ + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it was + * submitted with a LIBRADOS_OP_FLAG_FAILOK flag set) and is not + * guaranteed to do anything on the backend. + * + * @param io the pool the object is in + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_set_alloc_hint2(rados_ioctx_t io, const char *o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** @} Hints */ + +/** + * @name Object Operations + * + * A single rados operation can do multiple operations on one object + * atomically. The whole operation will succeed or fail, and no partial + * results will be visible. + * + * Operations may be either reads, which can return data, or writes, + * which cannot. The effects of writes are applied and visible all at + * once, so an operation that sets an xattr and then checks its value + * will not see the updated value. + * + * @{ + */ + +/** + * Create a new rados_write_op_t write operation. This will store all actions + * to be performed atomically. You must call rados_release_write_op when you are + * finished with it. + * + * @note the ownership of a write operartion is passed to the function + * performing the operation, so the same instance of @c rados_write_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_write_op_t rados_create_write_op(void); + +/** + * Free a rados_write_op_t, must be called when you're done with it. + * @param write_op operation to deallocate, created with rados_create_write_op + */ +CEPH_RADOS_API void rados_release_write_op(rados_write_op_t write_op); + +/** + * Set flags for the last operation added to this write_op. + * At least one op must have been added to the write_op. + * @param write_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_write_op_set_flags(rados_write_op_t write_op, + int flags); + +/** + * Ensure that the object exists before writing + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_assert_exists(rados_write_op_t write_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before writing. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_write_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_write_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param write_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_write_op_assert_version(rados_write_op_t write_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param write_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_write_op_cmpext(rados_write_op_t write_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that given xattr satisfies comparison. + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param write_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_write_op_cmpxattr(rados_write_op_t write_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param write_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_write_op_omap_cmp2(rados_write_op_t write_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Set an xattr + * @param write_op operation to add this action to + * @param name name of the xattr + * @param value buffer to set xattr to + * @param value_len length of buffer to set xattr to + */ +CEPH_RADOS_API void rados_write_op_setxattr(rados_write_op_t write_op, + const char *name, + const char *value, + size_t value_len); + +/** + * Remove an xattr + * @param write_op operation to add this action to + * @param name name of the xattr to remove + */ +CEPH_RADOS_API void rados_write_op_rmxattr(rados_write_op_t write_op, + const char *name); + +/** + * Create the object + * @param write_op operation to add this action to + * @param exclusive set to either LIBRADOS_CREATE_EXCLUSIVE or + LIBRADOS_CREATE_IDEMPOTENT + * will error if the object already exists. + * @param category category string (DEPRECATED, HAS NO EFFECT) + */ +CEPH_RADOS_API void rados_write_op_create(rados_write_op_t write_op, + int exclusive, + const char* category); + +/** + * Write to offset + * @param write_op operation to add this action to + * @param offset offset to write to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write(rados_write_op_t write_op, + const char *buffer, + size_t len, + uint64_t offset); + +/** + * Write whole object, atomically replacing it. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_write_full(rados_write_op_t write_op, + const char *buffer, + size_t len); + +/** + * Write the same buffer multiple times + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param data_len length of buffer + * @param write_len total number of bytes to write, as a multiple of @c data_len + * @param offset offset to write to + */ +CEPH_RADOS_API void rados_write_op_writesame(rados_write_op_t write_op, + const char *buffer, + size_t data_len, + size_t write_len, + uint64_t offset); + +/** + * Append to end of object. + * @param write_op operation to add this action to + * @param buffer bytes to write + * @param len length of buffer + */ +CEPH_RADOS_API void rados_write_op_append(rados_write_op_t write_op, + const char *buffer, + size_t len); +/** + * Remove object + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_remove(rados_write_op_t write_op); + +/** + * Truncate an object + * @param write_op operation to add this action to + * @param offset Offset to truncate to + */ +CEPH_RADOS_API void rados_write_op_truncate(rados_write_op_t write_op, + uint64_t offset); + +/** + * Zero part of an object + * @param write_op operation to add this action to + * @param offset Offset to zero + * @param len length to zero + */ +CEPH_RADOS_API void rados_write_op_zero(rados_write_op_t write_op, + uint64_t offset, + uint64_t len); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * @param write_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_write_op_exec(rados_write_op_t write_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + int *prval); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *lens, + size_t num); + +/** + * Set key/value pairs on an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to set + * @param vals array of pointers to values to set + * @param key_lens array of lengths corresponding to each key + * @param val_lens array of lengths corresponding to each value + * @param num number of key/value pairs to set + */ +CEPH_RADOS_API void rados_write_op_omap_set2(rados_write_op_t write_op, + char const* const* keys, + char const* const* vals, + const size_t *key_lens, + const size_t *val_lens, + size_t num); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of null-terminated char arrays representing keys to remove + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys(rados_write_op_t write_op, + char const* const* keys, + size_t keys_len); + +/** + * Remove key/value pairs from an object + * + * @param write_op operation to add this action to + * @param keys array of char arrays representing keys to remove + * @param key_lens array of size_t values representing length of each key + * @param keys_len number of key/value pairs to remove + */ +CEPH_RADOS_API void rados_write_op_omap_rm_keys2(rados_write_op_t write_op, + char const* const* keys, + const size_t* key_lens, + size_t keys_len); + + +/** + * Remove key/value pairs from an object whose keys are in the range + * [key_begin, key_end) + * + * @param write_op operation to add this action to + * @param key_begin the lower bound of the key range to remove + * @param key_begin_len length of key_begin + * @param key_end the upper bound of the key range to remove + * @param key_end_len length of key_end + */ +CEPH_RADOS_API void rados_write_op_omap_rm_range2(rados_write_op_t write_op, + const char *key_begin, + size_t key_begin_len, + const char *key_end, + size_t key_end_len); + +/** + * Remove all key/value pairs from an object + * + * @param write_op operation to add this action to + */ +CEPH_RADOS_API void rados_write_op_omap_clear(rados_write_op_t write_op); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size); + +/** + * Set allocation hint for an object + * + * @param write_op operation to add this action to + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags hints about future IO patterns + */ +CEPH_RADOS_API void rados_write_op_set_alloc_hint2(rados_write_op_t write_op, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + time_t *mtime, + int flags); +/** + * Perform a write operation synchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ + +CEPH_RADOS_API int rados_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + time_t *mtime, + int flags); + +/** + * Perform a write operation asynchronously + * @param write_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param mtime the time to set the mtime to, NULL for the current time + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_write_op_operate2(rados_write_op_t write_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + struct timespec *mtime, + int flags); + +/** + * Create a new rados_read_op_t read operation. This will store all + * actions to be performed atomically. You must call + * rados_release_read_op when you are finished with it (after it + * completes, or you decide not to send it in the first place). + * + * @note the ownership of a read operartion is passed to the function + * performing the operation, so the same instance of @c rados_read_op_t + * cannot be used again after being performed. + * + * @returns non-NULL on success, NULL on memory allocation error. + */ +CEPH_RADOS_API rados_read_op_t rados_create_read_op(void); + +/** + * Free a rados_read_op_t, must be called when you're done with it. + * @param read_op operation to deallocate, created with rados_create_read_op + */ +CEPH_RADOS_API void rados_release_read_op(rados_read_op_t read_op); + +/** + * Set flags for the last operation added to this read_op. + * At least one op must have been added to the read_op. + * @param read_op operation to add this action to + * @param flags see librados.h constants beginning with LIBRADOS_OP_FLAG + */ +CEPH_RADOS_API void rados_read_op_set_flags(rados_read_op_t read_op, int flags); + +/** + * Ensure that the object exists before reading + * @param read_op operation to add this action to + */ +CEPH_RADOS_API void rados_read_op_assert_exists(rados_read_op_t read_op); + +/** + * Ensure that the object exists and that its internal version + * number is equal to "ver" before reading. "ver" should be a + * version number previously obtained with rados_get_last_version(). + * - If the object's version is greater than the asserted version + * then rados_read_op_operate will return -ERANGE instead of + * executing the op. + * - If the object's version is less than the asserted version + * then rados_read_op_operate will return -EOVERFLOW instead + * of executing the op. + * @param read_op operation to add this action to + * @param ver object version number + */ +CEPH_RADOS_API void rados_read_op_assert_version(rados_read_op_t read_op, uint64_t ver); + +/** + * Ensure that given object range (extent) satisfies comparison. + * + * @param read_op operation to add this action to + * @param cmp_buf buffer containing bytes to be compared with object contents + * @param cmp_len length to compare and size of @c cmp_buf in bytes + * @param off object byte offset at which to start the comparison + * @param prval returned result of comparison, 0 on success, negative error code + * on failure, (-MAX_ERRNO - mismatch_off) on mismatch + */ +CEPH_RADOS_API void rados_read_op_cmpext(rados_read_op_t read_op, + const char *cmp_buf, + size_t cmp_len, + uint64_t off, + int *prval); + +/** + * Ensure that the an xattr satisfies a comparison + * If the comparison is not satisfied, the return code of the + * operation will be -ECANCELED + * @param read_op operation to add this action to + * @param name name of the xattr to look up + * @param comparison_operator currently undocumented, look for + * LIBRADOS_CMPXATTR_OP_EQ in librados.h + * @param value buffer to compare actual xattr value to + * @param value_len length of buffer to compare actual xattr value to + */ +CEPH_RADOS_API void rados_read_op_cmpxattr(rados_read_op_t read_op, + const char *name, + uint8_t comparison_operator, + const char *value, + size_t value_len); + +/** + * Start iterating over xattrs on an object. + * + * @param read_op operation to add this action to + * @param iter where to store the iterator + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_getxattrs(rados_read_op_t read_op, + rados_xattrs_iter_t *iter, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t val_len, + int *prval); + +/** + * Ensure that the an omap value satisfies a comparison, + * with the supplied value on the right hand side (i.e. + * for OP_LT, the comparison is actual_value < value. + * + * @param read_op operation to add this action to + * @param key which omap value to compare + * @param comparison_operator one of LIBRADOS_CMPXATTR_OP_EQ, + LIBRADOS_CMPXATTR_OP_LT, or LIBRADOS_CMPXATTR_OP_GT + * @param val value to compare with + * @param key_len length of key in bytes + * @param val_len length of value in bytes + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_cmp2(rados_read_op_t read_op, + const char *key, + uint8_t comparison_operator, + const char *val, + size_t key_len, + size_t val_len, + int *prval); + +/** + * Get object size and mtime + * @param read_op operation to add this action to + * @param psize where to store object size + * @param pmtime where to store modification time + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_stat(rados_read_op_t read_op, + uint64_t *psize, + time_t *pmtime, + int *prval); + +CEPH_RADOS_API void rados_read_op_stat2(rados_read_op_t read_op, + uint64_t *psize, + struct timespec *pmtime, + int *prval); +/** + * Read bytes from offset into buffer. + * + * prlen will be filled with the number of bytes read if successful. + * A short read can only occur if the read reaches the end of the + * object. + * + * @param read_op operation to add this action to + * @param offset offset to read from + * @param len length of buffer + * @param buffer where to put the data + * @param bytes_read where to store the number of bytes read by this action + * @param prval where to store the return value of this action + */ +CEPH_RADOS_API void rados_read_op_read(rados_read_op_t read_op, + uint64_t offset, + size_t len, + char *buffer, + size_t *bytes_read, + int *prval); + +/** + * Compute checksum from object data + * + * @param read_op operation to add this action to + * @param type the checksum algorithm to utilize + * @param init_value the init value for the algorithm + * @param init_value_len the length of the init value + * @param offset the offset to start checksumming in the object + * @param len the number of bytes to checksum + * @param chunk_size optional length-aligned chunk size for checksums + * @param pchecksum where to store the checksum result for this action + * @param checksum_len the number of bytes available for the result + * @param prval where to store the return value for this action + */ +CEPH_RADOS_API void rados_read_op_checksum(rados_read_op_t read_op, + rados_checksum_type_t type, + const char *init_value, + size_t init_value_len, + uint64_t offset, size_t len, + size_t chunk_size, char *pchecksum, + size_t checksum_len, int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * The output buffer is allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf where to put librados-allocated output buffer + * @param out_len length of out_buf in bytes + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char **out_buf, + size_t *out_len, + int *prval); + +/** + * Execute an OSD class method on an object + * See rados_exec() for general description. + * + * If the output buffer is too small, prval will + * be set to -ERANGE and used_len will be 0. + * + * @param read_op operation to add this action to + * @param cls the name of the class + * @param method the name of the method + * @param in_buf where to find input + * @param in_len length of in_buf in bytes + * @param out_buf user-provided buffer to read into + * @param out_len length of out_buf in bytes + * @param used_len where to store the number of bytes read into out_buf + * @param prval where to store the return value from the method + */ +CEPH_RADOS_API void rados_read_op_exec_user_buf(rados_read_op_t read_op, + const char *cls, + const char *method, + const char *in_buf, + size_t in_len, + char *out_buf, + size_t out_len, + size_t *used_len, + int *prval); + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over key/value pairs on an object. + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param filter_prefix list only keys beginning with filter_prefix + * @param max_return list no more than max_return key/value pairs + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals2(rados_read_op_t read_op, + const char *start_after, + const char *filter_prefix, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + int *prval) + __attribute__((deprecated)); /* use v2 below */ + +/** + * Start iterating over keys on an object. + * + * They will be returned sorted by key, and the iterator + * will fill in NULL for all values if specified. + * + * @param read_op operation to add this action to + * @param start_after list keys starting after start_after + * @param max_return list no more than max_return keys + * @param iter where to store the iterator + * @param pmore flag indicating whether there are more keys to fetch + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_keys2(rados_read_op_t read_op, + const char *start_after, + uint64_t max_return, + rados_omap_iter_t *iter, + unsigned char *pmore, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to null-terminated keys to get + * @param keys_len the number of strings in keys + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys(rados_read_op_t read_op, + char const* const* keys, + size_t keys_len, + rados_omap_iter_t *iter, + int *prval); + +/** + * Start iterating over specific key/value pairs + * + * They will be returned sorted by key. + * + * @param read_op operation to add this action to + * @param keys array of pointers to keys to get + * @param num_keys the number of strings in keys + * @param key_lens array of size_t's describing each key len (in bytes) + * @param iter where to store the iterator + * @param prval where to store the return value from this action + */ +CEPH_RADOS_API void rados_read_op_omap_get_vals_by_keys2(rados_read_op_t read_op, + char const* const* keys, + size_t num_keys, + const size_t* key_lens, + rados_omap_iter_t *iter, + int *prval); + +/** + * Perform a read operation synchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + const char *oid, + int flags); + +/** + * Perform a read operation asynchronously + * @param read_op operation to perform + * @param io the ioctx that the object is in + * @param completion what to do when operation has been attempted + * @param oid the object id + * @param flags flags to apply to the entire operation (LIBRADOS_OPERATION_*) + */ +CEPH_RADOS_API int rados_aio_read_op_operate(rados_read_op_t read_op, + rados_ioctx_t io, + rados_completion_t completion, + const char *oid, + int flags); + +/** @} Object Operations */ + +/** + * Take an exclusive lock on an object. + * + * @param io the context to operate in + * @param oid the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_exclusive(rados_ioctx_t io, const char * oid, + const char * name, const char * cookie, + const char * desc, + struct timeval * duration, + uint8_t flags); + +/** + * Take a shared lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for this instance of the lock + * @param tag The tag of the lock + * @param desc user-defined lock description + * @param duration the duration of the lock. Set to NULL for infinite duration. + * @param flags lock flags + * @returns 0 on success, negative error code on failure + * @returns -EBUSY if the lock is already held by another (client, cookie) pair + * @returns -EEXIST if the lock is already held by the same (client, cookie) pair + */ +CEPH_RADOS_API int rados_lock_shared(rados_ioctx_t io, const char * o, + const char * name, const char * cookie, + const char * tag, const char * desc, + struct timeval * duration, uint8_t flags); + +/** + * Release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + */ +CEPH_RADOS_API int rados_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie); + +/** + * Asynchronous release a shared or exclusive lock on an object. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param cookie user-defined identifier for the instance of the lock + * @param completion what to do when operation has been attempted + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_aio_unlock(rados_ioctx_t io, const char *o, + const char *name, const char *cookie, + rados_completion_t completion); + +/** + * List clients that have locked the named object lock and information about + * the lock. + * + * The number of bytes required in each buffer is put in the + * corresponding size out parameter. If any of the provided buffers + * are too short, -ERANGE is returned after these sizes are filled in. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param exclusive where to store whether the lock is exclusive (1) or shared (0) + * @param tag where to store the tag associated with the object lock + * @param tag_len number of bytes in tag buffer + * @param clients buffer in which locker clients are stored, separated by '\0' + * @param clients_len number of bytes in the clients buffer + * @param cookies buffer in which locker cookies are stored, separated by '\0' + * @param cookies_len number of bytes in the cookies buffer + * @param addrs buffer in which locker addresses are stored, separated by '\0' + * @param addrs_len number of bytes in the clients buffer + * @returns number of lockers on success, negative error code on failure + * @returns -ERANGE if any of the buffers are too short + */ +CEPH_RADOS_API ssize_t rados_list_lockers(rados_ioctx_t io, const char *o, + const char *name, int *exclusive, + char *tag, size_t *tag_len, + char *clients, size_t *clients_len, + char *cookies, size_t *cookies_len, + char *addrs, size_t *addrs_len); + +/** + * Releases a shared or exclusive lock on an object, which was taken by the + * specified client. + * + * @param io the context to operate in + * @param o the name of the object + * @param name the name of the lock + * @param client the client currently holding the lock + * @param cookie user-defined identifier for the instance of the lock + * @returns 0 on success, negative error code on failure + * @returns -ENOENT if the lock is not held by the specified (client, cookie) pair + * @returns -EINVAL if the client cannot be parsed + */ +CEPH_RADOS_API int rados_break_lock(rados_ioctx_t io, const char *o, + const char *name, const char *client, + const char *cookie); + +/** + * Blocklists the specified client from the OSDs + * + * @param cluster cluster handle + * @param client_address client address + * @param expire_seconds number of seconds to blocklist (0 for default) + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_blocklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds); +CEPH_RADOS_API int rados_blacklist_add(rados_t cluster, + char *client_address, + uint32_t expire_seconds) + __attribute__((deprecated)); + +/** + * Gets addresses of the RADOS session, suitable for blocklisting. + * + * @param cluster cluster handle + * @param addrs the output string. + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_getaddrs(rados_t cluster, char** addrs); + +CEPH_RADOS_API void rados_set_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_unset_osdmap_full_try(rados_ioctx_t io) + __attribute__((deprecated)); + +CEPH_RADOS_API void rados_set_pool_full_try(rados_ioctx_t io); + +CEPH_RADOS_API void rados_unset_pool_full_try(rados_ioctx_t io); + +/** + * Enable an application on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param force 0 if only single application per pool + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_enable(rados_ioctx_t io, + const char *app_name, int force); + +/** + * List all enabled applications + * + * If the provided buffer is too short, the required length is filled in and + * -ERANGE is returned. Otherwise, the buffers are filled with the application + * names, with a '\0' after each. + * + * @param io pool ioctx + * @param values buffer in which to store application names + * @param values_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_list(rados_ioctx_t io, char *values, + size_t *values_len); + +/** + * Get application metadata value from pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value result buffer + * @param value_len maximum len of value + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_get(rados_ioctx_t io, + const char *app_name, + const char *key, char *value, + size_t *value_len); + +/** + * Set application metadata on a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @param value metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_set(rados_ioctx_t io, + const char *app_name, + const char *key, + const char *value); + +/** + * Remove application metadata from a pool + * + * @param io pool ioctx + * @param app_name application name + * @param key metadata key + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_application_metadata_remove(rados_ioctx_t io, + const char *app_name, + const char *key); + +/** + * List all metadata key/value pairs associated with an application. + * + * This iterates over all metadata, key_len and val_len are filled in + * with the number of bytes put into the keys and values buffers. + * + * If the provided buffers are too short, the required lengths are filled + * in and -ERANGE is returned. Otherwise, the buffers are filled with + * the keys and values of the metadata, with a '\0' after each. + * + * @param io pool ioctx + * @param app_name application name + * @param keys buffer in which to store key names + * @param key_len number of bytes in keys buffer + * @param values buffer in which to store values + * @param vals_len number of bytes in values buffer + * @returns 0 on success, negative error code on failure + * @returns -ERANGE if either buffer is too short + */ +CEPH_RADOS_API int rados_application_metadata_list(rados_ioctx_t io, + const char *app_name, + char *keys, size_t *key_len, + char *values, + size_t *vals_len); + +/** + * @name Mon/OSD/PG Commands + * + * These interfaces send commands relating to the monitor, OSD, or PGs. + * + * @{ + */ + +/** + * Send monitor command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command(rados_t cluster, const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send ceph-mgr tell command. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name mgr name to target + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mgr_command_target( + rados_t cluster, + const char *name, + const char **cmd, + size_t cmdlen, const char *inbuf, + size_t inbuflen, char **outbuf, + size_t *outbuflen, char **outs, + size_t *outslen); + +/** + * Send monitor command to a specific monitor. + * + * @note Takes command string in carefully-formatted JSON; must match + * defined commands, types, etc. + * + * The result buffers are allocated on the heap; the caller is + * expected to release that memory with rados_buffer_free(). The + * buffer and length pointers can all be NULL, in which case they are + * not filled in. + * + * @param cluster cluster handle + * @param name target monitor's name + * @param cmd an array of char *'s representing the command + * @param cmdlen count of valid entries in cmd + * @param inbuf any bulk input data (crush map, etc.) + * @param inbuflen input buffer length + * @param outbuf double pointer to output buffer + * @param outbuflen pointer to output buffer length + * @param outs double pointer to status string + * @param outslen pointer to status string length + * @returns 0 on success, negative error code on failure + */ +CEPH_RADOS_API int rados_mon_command_target(rados_t cluster, const char *name, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/** + * free a rados-allocated buffer + * + * Release memory allocated by librados calls like rados_mon_command(). + * + * @param buf buffer pointer + */ +CEPH_RADOS_API void rados_buffer_free(char *buf); + +CEPH_RADOS_API int rados_osd_command(rados_t cluster, int osdid, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +CEPH_RADOS_API int rados_pg_command(rados_t cluster, const char *pgstr, + const char **cmd, size_t cmdlen, + const char *inbuf, size_t inbuflen, + char **outbuf, size_t *outbuflen, + char **outs, size_t *outslen); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback_t)(void *arg, + const char *line, + const char *who, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +/* + * This is not a doxygen comment leadin, because doxygen breaks on + * a typedef with function params and returns, and I can't figure out + * how to fix it. + * + * Monitor cluster log + * + * Monitor events logged to the cluster log. The callback get each + * log entry both as a single formatted line and with each field in a + * separate arg. + * + * Calling with a cb argument of NULL will deregister any previously + * registered callback. + * + * @param cluster cluster handle + * @param level minimum log level (debug, info, warn|warning, err|error) + * @param cb callback to run for each log message. It MUST NOT block + * nor call back into librados. + * @param arg void argument to pass to cb + * + * @returns 0 on success, negative code on error + */ +typedef void (*rados_log_callback2_t)(void *arg, + const char *line, + const char *channel, + const char *who, + const char *name, + uint64_t sec, uint64_t nsec, + uint64_t seq, const char *level, + const char *msg); + +CEPH_RADOS_API int rados_monitor_log(rados_t cluster, const char *level, + rados_log_callback_t cb, void *arg); +CEPH_RADOS_API int rados_monitor_log2(rados_t cluster, const char *level, + rados_log_callback2_t cb, void *arg); + + +/** + * register daemon instance for a service + * + * Register us as a daemon providing a particular service. We identify + * the service (e.g., 'rgw') and our instance name (e.g., 'rgw.$hostname'). + * The metadata is a map of keys and values with arbitrary static metdata + * for this instance. The encoding is a series of NULL-terminated strings, + * alternating key names and values, terminating with an empty key name. + * For example, "foo\0bar\0this\0that\0\0" is the dict {foo=bar,this=that}. + * + * For the lifetime of the librados instance, regular beacons will be sent + * to the cluster to maintain our registration in the service map. + * + * @param cluster handle + * @param service service name + * @param daemon daemon instance name + * @param metadata_dict static daemon metadata dict + */ +CEPH_RADOS_API int rados_service_register( + rados_t cluster, + const char *service, + const char *daemon, + const char *metadata_dict); + +/** + * update daemon status + * + * Update our mutable status information in the service map. + * + * The status dict is encoded the same way the daemon metadata is encoded + * for rados_service_register. For example, "foo\0bar\0this\0that\0\0" is + * {foo=bar,this=that}. + * + * @param cluster rados cluster handle + * @param status_dict status dict + */ +CEPH_RADOS_API int rados_service_update_status( + rados_t cluster, + const char *status_dict); + +/** @} Mon/OSD/PG commands */ + +/* + * These methods are no longer supported and return -ENOTSUP where possible. + */ +CEPH_RADOS_API int rados_objects_list_open( + rados_ioctx_t io, + rados_list_ctx_t *ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_get_pg_hash_position( + rados_list_ctx_t ctx) __attribute__((deprecated)); +CEPH_RADOS_API uint32_t rados_objects_list_seek( + rados_list_ctx_t ctx, + uint32_t pos) __attribute__((deprecated)); +CEPH_RADOS_API int rados_objects_list_next( + rados_list_ctx_t ctx, + const char **entry, + const char **key) __attribute__((deprecated)); +CEPH_RADOS_API void rados_objects_list_close( + rados_list_ctx_t ctx) __attribute__((deprecated)); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp new file mode 100644 index 000000000..cb8261af1 --- /dev/null +++ b/src/include/rados/librados.hpp @@ -0,0 +1,1568 @@ +#ifndef __LIBRADOS_HPP +#define __LIBRADOS_HPP + +#include <string> +#include <list> +#include <map> +#include <memory> +#include <set> +#include <vector> +#include <utility> +#include "buffer.h" + +#include "librados.h" +#include "librados_fwd.hpp" +#include "rados_types.hpp" + +namespace libradosstriper +{ + class RadosStriper; +} + +namespace neorados { class RADOS; } + +namespace librados { + +using ceph::bufferlist; + +struct AioCompletionImpl; +struct IoCtxImpl; +struct ListObjectImpl; +class NObjectIteratorImpl; +struct ObjListCtx; +class ObjectOperationImpl; +struct PlacementGroupImpl; +struct PoolAsyncCompletionImpl; + +typedef struct rados_cluster_stat_t cluster_stat_t; +typedef struct rados_pool_stat_t pool_stat_t; + +typedef void *list_ctx_t; +typedef uint64_t auid_t; +typedef void *config_t; + +typedef struct { + std::string client; + std::string cookie; + std::string address; +} locker_t; + +typedef std::map<std::string, pool_stat_t> stats_map; + +typedef void *completion_t; +typedef void (*callback_t)(completion_t cb, void *arg); + +inline namespace v14_2_0 { + + class IoCtx; + class RadosClient; + + class CEPH_RADOS_API ListObject + { + public: + const std::string& get_nspace() const; + const std::string& get_oid() const; + const std::string& get_locator() const; + + ListObject(); + ~ListObject(); + ListObject( const ListObject&); + ListObject& operator=(const ListObject& rhs); + private: + ListObject(ListObjectImpl *impl); + + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& out, const ListObject& lop); + + ListObjectImpl *impl; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& out, const librados::ListObject& lop); + + class CEPH_RADOS_API NObjectIterator; + + class CEPH_RADOS_API ObjectCursor + { + public: + ObjectCursor(); + ObjectCursor(const ObjectCursor &rhs); + explicit ObjectCursor(rados_object_list_cursor c); + ~ObjectCursor(); + ObjectCursor& operator=(const ObjectCursor& rhs); + bool operator<(const ObjectCursor &rhs) const; + bool operator==(const ObjectCursor &rhs) const; + void set(rados_object_list_cursor c); + + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + friend std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + std::string to_str() const; + bool from_str(const std::string& s); + + protected: + rados_object_list_cursor c_cursor; + }; + CEPH_RADOS_API std::ostream& operator<<(std::ostream& os, const librados::ObjectCursor& oc); + + class CEPH_RADOS_API NObjectIterator { + public: + using iterator_category = std::forward_iterator_tag; + using value_type = ListObject; + using difference_type = std::ptrdiff_t; + using pointer = ListObject*; + using reference = ListObject&; + static const NObjectIterator __EndObjectIterator; + NObjectIterator(): impl(NULL) {} + ~NObjectIterator(); + NObjectIterator(const NObjectIterator &rhs); + NObjectIterator& operator=(const NObjectIterator& rhs); + + bool operator==(const NObjectIterator& rhs) const; + bool operator!=(const NObjectIterator& rhs) const; + const ListObject& operator*() const; + const ListObject* operator->() const; + NObjectIterator &operator++(); //< Preincrement; errors are thrown as exceptions + NObjectIterator operator++(int); //< Postincrement; errors are thrown as exceptions + friend class IoCtx; + friend class librados::NObjectIteratorImpl; + + /// get current hash position of the iterator, rounded to the current pg + uint32_t get_pg_hash_position() const; + + /// move the iterator to a given hash position. this may (will!) be rounded + /// to the nearest pg. errors are thrown as exceptions + uint32_t seek(uint32_t pos); + + /// move the iterator to a given cursor position. errors are thrown as exceptions + uint32_t seek(const ObjectCursor& cursor); + + /// get current cursor position + ObjectCursor get_cursor(); + + /** + * Configure PGLS filter to be applied OSD-side (requires caller + * to know/understand the format expected by the OSD) + */ + void set_filter(const bufferlist &bl); + + private: + NObjectIterator(ObjListCtx *ctx_); + void get_next(); + NObjectIteratorImpl *impl; + }; + + class CEPH_RADOS_API ObjectItem + { + public: + std::string oid; + std::string nspace; + std::string locator; + }; + + /// DEPRECATED; do not use + class CEPH_RADOS_API WatchCtx { + public: + virtual ~WatchCtx(); + virtual void notify(uint8_t opcode, uint64_t ver, bufferlist& bl) = 0; + }; + + class CEPH_RADOS_API WatchCtx2 { + public: + virtual ~WatchCtx2(); + /** + * Callback activated when we receive a notify event. + * + * @param notify_id unique id for this notify event + * @param cookie the watcher we are notifying + * @param notifier_id the unique client id of the notifier + * @param bl opaque notify payload (from the notifier) + */ + virtual void handle_notify(uint64_t notify_id, + uint64_t cookie, + uint64_t notifier_id, + bufferlist& bl) = 0; + + /** + * Callback activated when we encounter an error with the watch. + * + * Errors we may see: + * -ENOTCONN : our watch was disconnected + * -ETIMEDOUT : our watch is still valid, but we may have missed + * a notify event. + * + * @param cookie the watcher with the problem + * @param err error + */ + virtual void handle_error(uint64_t cookie, int err) = 0; + }; + + struct CEPH_RADOS_API AioCompletion { + AioCompletion(AioCompletionImpl *pc_) : pc(pc_) {} + ~AioCompletion(); + int set_complete_callback(void *cb_arg, callback_t cb); + int set_safe_callback(void *cb_arg, callback_t cb) + __attribute__ ((deprecated)); + int wait_for_complete(); + int wait_for_safe() __attribute__ ((deprecated)); + int wait_for_complete_and_cb(); + int wait_for_safe_and_cb() __attribute__ ((deprecated)); + bool is_complete(); + bool is_safe() __attribute__ ((deprecated)); + bool is_complete_and_cb(); + bool is_safe_and_cb() __attribute__ ((deprecated)); + int get_return_value(); + int get_version() __attribute__ ((deprecated)); + uint64_t get_version64(); + void release(); + AioCompletionImpl *pc; + }; + + struct CEPH_RADOS_API PoolAsyncCompletion { + PoolAsyncCompletion(PoolAsyncCompletionImpl *pc_) : pc(pc_) {} + ~PoolAsyncCompletion(); + int set_callback(void *cb_arg, callback_t cb); + int wait(); + bool is_complete(); + int get_return_value(); + void release(); + PoolAsyncCompletionImpl *pc; + }; + + /** + * These are per-op flags which may be different among + * ops added to an ObjectOperation. + */ + enum ObjectOperationFlags { + OP_EXCL = LIBRADOS_OP_FLAG_EXCL, + OP_FAILOK = LIBRADOS_OP_FLAG_FAILOK, + OP_FADVISE_RANDOM = LIBRADOS_OP_FLAG_FADVISE_RANDOM, + OP_FADVISE_SEQUENTIAL = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL, + OP_FADVISE_WILLNEED = LIBRADOS_OP_FLAG_FADVISE_WILLNEED, + OP_FADVISE_DONTNEED = LIBRADOS_OP_FLAG_FADVISE_DONTNEED, + OP_FADVISE_NOCACHE = LIBRADOS_OP_FLAG_FADVISE_NOCACHE, + }; + + class CEPH_RADOS_API ObjectOperationCompletion { + public: + virtual ~ObjectOperationCompletion() {} + virtual void handle_completion(int r, bufferlist& outbl) = 0; + }; + + /** + * These flags apply to the ObjectOperation as a whole. + * + * Prior to octopus BALANCE_READS and LOCALIZE_READS should only + * be used when reading from data you're certain won't change, like + * a snapshot, or where eventual consistency is ok. Since octopus + * (get_min_compatible_osd() >= CEPH_RELEASE_OCTOPUS) both are safe + * for general use. + * + * ORDER_READS_WRITES will order reads the same way writes are + * ordered (e.g., waiting for degraded objects). In particular, it + * will make a write followed by a read sequence be preserved. + * + * IGNORE_CACHE will skip the caching logic on the OSD that normally + * handles promotion of objects between tiers. This allows an operation + * to operate (or read) the cached (or uncached) object, even if it is + * not coherent. + * + * IGNORE_OVERLAY will ignore the pool overlay tiering metadata and + * process the op directly on the destination pool. This is useful + * for CACHE_FLUSH and CACHE_EVICT operations. + */ + enum ObjectOperationGlobalFlags { + OPERATION_NOFLAG = LIBRADOS_OPERATION_NOFLAG, + OPERATION_BALANCE_READS = LIBRADOS_OPERATION_BALANCE_READS, + OPERATION_LOCALIZE_READS = LIBRADOS_OPERATION_LOCALIZE_READS, + OPERATION_ORDER_READS_WRITES = LIBRADOS_OPERATION_ORDER_READS_WRITES, + OPERATION_IGNORE_CACHE = LIBRADOS_OPERATION_IGNORE_CACHE, + OPERATION_SKIPRWLOCKS = LIBRADOS_OPERATION_SKIPRWLOCKS, + OPERATION_IGNORE_OVERLAY = LIBRADOS_OPERATION_IGNORE_OVERLAY, + // send requests to cluster despite the cluster or pool being + // marked full; ops will either succeed (e.g., delete) or return + // EDQUOT or ENOSPC + OPERATION_FULL_TRY = LIBRADOS_OPERATION_FULL_TRY, + // mainly for delete + OPERATION_FULL_FORCE = LIBRADOS_OPERATION_FULL_FORCE, + OPERATION_IGNORE_REDIRECT = LIBRADOS_OPERATION_IGNORE_REDIRECT, + OPERATION_ORDERSNAP = LIBRADOS_OPERATION_ORDERSNAP, + // enable/allow return value and per-op return code/buffers + OPERATION_RETURNVEC = LIBRADOS_OPERATION_RETURNVEC, + }; + + /* + * Alloc hint flags for the alloc_hint operation. + */ + enum AllocHintFlags { + ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1, + ALLOC_HINT_FLAG_RANDOM_WRITE = 2, + ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4, + ALLOC_HINT_FLAG_RANDOM_READ = 8, + ALLOC_HINT_FLAG_APPEND_ONLY = 16, + ALLOC_HINT_FLAG_IMMUTABLE = 32, + ALLOC_HINT_FLAG_SHORTLIVED = 64, + ALLOC_HINT_FLAG_LONGLIVED = 128, + ALLOC_HINT_FLAG_COMPRESSIBLE = 256, + ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512, + }; + + /* + * ObjectOperation : compound object operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectOperation + { + public: + ObjectOperation(); + virtual ~ObjectOperation(); + + ObjectOperation(const ObjectOperation&) = delete; + ObjectOperation& operator=(const ObjectOperation&) = delete; + + /** + * Move constructor. + * \warning A moved from ObjectOperation is invalid and may not be used for + * any purpose. This is a hard contract violation and will + * kill your program. + */ + ObjectOperation(ObjectOperation&&); + ObjectOperation& operator =(ObjectOperation&&); + + size_t size(); + void set_op_flags(ObjectOperationFlags flags) __attribute__((deprecated)); + //flag mean ObjectOperationFlags + void set_op_flags2(int flags); + + void cmpext(uint64_t off, const bufferlist& cmp_bl, int *prval); + void cmpxattr(const char *name, uint8_t op, const bufferlist& val); + void cmpxattr(const char *name, uint8_t op, uint64_t v); + void exec(const char *cls, const char *method, bufferlist& inbl); + void exec(const char *cls, const char *method, bufferlist& inbl, bufferlist *obl, int *prval); + void exec(const char *cls, const char *method, bufferlist& inbl, ObjectOperationCompletion *completion); + /** + * Guard operation with a check that object version == ver + * + * @param ver [in] version to check + */ + void assert_version(uint64_t ver); + + /** + * Guard operation with a check that the object already exists + */ + void assert_exists(); + + /** + * get key/value pairs for specified keys + * + * @param assertions [in] comparison assertions + * @param prval [out] place error code in prval upon completion + * + * assertions has the form of mappings from keys to (comparison rval, assertion) + * The assertion field may be CEPH_OSD_CMPXATTR_OP_[GT|LT|EQ]. + * + * That is, to assert that the value at key 'foo' is greater than 'bar': + * + * ObjectReadOperation op; + * int r; + * map<string, pair<bufferlist, int> > assertions; + * bufferlist bar(string('bar')); + * assertions['foo'] = make_pair(bar, CEPH_OSD_CMP_XATTR_OP_GT); + * op.omap_cmp(assertions, &r); + */ + void omap_cmp( + const std::map<std::string, std::pair<bufferlist, int> > &assertions, + int *prval); + + protected: + ObjectOperationImpl* impl; + friend class IoCtx; + friend class Rados; + }; + + /* + * ObjectWriteOperation : compound object write operation + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectWriteOperation : public ObjectOperation + { + protected: + time_t *unused; + public: + ObjectWriteOperation() : unused(NULL) {} + ~ObjectWriteOperation() override {} + + ObjectWriteOperation(ObjectWriteOperation&&) = default; + ObjectWriteOperation& operator =(ObjectWriteOperation&&) = default; + + void mtime(time_t *pt); + void mtime2(struct timespec *pts); + + void create(bool exclusive); + void create(bool exclusive, + const std::string& category); ///< NOTE: category is unused + + void write(uint64_t off, const bufferlist& bl); + void write_full(const bufferlist& bl); + void writesame(uint64_t off, uint64_t write_len, + const bufferlist& bl); + void append(const bufferlist& bl); + void remove(); + void truncate(uint64_t off); + void zero(uint64_t off, uint64_t len); + void rmxattr(const char *name); + void setxattr(const char *name, const bufferlist& bl); + void setxattr(const char *name, const bufferlist&& bl); + void tmap_update(const bufferlist& cmdbl); + void tmap_put(const bufferlist& bl); + void selfmanaged_snap_rollback(uint64_t snapid); + + /** + * Rollback an object to the specified snapshot id + * + * Used with pool snapshots + * + * @param snapid [in] snopshot id specified + */ + void snap_rollback(uint64_t snapid); + + /** + * set keys and values according to map + * + * @param map [in] keys and values to set + */ + void omap_set(const std::map<std::string, bufferlist> &map); + + /** + * set header + * + * @param bl [in] header to set + */ + void omap_set_header(const bufferlist &bl); + + /** + * Clears omap contents + */ + void omap_clear(); + + /** + * Clears keys in to_rm + * + * @param to_rm [in] keys to remove + */ + void omap_rm_keys(const std::set<std::string> &to_rm); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t src_fadvise_flags); + + /** + * Copy an object + * + * Copies an object from another location. The operation is atomic in that + * the copy either succeeds in its entirety or fails (e.g., because the + * source object was modified while the copy was in progress). Instead of + * copying truncate_seq and truncate_size from the source object it receives + * these values as parameters. + * + * @param src source object name + * @param src_ioctx ioctx for the source object + * @param src_version current version of the source object + * @param truncate_seq truncate sequence for the destination object + * @param truncate_size truncate size for the destination object + * @param src_fadvise_flags the fadvise flags for source object + */ + void copy_from2(const std::string& src, const IoCtx& src_ioctx, + uint64_t src_version, uint32_t truncate_seq, + uint64_t truncate_size, uint32_t src_fadvise_flags); + + /** + * undirty an object + * + * Clear an objects dirty flag + */ + void undirty(); + + /** + * Set allocation hint for an object + * + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @param flags flags () + */ + void set_alloc_hint(uint64_t expected_object_size, + uint64_t expected_write_size); + void set_alloc_hint2(uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + /** + * Pin/unpin an object in cache tier + * + * @returns 0 on success, negative error code on failure + */ + void cache_pin(); + void cache_unpin(); + + /** + * Extensible tier + * + * Set redirect target + */ + void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx, + uint64_t tgt_version, int flag = 0); + void tier_promote(); + void unset_manifest(); + + friend class IoCtx; + }; + + /* + * ObjectReadOperation : compound object operation that return value + * Batch multiple object operations into a single request, to be applied + * atomically. + */ + class CEPH_RADOS_API ObjectReadOperation : public ObjectOperation + { + public: + ObjectReadOperation() {} + ~ObjectReadOperation() override {} + + ObjectReadOperation(ObjectReadOperation&&) = default; + ObjectReadOperation& operator =(ObjectReadOperation&&) = default; + + void stat(uint64_t *psize, time_t *pmtime, int *prval); + void stat2(uint64_t *psize, struct timespec *pts, int *prval); + void getxattr(const char *name, bufferlist *pbl, int *prval); + void getxattrs(std::map<std::string, bufferlist> *pattrs, int *prval); + void read(size_t off, uint64_t len, bufferlist *pbl, int *prval); + void checksum(rados_checksum_type_t type, const bufferlist &init_value_bl, + uint64_t off, size_t len, size_t chunk_size, bufferlist *pbl, + int *prval); + + /** + * see aio_sparse_read() + */ + void sparse_read(uint64_t off, uint64_t len, std::map<uint64_t,uint64_t> *m, + bufferlist *data_bl, int *prval, + uint64_t truncate_size = 0, + uint32_t truncate_seq = 0); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list no keys smaller than start_after + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + /** + * omap_get_vals: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_vals2: keys and values from the object omap + * + * Get up to max_return keys and values beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param filter_prefix [in] list only keys beginning with filter_prefix + * @param max_return [in] list no more than max_return key/value pairs + * @param out_vals [out] place returned values in out_vals on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals2( + const std::string &start_after, + const std::string &filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore, + int *prval); + + + /** + * omap_get_keys: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + int *prval) __attribute__ ((deprecated)); // use v2 + + /** + * omap_get_keys2: keys from the object omap + * + * Get up to max_return keys beginning after start_after + * + * @param start_after [in] list keys starting after start_after + * @param max_return [in] list no more than max_return keys + * @param out_keys [out] place returned values in out_keys on completion + * @param pmore [out] pointer to bool indicating whether there are more keys + * @param prval [out] place error code in prval upon completion + */ + void omap_get_keys2(const std::string &start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore, + int *prval); + + /** + * omap_get_header: get header from object omap + * + * @param header [out] place header here upon completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_header(bufferlist *header, int *prval); + + /** + * get key/value pairs for specified keys + * + * @param keys [in] keys to get + * @param map [out] place key/value pairs found here on completion + * @param prval [out] place error code in prval upon completion + */ + void omap_get_vals_by_keys(const std::set<std::string> &keys, + std::map<std::string, bufferlist> *map, + int *prval); + + /** + * list_watchers: Get list watchers of object + * + * @param out_watchers [out] place returned values in out_watchers on completion + * @param prval [out] place error code in prval upon completion + */ + void list_watchers(std::list<obj_watch_t> *out_watchers, int *prval); + + /** + * list snapshot clones associated with a logical object + * + * This will include a record for each version of the object, + * include the "HEAD" (which will have a cloneid of SNAP_HEAD). + * Each clone includes a vector of snap ids for which it is + * defined to exist. + * + * NOTE: this operation must be submitted from an IoCtx with a + * read snapid of SNAP_DIR for reliable results. + * + * @param out_snaps [out] pointer to resulting snap_set_t + * @param prval [out] place error code in prval upon completion + */ + void list_snaps(snap_set_t *out_snaps, int *prval); + + /** + * query dirty state of an object + * + * @param isdirty [out] pointer to resulting bool + * @param prval [out] place error code in prval upon completion + */ + void is_dirty(bool *isdirty, int *prval); + + /** + * flush a cache tier object to backing tier; will block racing + * updates. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_flush(); + + /** + * Flush a cache tier object to backing tier; will EAGAIN if we race + * with an update. Must be used with the SKIPRWLOCKS flag. + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promotion. + */ + void cache_try_flush(); + + /** + * evict a clean cache tier object + * + * This should be used in concert with OPERATION_IGNORE_CACHE to avoid + * triggering a promote on the OSD (that is then evicted). + */ + void cache_evict(); + + /** + * Extensible tier + * + * set_chunk: make a chunk pointing a part of the source object at the target + * object + * + * @param src_offset [in] source offset to indicate the start position of + * a chunk in the source object + * @param src_length [in] source length to set the length of the chunk + * @param tgt_oid [in] target object's id to set a chunk + * @param tgt_offset [in] the start position of the target object + * @param flag [in] flag for the source object + * + */ + void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx, + std::string tgt_oid, uint64_t tgt_offset, int flag = 0); + /** + * flush a manifest tier object to backing tier, performing deduplication; + * will block racing updates. + * + * Invoking tier_flush() implicitly makes a manifest object even if + * the target object is not manifest. + */ + void tier_flush(); + /** + * evict a manifest tier object to backing tier; will block racing + * updates. + */ + void tier_evict(); + }; + + /* IoCtx : This is a context in which we can perform I/O. + * It includes a Pool, + * + * Typical use (error checking omitted): + * + * IoCtx p; + * rados.ioctx_create("my_pool", p); + * p->stat(&stats); + * ... etc ... + * + * NOTE: be sure to call watch_flush() prior to destroying any IoCtx + * that is used for watch events to ensure that racing callbacks + * have completed. + */ + class CEPH_RADOS_API IoCtx + { + public: + IoCtx(); + static void from_rados_ioctx_t(rados_ioctx_t p, IoCtx &pool); + IoCtx(const IoCtx& rhs); + IoCtx& operator=(const IoCtx& rhs); + IoCtx(IoCtx&& rhs) noexcept; + IoCtx& operator=(IoCtx&& rhs) noexcept; + + ~IoCtx(); + + bool is_valid() const; + + // Close our pool handle + void close(); + + // deep copy + void dup(const IoCtx& rhs); + + // set pool auid + int set_auid(uint64_t auid_) + __attribute__ ((deprecated)); + + // set pool auid + int set_auid_async(uint64_t auid_, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + + // get pool auid + int get_auid(uint64_t *auid_) + __attribute__ ((deprecated)); + + uint64_t get_instance_id() const; + + std::string get_pool_name(); + + bool pool_requires_alignment(); + int pool_requires_alignment2(bool * req); + uint64_t pool_required_alignment(); + int pool_required_alignment2(uint64_t * alignment); + + // create an object + int create(const std::string& oid, bool exclusive); + int create(const std::string& oid, bool exclusive, + const std::string& category); ///< category is unused + + /** + * write bytes to an object at a specified offset + * + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + /** + * append bytes to an object + * + * NOTE: this call steals the contents of @param bl. + */ + int append(const std::string& oid, bufferlist& bl, size_t len); + /** + * replace object contents with provided data + * + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& oid, bufferlist& bl); + int writesame(const std::string& oid, bufferlist& bl, + size_t write_len, uint64_t off); + int read(const std::string& oid, bufferlist& bl, size_t len, uint64_t off); + int checksum(const std::string& o, rados_checksum_type_t type, + const bufferlist &init_value_bl, size_t len, uint64_t off, + size_t chunk_size, bufferlist *pbl); + int remove(const std::string& oid); + int remove(const std::string& oid, int flags); + int trunc(const std::string& oid, uint64_t size); + int mapext(const std::string& o, uint64_t off, size_t len, std::map<uint64_t,uint64_t>& m); + int cmpext(const std::string& o, uint64_t off, bufferlist& cmp_bl); + int sparse_read(const std::string& o, std::map<uint64_t,uint64_t>& m, bufferlist& bl, size_t len, uint64_t off); + int getxattr(const std::string& oid, const char *name, bufferlist& bl); + int getxattrs(const std::string& oid, std::map<std::string, bufferlist>& attrset); + int setxattr(const std::string& oid, const char *name, bufferlist& bl); + int rmxattr(const std::string& oid, const char *name); + int stat(const std::string& oid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& oid, uint64_t *psize, struct timespec *pts); + int exec(const std::string& oid, const char *cls, const char *method, + bufferlist& inbl, bufferlist& outbl); + /** + * modify object tmap based on encoded update sequence + * + * NOTE: this call steals the contents of @param bl + */ + int tmap_update(const std::string& oid, bufferlist& cmdbl); + + int omap_get_vals(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_vals(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals); + int omap_get_vals2(const std::string& oid, + const std::string& start_after, + const std::string& filter_prefix, + uint64_t max_return, + std::map<std::string, bufferlist> *out_vals, + bool *pmore); + int omap_get_keys(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys); + int omap_get_keys2(const std::string& oid, + const std::string& start_after, + uint64_t max_return, + std::set<std::string> *out_keys, + bool *pmore); + int omap_get_header(const std::string& oid, + bufferlist *bl); + int omap_get_vals_by_keys(const std::string& oid, + const std::set<std::string>& keys, + std::map<std::string, bufferlist> *vals); + int omap_set(const std::string& oid, + const std::map<std::string, bufferlist>& map); + int omap_set_header(const std::string& oid, + const bufferlist& bl); + int omap_clear(const std::string& oid); + int omap_rm_keys(const std::string& oid, + const std::set<std::string>& keys); + + void snap_set_read(snap_t seq); + int selfmanaged_snap_set_write_ctx(snap_t seq, std::vector<snap_t>& snaps); + + // Create a snapshot with a given name + int snap_create(const char *snapname); + + // Look up a snapshot by name. + // Returns 0 on success; error code otherwise + int snap_lookup(const char *snapname, snap_t *snap); + + // Gets a timestamp for a snap + int snap_get_stamp(snap_t snapid, time_t *t); + + // Gets the name of a snap + int snap_get_name(snap_t snapid, std::string *s); + + // Remove a snapshot from this pool + int snap_remove(const char *snapname); + + int snap_list(std::vector<snap_t> *snaps); + + int snap_rollback(const std::string& oid, const char *snapname); + + // Deprecated name kept for backward compatibility - same as snap_rollback() + int rollback(const std::string& oid, const char *snapname) + __attribute__ ((deprecated)); + + int selfmanaged_snap_create(uint64_t *snapid); + void aio_selfmanaged_snap_create(uint64_t *snapid, AioCompletion *c); + + int selfmanaged_snap_remove(uint64_t snapid); + void aio_selfmanaged_snap_remove(uint64_t snapid, AioCompletion *c); + + int selfmanaged_snap_rollback(const std::string& oid, uint64_t snapid); + + // Advisory locking on rados objects. + int lock_exclusive(const std::string &oid, const std::string &name, + const std::string &cookie, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int lock_shared(const std::string &oid, const std::string &name, + const std::string &cookie, const std::string &tag, + const std::string &description, + struct timeval * duration, uint8_t flags); + + int unlock(const std::string &oid, const std::string &name, + const std::string &cookie); + + int break_lock(const std::string &oid, const std::string &name, + const std::string &client, const std::string &cookie); + + int list_lockers(const std::string &oid, const std::string &name, + int *exclusive, + std::string *tag, + std::list<librados::locker_t> *lockers); + + + /// Start enumerating objects for a pool. Errors are thrown as exceptions. + NObjectIterator nobjects_begin(const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from a hash position. + /// Errors are thrown as exceptions. + NObjectIterator nobjects_begin(uint32_t start_hash_position, + const bufferlist &filter=bufferlist()); + /// Start enumerating objects for a pool starting from cursor. Errors are + /// thrown as exceptions. + NObjectIterator nobjects_begin(const librados::ObjectCursor& cursor, + const bufferlist &filter=bufferlist()); + /// Iterator indicating the end of a pool + const NObjectIterator& nobjects_end() const; + + /// Get cursor for pool beginning + ObjectCursor object_list_begin(); + + /// Get cursor for pool end + ObjectCursor object_list_end(); + + /// Check whether a cursor is at the end of a pool + bool object_list_is_end(const ObjectCursor &oc); + + /// List some objects between two cursors + int object_list(const ObjectCursor &start, const ObjectCursor &finish, + const size_t result_count, + const bufferlist &filter, + std::vector<ObjectItem> *result, + ObjectCursor *next); + + /// Generate cursors that include the N out of Mth slice of the pool + void object_list_slice( + const ObjectCursor start, + const ObjectCursor finish, + const size_t n, + const size_t m, + ObjectCursor *split_start, + ObjectCursor *split_finish); + + /** + * List available hit set objects + * + * @param uint32_t [in] hash position to query + * @param c [in] completion + * @param pls [out] list of available intervals + */ + int hit_set_list(uint32_t hash, AioCompletion *c, + std::list< std::pair<time_t, time_t> > *pls); + + /** + * Retrieve hit set for a given hash, and time + * + * @param hash [in] hash position + * @param c [in] completion + * @param stamp [in] time interval that falls within the hit set's interval + * @param pbl [out] buffer to store the result in + */ + int hit_set_get(uint32_t hash, AioCompletion *c, time_t stamp, + bufferlist *pbl); + + uint64_t get_last_version(); + + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off); + /** + * Asynchronously read from an object at a particular snapshot + * + * This is the same as normal aio_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param pbl where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_read(const std::string& oid, AioCompletion *c, + bufferlist *pbl, size_t len, uint64_t off, uint64_t snapid); + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off); + /** + * Asynchronously read existing extents from an object at a + * particular snapshot + * + * This is the same as normal aio_sparse_read, except that it chooses + * the snapshot to read from from its arguments instead of the + * internal IoCtx state. + * + * m will be filled in with a map of extents in the object, + * mapping offsets to lengths (in bytes) within the range + * requested. The data for all of the extents are stored + * back-to-back in offset order in data_bl. + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param m where to store the map of extents + * @param data_bl where to store the data + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @param snapid the id of the snapshot to read from + * @returns 0 on success, negative error code on failure + */ + int aio_sparse_read(const std::string& oid, AioCompletion *c, + std::map<uint64_t,uint64_t> *m, bufferlist *data_bl, + size_t len, uint64_t off, uint64_t snapid); + /** + * Asynchronously compare an on-disk object range with a buffer + * + * @param oid the name of the object to read from + * @param c what to do when the read is complete + * @param off object byte offset at which to start the comparison + * @param cmp_bl buffer containing bytes to be compared with object contents + * @returns 0 on success, negative error code on failure, + * (-MAX_ERRNO - mismatch_off) on mismatch + */ + int aio_cmpext(const std::string& oid, + librados::AioCompletion *c, + uint64_t off, + bufferlist& cmp_bl); + int aio_write(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len, uint64_t off); + int aio_append(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t len); + int aio_write_full(const std::string& oid, AioCompletion *c, const bufferlist& bl); + int aio_writesame(const std::string& oid, AioCompletion *c, const bufferlist& bl, + size_t write_len, uint64_t off); + + /** + * Asynchronously remove an object + * + * Queues the remove and returns. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param oid the name of the object + * @param c what to do when the remove is safe and complete + * @returns 0 on success, -EROFS if the io context specifies a snap_seq + * other than SNAP_HEAD + */ + int aio_remove(const std::string& oid, AioCompletion *c); + int aio_remove(const std::string& oid, AioCompletion *c, int flags); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * Schedule a callback for when all currently pending + * aio writes are safe. This is a non-blocking version of + * aio_flush(). + * + * @param c what to do when the writes are safe + * @returns 0 on success, negative error code on failure + */ + int aio_flush_async(AioCompletion *c); + int aio_getxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_getxattrs(const std::string& oid, AioCompletion *c, std::map<std::string, bufferlist>& attrset); + int aio_setxattr(const std::string& oid, AioCompletion *c, const char *name, bufferlist& bl); + int aio_rmxattr(const std::string& oid, AioCompletion *c, const char *name); + int aio_stat(const std::string& oid, AioCompletion *c, uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& oid, AioCompletion *c, uint64_t *psize, struct timespec *pts); + + /** + * Cancel aio operation + * + * @param c completion handle + * @returns 0 on success, negative error code on failure + */ + int aio_cancel(AioCompletion *c); + + int aio_exec(const std::string& oid, AioCompletion *c, const char *cls, const char *method, + bufferlist& inbl, bufferlist *outbl); + + /* + * asynchronous version of unlock + */ + int aio_unlock(const std::string &oid, const std::string &name, + const std::string &cookie, AioCompletion *c); + + // compound object operations + int operate(const std::string& oid, ObjectWriteOperation *op); + int operate(const std::string& oid, ObjectWriteOperation *op, int flags); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl); + int operate(const std::string& oid, ObjectReadOperation *op, bufferlist *pbl, int flags); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op); + int aio_operate(const std::string& oid, AioCompletion *c, ObjectWriteOperation *op, int flags); + /** + * Schedule an async write operation with explicit snapshot parameters + * + * This is the same as the first aio_operate(), except that it + * gets the snapshot context from its arguments instead of the + * IoCtx internal state. + * + * @param oid the object to operate on + * @param c what to do when the operation is complete and safe + * @param op which operations to perform + * @param seq latest selfmanaged snapshot sequence number for this object + * @param snaps currently existing selfmanaged snapshot ids for this object + * @returns 0 on success, negative error code on failure + */ + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectWriteOperation *op, snap_t seq, + std::vector<snap_t>& snaps, int flags, + const blkin_trace_info *trace_info); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, bufferlist *pbl); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, snap_t snapid, int flags, + bufferlist *pbl) + __attribute__ ((deprecated)); + + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl); + int aio_operate(const std::string& oid, AioCompletion *c, + ObjectReadOperation *op, int flags, + bufferlist *pbl, const blkin_trace_info *trace_info); + + // watch/notify + int watch2(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx); + int watch3(const std::string& o, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int aio_watch(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx); + int aio_watch2(const std::string& o, AioCompletion *c, uint64_t *handle, + librados::WatchCtx2 *ctx, uint32_t timeout); + int unwatch2(uint64_t handle); + int aio_unwatch(uint64_t handle, AioCompletion *c); + /** + * Send a notify event to watchers + * + * Upon completion the pbl bufferlist reply payload will be + * encoded like so: + * + * le32 num_acks + * { + * le64 gid global id for the client (for client.1234 that's 1234) + * le64 cookie cookie for the client + * le32 buflen length of reply message buffer + * u8 * buflen payload + * } * num_acks + * le32 num_timeouts + * { + * le64 gid global id for the client + * le64 cookie cookie for the client + * } * num_timeouts + * + * + */ + int notify2(const std::string& o, ///< object + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + int aio_notify(const std::string& o, ///< object + AioCompletion *c, ///< completion when notify completes + bufferlist& bl, ///< optional broadcast payload + uint64_t timeout_ms, ///< timeout (in ms) + bufferlist *pbl); ///< reply buffer + /* + * Decode a notify response into acks and timeout vectors. + */ + void decode_notify_response(bufferlist &bl, + std::vector<librados::notify_ack_t> *acks, + std::vector<librados::notify_timeout_t> *timeouts); + + int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers); + int list_snaps(const std::string& o, snap_set_t *out_snaps); + void set_notify_timeout(uint32_t timeout); + + /// acknowledge a notify we received. + void notify_ack(const std::string& o, ///< watched object + uint64_t notify_id, ///< notify id + uint64_t cookie, ///< our watch handle + bufferlist& bl); ///< optional reply payload + + /*** + * check on watch validity + * + * Check if a watch is valid. If so, return the number of + * milliseconds since we last confirmed its liveness. If there is + * a known error, return it. + * + * If there is an error, the watch is no longer valid, and should + * be destroyed with unwatch(). The user is still interested in + * the object, a new watch should be created with watch(). + * + * @param cookie watch handle + * @returns ms since last confirmed valid, or error + */ + int watch_check(uint64_t cookie); + + // old, deprecated versions + int watch(const std::string& o, uint64_t ver, uint64_t *cookie, + librados::WatchCtx *ctx) __attribute__ ((deprecated)); + int notify(const std::string& o, uint64_t ver, bufferlist& bl) + __attribute__ ((deprecated)); + int unwatch(const std::string& o, uint64_t cookie) + __attribute__ ((deprecated)); + + /** + * Set allocation hint for an object + * + * This is an advisory operation, it will always succeed (as if it + * was submitted with a OP_FAILOK flag set) and is not guaranteed + * to do anything on the backend. + * + * @param o the name of the object + * @param expected_object_size expected size of the object, in bytes + * @param expected_write_size expected size of writes to the object, in bytes + * @returns 0 on success, negative error code on failure + */ + int set_alloc_hint(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size); + int set_alloc_hint2(const std::string& o, + uint64_t expected_object_size, + uint64_t expected_write_size, + uint32_t flags); + + // assert version for next sync operations + void set_assert_version(uint64_t ver); + + /** + * Pin/unpin an object in cache tier + * + * @param o the name of the object + * @returns 0 on success, negative error code on failure + */ + int cache_pin(const std::string& o); + int cache_unpin(const std::string& o); + + std::string get_pool_name() const; + + void locator_set_key(const std::string& key); + void set_namespace(const std::string& nspace); + std::string get_namespace() const; + + int64_t get_id(); + + // deprecated versions + uint32_t get_object_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + uint32_t get_object_pg_hash_position(const std::string& oid) + __attribute__ ((deprecated)); + + int get_object_hash_position2(const std::string& oid, uint32_t *hash_position); + int get_object_pg_hash_position2(const std::string& oid, uint32_t *pg_hash_position); + + config_t cct(); + + void set_osdmap_full_try() + __attribute__ ((deprecated)); + void unset_osdmap_full_try() + __attribute__ ((deprecated)); + + bool get_pool_full_try(); + void set_pool_full_try(); + void unset_pool_full_try(); + + int application_enable(const std::string& app_name, bool force); + int application_enable_async(const std::string& app_name, + bool force, PoolAsyncCompletion *c); + int application_list(std::set<std::string> *app_names); + int application_metadata_get(const std::string& app_name, + const std::string &key, + std::string *value); + int application_metadata_set(const std::string& app_name, + const std::string &key, + const std::string& value); + int application_metadata_remove(const std::string& app_name, + const std::string &key); + int application_metadata_list(const std::string& app_name, + std::map<std::string, std::string> *values); + + private: + /* You can only get IoCtx instances from Rados */ + IoCtx(IoCtxImpl *io_ctx_impl_); + + friend class Rados; // Only Rados can use our private constructor to create IoCtxes. + friend class libradosstriper::RadosStriper; // Striper needs to see our IoCtxImpl + friend class ObjectWriteOperation; // copy_from needs to see our IoCtxImpl + friend class ObjectReadOperation; // set_chunk needs to see our IoCtxImpl + + IoCtxImpl *io_ctx_impl; + }; + + struct CEPH_RADOS_API PlacementGroup { + PlacementGroup(); + PlacementGroup(const PlacementGroup&); + ~PlacementGroup(); + bool parse(const char*); + std::unique_ptr<PlacementGroupImpl> impl; + }; + + CEPH_RADOS_API std::ostream& operator<<(std::ostream&, const PlacementGroup&); + + class CEPH_RADOS_API Rados + { + public: + static void version(int *major, int *minor, int *extra); + + Rados(); + explicit Rados(IoCtx& ioctx); + ~Rados(); + static void from_rados_t(rados_t cluster, Rados &rados); + + int init(const char * const id); + int init2(const char * const name, const char * const clustername, + uint64_t flags); + int init_with_context(config_t cct_); + config_t cct(); + int connect(); + void shutdown(); + int watch_flush(); + int aio_watch_flush(AioCompletion*); + int conf_read_file(const char * const path) const; + int conf_parse_argv(int argc, const char ** argv) const; + int conf_parse_argv_remainder(int argc, const char ** argv, + const char ** remargv) const; + int conf_parse_env(const char *env) const; + int conf_set(const char *option, const char *value); + int conf_get(const char *option, std::string &val); + + int service_daemon_register( + const std::string& service, ///< service name (e.g., 'rgw') + const std::string& name, ///< daemon name (e.g., 'gwfoo') + const std::map<std::string,std::string>& metadata); ///< static metadata about daemon + int service_daemon_update_status( + std::map<std::string,std::string>&& status); + + int pool_create(const char *name); + int pool_create(const char *name, uint64_t auid) + __attribute__ ((deprecated)); + int pool_create(const char *name, uint64_t auid, uint8_t crush_rule) + __attribute__ ((deprecated)); + int pool_create_with_rule(const char *name, uint8_t crush_rule); + int pool_create_async(const char *name, PoolAsyncCompletion *c); + int pool_create_async(const char *name, uint64_t auid, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_async(const char *name, uint64_t auid, uint8_t crush_rule, PoolAsyncCompletion *c) + __attribute__ ((deprecated)); + int pool_create_with_rule_async(const char *name, uint8_t crush_rule, PoolAsyncCompletion *c); + int pool_get_base_tier(int64_t pool, int64_t* base_tier); + int pool_delete(const char *name); + int pool_delete_async(const char *name, PoolAsyncCompletion *c); + int64_t pool_lookup(const char *name); + int pool_reverse_lookup(int64_t id, std::string *name); + + uint64_t get_instance_id(); + + int get_min_compatible_osd(int8_t* require_osd_release); + int get_min_compatible_client(int8_t* min_compat_client, + int8_t* require_min_compat_client); + + int mon_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int mgr_command(std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int osd_command(int osdid, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + int pg_command(const char *pgstr, std::string cmd, const bufferlist& inbl, + bufferlist *outbl, std::string *outs); + + int ioctx_create(const char *name, IoCtx &pioctx); + int ioctx_create2(int64_t pool_id, IoCtx &pioctx); + + // Features useful for test cases + void test_blocklist_self(bool set); + + /* pool info */ + int pool_list(std::list<std::string>& v); + int pool_list2(std::list<std::pair<int64_t, std::string> >& v); + int get_pool_stats(std::list<std::string>& v, + stats_map& result); + /// deprecated; use simpler form. categories no longer supported. + int get_pool_stats(std::list<std::string>& v, + std::map<std::string, stats_map>& stats); + /// deprecated; categories no longer supported + int get_pool_stats(std::list<std::string>& v, + std::string& category, + std::map<std::string, stats_map>& stats); + /// check if pool has selfmanaged snaps + bool get_pool_is_selfmanaged_snaps_mode(const std::string& poolname); + + int cluster_stat(cluster_stat_t& result); + int cluster_fsid(std::string *fsid); + + /** + * List inconsistent placement groups in the given pool + * + * @param pool_id the pool id + * @param pgs [out] the inconsistent PGs + */ + int get_inconsistent_pgs(int64_t pool_id, + std::vector<PlacementGroup>* pgs); + /** + * List the inconsistent objects found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param objects [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_objects(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_obj_t>* objects, + uint32_t* interval); + /** + * List the inconsistent snapsets found in a given PG by last scrub + * + * @param pg the placement group returned by @c pg_list() + * @param start_after the first returned @c objects + * @param max_return the max number of the returned @c objects + * @param c what to do when the operation is complete and safe + * @param snapsets [out] the objects where inconsistencies are found + * @param interval [in,out] an epoch indicating current interval + * @returns if a non-zero @c interval is specified, will return -EAGAIN i + * the current interval begin epoch is different. + */ + int get_inconsistent_snapsets(const PlacementGroup& pg, + const object_id_t &start_after, + unsigned max_return, + AioCompletion *c, + std::vector<inconsistent_snapset_t>* snapset, + uint32_t* interval); + + /// get/wait for the most recent osdmap + int wait_for_latest_osdmap(); + + int blocklist_add(const std::string& client_address, + uint32_t expire_seconds); + + std::string get_addrs() const; + + /* + * pool aio + * + * It is up to the caller to release the completion handler, even if the pool_create_async() + * and/or pool_delete_async() fails and does not send the async request + */ + static PoolAsyncCompletion *pool_async_create_completion(); + + // -- aio -- + static AioCompletion *aio_create_completion(); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete, + callback_t cb_safe) + __attribute__ ((deprecated)); + static AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete); + + friend std::ostream& operator<<(std::ostream &oss, const Rados& r); + private: + friend class neorados::RADOS; + + // We don't allow assignment or copying + Rados(const Rados& rhs); + const Rados& operator=(const Rados& rhs); + RadosClient *client; + }; + +} // namespace v14_2_0 +} // namespace librados + +#endif + diff --git a/src/include/rados/librados_fwd.hpp b/src/include/rados/librados_fwd.hpp new file mode 100644 index 000000000..396f3a838 --- /dev/null +++ b/src/include/rados/librados_fwd.hpp @@ -0,0 +1,34 @@ +#ifndef __LIBRADOS_FWD_HPP +#define __LIBRADOS_FWD_HPP + +struct blkin_trace_info; + +namespace libradosstriper { + +class RadosStriper; + +} // namespace libradosstriper + +namespace librados { +inline namespace v14_2_0 { + +class AioCompletion; +class IoCtx; +class ListObject; +class NObjectIterator; +class ObjectCursor; +class ObjectItem; +class ObjectOperation; +class ObjectOperationCompletion; +class ObjectReadOperation; +class ObjectWriteOperation; +class PlacementGroup; +class PoolAsyncCompletion; +class Rados; +class WatchCtx; +class WatchCtx2; + +} // inline namespace v14_2_0 +} // namespace librados + +#endif // __LIBRADOS_FWD_HPP diff --git a/src/include/rados/librgw.h b/src/include/rados/librgw.h new file mode 100644 index 000000000..c20e96bed --- /dev/null +++ b/src/include/rados/librgw.h @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_LIBRGW_H +#define CEPH_LIBRGW_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_VER_MAJOR 1 +#define LIBRGW_VER_MINOR 1 +#define LIBRGW_VER_EXTRA 0 + +#define LIBRGW_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_VERSION_CODE LIBRGW_VERSION(LIBRGW_VER_MAJOR, LIBRGW_VER_MINOR, LIBRGW_VER_EXTRA) + +typedef void* librgw_t; +int librgw_create(librgw_t *rgw, int argc, char **argv); +void librgw_shutdown(librgw_t rgw); + +#ifdef __cplusplus +} +#endif + +#endif /* CEPH_LIBRGW_H */ diff --git a/src/include/rados/objclass.h b/src/include/rados/objclass.h new file mode 100644 index 000000000..80ae69d25 --- /dev/null +++ b/src/include/rados/objclass.h @@ -0,0 +1,177 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef CEPH_OBJCLASS_OBJCLASS_PUBLIC_H +#define CEPH_OBJCLASS_OBJCLASS_PUBLIC_H + +#ifdef __cplusplus + +#include "buffer.h" + +extern "C" { +#endif + +#define CEPH_CLS_API [[gnu::visibility("default")]] + +#define CLS_VER(maj,min) \ +int __cls_ver__## maj ## _ ##min = 0; \ +int __cls_ver_maj = maj; \ +int __cls_ver_min = min; + +#define CLS_NAME(name) \ +int __cls_name__## name = 0; \ +const char *__cls_name = #name; + +#define CLS_INIT(name) \ +CEPH_CLS_API void __cls_init() + +#define CLS_METHOD_RD 0x1 /// method executes read operations +#define CLS_METHOD_WR 0x2 /// method executes write operations +#define CLS_METHOD_PROMOTE 0x8 /// method cannot be proxied to base tier + +#define CLS_LOG(level, fmt, ...) \ + cls_log(level, "<cls> %s:%d: " fmt, __FILE__, __LINE__, ##__VA_ARGS__) +#define CLS_ERR(fmt, ...) CLS_LOG(0, fmt, ##__VA_ARGS__) + +/** + * Initialize a class. + */ +void __cls_init(); + +/** + * @typdef cls_handle_t + * + * A handle for interacting with the object class. + */ +typedef void *cls_handle_t; + +/** + * @typedef cls_method_handle_t + * + * A handle for interacting with the method of the object class. + */ +typedef void *cls_method_handle_t; + +/** + * @typedef cls_method_context_t + * + * A context for the method of the object class. + */ +typedef void* cls_method_context_t; + +/*class utils*/ +extern int cls_log(int level, const char *format, ...) + __attribute__((__format__(printf, 2, 3))); + +/* class registration api */ +extern int cls_register(const char *name, cls_handle_t *handle); + +#ifdef __cplusplus +} + +/** + * @typedef cls_method_cxx_call_t + * + */ +typedef int (*cls_method_cxx_call_t)(cls_method_context_t ctx, + class ceph::buffer::list *inbl, class ceph::buffer::list *outbl); + +/** + * Register a method. + * + * @param hclass + * @param method + * @param flags + * @param class_call + * @param handle + */ +extern int cls_register_cxx_method(cls_handle_t hclass, const char *method, int flags, + cls_method_cxx_call_t class_call, cls_method_handle_t *handle); + +/** + * Create an object. + * + * @param hctx + * @param exclusive + */ +extern int cls_cxx_create(cls_method_context_t hctx, bool exclusive); + +/** + * Remove an object. + * + * @param hctx + */ +extern int cls_cxx_remove(cls_method_context_t hctx); + +/** + * Check on the status of an object. + * + * @param hctx + * @param size + * @param mtime + */ +extern int cls_cxx_stat(cls_method_context_t hctx, uint64_t *size, time_t *mtime); + +/** + * Read contents of an object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_read(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Write to the object. + * + * @param hctx + * @param ofs + * @param len + * @param bl + */ +extern int cls_cxx_write(cls_method_context_t hctx, int ofs, int len, ceph::bufferlist *bl); + +/** + * Get xattr of the object. + * + * @param hctx + * @param name + * @param outbl + */ +extern int cls_cxx_getxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *outbl); + +/** + * Set xattr of the object. + * + * @param hctx + * @param name + * @param inbl + */ +extern int cls_cxx_setxattr(cls_method_context_t hctx, const char *name, + ceph::bufferlist *inbl); + +/** + * Get value corresponding to a key from the map. + * + * @param hctx + * @param key + * @param outbl + */ +extern int cls_cxx_map_get_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *outbl); + +/** + * Set value corresponding to a key in the map. + * + * @param hctx + * @param key + * @param inbl + */ +extern int cls_cxx_map_set_val(cls_method_context_t hctx, + const std::string &key, ceph::bufferlist *inbl); + +#endif + +#endif diff --git a/src/include/rados/page.h b/src/include/rados/page.h new file mode 120000 index 000000000..cf983e838 --- /dev/null +++ b/src/include/rados/page.h @@ -0,0 +1 @@ +../page.h
\ No newline at end of file diff --git a/src/include/rados/rados_types.h b/src/include/rados/rados_types.h new file mode 100644 index 000000000..d308341ec --- /dev/null +++ b/src/include/rados/rados_types.h @@ -0,0 +1,41 @@ +#ifndef CEPH_RADOS_TYPES_H +#define CEPH_RADOS_TYPES_H + +#include <stdint.h> + +/** + * @struct obj_watch_t + * One item from list_watchers + */ +struct obj_watch_t { + /// Address of the Watcher + char addr[256]; + /// Watcher ID + int64_t watcher_id; + /// Cookie + uint64_t cookie; + /// Timeout in Seconds + uint32_t timeout_seconds; +}; + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + char *payload; + uint64_t payload_len; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; + +/** + * + * Pass as nspace argument to rados_ioctx_set_namespace() + * before calling rados_nobjects_list_open() to return + * all objects in all namespaces. + */ +#define LIBRADOS_ALL_NSPACES "\001" + +#endif diff --git a/src/include/rados/rados_types.hpp b/src/include/rados/rados_types.hpp new file mode 100644 index 000000000..84023579b --- /dev/null +++ b/src/include/rados/rados_types.hpp @@ -0,0 +1,341 @@ +#ifndef CEPH_RADOS_TYPES_HPP +#define CEPH_RADOS_TYPES_HPP + +#include <map> +#include <utility> +#include <vector> +#include <stdint.h> +#include <string> + +#include "buffer.h" +#include "rados_types.h" + +namespace librados { + +typedef uint64_t snap_t; + +enum { + SNAP_HEAD = (uint64_t)(-2), + SNAP_DIR = (uint64_t)(-1) +}; + +struct clone_info_t { + snap_t cloneid; + std::vector<snap_t> snaps; // ascending + std::vector< std::pair<uint64_t,uint64_t> > overlap; // with next newest + uint64_t size; + clone_info_t() : cloneid(0), size(0) {} +}; + +struct snap_set_t { + std::vector<clone_info_t> clones; // ascending + snap_t seq; // newest snapid seen by the object + snap_set_t() : seq(0) {} +}; + +struct object_id_t { + std::string name; + std::string nspace; + std::string locator; + snap_t snap = 0; + object_id_t() = default; + object_id_t(const std::string& name, + const std::string& nspace, + const std::string& locator, + snap_t snap) + : name(name), + nspace(nspace), + locator(locator), + snap(snap) + {} +}; + +struct err_t { + enum : uint64_t { + SHARD_MISSING = 1 << 1, + SHARD_STAT_ERR = 1 << 2, + SHARD_READ_ERR = 1 << 3, + DATA_DIGEST_MISMATCH_OI = 1 << 9, // Old + DATA_DIGEST_MISMATCH_INFO = 1 << 9, + OMAP_DIGEST_MISMATCH_OI = 1 << 10, // Old + OMAP_DIGEST_MISMATCH_INFO = 1 << 10, + SIZE_MISMATCH_OI = 1 << 11, // Old + SIZE_MISMATCH_INFO = 1 << 11, + SHARD_EC_HASH_MISMATCH = 1 << 12, + SHARD_EC_SIZE_MISMATCH = 1 << 13, + OI_ATTR_MISSING = 1 << 14, // Old + INFO_MISSING = 1 << 14, + OI_ATTR_CORRUPTED = 1 << 15, // Old + INFO_CORRUPTED = 1 << 15, + SS_ATTR_MISSING = 1 << 16, // Old + SNAPSET_MISSING = 1 << 16, + SS_ATTR_CORRUPTED = 1 << 17, // Old + SNAPSET_CORRUPTED = 1 << 17, + OBJ_SIZE_OI_MISMATCH = 1 << 18, // Old + OBJ_SIZE_INFO_MISMATCH = 1 << 18, + HINFO_MISSING = 1 << 19, + HINFO_CORRUPTED = 1 << 20 + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = SHARD_MISSING|SHARD_STAT_ERR|SIZE_MISMATCH_INFO|INFO_MISSING|INFO_CORRUPTED|SNAPSET_MISSING|SNAPSET_CORRUPTED|OBJ_SIZE_INFO_MISMATCH|HINFO_MISSING|HINFO_CORRUPTED; + static constexpr uint64_t DEEP_ERRORS = SHARD_READ_ERR|DATA_DIGEST_MISMATCH_INFO|OMAP_DIGEST_MISMATCH_INFO|SHARD_EC_HASH_MISMATCH|SHARD_EC_SIZE_MISMATCH; + bool has_shard_missing() const { + return errors & SHARD_MISSING; + } + bool has_stat_error() const { + return errors & SHARD_STAT_ERR; + } + bool has_read_error() const { + return errors & SHARD_READ_ERR; + } + bool has_data_digest_mismatch_oi() const { // Compatibility + return errors & DATA_DIGEST_MISMATCH_OI; + } + bool has_data_digest_mismatch_info() const { + return errors & DATA_DIGEST_MISMATCH_INFO; + } + bool has_omap_digest_mismatch_oi() const { // Compatibility + return errors & OMAP_DIGEST_MISMATCH_OI; + } + bool has_omap_digest_mismatch_info() const { + return errors & OMAP_DIGEST_MISMATCH_INFO; + } + bool has_size_mismatch_oi() const { // Compatibility + return errors & SIZE_MISMATCH_OI; + } + bool has_size_mismatch_info() const { + return errors & SIZE_MISMATCH_INFO; + } + bool has_ec_hash_error() const { + return errors & SHARD_EC_HASH_MISMATCH; + } + bool has_ec_size_error() const { + return errors & SHARD_EC_SIZE_MISMATCH; + } + bool has_oi_attr_missing() const { // Compatibility + return errors & OI_ATTR_MISSING; + } + bool has_info_missing() const { + return errors & INFO_MISSING; + } + bool has_oi_attr_corrupted() const { // Compatibility + return errors & OI_ATTR_CORRUPTED; + } + bool has_info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool has_ss_attr_missing() const { // Compatibility + return errors & SS_ATTR_MISSING; + } + bool has_snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool has_ss_attr_corrupted() const { // Compatibility + return errors & SS_ATTR_CORRUPTED; + } + bool has_snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_obj_size_oi_mismatch() const { // Compatibility + return errors & OBJ_SIZE_OI_MISMATCH; + } + bool has_obj_size_info_mismatch() const { + return errors & OBJ_SIZE_INFO_MISMATCH; + } + bool has_hinfo_missing() const { + return errors & HINFO_MISSING; + } + bool has_hinfo_corrupted() const { + return errors & HINFO_CORRUPTED; + } +}; + +struct shard_info_t : err_t { + std::map<std::string, ceph::bufferlist> attrs; + uint64_t size = -1; + bool omap_digest_present = false; + uint32_t omap_digest = 0; + bool data_digest_present = false; + uint32_t data_digest = 0; + bool selected_oi = false; + bool primary = false; +}; + +struct osd_shard_t { + int32_t osd; + int8_t shard; +}; + +inline bool operator<(const osd_shard_t &lhs, const osd_shard_t &rhs) { + if (lhs.osd < rhs.osd) + return true; + else if (lhs.osd > rhs.osd) + return false; + else + return lhs.shard < rhs.shard; +} + +struct obj_err_t { + enum : uint64_t { + OBJECT_INFO_INCONSISTENCY = 1 << 1, + // XXX: Can an older rados binary work if these bits stay the same? + DATA_DIGEST_MISMATCH = 1 << 4, + OMAP_DIGEST_MISMATCH = 1 << 5, + SIZE_MISMATCH = 1 << 6, + ATTR_VALUE_MISMATCH = 1 << 7, + ATTR_NAME_MISMATCH = 1 << 8, + SNAPSET_INCONSISTENCY = 1 << 9, + HINFO_INCONSISTENCY = 1 << 10, + SIZE_TOO_LARGE = 1 << 11, + // When adding more here add to either SHALLOW_ERRORS or DEEP_ERRORS + }; + uint64_t errors = 0; + static constexpr uint64_t SHALLOW_ERRORS = OBJECT_INFO_INCONSISTENCY|SIZE_MISMATCH|ATTR_VALUE_MISMATCH + |ATTR_NAME_MISMATCH|SNAPSET_INCONSISTENCY|HINFO_INCONSISTENCY|SIZE_TOO_LARGE; + static constexpr uint64_t DEEP_ERRORS = DATA_DIGEST_MISMATCH|OMAP_DIGEST_MISMATCH; + bool has_object_info_inconsistency() const { + return errors & OBJECT_INFO_INCONSISTENCY; + } + bool has_data_digest_mismatch() const { + return errors & DATA_DIGEST_MISMATCH; + } + bool has_omap_digest_mismatch() const { + return errors & OMAP_DIGEST_MISMATCH; + } + bool has_size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool has_attr_value_mismatch() const { + return errors & ATTR_VALUE_MISMATCH; + } + bool has_attr_name_mismatch() const { + return errors & ATTR_NAME_MISMATCH; + } + bool has_shallow_errors() const { + return errors & SHALLOW_ERRORS; + } + bool has_deep_errors() const { + return errors & DEEP_ERRORS; + } + bool has_snapset_inconsistency() const { + return errors & SNAPSET_INCONSISTENCY; + } + bool has_hinfo_inconsistency() const { + return errors & HINFO_INCONSISTENCY; + } + bool has_size_too_large() const { + return errors & SIZE_TOO_LARGE; + } +}; + +struct inconsistent_obj_t : obj_err_t { + inconsistent_obj_t() = default; + inconsistent_obj_t(const object_id_t& object) + : object{object}, version(0) + {} + object_id_t object; + uint64_t version; // XXX: Redundant with object info attr + std::map<osd_shard_t, shard_info_t> shards; + err_t union_shards; +}; + +struct inconsistent_snapset_t { + inconsistent_snapset_t() = default; + inconsistent_snapset_t(const object_id_t& head) + : object{head} + {} + enum { + SNAPSET_MISSING = 1 << 0, + SNAPSET_CORRUPTED = 1 << 1, + CLONE_MISSING = 1 << 2, + SNAP_ERROR = 1 << 3, + HEAD_MISMATCH = 1 << 4, // Unused + HEADLESS_CLONE = 1 << 5, + SIZE_MISMATCH = 1 << 6, + OI_MISSING = 1 << 7, // Old + INFO_MISSING = 1 << 7, + OI_CORRUPTED = 1 << 8, // Old + INFO_CORRUPTED = 1 << 8, + EXTRA_CLONES = 1 << 9, + }; + uint64_t errors = 0; + object_id_t object; + // Extra clones + std::vector<snap_t> clones; + std::vector<snap_t> missing; + ceph::bufferlist ss_bl; + + bool ss_attr_missing() const { // Compatibility + return errors & SNAPSET_MISSING; + } + bool snapset_missing() const { + return errors & SNAPSET_MISSING; + } + bool ss_attr_corrupted() const { // Compatibility + return errors & SNAPSET_CORRUPTED; + } + bool snapset_corrupted() const { + return errors & SNAPSET_CORRUPTED; + } + bool clone_missing() const { + return errors & CLONE_MISSING; + } + bool snapset_mismatch() const { // Compatibility + return errors & SNAP_ERROR; + } + bool snapset_error() const { + return errors & SNAP_ERROR; + } + bool head_mismatch() const { // Compatibility + return false; + } + bool headless() const { + return errors & HEADLESS_CLONE; + } + bool size_mismatch() const { + return errors & SIZE_MISMATCH; + } + bool oi_attr_missing() const { // Compatibility + return errors & OI_MISSING; + } + bool info_missing() const { + return errors & INFO_MISSING; + } + bool oi_attr_corrupted() const { // Compatibility + return errors & OI_CORRUPTED; + } + bool info_corrupted() const { + return errors & INFO_CORRUPTED; + } + bool extra_clones() const { + return errors & EXTRA_CLONES; + } +}; + +/** + * @var all_nspaces + * Pass as nspace argument to IoCtx::set_namespace() + * before calling nobjects_begin() to iterate + * through all objects in all namespaces. + */ +const std::string all_nspaces(LIBRADOS_ALL_NSPACES); + +struct notify_ack_t { + uint64_t notifier_id; + uint64_t cookie; + ceph::bufferlist payload_bl; +}; + +struct notify_timeout_t { + uint64_t notifier_id; + uint64_t cookie; +}; +} +#endif diff --git a/src/include/rados/rgw_file.h b/src/include/rados/rgw_file.h new file mode 100644 index 000000000..e1ea45593 --- /dev/null +++ b/src/include/rados/rgw_file.h @@ -0,0 +1,431 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * convert RGW commands to file commands + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef RADOS_RGW_FILE_H +#define RADOS_RGW_FILE_H + +#include <sys/stat.h> +#include <sys/types.h> +#include <stdint.h> +#include <stdbool.h> + +#include "librgw.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LIBRGW_FILE_VER_MAJOR 1 +#define LIBRGW_FILE_VER_MINOR 2 +#define LIBRGW_FILE_VER_EXTRA 0 + +#define LIBRGW_FILE_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) +#define LIBRGW_FILE_VERSION_CODE LIBRGW_FILE_VERSION(LIBRGW_FILE_VER_MAJOR, LIBRGW_FILE_VER_MINOR, LIBRGW_FILE_VER_EXTRA) + +/* + * object types + */ +enum rgw_fh_type { + RGW_FS_TYPE_NIL = 0, + RGW_FS_TYPE_FILE, + RGW_FS_TYPE_DIRECTORY, + RGW_FS_TYPE_SYMBOLIC_LINK, +}; + +/* + * dynamic allocated handle to support nfs handle + */ + +/* content-addressable hash */ +struct rgw_fh_hk { + uint64_t bucket; + uint64_t object; +}; + +struct rgw_file_handle +{ + /* content-addressable hash */ + struct rgw_fh_hk fh_hk; + void *fh_private; /* librgw private data */ + /* object type */ + enum rgw_fh_type fh_type; +}; + +struct rgw_fs +{ + librgw_t rgw; + void *fs_private; + struct rgw_file_handle* root_fh; +}; + + +/* XXX mount info hypothetical--emulate Unix, support at least + * UUID-length fsid */ +struct rgw_statvfs { + uint64_t f_bsize; /* file system block size */ + uint64_t f_frsize; /* fragment size */ + uint64_t f_blocks; /* size of fs in f_frsize units */ + uint64_t f_bfree; /* # free blocks */ + uint64_t f_bavail; /* # free blocks for unprivileged users */ + uint64_t f_files; /* # inodes */ + uint64_t f_ffree; /* # free inodes */ + uint64_t f_favail; /* # free inodes for unprivileged users */ + uint64_t f_fsid[2]; /* file system ID */ + uint64_t f_flag; /* mount flags */ + uint64_t f_namemax; /* maximum filename length */ +}; + + +void rgwfile_version(int *major, int *minor, int *extra); + +/* + lookup object by name (POSIX style) +*/ +#define RGW_LOOKUP_FLAG_NONE 0x0000 +#define RGW_LOOKUP_FLAG_CREATE 0x0001 +#define RGW_LOOKUP_FLAG_RCB 0x0002 /* readdir callback hint */ +#define RGW_LOOKUP_FLAG_DIR 0x0004 +#define RGW_LOOKUP_FLAG_FILE 0x0008 + +#define RGW_LOOKUP_TYPE_FLAGS \ + (RGW_LOOKUP_FLAG_DIR|RGW_LOOKUP_FLAG_FILE) + +int rgw_lookup(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *path, + struct rgw_file_handle **fh, + struct stat *st, uint32_t mask, uint32_t flags); + +/* + lookup object by handle (NFS style) +*/ +int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk, + struct rgw_file_handle **fh, uint32_t flags); + +/* + * release file handle + */ +#define RGW_FH_RELE_FLAG_NONE 0x0000 + +int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + attach rgw namespace +*/ +#define RGW_MOUNT_FLAG_NONE 0x0000 + +int rgw_mount(librgw_t rgw, const char *uid, const char *key, + const char *secret, struct rgw_fs **rgw_fs, + uint32_t flags); + +int rgw_mount2(librgw_t rgw, const char *uid, const char *key, + const char *secret, const char *root, struct rgw_fs **rgw_fs, + uint32_t flags); + +/* + register invalidate callbacks +*/ +#define RGW_REG_INVALIDATE_FLAG_NONE 0x0000 + +typedef void (*rgw_fh_callback_t)(void *handle, struct rgw_fh_hk fh_hk); + +int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb, + void *arg, uint32_t flags); + +/* + detach rgw namespace +*/ +#define RGW_UMOUNT_FLAG_NONE 0x0000 + +int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags); + + +/* + get filesystem attributes +*/ +#define RGW_STATFS_FLAG_NONE 0x0000 + +int rgw_statfs(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + struct rgw_statvfs *vfs_st, + uint32_t flags); + + +/* XXX (get|set)attr mask bits */ +#define RGW_SETATTR_MODE 1 +#define RGW_SETATTR_UID 2 +#define RGW_SETATTR_GID 4 +#define RGW_SETATTR_MTIME 8 +#define RGW_SETATTR_ATIME 16 +#define RGW_SETATTR_SIZE 32 +#define RGW_SETATTR_CTIME 64 + +/* + create file +*/ +#define RGW_CREATE_FLAG_NONE 0x0000 + +int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a symbolic link + */ +#define RGW_CREATELINK_FLAG_NONE 0x0000 +int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + const char *name, const char *link_path, struct stat *st, + uint32_t mask, struct rgw_file_handle **fh, uint32_t posix_flags, + uint32_t flags); + +/* + create a new directory +*/ +#define RGW_MKDIR_FLAG_NONE 0x0000 + +int rgw_mkdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, struct stat *st, uint32_t mask, + struct rgw_file_handle **fh, uint32_t flags); + +/* + rename object +*/ +#define RGW_RENAME_FLAG_NONE 0x0000 + +int rgw_rename(struct rgw_fs *rgw_fs, + struct rgw_file_handle *olddir, const char* old_name, + struct rgw_file_handle *newdir, const char* new_name, + uint32_t flags); + +/* + remove file or directory +*/ +#define RGW_UNLINK_FLAG_NONE 0x0000 + +int rgw_unlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char* path, + uint32_t flags); + +/* + read directory content +*/ +typedef int (*rgw_readdir_cb)(const char *name, void *arg, uint64_t offset, + struct stat *st, uint32_t mask, + uint32_t flags); + +#define RGW_READDIR_FLAG_NONE 0x0000 +#define RGW_READDIR_FLAG_DOTDOT 0x0001 /* send dot names */ + +int rgw_readdir(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, uint64_t *offset, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* enumeration continuing from name */ +int rgw_readdir2(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, const char *name, + rgw_readdir_cb rcb, void *cb_arg, bool *eof, + uint32_t flags); + +/* project offset of dirent name */ +#define RGW_DIRENT_OFFSET_FLAG_NONE 0x0000 + +int rgw_dirent_offset(struct rgw_fs *rgw_fs, + struct rgw_file_handle *parent_fh, + const char *name, int64_t *offset, + uint32_t flags); + +/* + get unix attributes for object +*/ +#define RGW_GETATTR_FLAG_NONE 0x0000 + +int rgw_getattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t flags); + +/* + set unix attributes for object +*/ +#define RGW_SETATTR_FLAG_NONE 0x0000 + +int rgw_setattr(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, struct stat *st, + uint32_t mask, uint32_t flags); + +/* + truncate file +*/ +#define RGW_TRUNCATE_FLAG_NONE 0x0000 + +int rgw_truncate(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t size, + uint32_t flags); + +/* + open file +*/ +#define RGW_OPEN_FLAG_NONE 0x0000 +#define RGW_OPEN_FLAG_CREATE 0x0001 +#define RGW_OPEN_FLAG_V3 0x0002 /* ops have v3 semantics */ +#define RGW_OPEN_FLAG_STATELESS 0x0002 /* alias it */ + +int rgw_open(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh, + uint32_t posix_flags, uint32_t flags); + +/* + close file +*/ + +#define RGW_CLOSE_FLAG_NONE 0x0000 +#define RGW_CLOSE_FLAG_RELE 0x0001 + +int rgw_close(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + read data from file +*/ +#define RGW_READ_FLAG_NONE 0x0000 + +int rgw_read(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + read symbolic link +*/ +#define RGW_READLINK_FLAG_NONE 0x0000 + +int rgw_readlink(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_read, void *buffer, + uint32_t flags); + +/* + write data to file +*/ +#define RGW_WRITE_FLAG_NONE 0x0000 + +int rgw_write(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, uint64_t offset, + size_t length, size_t *bytes_written, void *buffer, + uint32_t flags); + +#define RGW_UIO_NONE 0x0000 +#define RGW_UIO_GIFT 0x0001 +#define RGW_UIO_FREE 0x0002 +#define RGW_UIO_BUFQ 0x0004 + +struct rgw_uio; +typedef void (*rgw_uio_release)(struct rgw_uio *, uint32_t); + +/* buffer vector descriptors */ +struct rgw_vio { + void *vio_p1; + void *vio_u1; + void *vio_base; + int32_t vio_len; +}; + +struct rgw_uio { + rgw_uio_release uio_rele; + void *uio_p1; + void *uio_u1; + uint64_t uio_offset; + uint64_t uio_resid; + uint32_t uio_cnt; + uint32_t uio_flags; + struct rgw_vio *uio_vio; /* appended vectors */ +}; + +typedef struct rgw_uio rgw_uio; + +int rgw_readv(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +int rgw_writev(struct rgw_fs *rgw_fs, + struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags); + +/* + sync written data +*/ +#define RGW_FSYNC_FLAG_NONE 0x0000 + +int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint32_t flags); + +/* + NFS commit operation +*/ + +#define RGW_COMMIT_FLAG_NONE 0x0000 + +int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + uint64_t offset, uint64_t length, uint32_t flags); + +/* + extended attributes + */ +typedef struct rgw_xattrstr +{ + char *val; + uint32_t len; +} rgw_xattrstr; + +typedef struct rgw_xattr +{ + rgw_xattrstr key; + rgw_xattrstr val; +} rgw_xattr; + +typedef struct rgw_xattrlist +{ + rgw_xattr *xattrs; + uint32_t xattr_cnt; +} rgw_xattrlist; + +#define RGW_GETXATTR_FLAG_NONE 0x0000 + +typedef int (*rgw_getxattr_cb)(rgw_xattrlist *attrs, void *arg, + uint32_t flags); + +int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg, + uint32_t flags); + +#define RGW_LSXATTR_FLAG_NONE 0x0000 +#define RGW_LSXATTR_FLAG_STOP 0x0001 + +int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrstr *filter_prefix /* unimplemented for now */, + rgw_getxattr_cb cb, void *cb_arg, uint32_t flags); + +#define RGW_SETXATTR_FLAG_NONE 0x0000 + +int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags); + +#define RGW_RMXATTR_FLAG_NONE 0x0000 + +int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh, + rgw_xattrlist *attrs, uint32_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* RADOS_RGW_FILE_H */ diff --git a/src/include/radosstriper/libradosstriper.h b/src/include/radosstriper/libradosstriper.h new file mode 100644 index 000000000..a35345f7d --- /dev/null +++ b/src/include/radosstriper/libradosstriper.h @@ -0,0 +1,620 @@ +#ifndef CEPH_LIBRADOSSTRIPER_H +#define CEPH_LIBRADOSSTRIPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <string.h> + +#include "../rados/librados.h" + +#define LIBRADOSSTRIPER_VER_MAJOR 0 +#define LIBRADOSSTRIPER_VER_MINOR 0 +#define LIBRADOSSTRIPER_VER_EXTRA 0 + +#define LIBRADOSSTRIPER_VERSION(maj, min, extra) ((maj << 16) + (min << 8) + extra) + +#define LIBRADOSSTRIPER_VERSION_CODE LIBRADOSSTRIPER_VERSION(LIBRADOSSTRIPER_VER_MAJOR, LIBRADOSSTRIPER_VER_MINOR, LIBRADOSSTRIPER_VER_EXTRA) + +/** + * @typedef rados_striper_t + * + * A handle for interacting with striped objects in a RADOS cluster. + */ +typedef void *rados_striper_t; + +/** + * @defgroup libradosstriper_h_init Setup and Teardown + * These are the first and last functions to that should be called + * when using libradosstriper. + * + * @{ + */ + +/** + * Creates a rados striper using the given io context + * Striper has initially default object layout. + * See rados_striper_set_object_layout_*() to change this + * + * @param ioctx the rados context to use + * @param striper where to store the rados striper + * @returns 0 on success, negative error code on failure + */ + int rados_striper_create(rados_ioctx_t ioctx, + rados_striper_t *striper); + +/** + * Destroys a rados striper + * + * @param striper the striper to destroy + */ +void rados_striper_destroy(rados_striper_t striper); + +/** + * Sets the object layout's stripe unit of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_unit the stripe_unit value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_unit(rados_striper_t striper, + unsigned int stripe_unit); + +/** + * Sets the object layout's stripe count of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param stripe_count the stripe_count value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_stripe_count(rados_striper_t striper, + unsigned int stripe_count); + +/** + * Sets the object layout's object_size of a rados striper for future objects. + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + * + * @param striper the targeted striper + * @param object_size the object_size value of the new object layout + * @returns 0 on success, negative error code on failure + */ +int rados_striper_set_object_layout_object_size(rados_striper_t striper, + unsigned int object_size); + +/** @} init */ + +/** + * @defgroup libradosstriper_h_synch_io Synchronous I/O + * Writes are striped to several rados objects which are then + * replicated to a number of OSDs based on the configuration + * of the pool they are in. These write functions block + * until data is in memory on all replicas of the object they're + * writing to - they are equivalent to doing the corresponding + * asynchronous write, and the calling + * rados_striper_ioctx_wait_for_complete(). + * + * @{ + */ + +/** + * Synchronously write data to a striped object at the specified offset + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_write(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously write an entire striped object + * + * The striped object is filled with the provided data. If the striped object exists, + * it is truncated and then written. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_write_full(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Append data to an object + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param buf the data to append + * @param len length of buf (in bytes) + * @returns 0 on success, negative error code on failure + * failure + */ +int rados_striper_append(rados_striper_t striper, + const char *soid, + const char *buf, + size_t len); + +/** + * Synchronously read data from a striped object at the specified offset + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns number of bytes read on success, negative error code on + * failure + */ +int rados_striper_read(rados_striper_t striper, + const char *soid, + char *buf, + size_t len, + uint64_t off); + +/** + * Synchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @returns 0 on success, negative error code on failure + */ +int rados_striper_remove(rados_striper_t striper, + const char* soid); + +/** + * Resize an object + * + * If this enlarges the object, the new area is logically filled with + * zeroes. If this shrinks the object, the excess data is removed. + * + * @note the truncation is not fully atomic. The metadata part is, + * so the behavior will be atomic from user point of view when + * the object size is reduced. However, in case of failure, old data + * may stay around, hidden. They may reappear if the object size is + * later grown, instead of the expected 0s. When growing the + * object and in case of failure, the new 0 data may not be + * fully created. This can lead to ENOENT errors when + * writing/reading the missing parts. + * @note the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + * @param io the rados context to use + * @param soid the name of the striped object + * @param size the new size of the object in bytes + * @returns 0 on success, negative error code on failure + */ +int rados_striper_trunc(rados_striper_t striper, const char *soid, uint64_t size); + +/** @} Synchronous I/O */ + +/** + * @defgroup libradosstriper_h_xattrs Xattrs + * Extended attributes are stored as extended attributes on the + * first rados regular object of the striped object. + * Thus, they have the same limitations as the underlying + * rados extended attributes. + * + * @{ + */ + +/** + * Get the value of an extended attribute on a striped object. + * + * @param striper the striper in which the getxattr will occur + * @param oid name of the striped object + * @param name which extended attribute to read + * @param buf where to store the result + * @param len size of buf in bytes + * @returns length of xattr value on success, negative error code on failure + */ +int rados_striper_getxattr(rados_striper_t striper, + const char *oid, + const char *name, + char *buf, + size_t len); + +/** + * Set an extended attribute on a striped object. + * + * @param striper the striper in which the setxattr will occur + * @param oid name of the object + * @param name which extended attribute to set + * @param buf what to store in the xattr + * @param len the number of bytes in buf + * @returns 0 on success, negative error code on failure + */ +int rados_striper_setxattr(rados_striper_t striper, + const char *oid, + const char *name, + const char *buf, + size_t len); + +/** + * Delete an extended attribute from a striped object. + * + * @param striper the striper in which the rmxattr will occur + * @param oid name of the object + * @param name which xattr to delete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_rmxattr(rados_striper_t striper, + const char *oid, + const char *name); + +/** + * Start iterating over xattrs on a striped object. + * + * @post iter is a valid iterator + * + * @param striper the striper in which the getxattrs will occur + * @param oid name of the object + * @param iter where to store the iterator + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs(rados_striper_t striper, + const char *oid, + rados_xattrs_iter_t *iter); + +/** + * Get the next xattr on the striped object + * + * @pre iter is a valid iterator + * + * @post name is the NULL-terminated name of the next xattr, and val + * contains the value of the xattr, which is of length len. If the end + * of the list has been reached, name and val are NULL, and len is 0. + * + * @param iter iterator to advance + * @param name where to store the name of the next xattr + * @param val where to store the value of the next xattr + * @param len the number of bytes in val + * @returns 0 on success, negative error code on failure + */ +int rados_striper_getxattrs_next(rados_xattrs_iter_t iter, + const char **name, + const char **val, + size_t *len); + +/** + * Close the xattr iterator. + * + * iter should not be used after this is called. + * + * @param iter the iterator to close + */ +void rados_striper_getxattrs_end(rados_xattrs_iter_t iter); + +/** @} Xattrs */ + +/** + * Synchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @returns 0 on success, negative error code on failure + */ +int rados_striper_stat(rados_striper_t striper, + const char* soid, + uint64_t *psize, + time_t *pmtime); + +int rados_striper_stat2(rados_striper_t striper, + const char* soid, + uint64_t *psize, + struct timespec *pmtime); + +/** + * @defgroup libradosstriper_h_asynch_io Asynchronous I/O + * Read and write to objects without blocking. + * + * @{ + */ + +/** + * @typedef rados_striper_multi_completion_t + * Represents the state of a set of asynchronous operations + * it contains the aggregated return value once the operations complete + * and can be used to block until all operations are complete and/or safe. + */ +typedef void *rados_striper_multi_completion_t; + +/** + * Constructs a multi completion to use with asynchronous operations + * + * The complete and safe callbacks correspond to operations being + * acked and committed, respectively. The callbacks are called in + * order of receipt, so the safe callback may be triggered before the + * complete callback, and vice versa. This is affected by journalling + * on the OSDs. + * + * @note Read operations only get a complete callback. + * @note BUG: this should check for ENOMEM instead of throwing an exception + * + * @param cb_arg application-defined data passed to the callback functions + * @param cb_complete the function to be called when the operation is + * in memory on all relpicas + * @param cb_safe the function to be called when the operation is on + * stable storage on all replicas + * @param pc where to store the completion + * @returns 0 + */ +int rados_striper_multi_aio_create_completion(void *cb_arg, + rados_callback_t cb_complete, + rados_callback_t cb_safe, + rados_striper_multi_completion_t *pc); + +/** + * Block until all operation complete + * + * This means data is in memory on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete(rados_striper_multi_completion_t c); + +/** + * Block until all operation are safe + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation completed? + * + * @warning This does not imply that the complete callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe? + * + * @warning This does not imply that the safe callback has + * finished + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe(rados_striper_multi_completion_t c); + +/** + * Block until all operations complete and callback completes + * + * This means data is in memory on all replicas and can be read. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Block until all operations are safe and callback has completed + * + * This means data is on stable storage on all replicas. + * + * @param c operations to wait for + * @returns 0 + */ +void rados_striper_multi_aio_wait_for_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Has a multi asynchronous operation and callback completed + * + * @param c async operations to inspect + * @returns whether c is complete + */ +int rados_striper_multi_aio_is_complete_and_cb(rados_striper_multi_completion_t c); + +/** + * Is a multi asynchronous operation safe and has the callback completed + * + * @param c async operations to inspect + * @returns whether c is safe + */ +int rados_striper_multi_aio_is_safe_and_cb(rados_striper_multi_completion_t c); + +/** + * Get the return value of a multi asychronous operation + * + * The return value is set when all operations are complete or safe, + * whichever comes first. + * + * @pre The operation is safe or complete + * + * @note BUG: complete callback may never be called when the safe + * message is received before the complete message + * + * @param c async operations to inspect + * @returns aggregated return value of the operations + */ +int rados_striper_multi_aio_get_return_value(rados_striper_multi_completion_t c); + +/** + * Release a multi asynchrnous IO completion + * + * Call this when you no longer need the completion. It may not be + * freed immediately if the operation is not acked and committed. + * + * @param c multi completion to release + */ +void rados_striper_multi_aio_release(rados_striper_multi_completion_t c); + +/** + * Asynchronously write data to a striped object at the specified offset + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @param off byte offset in the object to begin writing at + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len, + uint64_t off); + +/** + * Asynchronously appends data to a striped object + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_append(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously fills and object with the provided data. + * If the object exists, it is truncated and then written. + * + * The return value of the completion will be 0 on success, negative + * error code on failure. + * + * @param striper the striper in which the write will occur + * @param soid the name of the striped object + * @param completion what to do when the write is safe and complete + * @param buf data to write + * @param len length of the data, in bytes + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_write_full(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + const char *buf, + size_t len); + +/** + * Asynchronously read data from a striped object at the specified offset + * + * The return value of the completion will be number of bytes read on + * success, negative error code on failure. + * + * @param striper the striper in which the read will occur + * @param soid the name of the striped object + * @param completion what to do when the read is safe and complete + * @param buf where to store the results + * @param len the number of bytes to read + * @param off the offset to start reading from in the object + * @returns 0 on success, negative error code on + * failure + */ +int rados_striper_aio_read(rados_striper_t striper, + const char *soid, + rados_completion_t completion, + char *buf, + const size_t len, + uint64_t off); + +/** + * Asynchronously removes a striped object + * + * @note There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + * @param striper the striper in which the remove will occur + * @param soid the name of the striped object + * @param completion what to do when the remove is safe and complete + * @returns 0 on success, negative error code on failure + */ + +int rados_striper_aio_remove(rados_striper_t striper, + const char* soid, + rados_completion_t completion); + +/** + * Block until all pending writes in a striper are safe + * + * This is not equivalent to calling rados_striper_multi_aio_wait_for_safe() on all + * write completions, since this waits for the associated callbacks to + * complete as well. + * + * @param striper the striper in which the flush will occur + * @returns 0 on success, negative error code on failure +*/ +void rados_striper_aio_flush(rados_striper_t striper); + +/** + * Asynchronously get object stats (size/mtime) + * + * @param striper the striper in which the stat will occur + * @param soid the id of the striped object + * @param psize where to store object size + * @param pmtime where to store modification time + * @param completion what to do when the stats is complete + * @returns 0 on success, negative error code on failure + */ +int rados_striper_aio_stat(rados_striper_t striper, + const char* soid, + rados_completion_t completion, + uint64_t *psize, + time_t *pmtime); + +int rados_striper_aio_stat2(rados_striper_t striper, + const char* soid, + rados_completion_t completion, + uint64_t *psize, + struct timespec *pmtime); +/** @} Asynchronous I/O */ + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/include/radosstriper/libradosstriper.hpp b/src/include/radosstriper/libradosstriper.hpp new file mode 100644 index 000000000..fb790b0d7 --- /dev/null +++ b/src/include/radosstriper/libradosstriper.hpp @@ -0,0 +1,241 @@ +#ifndef __LIBRADOSSTRIPER_HPP +#define __LIBRADOSSTRIPER_HPP + +#include <string.h> +#include <string> +#include <map> +#include "../rados/buffer.h" +#include "../rados/librados.hpp" + +#include "libradosstriper.h" + +namespace libradosstriper +{ + struct RadosStriperImpl; + struct MultiAioCompletionImpl; + + /* + * Completion object for multiple asynchronous IO + * It allows to internally handle several "requests" + */ + struct MultiAioCompletion { + MultiAioCompletion(MultiAioCompletionImpl *pc_) : pc(pc_) {} + ~MultiAioCompletion(); + int set_complete_callback(void *cb_arg, librados::callback_t cb); + int set_safe_callback(void *cb_arg, librados::callback_t cb) __attribute__ ((deprecated)); + void wait_for_complete(); + void wait_for_safe() __attribute__ ((deprecated)); + void wait_for_complete_and_cb(); + void wait_for_safe_and_cb() __attribute__ ((deprecated)); + bool is_complete(); + bool is_safe() __attribute__ ((deprecated)); + bool is_complete_and_cb(); + bool is_safe_and_cb() __attribute__ ((deprecated)); + int get_return_value(); + void release(); + MultiAioCompletionImpl *pc; + }; + + /* RadosStriper : This class allows to perform read/writes on striped objects + * + * Typical use (error checking omitted): + * + * RadosStriper rs; + * RadosStriper.striper_create("my_cluster", rs); + * bufferlist bl; + * ... put data in bl ... + * rs.write(object_name, bl, len, offset); + * bufferlist bl2; + * rs.read(object_name, &bl2, len, offset); + * ... + */ + class RadosStriper + { + public: + + /* + * constructor + */ + RadosStriper(); + + /* + * builds the C counter part of a RadosStriper + */ + static void to_rados_striper_t(RadosStriper &striper, + rados_striper_t *s); + + /* + * copy constructor + */ + RadosStriper(const RadosStriper& rs); + + /* + * operator= + */ + RadosStriper& operator=(const RadosStriper& rs); + + /* + * destructor + * Internally calling close() if an object is currently opened + */ + ~RadosStriper(); + + /* + * create method + */ + static int striper_create(librados::IoCtx& ioctx, + RadosStriper *striper); + + /* + * set object layout's stripe unit + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_unit(unsigned int stripe_unit); + + /* + * set object layout's stripe count + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_stripe_count(unsigned int stripe_count); + + /* + * set object layout's object size + * This layout will be used when new objects are created (by writing to them) + * Already existing objects will be opened with their own layout. + */ + int set_object_layout_object_size(unsigned int object_size); + + /** + * Get the value of an extended attribute on a striped object + */ + int getxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Set the value of an extended attribute on a striped object + */ + int setxattr(const std::string& oid, const char *name, ceph::bufferlist& bl); + + /** + * Delete an extended attribute from a striped object + */ + int rmxattr(const std::string& oid, const char *name); + + /** + * Start iterating over xattrs on a striped object. + */ + int getxattrs(const std::string& oid, + std::map<std::string, ceph::bufferlist>& attrset); + + /** + * synchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @param bl. + */ + int write(const std::string& soid, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * synchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @param bl. + */ + int write_full(const std::string& soid, const ceph::bufferlist& bl); + + /** + * synchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int append(const std::string& soid, const ceph::bufferlist& bl, size_t len); + + /** + * asynchronously write to the striped object at the specified offset. + * NOTE: this call steals the contents of @p bl. + */ + int aio_write(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len, uint64_t off); + + /** + * asynchronously fill the striped object with the specified data + * NOTE: this call steals the contents of @p bl. + */ + int aio_write_full(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl); + + /** + * asynchronously append data to the striped object + * NOTE: this call steals the contents of @p bl. + */ + int aio_append(const std::string& soid, librados::AioCompletion *c, const ceph::bufferlist& bl, size_t len); + + /** + * synchronously read from the striped object at the specified offset. + */ + int read(const std::string& soid, ceph::bufferlist* pbl, size_t len, uint64_t off); + + /** + * asynchronously read from the striped object at the specified offset. + */ + int aio_read(const std::string& soid, librados::AioCompletion *c, ceph::bufferlist *pbl, size_t len, uint64_t off); + + /** + * synchronously get striped object stats (size/mtime) + */ + int stat(const std::string& soid, uint64_t *psize, time_t *pmtime); + int stat2(const std::string& soid, uint64_t *psize, struct timespec *pts); + + /** + * asynchronously get striped object stats (size/mtime) + */ + int aio_stat(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, time_t *pmtime); + int aio_stat2(const std::string& soid, librados::AioCompletion *c, + uint64_t *psize, struct timespec *pts); + + /** + * deletes a striped object. + * There is no atomicity of the deletion and the striped + * object may be left incomplete if an error is returned (metadata + * all present, but some stripes missing) + * However, there is a atomicity of the metadata deletion and + * the deletion can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during deletion (same EBUSY return code) + */ + int remove(const std::string& soid); + int remove(const std::string& soid, int flags); + + /** + * asynchronous remove of striped objects + * See synchronous version for comments on (lack of) atomicity + */ + int aio_remove(const std::string& soid, librados::AioCompletion *c); + int aio_remove(const std::string& soid, librados::AioCompletion *c, int flags); + + /** + * Resizes a striped object + * the truncation can not happen if any I/O is ongoing (it + * will return EBUSY). Identically, no I/O will be able to start + * during truncation (same EBUSY return code) + */ + int trunc(const std::string& oid, uint64_t size); + + /** + * Wait for all currently pending aio writes to be safe. + * + * @returns 0 on success, negative error code on failure + */ + int aio_flush(); + + /** + * creation of multi aio completion objects + */ + static MultiAioCompletion *multi_aio_create_completion(); + static MultiAioCompletion *multi_aio_create_completion(void *cb_arg, + librados::callback_t cb_complete, + librados::callback_t cb_safe); + + private: + RadosStriperImpl *rados_striper_impl; + + }; + +} + +#endif |