summaryrefslogtreecommitdiffstats
path: root/include/linux/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/ceph')
-rw-r--r--include/linux/ceph/auth.h190
-rw-r--r--include/linux/ceph/buffer.h39
-rw-r--r--include/linux/ceph/ceph_debug.h39
-rw-r--r--include/linux/ceph/ceph_features.h224
-rw-r--r--include/linux/ceph/ceph_frag.h75
-rw-r--r--include/linux/ceph/ceph_fs.h895
-rw-r--r--include/linux/ceph/ceph_hash.h14
-rw-r--r--include/linux/ceph/cls_lock_client.h58
-rw-r--r--include/linux/ceph/debugfs.h14
-rw-r--r--include/linux/ceph/decode.h398
-rw-r--r--include/linux/ceph/libceph.h332
-rw-r--r--include/linux/ceph/mdsmap.h72
-rw-r--r--include/linux/ceph/messenger.h587
-rw-r--r--include/linux/ceph/mon_client.h152
-rw-r--r--include/linux/ceph/msgpool.h27
-rw-r--r--include/linux/ceph/msgr.h234
-rw-r--r--include/linux/ceph/osd_client.h562
-rw-r--r--include/linux/ceph/osdmap.h339
-rw-r--r--include/linux/ceph/pagelist.h72
-rw-r--r--include/linux/ceph/rados.h551
-rw-r--r--include/linux/ceph/string_table.h63
-rw-r--r--include/linux/ceph/striper.h71
-rw-r--r--include/linux/ceph/types.h31
23 files changed, 5039 insertions, 0 deletions
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
new file mode 100644
index 000000000..6b138fa97
--- /dev/null
+++ b/include/linux/ceph/auth.h
@@ -0,0 +1,190 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys. These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_msg;
+
+struct ceph_authorizer {
+ void (*destroy)(struct ceph_authorizer *);
+};
+
+struct ceph_auth_handshake {
+ struct ceph_authorizer *authorizer;
+ void *authorizer_buf;
+ size_t authorizer_buf_len;
+ void *authorizer_reply_buf;
+ size_t authorizer_reply_buf_len;
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+};
+
+struct ceph_auth_client_ops {
+ /*
+ * true if we are authenticated and can connect to
+ * services.
+ */
+ int (*is_authenticated)(struct ceph_auth_client *ac);
+
+ /*
+ * true if we should (re)authenticate, e.g., when our tickets
+ * are getting old and crusty.
+ */
+ int (*should_authenticate)(struct ceph_auth_client *ac);
+
+ /*
+ * build requests and process replies during monitor
+ * handshake. if handle_reply returns -EAGAIN, we build
+ * another request.
+ */
+ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+ int (*handle_reply)(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len);
+
+ /*
+ * Create authorizer for connecting to a service, and verify
+ * the response to authenticate the service.
+ */
+ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth);
+ /* ensure that an existing authorizer is up to date */
+ int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type,
+ struct ceph_auth_handshake *auth);
+ int (*add_authorizer_challenge)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len);
+ int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+ void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+ int peer_type);
+
+ /* reset when we (re)connect to a monitor */
+ void (*reset)(struct ceph_auth_client *ac);
+
+ void (*destroy)(struct ceph_auth_client *ac);
+
+ int (*sign_message)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+ int (*check_message_signature)(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg);
+};
+
+struct ceph_auth_client {
+ u32 protocol; /* CEPH_AUTH_* */
+ void *private; /* for use by protocol implementation */
+ const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
+
+ bool negotiating; /* true if negotiating protocol */
+ const char *name; /* entity name */
+ u64 global_id; /* our unique id in system */
+ const struct ceph_crypto_key *key; /* our secret key */
+ unsigned want_keys; /* which services we want */
+
+ int preferred_mode; /* CEPH_CON_MODE_* */
+ int fallback_mode; /* ditto */
+
+ struct mutex mutex;
+};
+
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id);
+
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+ void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+ void *buf, size_t len,
+ void *reply_buf, size_t reply_len);
+int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+ void *msg_buf, size_t msg_len);
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode);
+void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
+int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *challenge_buf,
+ int challenge_buf_len);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type);
+
+static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->sign_message)
+ return auth->sign_message(auth, msg);
+ return 0;
+}
+
+static inline
+int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
+ struct ceph_msg *msg)
+{
+ if (auth->check_message_signature)
+ return auth->check_message_signature(auth, msg);
+ return 0;
+}
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+
+#endif
diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h
new file mode 100644
index 000000000..11cdc7c60
--- /dev/null
+++ b/include/linux/ceph/buffer.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for smaller sizes, vmalloc for larger sizes.
+ */
+struct ceph_buffer {
+ struct kref kref;
+ struct kvec vec;
+ size_t alloc_len;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+ kref_get(&b->kref);
+ return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+ if (b)
+ kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
new file mode 100644
index 000000000..d5a5da838
--- /dev/null
+++ b/include/linux/ceph/ceph_debug.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/string.h>
+
+#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+# define dout(fmt, ...) \
+ pr_debug("%.*s %12.12s:%-4d : " fmt, \
+ 8 - (int)sizeof(KBUILD_MODNAME), " ", \
+ kbasename(__FILE__), __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+# define dout(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
new file mode 100644
index 000000000..3a47acd9c
--- /dev/null
+++ b/include/linux/ceph/ceph_features.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CEPH_FEATURES
+#define __CEPH_FEATURES
+
+/*
+ * Each time we reclaim bits for reuse we need to specify another bit
+ * that, if present, indicates we have the new incarnation of that
+ * feature. Base case is 1 (first use).
+ */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
+ static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/* this bit is ignored but still advertised by release *when* */
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ static const uint64_t __maybe_unused DEPRECATED_CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/*
+ * this bit is ignored by release *unused* and not advertised by
+ * release *unadvertised*
+ */
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+/*
+ * test for a feature. this test is safer than a typical mask against
+ * the bit because it ensures that we have the bit AND the marker for the
+ * bit's incarnation. this must be used in any case where the features
+ * bits may include an old meaning of the bit.
+ */
+#define CEPH_HAVE_FEATURE(x, name) \
+ (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel). For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ * - In the first phase we indicate that a feature is DEPRECATED as of
+ * a particular release. This is the first major release X (say,
+ * jewel) that does not depend on its peers advertising the feature.
+ * That is, it safely assumes its peers all have the feature. We
+ * indicate this with the DEPRECATED macro. For example,
+ *
+ * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ * because 10.2.z (jewel) did not care if its peers advertised this
+ * feature bit.
+ *
+ * - In the second phase we stop advertising the bit and call it
+ * RETIRED. This can normally be done in the *next* major release
+ * following the one in which we marked the feature DEPRECATED. In
+ * the above example, for 12.0.z (luminous) we can say:
+ *
+ * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+ *
+ * - The bit can be reused in the first post-luminous release, 13.0.z
+ * (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
+ */
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
+DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
+DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
+DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
+DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
+DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
+DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
+DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
+DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
+DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
+DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
+DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit*
+DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit*
+
+DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
+
+/*
+ * Features supported.
+ */
+#define CEPH_FEATURES_SUPPORTED_DEFAULT \
+ (CEPH_FEATURE_NOSRCADDR | \
+ CEPH_FEATURE_SERVER_NAUTILUS | \
+ CEPH_FEATURE_FLOCK | \
+ CEPH_FEATURE_SUBSCRIBE2 | \
+ CEPH_FEATURE_MONNAMES | \
+ CEPH_FEATURE_RECONNECT_SEQ | \
+ CEPH_FEATURE_DIRLAYOUTHASH | \
+ CEPH_FEATURE_PGID64 | \
+ CEPH_FEATURE_PGPOOL3 | \
+ CEPH_FEATURE_OSDENC | \
+ CEPH_FEATURE_MONENC | \
+ CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_SERVER_LUMINOUS | \
+ CEPH_FEATURE_RESEND_ON_SPLIT | \
+ CEPH_FEATURE_RADOS_BACKOFF | \
+ CEPH_FEATURE_OSDMAP_PG_UPMAP | \
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \
+ CEPH_FEATURE_MSG_AUTH | \
+ CEPH_FEATURE_CRUSH_TUNABLES2 | \
+ CEPH_FEATURE_REPLY_CREATE_INODE | \
+ CEPH_FEATURE_SERVER_MIMIC | \
+ CEPH_FEATURE_MDSENC | \
+ CEPH_FEATURE_OSDHASHPSPOOL | \
+ CEPH_FEATURE_OSD_CACHEPOOL | \
+ CEPH_FEATURE_CRUSH_V2 | \
+ CEPH_FEATURE_EXPORT_PEER | \
+ CEPH_FEATURE_OSDMAP_ENC | \
+ CEPH_FEATURE_MDS_INLINE_DATA | \
+ CEPH_FEATURE_CRUSH_TUNABLES3 | \
+ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
+ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
+ CEPH_FEATURE_OSD_POOLRESEND | \
+ CEPH_FEATURE_MDS_QUOTA | \
+ CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATURE_NEW_OSDOP_ENCODING | \
+ CEPH_FEATURE_SERVER_JEWEL | \
+ CEPH_FEATURE_MON_STATEFUL_SUB | \
+ CEPH_FEATURE_CRUSH_TUNABLES5 | \
+ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \
+ CEPH_FEATURE_MSG_ADDR2 | \
+ CEPH_FEATURE_CEPHX_V2)
+
+#define CEPH_FEATURES_REQUIRED_DEFAULT 0
+
+#endif
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
new file mode 100644
index 000000000..97bab0adc
--- /dev/null
+++ b/include/linux/ceph/ceph_frag.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline bool ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline bool ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline bool ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
new file mode 100644
index 000000000..49586ff26
--- /dev/null
+++ b/include/linux/ceph/ceph_fs.h
@@ -0,0 +1,895 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef CEPH_FS_H
+#define CEPH_FS_H
+
+#include <linux/ceph/msgr.h>
+#include <linux/ceph/rados.h>
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSDC_PROTOCOL 24 /* server/client */
+#define CEPH_MDSC_PROTOCOL 32 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+/*
+ * legacy ceph_file_layoute
+ */
+struct ceph_file_layout_legacy {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+struct ceph_string;
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ u32 stripe_unit; /* stripe unit, in bytes */
+ u32 stripe_count; /* over this many objects */
+ u32 object_size; /* until objects are this big */
+ s64 pool_id; /* rados pool id */
+ struct ceph_string __rcu *pool_ns; /* rados pool namespace */
+};
+
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+struct ceph_dir_layout {
+ __u8 dl_dir_hash; /* see ceph_hash.h for ids */
+ __u8 dl_unused1;
+ __u16 dl_unused2;
+ __u32 dl_unused3;
+} __attribute__ ((packed));
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+#define CEPH_AES_IV "cephsageyudagreg"
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+#define CEPH_AUTH_MODE_NONE 0
+#define CEPH_AUTH_MODE_AUTHORIZER 1
+#define CEPH_AUTH_MODE_MON 10
+
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN 0x0
+#define CEPH_CON_MODE_CRC 0x1
+#define CEPH_CON_MODE_SECURE 0x2
+
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
+const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+#define CEPH_MSG_MON_GET_VERSION 19
+#define CEPH_MSG_MON_GET_VERSION_REPLY 20
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+#define CEPH_MSG_FS_MAP_USER 103
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_METRICS 29
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+#define CEPH_MSG_CLIENT_QUOTA 0x314
+
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY 48
+#define CEPH_MSG_POOLOP 49
+
+/* mon commands */
+#define CEPH_MSG_MON_COMMAND 50
+#define CEPH_MSG_MON_COMMAND_ACK 51
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+#define CEPH_MSG_WATCH_NOTIFY 44
+#define CEPH_MSG_OSD_BACKOFF 61
+
+
+/* watch-notify operations */
+enum {
+ CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
+ CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
+ CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
+};
+
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __u8 contains_data_pool;
+ __le64 data_pool;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_mon_command {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 num_strs; /* always 1 */
+ __le32 str_len;
+ char str[];
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
+
+struct ceph_mon_subscribe_item {
+ __le64 start;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+#define CEPH_FS_CLUSTER_ID_NONE -1
+
+/*
+ * mdsmap flags
+ */
+#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DVERSION 1
+#define CEPH_LOCK_DN 2
+#define CEPH_LOCK_ISNAP 16
+#define CEPH_LOCK_IVERSION 32 /* mds internal */
+#define CEPH_LOCK_IFILE 64
+#define CEPH_LOCK_IAUTH 128
+#define CEPH_LOCK_ILINK 256
+#define CEPH_LOCK_IDFT 512 /* dir frag tree */
+#define CEPH_LOCK_INEST 1024 /* mds internal */
+#define CEPH_LOCK_IXATTR 2048
+#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
+#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+ CEPH_SESSION_FLUSHMSG,
+ CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
+ CEPH_SESSION_REJECT,
+ CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+};
+
+#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+ CEPH_MDS_OP_LOOKUPINO = 0x00104,
+ CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+ CEPH_MDS_OP_GETVXATTR = 0x00106,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+ CEPH_MDS_OP_SETFILELOCK= 0x01109,
+ CEPH_MDS_OP_GETFILELOCK= 0x00110,
+ CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x01301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+ CEPH_MDS_OP_RENAMESNAP = 0x01403,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE 1
+#define CEPH_SETATTR_UID 2
+#define CEPH_SETATTR_GID 4
+#define CEPH_SETATTR_MTIME 8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE 32
+#define CEPH_SETATTR_CTIME 64
+
+/*
+ * Ceph setxattr request flags.
+ */
+#define CEPH_XATTR_CREATE (1 << 0)
+#define CEPH_XATTR_REPLACE (1 << 1)
+#define CEPH_XATTR_REMOVE (1 << 31)
+
+/*
+ * readdir request flags;
+ */
+#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
+
+/*
+ * readdir reply flags.
+ */
+#define CEPH_READDIR_FRAG_END (1<<0)
+#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
+#define CEPH_READDIR_HASH_ORDER (1<<9)
+#define CEPH_READDIR_OFFSET_HASH (1<<10)
+
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY 00000000
+#define CEPH_O_WRONLY 00000001
+#define CEPH_O_RDWR 00000002
+#define CEPH_O_CREAT 00000100
+#define CEPH_O_EXCL 00000200
+#define CEPH_O_TRUNC 00001000
+#define CEPH_O_DIRECTORY 00200000
+#define CEPH_O_NOFOLLOW 00400000
+
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
+ __le16 flags;
+ __le32 offset_hash;
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 pool;
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 old_size;
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ __le32 osdmap_epoch; /* used for setting file/dir layouts */
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout_legacy layout;
+ } __attribute__ ((packed)) setlayout;
+ struct {
+ __u8 rule; /* currently fcntl or flock */
+ __u8 type; /* shared, exclusive, remove*/
+ __le64 owner; /* owner of the lock */
+ __le64 pid; /* process id requesting the lock */
+ __le64 start; /* initial location to lock */
+ __le64 length; /* num bytes to lock from start */
+ __u8 wait; /* will caller wait for lock to become available? */
+ } __attribute__ ((packed)) filelock_change;
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ __le64 snapid;
+ __le64 parent;
+ __le32 hash;
+ } __attribute__ ((packed)) lookupino;
+} __attribute__ ((packed));
+
+union ceph_mds_request_args_ext {
+ union ceph_mds_request_args old;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ struct ceph_timespec btime;
+ } __attribute__ ((packed)) setattr_ext;
+};
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
+
+struct ceph_mds_request_head_old {
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_REQUEST_HEAD_VERSION 1
+
+struct ceph_mds_request_head {
+ __le16 version; /* struct version */
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args_ext args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */
+#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+ __le64 ino;
+ __le64 snapid;
+ __le32 rdev;
+ __le64 version; /* inode version */
+ __le64 xattr_version; /* version for xattr blob */
+ struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+ struct ceph_file_layout_legacy layout;
+ struct ceph_timespec ctime, mtime, atime;
+ __le32 time_warp_seq;
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ __le32 mode, uid, gid;
+ __le32 nlink;
+ __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
+ struct ceph_timespec rctime;
+ struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, symlink string, dir layout, xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
+#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+#define CEPH_LOCK_FCNTL 1
+#define CEPH_LOCK_FLOCK 2
+#define CEPH_LOCK_FCNTL_INTR 3
+#define CEPH_LOCK_FLOCK_INTR 4
+
+
+#define CEPH_LOCK_SHARED 1
+#define CEPH_LOCK_EXCL 2
+#define CEPH_LOCK_UNLOCK 4
+
+struct ceph_filelock {
+ __le64 start;/* file offset to start lock at */
+ __le64 length; /* num bytes to lock; 0 for all following start */
+ __le64 client; /* which client holds the lock */
+ __le64 owner; /* owner the lock */
+ __le64 pid; /* process id holding the lock on the client */
+ __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
+
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_BITS 4
+#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1)
+
+int ceph_flags_to_mode(int flags);
+
+#define CEPH_INLINE_NONE ((__u64)-1)
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+#define CEPH_CAP_SIMPLE_BITS 2
+#define CEPH_CAP_FILE_BITS 8
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8
+#define CEPH_CAP_SFLOCK 20
+
+#define CEPH_CAP_BITS 22
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
+
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+#define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_FILE_RD)
+#define CEPH_STAT_RSTAT CEPH_CAP_FILE_WREXTEND
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+ CEPH_CAP_PIN)
+#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
+ CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+/* cap masks async dir operations */
+#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
+#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
+#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/* flags field in client cap messages (version >= 10) */
+#define CEPH_CLIENT_CAPS_SYNC (1<<0)
+#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1)
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2)
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+
+ /* a union of non-export and export bodies. */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout_legacy layout;
+ __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_peer {
+ __le64 cap_id;
+ __le32 seq;
+ __le32 mseq;
+ __le32 mds;
+ __u8 flags;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+ __le32 flock_len; /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+
+struct ceph_mds_cap_reconnect_v1 {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+/*
+ * quotas
+ */
+struct ceph_mds_quota {
+ __le64 ino; /* ino */
+ struct ceph_timespec rctime;
+ __le64 rbytes; /* dir stats */
+ __le64 rfiles;
+ __le64 rsubdirs;
+ __u8 struct_v; /* compat */
+ __u8 struct_compat;
+ __le32 struct_len;
+ __le64 max_bytes; /* quota max. bytes */
+ __le64 max_files; /* quota max. files */
+} __attribute__ ((packed));
+
+#endif
diff --git a/include/linux/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
new file mode 100644
index 000000000..fda474c7a
--- /dev/null
+++ b/include/linux/ceph/ceph_hash.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
new file mode 100644
index 000000000..17bc7584d
--- /dev/null
+++ b/include/linux/ceph/cls_lock_client.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CEPH_CLS_LOCK_CLIENT_H
+#define _LINUX_CEPH_CLS_LOCK_CLIENT_H
+
+#include <linux/ceph/osd_client.h>
+
+enum ceph_cls_lock_type {
+ CEPH_CLS_LOCK_NONE = 0,
+ CEPH_CLS_LOCK_EXCLUSIVE = 1,
+ CEPH_CLS_LOCK_SHARED = 2,
+};
+
+struct ceph_locker_id {
+ struct ceph_entity_name name; /* locker's client name */
+ char *cookie; /* locker's cookie */
+};
+
+struct ceph_locker_info {
+ struct ceph_entity_addr addr; /* locker's address */
+};
+
+struct ceph_locker {
+ struct ceph_locker_id id;
+ struct ceph_locker_info info;
+};
+
+int ceph_cls_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *cookie,
+ char *tag, char *desc, u8 flags);
+int ceph_cls_unlock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie);
+int ceph_cls_break_lock(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, char *cookie,
+ struct ceph_entity_name *locker);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie);
+
+void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers);
+
+int ceph_cls_lock_info(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 *type, char **tag,
+ struct ceph_locker **lockers, u32 *num_lockers);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+ char *lock_name, u8 type, char *cookie, char *tag);
+
+#endif
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000..8b3a1a7a9
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_DEBUGFS_H
+#define _FS_CEPH_DEBUGFS_H
+
+#include <linux/ceph/types.h>
+
+/* debugfs.c */
+extern void ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern void ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+#endif
+
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
new file mode 100644
index 000000000..04f3ace57
--- /dev/null
+++ b/include/linux/ceph/decode.h
@@ -0,0 +1,398 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <linux/err.h>
+#include <linux/bug.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <asm/unaligned.h>
+
+#include <linux/ceph/types.h>
+
+/*
+ * in all cases,
+ * void **p pointer to position pointer
+ * void *end pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+ u64 v = get_unaligned_le64(*p);
+ *p += sizeof(u64);
+ return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+ u32 v = get_unaligned_le32(*p);
+ *p += sizeof(u32);
+ return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+ u16 v = get_unaligned_le16(*p);
+ *p += sizeof(u16);
+ return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+ u8 v = *(u8 *)*p;
+ (*p)++;
+ return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+ memcpy(pv, *p, n);
+ *p += n;
+}
+
+/*
+ * bounds check input.
+ */
+static inline bool ceph_has_room(void **p, void *end, size_t n)
+{
+ return end >= *p && n <= end - *p;
+}
+
+#define ceph_decode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u64), bad); \
+ v = ceph_decode_64(p); \
+ } while (0)
+#define ceph_decode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u32), bad); \
+ v = ceph_decode_32(p); \
+ } while (0)
+#define ceph_decode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u16), bad); \
+ v = ceph_decode_16(p); \
+ } while (0)
+#define ceph_decode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u8), bad); \
+ v = ceph_decode_8(p); \
+ } while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ ceph_decode_copy(p, pv, n); \
+ } while (0)
+
+/*
+ * Allocate a buffer big enough to hold the wire-encoded string, and
+ * decode the string into it. The resulting string will always be
+ * terminated with '\0'. If successful, *p will be advanced
+ * past the decoded data. Also, if lenp is not a null pointer, the
+ * length (not including the terminating '\0') will be recorded in
+ * *lenp. Note that a zero-length string is a valid return value.
+ *
+ * Returns a pointer to the newly-allocated string buffer, or a
+ * pointer-coded errno if an error occurs. Neither *p nor *lenp
+ * will have been updated if an error is returned.
+ *
+ * There are two possible failures:
+ * - converting the string would require accessing memory at or
+ * beyond the "end" pointer provided (-ERANGE)
+ * - memory could not be allocated for the result (-ENOMEM)
+ */
+static inline char *ceph_extract_encoded_string(void **p, void *end,
+ size_t *lenp, gfp_t gfp)
+{
+ u32 len;
+ void *sp = *p;
+ char *buf;
+
+ ceph_decode_32_safe(&sp, end, len, bad);
+ if (!ceph_has_room(&sp, end, len))
+ goto bad;
+
+ buf = kmalloc(len + 1, gfp);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ if (len)
+ memcpy(buf, sp, len);
+ buf[len] = '\0';
+
+ *p = (char *) *p + sizeof (u32) + len;
+ if (lenp)
+ *lenp = (size_t) len;
+
+ return buf;
+
+bad:
+ return ERR_PTR(-ERANGE);
+}
+
+/*
+ * skip helpers
+ */
+#define ceph_decode_skip_n(p, end, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ *p += n; \
+ } while (0)
+
+#define ceph_decode_skip_64(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u64), bad)
+
+#define ceph_decode_skip_32(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u32), bad)
+
+#define ceph_decode_skip_16(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u16), bad)
+
+#define ceph_decode_skip_8(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u8), bad)
+
+#define ceph_decode_skip_string(p, end, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ ceph_decode_skip_n(p, end, len, bad); \
+ } while (0)
+
+#define ceph_decode_skip_set(p, end, type, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) \
+ ceph_decode_skip_##type(p, end, bad); \
+ } while (0)
+
+#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype(p, end, bad); \
+ ceph_decode_skip_##vtype(p, end, bad); \
+ } \
+ } while (0)
+
+#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype1(p, end, bad); \
+ ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
+ } \
+ } while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec64
+ */
+static inline void ceph_decode_timespec64(struct timespec64 *ts,
+ const struct ceph_timespec *tv)
+{
+ /*
+ * This will still overflow in year 2106. We could extend
+ * the protocol to steal two more bits from tv_nsec to
+ * add three more 136 year epochs after that the way ext4
+ * does if necessary.
+ */
+ ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec);
+ ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
+ const struct timespec64 *ts)
+{
+ tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
+ tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+#define CEPH_ENTITY_ADDR_TYPE_NONE 0
+#define CEPH_ENTITY_ADDR_TYPE_LEGACY __cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2 __cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY __cpu_to_le32(3)
+
+static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
+{
+ __be16 ss_family = htons(a->in_addr.ss_family);
+ a->in_addr.ss_family = *(__u16 *)&ss_family;
+
+ /* Banner addresses require TYPE_NONE */
+ a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
+}
+static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
+{
+ __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+ a->in_addr.ss_family = ntohs(ss_family);
+ WARN_ON(a->in_addr.ss_family == 512);
+ a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+}
+
+extern int ceph_decode_entity_addr(void **p, void *end,
+ struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr);
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+ put_unaligned_le64(v, (__le64 *)*p);
+ *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+ put_unaligned_le32(v, (__le32 *)*p);
+ *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+ put_unaligned_le16(v, (__le16 *)*p);
+ *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+ *(u8 *)*p = v;
+ (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+ u64 ino, const char *path)
+{
+ u32 len = path ? strlen(path) : 0;
+ BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, ino);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, path, len);
+ *p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+ const char *s, u32 len)
+{
+ BUG_ON(*p + sizeof(len) + len > end);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+/*
+ * version and length starting block encoders/decoders
+ */
+
+/* current code version (u8) + compat code version (u8) + len of struct (u32) */
+#define CEPH_ENCODING_START_BLK_LEN 6
+
+/**
+ * ceph_start_encoding - start encoding block
+ * @struct_v: current (code) version of the encoding
+ * @struct_compat: oldest code version that can decode it
+ * @struct_len: length of struct encoding
+ */
+static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat,
+ u32 struct_len)
+{
+ ceph_encode_8(p, struct_v);
+ ceph_encode_8(p, struct_compat);
+ ceph_encode_32(p, struct_len);
+}
+
+/**
+ * ceph_start_decoding - start decoding block
+ * @v: current version of the encoding that the code supports
+ * @name: name of the struct (free-form)
+ * @struct_v: out param for the encoding version
+ * @struct_len: out param for the length of struct encoding
+ *
+ * Validates the length of struct encoding, so unsafe ceph_decode_*
+ * variants can be used for decoding.
+ */
+static inline int ceph_start_decoding(void **p, void *end, u8 v,
+ const char *name, u8 *struct_v,
+ u32 *struct_len)
+{
+ u8 struct_compat;
+
+ ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad);
+ *struct_v = ceph_decode_8(p);
+ struct_compat = ceph_decode_8(p);
+ if (v < struct_compat) {
+ pr_warn("got struct_v %d struct_compat %d > %d of %s\n",
+ *struct_v, struct_compat, v, name);
+ return -EINVAL;
+ }
+
+ *struct_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, *struct_len, bad);
+ return 0;
+
+bad:
+ return -ERANGE;
+}
+
+#define ceph_encode_need(p, end, n, bad) \
+ do { \
+ if (!likely(ceph_has_room(p, end, n))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u64), bad); \
+ ceph_encode_64(p, v); \
+ } while (0)
+#define ceph_encode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u32), bad); \
+ ceph_encode_32(p, v); \
+ } while (0)
+#define ceph_encode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u16), bad); \
+ ceph_encode_16(p, v); \
+ } while (0)
+#define ceph_encode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u8), bad); \
+ ceph_encode_8(p, v); \
+ } while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_encode_need(p, end, n, bad); \
+ ceph_encode_copy(p, pv, n); \
+ } while (0)
+#define ceph_encode_string_safe(p, end, s, n, bad) \
+ do { \
+ ceph_encode_need(p, end, n, bad); \
+ ceph_encode_string(p, end, s, n); \
+ } while (0)
+
+
+#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 000000000..4497d0a67
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_LIBCEPH_H
+#define _FS_CEPH_LIBCEPH_H
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/bug.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+#include <linux/slab.h>
+#include <linux/refcount.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID (1<<0)
+#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
+#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes (msgr1) */
+#define CEPH_OPT_TCP_NODELAY (1<<4) /* TCP_NODELAY on TCP sockets */
+#define CEPH_OPT_NOMSGSIGN (1<<5) /* don't sign msgs (msgr1) */
+#define CEPH_OPT_ABORT_ON_FULL (1<<6) /* abort w/ ENOSPC when full */
+#define CEPH_OPT_RXBOUNCE (1<<7) /* double-buffer read data */
+
+#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
+
+#define ceph_set_opt(client, opt) \
+ (client)->options->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+ (!!((client)->options->flags & CEPH_OPT_##opt))
+
+struct ceph_options {
+ int flags;
+ struct ceph_fsid fsid;
+ struct ceph_entity_addr my_addr;
+ unsigned long mount_timeout; /* jiffies */
+ unsigned long osd_idle_ttl; /* jiffies */
+ unsigned long osd_keepalive_timeout; /* jiffies */
+ unsigned long osd_request_timeout; /* jiffies */
+ u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+ int con_modes[2]; /* CEPH_CON_MODE_* */
+
+ /*
+ * any type that can't be simply compared or doesn't need
+ * to be compared should go beyond this point,
+ * ceph_compare_options() should be updated accordingly
+ */
+
+ struct ceph_entity_addr *mon_addr; /* should be the first
+ pointer type of args */
+ int num_mon;
+ char *name;
+ struct ceph_crypto_key *key;
+ struct rb_root crush_locs;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
+#define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */
+
+#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
+#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
+#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000)
+#define CEPH_MONC_HUNT_BACKOFF 2
+#define CEPH_MONC_HUNT_MAX_MULT 10
+
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
+
+/*
+ * The largest possible rbd data object is 32M.
+ * The largest possible rbd object map object is 64M.
+ *
+ * There is no limit on the size of cephfs objects, but it has to obey
+ * rsize and wsize mount options anyway.
+ */
+#define CEPH_MSG_MAX_DATA_LEN (64*1024*1024)
+
+#define CEPH_AUTH_NAME_DEFAULT "guest"
+
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
+{
+ return timeout ?: MAX_SCHEDULE_TIMEOUT;
+}
+
+struct ceph_mds_client;
+
+/*
+ * per client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+ struct ceph_fsid fsid;
+ bool have_fsid;
+
+ void *private;
+
+ struct ceph_options *options;
+
+ struct mutex mount_mutex; /* serialize mount attempts */
+ wait_queue_head_t auth_wq;
+ int auth_err;
+
+ int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
+
+ u64 supported_features;
+ u64 required_features;
+
+ struct ceph_messenger msgr; /* messenger instance */
+ struct ceph_mon_client monc;
+ struct ceph_osd_client osdc;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_dir;
+ struct dentry *debugfs_monmap;
+ struct dentry *debugfs_osdmap;
+ struct dentry *debugfs_options;
+#endif
+};
+
+#define from_msgr(ms) container_of(ms, struct ceph_client, msgr)
+
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+ return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data. It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+ refcount_t nref;
+ u64 seq;
+ u32 num_snaps;
+ u64 snaps[];
+};
+
+extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+ gfp_t gfp_flags);
+extern struct ceph_snap_context *ceph_get_snap_context(
+ struct ceph_snap_context *sc);
+extern void ceph_put_snap_context(struct ceph_snap_context *sc);
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+ return ((off+len+PAGE_SIZE-1) >> PAGE_SHIFT) -
+ (off >> PAGE_SHIFT);
+}
+
+#define RB_BYVAL(a) (a)
+#define RB_BYPTR(a) (&(a))
+#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
+
+#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
+static bool __insert_##name(struct rb_root *root, type *t) \
+{ \
+ struct rb_node **n = &root->rb_node; \
+ struct rb_node *parent = NULL; \
+ \
+ BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
+ \
+ while (*n) { \
+ type *cur = rb_entry(*n, type, nodefld); \
+ int cmp; \
+ \
+ parent = *n; \
+ cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \
+ if (cmp < 0) \
+ n = &(*n)->rb_left; \
+ else if (cmp > 0) \
+ n = &(*n)->rb_right; \
+ else \
+ return false; \
+ } \
+ \
+ rb_link_node(&t->nodefld, parent, n); \
+ rb_insert_color(&t->nodefld, root); \
+ return true; \
+} \
+static void __maybe_unused insert_##name(struct rb_root *root, type *t) \
+{ \
+ if (!__insert_##name(root, t)) \
+ BUG(); \
+} \
+static void erase_##name(struct rb_root *root, type *t) \
+{ \
+ BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
+ rb_erase(&t->nodefld, root); \
+ RB_CLEAR_NODE(&t->nodefld); \
+}
+
+/*
+ * @lookup_param_type is a parameter and not constructed from (@type,
+ * @keyfld) with typeof() because adding const is too unwieldy.
+ */
+#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+static type *lookup_##name(struct rb_root *root, lookup_param_type key) \
+{ \
+ struct rb_node *n = root->rb_node; \
+ \
+ while (n) { \
+ type *cur = rb_entry(n, type, nodefld); \
+ int cmp; \
+ \
+ cmp = cmpexp(key, keyexp(cur->keyfld)); \
+ if (cmp < 0) \
+ n = n->rb_left; \
+ else if (cmp > 0) \
+ n = n->rb_right; \
+ else \
+ return cur; \
+ } \
+ \
+ return NULL; \
+}
+
+#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld)
+
+/*
+ * Shorthands for integer keys.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
+extern type __lookup_##name##_key; \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \
+ typeof(__lookup_##name##_key.keyfld), nodefld)
+
+#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
+
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_snap_cachep;
+extern struct kmem_cache *ceph_cap_flush_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+extern struct kmem_cache *ceph_dir_file_cachep;
+extern struct kmem_cache *ceph_mds_request_cachep;
+extern mempool_t *ceph_wb_pagevec_pool;
+
+/* ceph_common.c */
+extern bool libceph_compatible(void *data);
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid);
+
+struct fs_parameter;
+struct fc_log;
+struct ceph_options *ceph_alloc_options(void);
+int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
+ struct fc_log *l, char delim);
+int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
+ struct fc_log *l);
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
+ bool show_all);
+extern void ceph_destroy_options(struct ceph_options *opt);
+extern int ceph_compare_options(struct ceph_options *new_opt,
+ struct ceph_client *client);
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
+struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
+u64 ceph_client_gid(struct ceph_client *client);
+extern void ceph_destroy_client(struct ceph_client *client);
+extern void ceph_reset_client_addr(struct ceph_client *client);
+extern int __ceph_open_session(struct ceph_client *client,
+ unsigned long started);
+extern int ceph_open_session(struct ceph_client *client);
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+ unsigned long timeout);
+
+/* pagevec.c */
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+extern void ceph_put_page_vector(struct page **pages, int num_pages,
+ bool dirty);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
+extern int ceph_copy_user_to_page_vector(struct page **pages,
+ const void __user *data,
+ loff_t off, size_t len);
+extern void ceph_copy_to_page_vector(struct page **pages,
+ const void *data,
+ loff_t off, size_t len);
+extern void ceph_copy_from_page_vector(struct page **pages,
+ void *data,
+ loff_t off, size_t len);
+extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
+
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
new file mode 100644
index 000000000..4c3e0648d
--- /dev/null
+++ b/include/linux/ceph/mdsmap.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include <linux/bug.h>
+#include <linux/ceph/types.h>
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+ u64 global_id;
+ struct ceph_entity_addr addr;
+ s32 state;
+ int num_export_targets;
+ bool laggy;
+ u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+ u32 m_epoch, m_client_epoch, m_last_failure;
+ u32 m_root;
+ u32 m_session_timeout; /* seconds */
+ u32 m_session_autoclose; /* seconds */
+ u64 m_max_file_size;
+ u64 m_max_xattr_size; /* maximum size for xattrs blob */
+ u32 m_max_mds; /* expected up:active mds number */
+ u32 m_num_active_mds; /* actual up:active mds number */
+ u32 possible_max_rank; /* possible max rank index */
+ struct ceph_mds_info *m_info;
+
+ /* which object pools file data can be stored in */
+ int m_num_data_pg_pools;
+ u64 *m_data_pg_pools;
+ u64 m_cas_pg_pool;
+
+ bool m_enabled;
+ bool m_damaged;
+ int m_num_laggy;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+ if (w >= m->possible_max_rank)
+ return NULL;
+ return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+ BUG_ON(w < 0);
+ if (w >= m->possible_max_rank)
+ return CEPH_MDS_STATE_DNE;
+ return m->m_info[w].state;
+}
+
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+ if (w >= 0 && w < m->possible_max_rank)
+ return m->m_info[w].laggy;
+ return false;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
+
+#endif
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
new file mode 100644
index 000000000..99c1726be
--- /dev/null
+++ b/include/linux/ceph/messenger.h
@@ -0,0 +1,587 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/bvec.h>
+#include <linux/crypto.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/workqueue.h>
+#include <net/net_namespace.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/buffer.h>
+
+struct ceph_msg;
+struct ceph_connection;
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+ struct ceph_connection *(*get)(struct ceph_connection *);
+ void (*put)(struct ceph_connection *);
+
+ /* handle an incoming message. */
+ void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+ /* authorize an outgoing connection */
+ struct ceph_auth_handshake *(*get_authorizer) (
+ struct ceph_connection *con,
+ int *proto, int force_new);
+ int (*add_authorizer_challenge)(struct ceph_connection *con,
+ void *challenge_buf,
+ int challenge_buf_len);
+ int (*verify_authorizer_reply) (struct ceph_connection *con);
+ int (*invalidate_authorizer)(struct ceph_connection *con);
+
+ /* there was some error on the socket (disconnect, whatever) */
+ void (*fault) (struct ceph_connection *con);
+
+ /* a remote host as terminated a message exchange session, and messages
+ * we sent (or they tried to send us) may be lost. */
+ void (*peer_reset) (struct ceph_connection *con);
+
+ struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip);
+
+ void (*reencode_message) (struct ceph_msg *msg);
+
+ int (*sign_message) (struct ceph_msg *msg);
+ int (*check_message_signature) (struct ceph_msg *msg);
+
+ /* msgr2 authentication exchange */
+ int (*get_auth_request)(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_reply_more)(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len);
+ int (*handle_auth_done)(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len);
+ int (*handle_auth_bad_method)(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt);
+};
+
+/* use format string %s%lld */
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+ struct ceph_entity_inst inst; /* my name+address */
+ struct ceph_entity_addr my_enc_addr;
+
+ atomic_t stopping;
+ possible_net_t net;
+
+ /*
+ * the global_seq counts connections i (attempt to) initiate
+ * in order to disambiguate certain connect race conditions.
+ */
+ u32 global_seq;
+ spinlock_t global_seq_lock;
+};
+
+enum ceph_msg_data_type {
+ CEPH_MSG_DATA_NONE, /* message contains no data payload */
+ CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */
+ CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */
+#ifdef CONFIG_BLOCK
+ CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */
+#endif /* CONFIG_BLOCK */
+ CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
+};
+
+#ifdef CONFIG_BLOCK
+
+struct ceph_bio_iter {
+ struct bio *bio;
+ struct bvec_iter iter;
+};
+
+#define __ceph_bio_iter_advance_step(it, n, STEP) do { \
+ unsigned int __n = (n), __cur_n; \
+ \
+ while (__n) { \
+ BUG_ON(!(it)->iter.bi_size); \
+ __cur_n = min((it)->iter.bi_size, __n); \
+ (void)(STEP); \
+ bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \
+ if (!(it)->iter.bi_size && (it)->bio->bi_next) { \
+ dout("__ceph_bio_iter_advance_step next bio\n"); \
+ (it)->bio = (it)->bio->bi_next; \
+ (it)->iter = (it)->bio->bi_iter; \
+ } \
+ __n -= __cur_n; \
+ } \
+} while (0)
+
+/*
+ * Advance @it by @n bytes.
+ */
+#define ceph_bio_iter_advance(it, n) \
+ __ceph_bio_iter_advance_step(it, n, 0)
+
+/*
+ * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
+ */
+#define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \
+ __ceph_bio_iter_advance_step(it, n, ({ \
+ struct bio_vec bv; \
+ struct bvec_iter __cur_iter; \
+ \
+ __cur_iter = (it)->iter; \
+ __cur_iter.bi_size = __cur_n; \
+ __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \
+ (void)(BVEC_STEP); \
+ }))
+
+#endif /* CONFIG_BLOCK */
+
+struct ceph_bvec_iter {
+ struct bio_vec *bvecs;
+ struct bvec_iter iter;
+};
+
+#define __ceph_bvec_iter_advance_step(it, n, STEP) do { \
+ BUG_ON((n) > (it)->iter.bi_size); \
+ (void)(STEP); \
+ bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \
+} while (0)
+
+/*
+ * Advance @it by @n bytes.
+ */
+#define ceph_bvec_iter_advance(it, n) \
+ __ceph_bvec_iter_advance_step(it, n, 0)
+
+/*
+ * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec.
+ */
+#define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \
+ __ceph_bvec_iter_advance_step(it, n, ({ \
+ struct bio_vec bv; \
+ struct bvec_iter __cur_iter; \
+ \
+ __cur_iter = (it)->iter; \
+ __cur_iter.bi_size = (n); \
+ for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \
+ (void)(BVEC_STEP); \
+ }))
+
+#define ceph_bvec_iter_shorten(it, n) do { \
+ BUG_ON((n) > (it)->iter.bi_size); \
+ (it)->iter.bi_size = (n); \
+} while (0)
+
+struct ceph_msg_data {
+ enum ceph_msg_data_type type;
+ union {
+#ifdef CONFIG_BLOCK
+ struct {
+ struct ceph_bio_iter bio_pos;
+ u32 bio_length;
+ };
+#endif /* CONFIG_BLOCK */
+ struct ceph_bvec_iter bvec_pos;
+ struct {
+ struct page **pages;
+ size_t length; /* total # bytes */
+ unsigned int alignment; /* first page */
+ bool own_pages;
+ };
+ struct ceph_pagelist *pagelist;
+ };
+};
+
+struct ceph_msg_data_cursor {
+ size_t total_resid; /* across all data items */
+
+ struct ceph_msg_data *data; /* current data item */
+ size_t resid; /* bytes not yet consumed */
+ bool need_crc; /* crc update needed */
+ union {
+#ifdef CONFIG_BLOCK
+ struct ceph_bio_iter bio_iter;
+#endif /* CONFIG_BLOCK */
+ struct bvec_iter bvec_iter;
+ struct { /* pages */
+ unsigned int page_offset; /* offset in page */
+ unsigned short page_index; /* index in array */
+ unsigned short page_count; /* pages in array */
+ };
+ struct { /* pagelist */
+ struct page *page; /* page from list */
+ size_t offset; /* bytes from list */
+ };
+ };
+};
+
+/*
+ * a single message. it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+ struct ceph_msg_header hdr; /* header */
+ union {
+ struct ceph_msg_footer footer; /* footer */
+ struct ceph_msg_footer_old old_footer; /* old format footer */
+ };
+ struct kvec front; /* unaligned blobs of message */
+ struct ceph_buffer *middle;
+
+ size_t data_length;
+ struct ceph_msg_data *data;
+ int num_data_items;
+ int max_data_items;
+ struct ceph_msg_data_cursor cursor;
+
+ struct ceph_connection *con;
+ struct list_head list_head; /* links for connection lists */
+
+ struct kref kref;
+ bool more_to_follow;
+ bool needs_out_seq;
+ int front_alloc_len;
+
+ struct ceph_msgpool *pool;
+};
+
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED 1
+#define CEPH_CON_S_PREOPEN 2
+#define CEPH_CON_S_V1_BANNER 3
+#define CEPH_CON_S_V1_CONNECT_MSG 4
+#define CEPH_CON_S_V2_BANNER_PREFIX 5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD 6
+#define CEPH_CON_S_V2_HELLO 7
+#define CEPH_CON_S_V2_AUTH 8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE 9
+#define CEPH_CON_S_V2_SESSION_CONNECT 10
+#define CEPH_CON_S_V2_SESSION_RECONNECT 11
+#define CEPH_CON_S_OPEN 12
+#define CEPH_CON_S_STANDBY 13
+
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop
+ messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed
+ work */
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL (HZ / 4)
+#define MAX_DELAY_INTERVAL (15 * HZ)
+
+struct ceph_connection_v1_info {
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_more; /* there is more data after the kvecs */
+ bool out_msg_done;
+
+ struct ceph_auth_handshake *auth;
+ int auth_retry; /* true if we need a newer authorizer */
+
+ /* connection negotiation temps */
+ u8 in_banner[CEPH_BANNER_MAX_LEN];
+ struct ceph_entity_addr actual_peer_addr;
+ struct ceph_entity_addr peer_addr_for_me;
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+
+ int in_base_pos; /* bytes read */
+
+ /* message in temps */
+ u8 in_tag; /* protocol control byte */
+ struct ceph_msg_header in_hdr;
+ __le64 in_temp_ack; /* for reading an ack */
+
+ /* message out temps */
+ struct ceph_msg_header out_hdr;
+ __le64 out_temp_ack; /* for writing an ack */
+ struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
+ stamp */
+
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this session */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+};
+
+#define CEPH_CRC_LEN 4
+#define CEPH_GCM_KEY_LEN 16
+#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN 16
+#define CEPH_GCM_TAG_LEN 16
+
+#define CEPH_PREAMBLE_LEN 32
+#define CEPH_PREAMBLE_INLINE_LEN 48
+#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \
+ CEPH_PREAMBLE_INLINE_LEN + \
+ CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT 4
+
+struct ceph_frame_desc {
+ int fd_tag; /* FRAME_TAG_* */
+ int fd_seg_cnt;
+ int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */
+ int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+ __le32 fixed;
+ __le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+ struct iov_iter in_iter;
+ struct kvec in_kvecs[5]; /* recvmsg */
+ struct bio_vec in_bvec; /* recvmsg (in_cursor) */
+ int in_kvec_cnt;
+ int in_state; /* IN_S_* */
+
+ struct iov_iter out_iter;
+ struct kvec out_kvecs[8]; /* sendmsg */
+ struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero),
+ sendmsg (out_enc_pages) */
+ int out_kvec_cnt;
+ int out_state; /* OUT_S_* */
+
+ int out_zero; /* # of zero bytes to send */
+ bool out_iter_sendpage; /* use sendpage if possible */
+
+ struct ceph_frame_desc in_desc;
+ struct ceph_msg_data_cursor in_cursor;
+ struct ceph_msg_data_cursor out_cursor;
+
+ struct crypto_shash *hmac_tfm; /* post-auth signature */
+ struct crypto_aead *gcm_tfm; /* on-wire encryption */
+ struct aead_request *gcm_req;
+ struct crypto_wait gcm_wait;
+ struct ceph_gcm_nonce in_gcm_nonce;
+ struct ceph_gcm_nonce out_gcm_nonce;
+
+ struct page **in_enc_pages;
+ int in_enc_page_cnt;
+ int in_enc_resid;
+ int in_enc_i;
+ struct page **out_enc_pages;
+ int out_enc_page_cnt;
+ int out_enc_resid;
+ int out_enc_i;
+
+ int con_mode; /* CEPH_CON_MODE_* */
+
+ void *conn_bufs[16];
+ int conn_buf_cnt;
+
+ struct kvec in_sign_kvecs[8];
+ struct kvec out_sign_kvecs[8];
+ int in_sign_kvec_cnt;
+ int out_sign_kvec_cnt;
+
+ u64 client_cookie;
+ u64 server_cookie;
+ u64 global_seq;
+ u64 connect_seq;
+ u64 peer_global_seq;
+
+ u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+ u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+ struct {
+ u8 late_status; /* FRAME_LATE_STATUS_* */
+ union {
+ struct {
+ u32 front_crc;
+ u32 middle_crc;
+ u32 data_crc;
+ } __packed;
+ u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+ };
+ } out_epil;
+};
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+ void *private;
+
+ const struct ceph_connection_operations *ops;
+
+ struct ceph_messenger *msgr;
+
+ int state; /* CEPH_CON_S_* */
+ atomic_t sock_state;
+ struct socket *sock;
+
+ unsigned long flags; /* CEPH_CON_F_* */
+ const char *error_msg; /* error message, if any */
+
+ struct ceph_entity_name peer_name; /* peer name */
+ struct ceph_entity_addr peer_addr; /* peer address */
+ u64 peer_features;
+
+ struct mutex mutex;
+
+ /* out queue */
+ struct list_head out_queue;
+ struct list_head out_sent; /* sending or sent but unacked */
+ u64 out_seq; /* last message queued for send */
+
+ u64 in_seq, in_seq_acked; /* last message received, acked */
+
+ struct ceph_msg *in_msg;
+ struct ceph_msg *out_msg; /* sending message (== tail of
+ out_sent) */
+
+ struct page *bounce_page;
+ u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
+
+ struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
+
+ struct delayed_work work; /* send|recv work */
+ unsigned long delay; /* current delay interval */
+
+ union {
+ struct ceph_connection_v1_info v1;
+ struct ceph_connection_v2_info v2;
+ };
+};
+
+extern struct page *ceph_zero_page;
+
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip);
+void ceph_con_get_out_msg(struct ceph_connection *con);
+
+/* messenger_v1.c */
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
+
+extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
+
+extern int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count, char delim);
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
+
+extern void ceph_messenger_init(struct ceph_messenger *msgr,
+ struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_fini(struct ceph_messenger *msgr);
+extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
+
+extern void ceph_con_init(struct ceph_connection *con, void *private,
+ const struct ceph_connection_operations *ops,
+ struct ceph_messenger *msgr);
+extern void ceph_con_open(struct ceph_connection *con,
+ __u8 entity_type, __u64 entity_num,
+ struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+
+extern void ceph_msg_revoke(struct ceph_msg *msg);
+extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
+
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
+ unsigned long interval);
+
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
+ size_t length, size_t alignment, bool own_pages);
+extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
+ struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
+ u32 length);
+#endif /* CONFIG_BLOCK */
+void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
+ struct ceph_bvec_iter *bvec_pos);
+
+struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
+ gfp_t flags, bool can_fail);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail);
+
+extern struct ceph_msg *ceph_msg_get(struct ceph_msg *msg);
+extern void ceph_msg_put(struct ceph_msg *msg);
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
new file mode 100644
index 000000000..b65896115
--- /dev/null
+++ b/include/linux/ceph/mon_client.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+
+#include <linux/ceph/messenger.h>
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 num_mon;
+ struct ceph_entity_inst mon_inst[];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_generic_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+ int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+ struct ceph_mon_client *monc;
+ struct delayed_work delayed_work;
+ unsigned long delay;
+ ceph_monc_request_func_t do_request;
+};
+
+typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
+
+/*
+ * ceph_mon_generic_request is being used for the statfs and
+ * mon_get_version requests which are being done a bit differently
+ * because we need to get data back to the caller
+ */
+struct ceph_mon_generic_request {
+ struct ceph_mon_client *monc;
+ struct kref kref;
+ u64 tid;
+ struct rb_node node;
+ int result;
+
+ struct completion completion;
+ ceph_monc_callback_t complete_cb;
+ u64 private_data; /* r_tid/linger_id */
+
+ struct ceph_msg *request; /* original request */
+ struct ceph_msg *reply; /* and reply */
+
+ union {
+ struct ceph_statfs *st;
+ u64 newest;
+ } u;
+};
+
+struct ceph_mon_client {
+ struct ceph_client *client;
+ struct ceph_monmap *monmap;
+
+ struct mutex mutex;
+ struct delayed_work delayed_work;
+
+ struct ceph_auth_client *auth;
+ struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
+ int pending_auth;
+
+ bool hunting;
+ int cur_mon; /* last monitor i contacted */
+ unsigned long sub_renew_after;
+ unsigned long sub_renew_sent;
+ struct ceph_connection con;
+
+ bool had_a_connection;
+ int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */
+
+ /* pending generic requests */
+ struct rb_root generic_request_tree;
+ u64 last_tid;
+
+ /* subs, indexed with CEPH_SUB_* */
+ struct {
+ struct ceph_mon_subscribe_item item;
+ bool want;
+ u32 have; /* epoch */
+ } subs[4];
+ int fs_cluster_id; /* "mdsmap.<id>" sub */
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+};
+
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+ struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
+
+enum {
+ CEPH_SUB_MONMAP = 0,
+ CEPH_SUB_OSDMAP,
+ CEPH_SUB_FSMAP,
+ CEPH_SUB_MDSMAP,
+};
+
+extern const char *ceph_sub_str[];
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @epoch, and also call in when we receive a map. We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+ bool continuous);
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
+void ceph_monc_renew_subs(struct ceph_mon_client *monc);
+
+extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+ unsigned long timeout);
+
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf);
+
+int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
+ u64 *newest);
+int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
+ ceph_monc_callback_t cb, u64 private_data);
+
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+#endif
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
new file mode 100644
index 000000000..729cdf700
--- /dev/null
+++ b/include/linux/ceph/msgpool.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include <linux/mempool.h>
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+ const char *name;
+ mempool_t *pool;
+ int type; /* preallocated message type */
+ int front_len; /* preallocated payload size */
+ int max_data_items;
+};
+
+int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
+ int front_len, int max_data_items, int size,
+ const char *name);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
+ int max_data_items);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
new file mode 100644
index 000000000..3989dcb94
--- /dev/null
+++ b/include/linux/ceph/msgr.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef CEPH_MSGR_H
+#define CEPH_MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT 6789 /* default monitor port */
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+ static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \
+ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+ (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+ const struct ceph_entity_addr *rhs)
+{
+ return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+ lhs->nonce == rhs->nonce;
+}
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
+#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le64 features; /* supported feature bits */
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le64 features; /* feature bits for this session */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header_old {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_name src;
+ __le16 compat_version;
+ __le16 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+struct ceph_msg_header2 {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 data_pre_padding_len;
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ __le64 ack_seq;
+ __u8 flags;
+ /* oldest code we think can decode this. unknown if zero. */
+ __le16 compat_version;
+ __le16 reserved;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer_old {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ // sig holds the 64 bits of the digital signature for the message PLR
+ __le64 sig;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+#define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */
+
+
+#endif
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
new file mode 100644
index 000000000..fb6be7210
--- /dev/null
+++ b/include/linux/ceph/osd_client.h
@@ -0,0 +1,562 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/bitrev.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+#include <linux/refcount.h>
+#include <linux/ktime.h>
+
+#include <linux/ceph/types.h>
+#include <linux/ceph/osdmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/pagelist.h>
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
+
+#define CEPH_HOMELESS_OSD -1
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+ refcount_t o_ref;
+ struct ceph_osd_client *o_osdc;
+ int o_osd;
+ int o_incarnation;
+ struct rb_node o_node;
+ struct ceph_connection o_con;
+ struct rb_root o_requests;
+ struct rb_root o_linger_requests;
+ struct rb_root o_backoff_mappings;
+ struct rb_root o_backoffs_by_id;
+ struct list_head o_osd_lru;
+ struct ceph_auth_handshake o_auth;
+ unsigned long lru_ttl;
+ struct list_head o_keepalive_item;
+ struct mutex lock;
+};
+
+#define CEPH_OSD_SLAB_OPS 2
+#define CEPH_OSD_MAX_OPS 16
+
+enum ceph_osd_data_type {
+ CEPH_OSD_DATA_TYPE_NONE = 0,
+ CEPH_OSD_DATA_TYPE_PAGES,
+ CEPH_OSD_DATA_TYPE_PAGELIST,
+#ifdef CONFIG_BLOCK
+ CEPH_OSD_DATA_TYPE_BIO,
+#endif /* CONFIG_BLOCK */
+ CEPH_OSD_DATA_TYPE_BVECS,
+};
+
+struct ceph_osd_data {
+ enum ceph_osd_data_type type;
+ union {
+ struct {
+ struct page **pages;
+ u64 length;
+ u32 alignment;
+ bool pages_from_pool;
+ bool own_pages;
+ };
+ struct ceph_pagelist *pagelist;
+#ifdef CONFIG_BLOCK
+ struct {
+ struct ceph_bio_iter bio_pos;
+ u32 bio_length;
+ };
+#endif /* CONFIG_BLOCK */
+ struct {
+ struct ceph_bvec_iter bvec_pos;
+ u32 num_bvecs;
+ };
+ };
+};
+
+struct ceph_osd_req_op {
+ u16 op; /* CEPH_OSD_OP_* */
+ u32 flags; /* CEPH_OSD_OP_FLAG_* */
+ u32 indata_len; /* request */
+ u32 outdata_len; /* reply */
+ s32 rval;
+
+ union {
+ struct ceph_osd_data raw_data_in;
+ struct {
+ u64 offset, length;
+ u64 truncate_size;
+ u32 truncate_seq;
+ struct ceph_osd_data osd_data;
+ } extent;
+ struct {
+ u32 name_len;
+ u32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ struct ceph_osd_data osd_data;
+ } xattr;
+ struct {
+ const char *class_name;
+ const char *method_name;
+ struct ceph_osd_data request_info;
+ struct ceph_osd_data request_data;
+ struct ceph_osd_data response_data;
+ __u8 class_len;
+ __u8 method_len;
+ u32 indata_len;
+ } cls;
+ struct {
+ u64 cookie;
+ __u8 op; /* CEPH_OSD_WATCH_OP_ */
+ u32 gen;
+ } watch;
+ struct {
+ struct ceph_osd_data request_data;
+ } notify_ack;
+ struct {
+ u64 cookie;
+ struct ceph_osd_data request_data;
+ struct ceph_osd_data response_data;
+ } notify;
+ struct {
+ struct ceph_osd_data response_data;
+ } list_watchers;
+ struct {
+ u64 expected_object_size;
+ u64 expected_write_size;
+ u32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } alloc_hint;
+ struct {
+ u64 snapid;
+ u64 src_version;
+ u8 flags;
+ u32 src_fadvise_flags;
+ struct ceph_osd_data osd_data;
+ } copy_from;
+ };
+};
+
+struct ceph_osd_request_target {
+ struct ceph_object_id base_oid;
+ struct ceph_object_locator base_oloc;
+ struct ceph_object_id target_oid;
+ struct ceph_object_locator target_oloc;
+
+ struct ceph_pg pgid; /* last raw pg we mapped to */
+ struct ceph_spg spgid; /* last actual spg we mapped to */
+ u32 pg_num;
+ u32 pg_num_mask;
+ struct ceph_osds acting;
+ struct ceph_osds up;
+ int size;
+ int min_size;
+ bool sort_bitwise;
+ bool recovery_deletes;
+
+ unsigned int flags; /* CEPH_OSD_FLAG_* */
+ bool used_replica;
+ bool paused;
+
+ u32 epoch;
+ u32 last_force_resend;
+
+ int osd;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+ u64 r_tid; /* unique for this client */
+ struct rb_node r_node;
+ struct rb_node r_mc_node; /* map check */
+ struct work_struct r_complete_work;
+ struct ceph_osd *r_osd;
+
+ struct ceph_osd_request_target r_t;
+#define r_base_oid r_t.base_oid
+#define r_base_oloc r_t.base_oloc
+#define r_flags r_t.flags
+
+ struct ceph_msg *r_request, *r_reply;
+ u32 r_sent; /* >0 if r_request is sending/sent */
+
+ /* request osd ops array */
+ unsigned int r_num_ops;
+
+ int r_result;
+
+ struct ceph_osd_client *r_osdc;
+ struct kref r_kref;
+ bool r_mempool;
+ struct completion r_completion; /* private to osd_client.c */
+ ceph_osdc_callback_t r_callback;
+
+ struct inode *r_inode; /* for use by callbacks */
+ struct list_head r_private_item; /* ditto */
+ void *r_priv; /* ditto */
+
+ /* set by submitter */
+ u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
+ struct ceph_snap_context *r_snapc; /* for writes */
+ struct timespec64 r_mtime; /* ditto */
+ u64 r_data_offset; /* ditto */
+ bool r_linger; /* don't resend on failure */
+
+ /* internal */
+ unsigned long r_stamp; /* jiffies, send or check time */
+ unsigned long r_start_stamp; /* jiffies */
+ ktime_t r_start_latency; /* ktime_t */
+ ktime_t r_end_latency; /* ktime_t */
+ int r_attempts;
+ u32 r_map_dne_bound;
+
+ struct ceph_osd_req_op r_ops[];
+};
+
+struct ceph_request_redirect {
+ struct ceph_object_locator oloc;
+};
+
+/*
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request
+ */
+struct ceph_osd_reqid {
+ struct ceph_entity_name name;
+ __le64 tid;
+ __le32 inc;
+} __packed;
+
+struct ceph_blkin_trace_info {
+ __le64 trace_id;
+ __le64 span_id;
+ __le64 parent_span_id;
+} __packed;
+
+typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
+ u64 notifier_id, void *data, size_t data_len);
+typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
+
+struct ceph_osd_linger_request {
+ struct ceph_osd_client *osdc;
+ u64 linger_id;
+ bool committed;
+ bool is_watch; /* watch or notify */
+
+ struct ceph_osd *osd;
+ struct ceph_osd_request *reg_req;
+ struct ceph_osd_request *ping_req;
+ unsigned long ping_sent;
+ unsigned long watch_valid_thru;
+ struct list_head pending_lworks;
+
+ struct ceph_osd_request_target t;
+ u32 map_dne_bound;
+
+ struct timespec64 mtime;
+
+ struct kref kref;
+ struct mutex lock;
+ struct rb_node node; /* osd */
+ struct rb_node osdc_node; /* osdc */
+ struct rb_node mc_node; /* map check */
+ struct list_head scan_item;
+
+ struct completion reg_commit_wait;
+ struct completion notify_finish_wait;
+ int reg_commit_error;
+ int notify_finish_error;
+ int last_error;
+
+ u32 register_gen;
+ u64 notify_id;
+
+ rados_watchcb2_t wcb;
+ rados_watcherrcb_t errcb;
+ void *data;
+
+ struct ceph_pagelist *request_pl;
+ struct page **notify_id_pages;
+
+ struct page ***preply_pages;
+ size_t *preply_len;
+};
+
+struct ceph_watch_item {
+ struct ceph_entity_name name;
+ u64 cookie;
+ struct ceph_entity_addr addr;
+};
+
+struct ceph_spg_mapping {
+ struct rb_node node;
+ struct ceph_spg spgid;
+
+ struct rb_root backoffs;
+};
+
+struct ceph_hobject_id {
+ void *key;
+ size_t key_len;
+ void *oid;
+ size_t oid_len;
+ u64 snapid;
+ u32 hash;
+ u8 is_max;
+ void *nspace;
+ size_t nspace_len;
+ s64 pool;
+
+ /* cache */
+ u32 hash_reverse_bits;
+};
+
+static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
+{
+ hoid->hash_reverse_bits = bitrev32(hoid->hash);
+}
+
+/*
+ * PG-wide backoff: [begin, end)
+ * per-object backoff: begin == end
+ */
+struct ceph_osd_backoff {
+ struct rb_node spg_node;
+ struct rb_node id_node;
+
+ struct ceph_spg spgid;
+ u64 id;
+ struct ceph_hobject_id *begin;
+ struct ceph_hobject_id *end;
+};
+
+#define CEPH_LINGER_ID_START 0xffff000000000000ULL
+
+struct ceph_osd_client {
+ struct ceph_client *client;
+
+ struct ceph_osdmap *osdmap; /* current map */
+ struct rw_semaphore lock;
+
+ struct rb_root osds; /* osds */
+ struct list_head osd_lru; /* idle osds */
+ spinlock_t osd_lru_lock;
+ u32 epoch_barrier;
+ struct ceph_osd homeless_osd;
+ atomic64_t last_tid; /* tid of last request */
+ u64 last_linger_id;
+ struct rb_root linger_requests; /* lingering requests */
+ struct rb_root map_checks;
+ struct rb_root linger_map_checks;
+ atomic_t num_requests;
+ atomic_t num_homeless;
+ int abort_err;
+ struct delayed_work timeout_work;
+ struct delayed_work osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+
+ mempool_t *req_mempool;
+
+ struct ceph_msgpool msgpool_op;
+ struct ceph_msgpool msgpool_op_reply;
+
+ struct workqueue_struct *notify_wq;
+ struct workqueue_struct *completion_wq;
+};
+
+static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
+{
+ return osdc->osdmap->flags & flag;
+}
+
+extern int ceph_osdc_setup(void);
+extern void ceph_osdc_cleanup(void);
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+ struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
+void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
+
+#define osd_req_op_data(oreq, whch, typ, fld) \
+({ \
+ struct ceph_osd_request *__oreq = (oreq); \
+ unsigned int __whch = (whch); \
+ BUG_ON(__whch >= __oreq->r_num_ops); \
+ &__oreq->r_ops[__whch].typ.fld; \
+})
+
+struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode, u32 flags);
+
+extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+
+extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+ unsigned int which, u16 opcode,
+ u64 offset, u64 length,
+ u64 truncate_size, u32 truncate_seq);
+extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 length);
+extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 offset_inc);
+
+extern struct ceph_osd_data *osd_req_op_extent_osd_data(
+ struct ceph_osd_request *osd_req,
+ unsigned int which);
+
+extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
+ unsigned int which,
+ struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bio_iter *bio_pos,
+ u32 bio_length);
+#endif /* CONFIG_BLOCK */
+void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes);
+void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct ceph_bvec_iter *bvec_pos);
+
+extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
+ unsigned int which,
+ struct ceph_pagelist *pagelist);
+extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ struct bio_vec *bvecs, u32 num_bvecs,
+ u32 bytes);
+extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
+ unsigned int which,
+ struct page **pages, u64 length,
+ u32 alignment, bool pages_from_pool,
+ bool own_pages);
+int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+ const char *class, const char *method);
+extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
+ u16 opcode, const char *name, const void *value,
+ size_t size, u8 cmp_op, u8 cmp_mode);
+extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size,
+ u32 flags);
+extern int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
+ u8 copy_from_flags);
+
+extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
+ struct ceph_snap_context *snapc,
+ unsigned int num_ops,
+ bool use_mempool,
+ gfp_t gfp_flags);
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 offset, u64 *len,
+ unsigned int which, int num_ops,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ u32 truncate_seq, u64 truncate_size,
+ bool use_mempool);
+
+extern void ceph_osdc_get_request(struct ceph_osd_request *req);
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
+
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
+void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
+
+int ceph_osdc_call(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ const char *class, const char *method,
+ unsigned int flags,
+ struct page *req_page, size_t req_len,
+ struct page **resp_pages, size_t *resp_len);
+
+/* watch/notify */
+struct ceph_osd_linger_request *
+ceph_osdc_watch(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ rados_watchcb2_t wcb,
+ rados_watcherrcb_t errcb,
+ void *data);
+int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq);
+
+int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ u64 notify_id,
+ u64 cookie,
+ void *payload,
+ u32 payload_len);
+int ceph_osdc_notify(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ void *payload,
+ u32 payload_len,
+ u32 timeout,
+ struct page ***preply_pages,
+ size_t *preply_len);
+int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
+ struct ceph_osd_linger_request *lreq);
+int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ struct ceph_watch_item **watchers,
+ u32 *num_watchers);
+#endif
+
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
new file mode 100644
index 000000000..5553019c3
--- /dev/null
+++ b/include/linux/ceph/osdmap.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
+#include <linux/crush/crush.h>
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds. That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg {
+ uint64_t pool;
+ uint32_t seed;
+};
+
+#define CEPH_SPG_NOSHARD -1
+
+struct ceph_spg {
+ struct ceph_pg pgid;
+ s8 shard;
+};
+
+int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
+
+#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
+ together */
+#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
+#define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota,
+ will set FULL too */
+#define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */
+
+struct ceph_pg_pool_info {
+ struct rb_node node;
+ s64 id;
+ u8 type; /* CEPH_POOL_TYPE_* */
+ u8 size;
+ u8 min_size;
+ u8 crush_ruleset;
+ u8 object_hash;
+ u32 last_force_request_resend;
+ u32 pg_num, pgp_num;
+ int pg_num_mask, pgp_num_mask;
+ s64 read_tier;
+ s64 write_tier; /* wins for read+write ops */
+ u64 flags; /* CEPH_POOL_FLAG_* */
+ char *name;
+
+ bool was_full; /* for handle_one_map() */
+};
+
+static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
+{
+ switch (pool->type) {
+ case CEPH_POOL_TYPE_REP:
+ return true;
+ case CEPH_POOL_TYPE_EC:
+ return false;
+ default:
+ BUG();
+ }
+}
+
+struct ceph_object_locator {
+ s64 pool;
+ struct ceph_string *pool_ns;
+};
+
+static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
+{
+ oloc->pool = -1;
+ oloc->pool_ns = NULL;
+}
+
+static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
+{
+ return oloc->pool == -1;
+}
+
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src);
+void ceph_oloc_destroy(struct ceph_object_locator *oloc);
+
+/*
+ * 51-char inline_name is long enough for all cephfs and all but one
+ * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
+ * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
+ * other rbd requests fit into inline_name.
+ *
+ * Makes ceph_object_id 64 bytes on 64-bit.
+ */
+#define CEPH_OID_INLINE_LEN 52
+
+/*
+ * Both inline and external buffers have space for a NUL-terminator,
+ * which is carried around. It's not required though - RADOS object
+ * names don't have to be NUL-terminated and may contain NULs.
+ */
+struct ceph_object_id {
+ char *name;
+ char inline_name[CEPH_OID_INLINE_LEN];
+ int name_len;
+};
+
+#define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
+
+#define CEPH_DEFINE_OID_ONSTACK(oid) \
+ struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
+
+static inline void ceph_oid_init(struct ceph_object_id *oid)
+{
+ *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
+}
+
+static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
+{
+ return oid->name == oid->inline_name && !oid->name_len;
+}
+
+void ceph_oid_copy(struct ceph_object_id *dest,
+ const struct ceph_object_id *src);
+__printf(2, 3)
+void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
+__printf(3, 4)
+int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
+ const char *fmt, ...);
+void ceph_oid_destroy(struct ceph_object_id *oid);
+
+struct workspace_manager {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ /* Number of free workspaces */
+ int free_ws;
+ /* Total number of allocated workspaces */
+ atomic_t total_ws;
+ /* Waiters for a free workspace */
+ wait_queue_head_t ws_wait;
+};
+
+struct ceph_pg_mapping {
+ struct rb_node node;
+ struct ceph_pg pgid;
+
+ union {
+ struct {
+ int len;
+ int osds[];
+ } pg_temp, pg_upmap;
+ struct {
+ int osd;
+ } primary_temp;
+ struct {
+ int len;
+ int from_to[][2];
+ } pg_upmap_items;
+ };
+};
+
+struct ceph_osdmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ struct ceph_timespec created, modified;
+
+ u32 flags; /* CEPH_OSDMAP_* */
+
+ u32 max_osd; /* size of osd_state, _offload, _addr arrays */
+ u32 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
+ struct ceph_entity_addr *osd_addr;
+
+ struct rb_root pg_temp;
+ struct rb_root primary_temp;
+
+ /* remap (post-CRUSH, pre-up) */
+ struct rb_root pg_upmap; /* PG := raw set */
+ struct rb_root pg_upmap_items; /* from -> to within raw set */
+
+ u32 *osd_primary_affinity;
+
+ struct rb_root pg_pools;
+ u32 pool_max;
+
+ /* the CRUSH map specifies the mapping of placement groups to
+ * the list of osds that store+replicate them. */
+ struct crush_map *crush;
+
+ struct workspace_manager crush_wsm;
+};
+
+static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
+{
+ return osd >= 0 && osd < map->max_osd &&
+ (map->osd_state[osd] & CEPH_OSD_EXISTS);
+}
+
+static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+ return ceph_osd_exists(map, osd) &&
+ (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
+{
+ return !ceph_osd_is_up(map, osd);
+}
+
+char *ceph_osdmap_state_str(char *str, int len, u32 state);
+extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+ int osd)
+{
+ if (osd >= map->max_osd)
+ return NULL;
+ return &map->osd_addr[osd];
+}
+
+#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
+
+static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
+{
+ __u8 version;
+
+ if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
+ pr_warn("incomplete pg encoding\n");
+ return -EINVAL;
+ }
+ version = ceph_decode_8(p);
+ if (version > 1) {
+ pr_warn("do not understand pg encoding %d > 1\n",
+ (int)version);
+ return -EINVAL;
+ }
+
+ pgid->pool = ceph_decode_64(p);
+ pgid->seed = ceph_decode_32(p);
+ *p += 4; /* skip deprecated preferred value */
+
+ return 0;
+}
+
+struct ceph_osdmap *ceph_osdmap_alloc(void);
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+struct ceph_osds {
+ int osds[CEPH_PG_MAX_SIZE];
+ int size;
+ int primary; /* id, NOT index */
+};
+
+static inline void ceph_osds_init(struct ceph_osds *set)
+{
+ set->size = 0;
+ set->primary = -1;
+}
+
+void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+ u32 new_pg_num);
+bool ceph_is_new_interval(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ const struct ceph_osds *old_up,
+ const struct ceph_osds *new_up,
+ int old_size,
+ int new_size,
+ int old_min_size,
+ int new_min_size,
+ u32 old_pg_num,
+ u32 new_pg_num,
+ bool old_sort_bitwise,
+ bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
+ const struct ceph_pg *pgid);
+bool ceph_osds_changed(const struct ceph_osds *old_acting,
+ const struct ceph_osds *new_acting,
+ bool any_change);
+
+void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
+int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
+
+void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_osds *up,
+ struct ceph_osds *acting);
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_spg *spgid);
+int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
+ const struct ceph_pg *raw_pgid);
+
+struct crush_loc {
+ char *cl_type_name;
+ char *cl_name;
+};
+
+struct crush_loc_node {
+ struct rb_node cl_node;
+ struct crush_loc cl_loc; /* pointers into cl_data */
+ char cl_data[];
+};
+
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
+void ceph_clear_crush_locs(struct rb_root *locs);
+
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs);
+
+extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
+ u64 id);
+extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
+extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
+
+#endif
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
new file mode 100644
index 000000000..5dead8486
--- /dev/null
+++ b/include/linux/ceph/pagelist.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <asm/byteorder.h>
+#include <linux/refcount.h>
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct ceph_pagelist {
+ struct list_head head;
+ void *mapped_tail;
+ size_t length;
+ size_t room;
+ struct list_head free_list;
+ size_t num_pages_free;
+ refcount_t refcnt;
+};
+
+struct ceph_pagelist_cursor {
+ struct ceph_pagelist *pl; /* pagelist, for error checking */
+ struct list_head *page_lru; /* page in list */
+ size_t room; /* room remaining to reset to */
+};
+
+struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags);
+
+extern void ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
+
+extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
+
+extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
+
+extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c);
+
+extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
+ struct ceph_pagelist_cursor *c);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+ __le64 ev = cpu_to_le64(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+ __le32 ev = cpu_to_le32(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+ __le16 ev = cpu_to_le16(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+ return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+ char *s, u32 len)
+{
+ int ret = ceph_pagelist_encode_32(pl, len);
+ if (ret)
+ return ret;
+ if (len)
+ return ceph_pagelist_append(pl, s, len);
+ return 0;
+}
+
+#endif
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
new file mode 100644
index 000000000..43a7a1573
--- /dev/null
+++ b/include/linux/ceph/rados.h
@@ -0,0 +1,551 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef CEPH_RADOS_H
+#define CEPH_RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include <linux/ceph/msgr.h>
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+#define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg_v1 {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
+#define CEPH_NOPOOL ((__u64) (-1)) /* pool id not defined */
+
+#define CEPH_POOL_TYPE_REP 1
+#define CEPH_POOL_TYPE_RAID4 2 /* never implemented */
+#define CEPH_POOL_TYPE_EC 3
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg_v1 ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le64 version;
+ __le32 epoch;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS (1<<0)
+#define CEPH_OSD_UP (1<<1)
+#define CEPH_OSD_AUTOOUT (1<<2) /* osd was automatically marked out */
+#define CEPH_OSD_NEW (1<<3) /* osd is new, never marked in */
+
+extern const char *ceph_osd_state_name(int s);
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+/* osd primary-affinity. fixed point value: 0x10000 == baseline */
+#define CEPH_OSD_MAX_PRIMARY_AFFINITY 0x10000
+#define CEPH_OSD_DEFAULT_PRIMARY_AFFINITY 0x10000
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC),
+ not set since ~luminous */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC),
+ not set since ~luminous */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+#define CEPH_OSDMAP_NOUP (1<<5) /* block osd boot */
+#define CEPH_OSDMAP_NODOWN (1<<6) /* block osd mark-down/failure */
+#define CEPH_OSDMAP_NOOUT (1<<7) /* block osd auto mark-out */
+#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
+#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
+#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
+#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
+#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
+#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
+#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
+#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
+
+/*
+ * The error code to return when an OSD can't handle a write
+ * because it is too large.
+ */
+#define OSD_WRITETOOBIG EMSGSIZE
+
+/*
+ * osd ops
+ *
+ * WARNING: do not use these op codes directly. Use the helpers
+ * defined below instead. In certain cases, op code behavior was
+ * redefined, resulting in special-cases in the helpers.
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+#define CEPH_OSD_OP_MODE_CACHE 0x8000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK 0x0100
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+#define CEPH_OSD_OP_TYPE_MULTI 0x0600 /* multiobject */
+
+#define __CEPH_OSD_OP1(mode, nr) \
+ (CEPH_OSD_OP_MODE_##mode | (nr))
+
+#define __CEPH_OSD_OP(mode, type, nr) \
+ (CEPH_OSD_OP_MODE_##mode | CEPH_OSD_OP_TYPE_##type | (nr))
+
+#define __CEPH_FORALL_OSD_OPS(f) \
+ /** data **/ \
+ /* read */ \
+ f(READ, __CEPH_OSD_OP(RD, DATA, 1), "read") \
+ f(STAT, __CEPH_OSD_OP(RD, DATA, 2), "stat") \
+ f(MAPEXT, __CEPH_OSD_OP(RD, DATA, 3), "mapext") \
+ \
+ /* fancy read */ \
+ f(MASKTRUNC, __CEPH_OSD_OP(RD, DATA, 4), "masktrunc") \
+ f(SPARSE_READ, __CEPH_OSD_OP(RD, DATA, 5), "sparse-read") \
+ \
+ f(NOTIFY, __CEPH_OSD_OP(RD, DATA, 6), "notify") \
+ f(NOTIFY_ACK, __CEPH_OSD_OP(RD, DATA, 7), "notify-ack") \
+ \
+ /* versioning */ \
+ f(ASSERT_VER, __CEPH_OSD_OP(RD, DATA, 8), "assert-version") \
+ \
+ f(LIST_WATCHERS, __CEPH_OSD_OP(RD, DATA, 9), "list-watchers") \
+ \
+ f(LIST_SNAPS, __CEPH_OSD_OP(RD, DATA, 10), "list-snaps") \
+ \
+ /* sync */ \
+ f(SYNC_READ, __CEPH_OSD_OP(RD, DATA, 11), "sync_read") \
+ \
+ /* write */ \
+ f(WRITE, __CEPH_OSD_OP(WR, DATA, 1), "write") \
+ f(WRITEFULL, __CEPH_OSD_OP(WR, DATA, 2), "writefull") \
+ f(TRUNCATE, __CEPH_OSD_OP(WR, DATA, 3), "truncate") \
+ f(ZERO, __CEPH_OSD_OP(WR, DATA, 4), "zero") \
+ f(DELETE, __CEPH_OSD_OP(WR, DATA, 5), "delete") \
+ \
+ /* fancy write */ \
+ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
+ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
+ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
+ \
+ f(TMAPUP, __CEPH_OSD_OP(RMW, DATA, 10), "tmapup") \
+ f(TMAPPUT, __CEPH_OSD_OP(WR, DATA, 11), "tmapput") \
+ f(TMAPGET, __CEPH_OSD_OP(RD, DATA, 12), "tmapget") \
+ \
+ f(CREATE, __CEPH_OSD_OP(WR, DATA, 13), "create") \
+ f(ROLLBACK, __CEPH_OSD_OP(WR, DATA, 14), "rollback") \
+ \
+ f(WATCH, __CEPH_OSD_OP(WR, DATA, 15), "watch") \
+ \
+ /* omap */ \
+ f(OMAPGETKEYS, __CEPH_OSD_OP(RD, DATA, 17), "omap-get-keys") \
+ f(OMAPGETVALS, __CEPH_OSD_OP(RD, DATA, 18), "omap-get-vals") \
+ f(OMAPGETHEADER, __CEPH_OSD_OP(RD, DATA, 19), "omap-get-header") \
+ f(OMAPGETVALSBYKEYS, __CEPH_OSD_OP(RD, DATA, 20), "omap-get-vals-by-keys") \
+ f(OMAPSETVALS, __CEPH_OSD_OP(WR, DATA, 21), "omap-set-vals") \
+ f(OMAPSETHEADER, __CEPH_OSD_OP(WR, DATA, 22), "omap-set-header") \
+ f(OMAPCLEAR, __CEPH_OSD_OP(WR, DATA, 23), "omap-clear") \
+ f(OMAPRMKEYS, __CEPH_OSD_OP(WR, DATA, 24), "omap-rm-keys") \
+ f(OMAP_CMP, __CEPH_OSD_OP(RD, DATA, 25), "omap-cmp") \
+ \
+ /* tiering */ \
+ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
+ f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \
+ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \
+ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
+ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
+ f(COPY_GET, __CEPH_OSD_OP(RD, DATA, 30), "copy-get") \
+ f(CACHE_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 31), "cache-flush") \
+ f(CACHE_EVICT, __CEPH_OSD_OP(CACHE, DATA, 32), "cache-evict") \
+ f(CACHE_TRY_FLUSH, __CEPH_OSD_OP(CACHE, DATA, 33), "cache-try-flush") \
+ \
+ /* convert tmap to omap */ \
+ f(TMAP2OMAP, __CEPH_OSD_OP(RMW, DATA, 34), "tmap2omap") \
+ \
+ /* hints */ \
+ f(SETALLOCHINT, __CEPH_OSD_OP(WR, DATA, 35), "set-alloc-hint") \
+ \
+ /** multi **/ \
+ f(CLONERANGE, __CEPH_OSD_OP(WR, MULTI, 1), "clonerange") \
+ f(ASSERT_SRC_VERSION, __CEPH_OSD_OP(RD, MULTI, 2), "assert-src-version") \
+ f(SRC_CMPXATTR, __CEPH_OSD_OP(RD, MULTI, 3), "src-cmpxattr") \
+ \
+ /** attrs **/ \
+ /* read */ \
+ f(GETXATTR, __CEPH_OSD_OP(RD, ATTR, 1), "getxattr") \
+ f(GETXATTRS, __CEPH_OSD_OP(RD, ATTR, 2), "getxattrs") \
+ f(CMPXATTR, __CEPH_OSD_OP(RD, ATTR, 3), "cmpxattr") \
+ \
+ /* write */ \
+ f(SETXATTR, __CEPH_OSD_OP(WR, ATTR, 1), "setxattr") \
+ f(SETXATTRS, __CEPH_OSD_OP(WR, ATTR, 2), "setxattrs") \
+ f(RESETXATTRS, __CEPH_OSD_OP(WR, ATTR, 3), "resetxattrs") \
+ f(RMXATTR, __CEPH_OSD_OP(WR, ATTR, 4), "rmxattr") \
+ \
+ /** subop **/ \
+ f(PULL, __CEPH_OSD_OP1(SUB, 1), "pull") \
+ f(PUSH, __CEPH_OSD_OP1(SUB, 2), "push") \
+ f(BALANCEREADS, __CEPH_OSD_OP1(SUB, 3), "balance-reads") \
+ f(UNBALANCEREADS, __CEPH_OSD_OP1(SUB, 4), "unbalance-reads") \
+ f(SCRUB, __CEPH_OSD_OP1(SUB, 5), "scrub") \
+ f(SCRUB_RESERVE, __CEPH_OSD_OP1(SUB, 6), "scrub-reserve") \
+ f(SCRUB_UNRESERVE, __CEPH_OSD_OP1(SUB, 7), "scrub-unreserve") \
+ f(SCRUB_STOP, __CEPH_OSD_OP1(SUB, 8), "scrub-stop") \
+ f(SCRUB_MAP, __CEPH_OSD_OP1(SUB, 9), "scrub-map") \
+ \
+ /** lock **/ \
+ f(WRLOCK, __CEPH_OSD_OP(WR, LOCK, 1), "wrlock") \
+ f(WRUNLOCK, __CEPH_OSD_OP(WR, LOCK, 2), "wrunlock") \
+ f(RDLOCK, __CEPH_OSD_OP(WR, LOCK, 3), "rdlock") \
+ f(RDUNLOCK, __CEPH_OSD_OP(WR, LOCK, 4), "rdunlock") \
+ f(UPLOCK, __CEPH_OSD_OP(WR, LOCK, 5), "uplock") \
+ f(DNLOCK, __CEPH_OSD_OP(WR, LOCK, 6), "dnlock") \
+ \
+ /** exec **/ \
+ /* note: the RD bit here is wrong; see special-case below in helper */ \
+ f(CALL, __CEPH_OSD_OP(RD, EXEC, 1), "call") \
+ \
+ /** pg **/ \
+ f(PGLS, __CEPH_OSD_OP(RD, PG, 1), "pgls") \
+ f(PGLS_FILTER, __CEPH_OSD_OP(RD, PG, 2), "pgls-filter") \
+ f(PG_HITSET_LS, __CEPH_OSD_OP(RD, PG, 3), "pg-hitset-ls") \
+ f(PG_HITSET_GET, __CEPH_OSD_OP(RD, PG, 4), "pg-hitset-get")
+
+enum {
+#define GENERATE_ENUM_ENTRY(op, opcode, str) CEPH_OSD_OP_##op = (opcode),
+__CEPH_FORALL_OSD_OPS(GENERATE_ENUM_ENTRY)
+#undef GENERATE_ENUM_ENTRY
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+static inline int ceph_osd_op_type_multi(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_MULTI;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE_RD) &&
+ op != CEPH_OSD_OP_CALL;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return op & CEPH_OSD_OP_MODE_WR;
+}
+
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_CREATE 'c' /* create key */
+#define CEPH_OSD_TMAP_RM 'r'
+#define CEPH_OSD_TMAP_RMSLOPPY 'R'
+
+extern const char *ceph_osd_op_name(int op);
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 0x0001, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 0x0002, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 0x0004, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 0x0008, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 0x0010, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 0x0020, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 0x0040, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT_OLD = 0x0080, /* DEPRECATED msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 0x0100,
+ CEPH_OSD_FLAG_PARALLELEXEC = 0x0200, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 0x0400, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 0x0800, /* op may exec */
+ CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */
+ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */
+ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */
+ CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */
+ CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
+ CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
+ CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
+ CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
+ CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
+ pool uses pool snaps */
+ CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
+ CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
+ CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
+ CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
+ CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
+ CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
+ CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
+ CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in
+ the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed
+ in the near future */
+ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only
+ once by this client */
+};
+
+#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
+
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_NOP = 0,
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
+enum {
+ CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
+ CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
+ CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
+ * cloneid */
+ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
+ CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */
+};
+
+enum {
+ CEPH_OSD_WATCH_OP_UNWATCH = 0,
+ CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
+ /* note: use only ODD ids to prevent pre-giant code from
+ interpreting the op as UNWATCH */
+ CEPH_OSD_WATCH_OP_WATCH = 3,
+ CEPH_OSD_WATCH_OP_RECONNECT = 5,
+ CEPH_OSD_WATCH_OP_PING = 7,
+};
+
+const char *ceph_osd_watch_op_name(int o);
+
+enum {
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_WRITE = 1,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_WRITE = 2,
+ CEPH_OSD_ALLOC_HINT_FLAG_SEQUENTIAL_READ = 4,
+ CEPH_OSD_ALLOC_HINT_FLAG_RANDOM_READ = 8,
+ CEPH_OSD_ALLOC_HINT_FLAG_APPEND_ONLY = 16,
+ CEPH_OSD_ALLOC_HINT_FLAG_IMMUTABLE = 32,
+ CEPH_OSD_ALLOC_HINT_FLAG_SHORTLIVED = 64,
+ CEPH_OSD_ALLOC_HINT_FLAG_LONGLIVED = 128,
+ CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE = 256,
+ CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE = 512,
+};
+
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_OP_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 cookie, count;
+ } __attribute__ ((packed)) pgls;
+ struct {
+ __le64 snapid;
+ } __attribute__ ((packed)) snap;
+ struct {
+ __le64 cookie;
+ __le64 ver; /* no longer used */
+ __u8 op; /* CEPH_OSD_WATCH_OP_* */
+ __le32 gen; /* registration generation */
+ } __attribute__ ((packed)) watch;
+ struct {
+ __le64 cookie;
+ } __attribute__ ((packed)) notify;
+ struct {
+ __le64 offset, length;
+ __le64 src_offset;
+ } __attribute__ ((packed)) clonerange;
+ struct {
+ __le64 expected_object_size;
+ __le64 expected_write_size;
+ __le32 flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+ } __attribute__ ((packed)) alloc_hint;
+ struct {
+ __le64 snapid;
+ __le64 src_version;
+ __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
+ /*
+ * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
+ * for src object, flags for dest object are in
+ * ceph_osd_op::flags.
+ */
+ __le32 src_fadvise_flags;
+ } __attribute__ ((packed)) copy_from;
+ };
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 000000000..a4a9962d1
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+ struct kref kref;
+ union {
+ struct rb_node node;
+ struct rcu_head rcu;
+ };
+ size_t len;
+ char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+ size_t len);
+extern bool ceph_strings_empty(void);
+
+static inline struct ceph_string *ceph_get_string(struct ceph_string *str)
+{
+ kref_get(&str->kref);
+ return str;
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+ if (!str)
+ return;
+ kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+ const char* str, size_t len)
+{
+ size_t cs_len = cs ? cs->len : 0;
+ if (cs_len != len)
+ return cs_len - len;
+ if (len == 0)
+ return 0;
+ return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x) \
+({ \
+ struct ceph_string *___str; \
+ rcu_read_lock(); \
+ for (;;) { \
+ ___str = rcu_dereference(x); \
+ if (!___str || \
+ kref_get_unless_zero(&___str->kref)) \
+ break; \
+ } \
+ rcu_read_unlock(); \
+ (___str); \
+})
+
+#endif
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
new file mode 100644
index 000000000..3486636c0
--- /dev/null
+++ b/include/linux/ceph/striper.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CEPH_STRIPER_H
+#define _LINUX_CEPH_STRIPER_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct ceph_file_layout;
+
+void ceph_calc_file_object_mapping(struct ceph_file_layout *l,
+ u64 off, u64 len,
+ u64 *objno, u64 *objoff, u32 *xlen);
+
+struct ceph_object_extent {
+ struct list_head oe_item;
+ u64 oe_objno;
+ u64 oe_off;
+ u64 oe_len;
+};
+
+static inline void ceph_object_extent_init(struct ceph_object_extent *ex)
+{
+ INIT_LIST_HEAD(&ex->oe_item);
+}
+
+/*
+ * Called for each mapped stripe unit.
+ *
+ * @bytes: number of bytes mapped, i.e. the minimum of the full length
+ * requested (file extent length) or the remainder of the stripe
+ * unit within an object
+ */
+typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex,
+ u32 bytes, void *arg);
+
+int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ struct ceph_object_extent *alloc_fn(void *arg),
+ void *alloc_arg,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg);
+int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len,
+ struct list_head *object_extents,
+ ceph_object_extent_fn_t action_fn,
+ void *action_arg);
+
+struct ceph_file_extent {
+ u64 fe_off;
+ u64 fe_len;
+};
+
+static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents,
+ u32 num_file_extents)
+{
+ u64 bytes = 0;
+ u32 i;
+
+ for (i = 0; i < num_file_extents; i++)
+ bytes += file_extents[i].fe_len;
+
+ return bytes;
+}
+
+int ceph_extent_to_file(struct ceph_file_layout *l,
+ u64 objno, u64 objoff, u64 objlen,
+ struct ceph_file_extent **file_extents,
+ u32 *num_file_extents);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
+
+#endif
diff --git a/include/linux/ceph/types.h b/include/linux/ceph/types.h
new file mode 100644
index 000000000..bd3d53290
--- /dev/null
+++ b/include/linux/ceph/types.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/ceph_frag.h>
+#include <linux/ceph/ceph_hash.h>
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+ u64 ino;
+ u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+ int count;
+ int used;
+};
+
+
+#endif