summaryrefslogtreecommitdiffstats
path: root/epan/dissectors/packet-rpcrdma.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:34:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:34:10 +0000
commite4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc (patch)
tree68cb5ef9081156392f1dd62a00c6ccc1451b93df /epan/dissectors/packet-rpcrdma.c
parentInitial commit. (diff)
downloadwireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.tar.xz
wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.zip
Adding upstream version 4.2.2.upstream/4.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'epan/dissectors/packet-rpcrdma.c')
-rw-r--r--epan/dissectors/packet-rpcrdma.c2093
1 files changed, 2093 insertions, 0 deletions
diff --git a/epan/dissectors/packet-rpcrdma.c b/epan/dissectors/packet-rpcrdma.c
new file mode 100644
index 00000000..e7f650c9
--- /dev/null
+++ b/epan/dissectors/packet-rpcrdma.c
@@ -0,0 +1,2093 @@
+/* packet-rpcordma.c
+ * Routines for RPC over RDMA dissection (RFC 5666)
+ * Copyright 2014-2015, Mellanox Technologies Ltd.
+ * Code by Yan Burman.
+ *
+ * Wireshark - Network traffic analyzer
+ * By Gerald Combs <gerald@wireshark.org>
+ * Copyright 1998 Gerald Combs
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include <epan/packet.h>
+#include <epan/prefs.h>
+#include <epan/exceptions.h>
+#include <epan/proto_data.h>
+#include <epan/reassemble.h>
+#include <epan/conversation.h>
+#include <epan/addr_resolv.h>
+
+#include "packet-rpcrdma.h"
+#include "packet-frame.h"
+#include "packet-infiniband.h"
+#include "packet-iwarp-ddp-rdmap.h"
+
+#define MIN_RPCRDMA_HDR_SZ 16
+#define MIN_RPCRDMA_MSG_SZ (MIN_RPCRDMA_HDR_SZ + 12)
+#define MIN_RPCRDMA_MSGP_SZ (MIN_RPCRDMA_MSG_SZ + 8)
+
+#define SID_ULP_MASK 0x00000000FF000000
+#define SID_PROTO_MASK 0x0000000000FF0000
+#define SID_PORT_MASK 0x000000000000FFFF
+
+#define SID_ULP 0x01
+#define SID_PROTO_TCP 0x06
+#define TCP_PORT_RPCRDMA_RANGE "20049,2050"
+
+#define SID_MASK (SID_ULP_MASK | SID_PROTO_MASK)
+#define SID_ULP_TCP ((SID_ULP << 3 * 8) | (SID_PROTO_TCP << 2 * 8))
+
+void proto_reg_handoff_rpcordma(void);
+void proto_register_rpcordma(void);
+
+static int proto_rpcordma = -1;
+static dissector_handle_t rpcordma_handle;
+static dissector_handle_t rpc_handler;
+
+/* RPCoRDMA Header */
+static int hf_rpcordma_xid = -1;
+static int hf_rpcordma_vers = -1;
+static int hf_rpcordma_flow_control = -1;
+static int hf_rpcordma_message_type = -1;
+
+/* chunks */
+static int hf_rpcordma_reads_count = -1;
+static int hf_rpcordma_writes_count = -1;
+static int hf_rpcordma_reply_count = -1;
+
+static int hf_rpcordma_position = -1;
+static int hf_rpcordma_segment_count = -1;
+
+/* rdma_segment */
+static int hf_rpcordma_rdma_handle = -1;
+static int hf_rpcordma_rdma_length = -1;
+static int hf_rpcordma_rdma_offset = -1;
+
+static int hf_rpcordma_rdma_align = -1;
+static int hf_rpcordma_rdma_thresh = -1;
+
+static int hf_rpcordma_errcode = -1;
+static int hf_rpcordma_vers_high = -1;
+static int hf_rpcordma_vers_low = -1;
+
+/* Initialize the subtree pointers */
+static gint ett_rpcordma = -1;
+static gint ett_rpcordma_chunk = -1;
+static gint ett_rpcordma_read_list = -1;
+static gint ett_rpcordma_read_chunk = -1;
+static gint ett_rpcordma_write_list = -1;
+static gint ett_rpcordma_write_chunk = -1;
+static gint ett_rpcordma_reply_chunk = -1;
+static gint ett_rpcordma_segment = -1;
+
+/* Fragmentation */
+static int hf_rpcordma_fragments = -1;
+static int hf_rpcordma_fragment = -1;
+static int hf_rpcordma_fragment_overlap = -1;
+static int hf_rpcordma_fragment_overlap_conflicts = -1;
+static int hf_rpcordma_fragment_multiple_tails = -1;
+static int hf_rpcordma_fragment_too_long_fragment = -1;
+static int hf_rpcordma_fragment_error = -1;
+static int hf_rpcordma_fragment_count = -1;
+static int hf_rpcordma_reassembled_in = -1;
+static int hf_rpcordma_reassembled_length = -1;
+static int hf_rpcordma_reassembled_data = -1;
+
+static gint ett_rpcordma_fragment = -1;
+static gint ett_rpcordma_fragments = -1;
+
+static const fragment_items rpcordma_frag_items = {
+ /* Fragment subtrees */
+ &ett_rpcordma_fragment,
+ &ett_rpcordma_fragments,
+ /* Fragment fields */
+ &hf_rpcordma_fragments,
+ &hf_rpcordma_fragment,
+ &hf_rpcordma_fragment_overlap,
+ &hf_rpcordma_fragment_overlap_conflicts,
+ &hf_rpcordma_fragment_multiple_tails,
+ &hf_rpcordma_fragment_too_long_fragment,
+ &hf_rpcordma_fragment_error,
+ &hf_rpcordma_fragment_count,
+ /* Reassembled in field */
+ &hf_rpcordma_reassembled_in,
+ /* Reassembled length field */
+ &hf_rpcordma_reassembled_length,
+ /* Reassembled data field */
+ &hf_rpcordma_reassembled_data,
+ /* Tag */
+ "RPCoRDMA fragments"
+};
+
+/* Reassembly table */
+static reassembly_table rpcordma_reassembly_table;
+
+enum MSG_TYPE {
+ RDMA_MSG,
+ RDMA_NOMSG,
+ RDMA_MSGP,
+ RDMA_DONE,
+ RDMA_ERROR
+};
+
+static const value_string rpcordma_message_type[] = {
+ {RDMA_MSG, "RDMA_MSG"},
+ {RDMA_NOMSG, "RDMA_NOMSG"},
+ {RDMA_MSGP, "RDMA_MSGP"},
+ {RDMA_DONE, "RDMA_DONE"},
+ {RDMA_ERROR, "RDMA_ERROR"},
+ {0, NULL}
+};
+
+#define ERR_VERS 1
+#define ERR_CHUNK 2
+
+static const value_string rpcordma_err[] = {
+ {ERR_VERS, "ERR_VERS"},
+ {ERR_CHUNK, "ERR_CHUNK"},
+ {0, NULL}
+};
+
+/* RDMA chunk type */
+typedef enum {
+ RDMA_READ_CHUNK,
+ RDMA_WRITE_CHUNK,
+ RDMA_REPLY_CHUNK
+} chunk_type_t;
+
+/* RDMA segment */
+typedef struct {
+ guint32 xdrpos; /* Position in XDR stream -- RDMA read only */
+ guint32 handle; /* Registered memory handle */
+ guint32 length; /* Length of segment in bytes */
+ guint64 offset; /* Segment virtual address or offset */
+} rdma_segment_t;
+
+/* RDMA chunk */
+typedef struct {
+ chunk_type_t type; /* Chunk type */
+ guint32 length; /* Length of chunk in bytes */
+ wmem_array_t *segments; /* List of segments for chunk */
+} rdma_chunk_t;
+
+/* RPC-over-RDMA lists */
+typedef struct {
+ wmem_array_t *p_read_list; /* List of RDMA read chunks */
+ wmem_array_t *p_write_list; /* List of RDMA write chunks */
+ wmem_array_t *p_reply_list; /* List of RDMA reply chunks */
+} rdma_lists_t;
+
+/*
+ * Segment information for RDMA I/O
+ * All segments belonging to the same chunk list have the same message ID
+ * A segment could have multiple I/O requests
+ */
+typedef struct {
+ guint32 handle; /* Handle or remote key of segment */
+ guint64 offset; /* Segment virtual address or offset */
+ guint32 msgid; /* ID for fragments belonging together */
+ guint32 msgno; /* Message number base so fragments are
+ consecutive within segment requests */
+ chunk_type_t type; /* Chunk type for segment */
+ guint32 xdrpos; /* Position in XDR stream -- RDMA read only */
+ guint32 length; /* Length of segment in bytes */
+ guint32 rbytes; /* Number of bytes added to reassembly table */
+} segment_info_t;
+
+typedef struct {
+ guint32 psn; /* First PSN for request */
+ guint32 length; /* Request length */
+ guint64 offset; /* Request offset */
+ segment_info_t *segment; /* Segment info for RDMA I/O */
+} ib_request_t;
+
+/* Send reassembly info structure */
+typedef struct {
+ guint32 msgid; /* ID for fragments belonging together */
+ guint32 msgno; /* Message number base */
+ guint32 rsize; /* Number of bytes added to reassembly table */
+} send_info_t;
+
+/* State structure per conversation */
+typedef struct {
+ wmem_tree_t *segment_list; /* Binary tree of segments searched by handle */
+ wmem_tree_t *psn_list; /* Binary tree of IB requests searched by PSN */
+ wmem_tree_t *msgid_list; /* Binary tree of segments with same message id */
+ wmem_tree_t *request_list; /* Binary tree of iWarp read requests for mapping sink -> source */
+ wmem_tree_t *send_list; /* Binary tree for mapping PSN -> msgid (IB) */
+ wmem_tree_t *msn_list; /* Binary tree for mapping MSN -> msgid (iWarp) */
+ segment_info_t *segment_info; /* Current READ/WRITE/REPLY segment info */
+ guint32 iosize; /* Maximum size of data transferred in a
+ single packet */
+} rdma_conv_info_t;
+
+/* Proto data keys */
+enum {
+ RPCRDMA_MSG_ID,
+ RPCRDMA_FRAG_HEAD,
+ RPCRDMA_WRITE_SIZE,
+};
+
+/* Return the number of fragments of size 'b' in 'a' */
+#define NFRAGS(a,b) ((a)/(b) + ((a)%(b) ? 1: 0))
+
+/*
+ * Global variable set for every InfiniBand packet. This is used because
+ * the arguments in dissect_rpcrdma are fixed and cannot be changed to pass
+ * an extra argument to differentiate between InfiniBand and iWarp.
+ * Reassembly is only supported for InfiniBand packets.
+ */
+static struct infinibandinfo *gp_infiniband_info = NULL;
+
+/* Global variable set for every iWarp packet */
+static rdmap_info_t *gp_rdmap_info = NULL;
+
+/* Call process_reassembled_data just once per frame */
+static gboolean g_needs_reassembly = FALSE;
+
+/* Array of offsets for reduced data in write chunks */
+static wmem_array_t *gp_rdma_write_offsets = NULL;
+
+/* Signal upper layer(s) the current frame's data has been reduced by DDP */
+static gboolean g_rpcrdma_reduced = FALSE;
+
+/*
+ * Signal upper layer(s) the current frame's data has been reduced by DDP
+ * (direct data placement) in which large data chunks have been removed from
+ * the XDR data so these data chunks could be transferred using RDMA writes.
+ * This is only used on RDMA write chunks because there is no way to know
+ * where each write chunk must be inserted into the XDR data.
+ * Read chunks have the xdrpos because the client needs to notify the server
+ * how to reassemble the reduced message and their chunks. On the other hand,
+ * write chunks do not have this information because the client knows exactly
+ * how to reassemble the reply with the use of the virtual address in the chunk,
+ * but this virtual address is internal to the client -- there is no way to
+ * map the virtual address to an offset within the XDR data.
+ */
+gboolean rpcrdma_is_reduced(void)
+{
+ return g_rpcrdma_reduced;
+}
+
+/*
+ * Insert offset in the reduced data write chunk array.
+ * Offset is relative to the reduced message from the end of the reported
+ * buffer because the upper layer is dealing with the reduced XDR message
+ * so it is easier to report this offset back and calculate the correct XDR
+ * position in this layer before reassembly starts for a reduced message
+ */
+void rpcrdma_insert_offset(gint offset)
+{
+ wmem_array_append_one(gp_rdma_write_offsets, offset);
+}
+
+/*
+ * Reset the array of write offsets at the end of the frame. These
+ * are packet scoped, so they don't need to be freed, but we want
+ * to ensure that the global doesn't point to no longer allocated
+ * memory in a later packet.
+ */
+static void
+reset_write_offsets(void)
+{
+ gp_rdma_write_offsets = NULL;
+}
+
+/* Get conversation state, it is created if it does not exist */
+static rdma_conv_info_t *get_rdma_conv_info(packet_info *pinfo)
+{
+ conversation_t *p_conversation;
+ rdma_conv_info_t *p_rdma_conv_info;
+
+ /* Find or create conversation info */
+ p_conversation = find_or_create_conversation(pinfo);
+
+ /* Get state structure for this conversation */
+ p_rdma_conv_info = (rdma_conv_info_t *)conversation_get_proto_data(p_conversation, proto_rpcordma);
+ if (p_rdma_conv_info == NULL) {
+ /* Add state structure for this conversation */
+ p_rdma_conv_info = wmem_new(wmem_file_scope(), rdma_conv_info_t);
+ p_rdma_conv_info->segment_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->psn_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->msgid_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->send_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->msn_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->request_list = wmem_tree_new(wmem_file_scope());
+ p_rdma_conv_info->segment_info = NULL;
+ p_rdma_conv_info->iosize = 1;
+ conversation_add_proto_data(p_conversation, proto_rpcordma, p_rdma_conv_info);
+ }
+ return p_rdma_conv_info;
+}
+
+/* Set RDMA maximum I/O size for conversation */
+static void set_max_iosize(rdma_conv_info_t *p_rdma_conv_info, guint size)
+{
+ p_rdma_conv_info->iosize = MAX(p_rdma_conv_info->iosize, size);
+}
+
+/* Return a unique non-zero message ID */
+static guint32 get_msg_id(void)
+{
+ static guint32 msg_id = 0;
+ if (++msg_id == 0) {
+ /* Message ID has wrapped around so increment again */
+ ++msg_id;
+ }
+ return msg_id;
+}
+
+/* Find segment info for the given handle and offset */
+static segment_info_t *find_segment_info(rdma_conv_info_t *p_rdma_conv_info, guint32 handle, guint64 offset)
+{
+ segment_info_t *p_segment_info;
+
+ p_segment_info = (segment_info_t *)wmem_tree_lookup32(p_rdma_conv_info->segment_list, handle);
+ if (p_segment_info && offset >= p_segment_info->offset && \
+ offset < p_segment_info->offset + p_segment_info->length)
+ return p_segment_info;
+ return NULL;
+}
+
+/* Add Infiniband request info for the correct segment */
+static void add_request_info(rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo)
+{
+ segment_info_t *p_segment_info;
+ ib_request_t *p_ib_request;
+
+ if (!pinfo->fd->visited) {
+ p_segment_info = find_segment_info(p_rdma_conv_info, gp_infiniband_info->reth_remote_key, gp_infiniband_info->reth_remote_address);
+ if (p_segment_info) {
+ /* Add request to list */
+ p_ib_request = wmem_new(wmem_file_scope(), ib_request_t);
+ p_ib_request->psn = gp_infiniband_info->packet_seq_num;
+ p_ib_request->offset = gp_infiniband_info->reth_remote_address;
+ p_ib_request->length = gp_infiniband_info->reth_dma_length;
+ p_ib_request->segment = p_segment_info;
+ wmem_tree_insert32(p_rdma_conv_info->psn_list, gp_infiniband_info->packet_seq_num, p_ib_request);
+ }
+ }
+}
+
+/*
+ * Return if reassembly is done by checking all bytes in each segment have
+ * been added to the reassembly table. It could be more than requested
+ * because of padding bytes.
+ */
+static gboolean is_reassembly_done(rdma_conv_info_t *p_rdma_conv_info, guint32 msgid)
+{
+ guint32 message_size = 0;
+ guint32 reassembled_size = 0;
+ wmem_list_frame_t *item;
+ wmem_list_t *msgid_segments;
+ segment_info_t *p_segment_info;
+ gboolean ret = FALSE; /* Make sure there is at least one segment */
+ int segment_type = -1;
+
+ /* Get all segments for the given msgid */
+ msgid_segments = wmem_tree_lookup32(p_rdma_conv_info->msgid_list, msgid);
+ if (msgid_segments) {
+ for (item = wmem_list_head(msgid_segments); item != NULL; item = wmem_list_frame_next(item)) {
+ p_segment_info = wmem_list_frame_data(item);
+ segment_type = p_segment_info->type;
+ if (p_segment_info->rbytes < p_segment_info->length) {
+ /* Not all bytes have been received for this request */
+ return FALSE;
+ }
+ /* At least one segment is done, check the rest */
+ ret = TRUE;
+ message_size += p_segment_info->length;
+ reassembled_size += p_segment_info->rbytes;
+ }
+ }
+ if (ret && segment_type == RDMA_READ_CHUNK) {
+ /*
+ * Make sure all bytes are added to the reassembly table. Since the
+ * reassembly is done on the READ_RESPONSE_LAST, a read request could
+ * happen after the last read response for the previous request, in
+ * this case this will give a false positive so check the total size
+ * of all chunks (all segments required for the message)
+ */
+ return (reassembled_size >= message_size);
+ }
+ return ret;
+}
+
+/*
+ * Get the fragment head from the cache
+ * Returns NULL if still missing fragments
+ */
+static fragment_head *get_fragment_head(packet_info *pinfo)
+{
+ return (fragment_head *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_FRAG_HEAD);
+}
+
+/* Save the fragment head on the proto data cache */
+static void set_fragment_head(fragment_head *fd_head, packet_info *pinfo)
+{
+ if (fd_head && fd_head != get_fragment_head(pinfo)) {
+ /* Add the fragment head to the packet cache */
+ p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_FRAG_HEAD, fd_head);
+ }
+}
+
+/*
+ * Get the fragment head for the current frame
+ * Returns non-NULL if this frame is a fragment
+ */
+static fragment_head *get_reassembled_id(packet_info *pinfo)
+{
+ guint32 *p_msgid;
+ p_msgid = (guint32 *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID);
+ if (p_msgid) {
+ return fragment_get_reassembled_id(&rpcordma_reassembly_table, pinfo, *p_msgid);
+ }
+ return NULL;
+}
+
+/* Get the reassembled data, returns NULL if still missing fragments */
+static tvbuff_t *get_reassembled_data(tvbuff_t *tvb, guint offset,
+ packet_info *pinfo, proto_tree *tree)
+{
+ tvbuff_t *new_tvb = NULL;
+ fragment_head *fd_head;
+
+ if (g_needs_reassembly) {
+ /* Get fragment head for fragment to display "Reassembled in" message */
+ fd_head = get_reassembled_id(pinfo);
+ if (!fd_head) {
+ /* Get fragment head on frame where reassembly has been completed */
+ fd_head = get_fragment_head(pinfo);
+ }
+ if (fd_head) {
+ new_tvb = process_reassembled_data(tvb, offset, pinfo,
+ "Reassembled RPCoRDMA Message", fd_head, &rpcordma_frag_items,
+ NULL, tree);
+ /* Call process_reassembled_data just once per frame */
+ g_needs_reassembly = FALSE;
+ }
+ }
+ return new_tvb;
+}
+
+/*
+ * Complete reassembly:
+ * 1. If p_rdma_conv_info is NULL then complete reassembly.
+ * 2. If p_rdma_conv_info is non-NULL then complete reassembly only if
+ * reassembly is really done by making sure all data has been received.
+ */
+static fragment_head *end_reassembly(guint32 msgid,
+ rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo)
+{
+ fragment_head *fd_head = NULL;
+
+ /* Check if reassembly is really done only if p_rdma_conv_info is non-NULL */
+ if (!p_rdma_conv_info || is_reassembly_done(p_rdma_conv_info, msgid)) {
+ /* Complete the reassembly */
+ fd_head = fragment_end_seq_next(&rpcordma_reassembly_table, pinfo, msgid, NULL);
+ set_fragment_head(fd_head, pinfo);
+ }
+ return fd_head;
+}
+
+/*
+ * Add a fragment to the reassembly table and return the reassembled data
+ * if all fragments have been added
+ */
+static tvbuff_t *add_fragment(tvbuff_t *tvb, gint offset, guint32 msgid,
+ gint32 msg_num, gboolean more_frags, rdma_conv_info_t *p_rdma_conv_info,
+ packet_info *pinfo, proto_tree *tree)
+{
+ guint8 pad_count = 0;
+ guint32 nbytes, frag_size;
+ tvbuff_t *new_tvb = NULL;
+ fragment_head *fd_head = NULL;
+ guint32 *p_msgid;
+
+ if (gp_infiniband_info) {
+ pad_count = gp_infiniband_info->pad_count;
+ }
+
+ /* Get fragment head if reassembly has been completed */
+ fd_head = get_fragment_head(pinfo);
+ if (fd_head == NULL) {
+ /* Reassembly has not been completed yet */
+ if (msg_num >= 0) {
+ nbytes = tvb_captured_length_remaining(tvb, offset);
+ if (nbytes > 0 || more_frags) {
+ /* Add message fragment to reassembly table */
+ if (pad_count > 0 && p_rdma_conv_info && \
+ p_rdma_conv_info->segment_info != NULL && \
+ p_rdma_conv_info->segment_info->type == RDMA_READ_CHUNK && \
+ p_rdma_conv_info->segment_info->xdrpos == 0) {
+ /* Do not include any padding bytes inserted by Infiniband
+ * layer if this is a PZRC (Position-Zero Read Chunk) since
+ * payload stream already has any necessary padding bytes */
+ frag_size = tvb_reported_length_remaining(tvb, offset) - pad_count;
+ if (frag_size < nbytes) {
+ nbytes = frag_size;
+ }
+ }
+ fd_head = fragment_add_seq_check(&rpcordma_reassembly_table,
+ tvb, offset, pinfo,
+ msgid, NULL, (guint32)msg_num,
+ nbytes, more_frags);
+ /* Save the msgid in the proto data cache */
+ p_msgid = wmem_new(wmem_file_scope(), guint32);
+ *p_msgid = msgid;
+ p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID, p_msgid);
+ } else if (p_rdma_conv_info) {
+ /* No data in this frame, so just complete the reassembly
+ * if reassembly is really done */
+ fd_head = end_reassembly(msgid, p_rdma_conv_info, pinfo);
+ }
+ /* Add the fragment head to the packet cache */
+ set_fragment_head(fd_head, pinfo);
+ }
+ }
+
+ /* Get reassembled data */
+ new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
+
+ return new_tvb;
+}
+
+/*
+ * Add an Infiniband fragment to the reassembly table and return the
+ * reassembled data if all fragments have been added
+ */
+static tvbuff_t *add_ib_fragment(tvbuff_t *tvb,
+ rdma_conv_info_t *p_rdma_conv_info, gboolean only_frag,
+ packet_info *pinfo, proto_tree *tree)
+{
+ guint32 msgid, msg_num, msg_off;
+ guint32 nfrags, psndelta = 0;
+ tvbuff_t *new_tvb = NULL;
+ ib_request_t *p_ib_request;
+ segment_info_t *p_segment_info = NULL;
+ guint32 iosize = p_rdma_conv_info->iosize;
+ guint64 va_offset;
+
+ if (pinfo->fd->visited) {
+ return get_reassembled_data(tvb, 0, pinfo, tree);
+ } else if (only_frag) {
+ /* Write Only: no request so use segment info */
+ p_segment_info = find_segment_info(p_rdma_conv_info, gp_infiniband_info->reth_remote_key, gp_infiniband_info->reth_remote_address);
+ va_offset = gp_infiniband_info->reth_remote_address;
+ } else {
+ p_rdma_conv_info->segment_info = NULL;
+ /* Get correct request */
+ p_ib_request = (ib_request_t *)wmem_tree_lookup32_le(p_rdma_conv_info->psn_list, gp_infiniband_info->packet_seq_num);
+ if (p_ib_request) {
+ psndelta = gp_infiniband_info->packet_seq_num - p_ib_request->psn;
+ nfrags = NFRAGS((p_ib_request->length), iosize);
+ if (psndelta < nfrags) {
+ /* This is the correct request */
+ p_segment_info = p_ib_request->segment;
+ /* Make message number relative to request */
+ va_offset = p_ib_request->offset;
+ }
+ }
+ }
+ if (p_segment_info) {
+ p_rdma_conv_info->segment_info = p_segment_info;
+ p_segment_info->rbytes += tvb_reported_length(tvb);
+ /* Make message number relative to request or segment(write only) */
+ msg_off = (guint32)NFRAGS((va_offset - p_segment_info->offset), iosize) + psndelta;
+ msgid = p_segment_info->msgid;
+ msg_num = p_segment_info->msgno + 1 + msg_off;
+ new_tvb = add_fragment(tvb, 0, msgid, msg_num, TRUE, p_rdma_conv_info, pinfo, tree);
+ }
+ return new_tvb;
+}
+
+/*
+ * Add padding bytes as a separate fragment when last fragment's data is not
+ * on a four-byte boundary. The MPA layer removes the padding bytes from all
+ * iWarp Reads/Writes. The iWarp Send messages are padded correctly.
+ */
+static void add_iwarp_padding(tvbuff_t *tvb, gint offset,
+ guint32 msgid, guint32 msgno, packet_info *pinfo)
+{
+ gchar *pbuf;
+ tvbuff_t *pad_tvb;
+ /* Size of payload data for current iWarp Read/Write */
+ guint32 bsize = tvb_reported_length_remaining(tvb, offset);
+ /* Number of padding bytes needed */
+ guint32 padding = (4 - (bsize%4)) % 4;
+
+ if (padding > 0) {
+ /* Allocate buffer for the number of padding bytes that will be added */
+ pbuf = (gchar *)wmem_alloc(pinfo->pool, padding);
+ memset(pbuf, 0, padding);
+ /* Create tvb buffer */
+ pad_tvb = tvb_new_real_data(pbuf, padding, padding);
+ /* Add padding fragment to the reassembly table */
+ fragment_add_seq_check(&rpcordma_reassembly_table, pad_tvb, 0,
+ pinfo, msgid, NULL, msgno, padding, TRUE);
+ }
+}
+
+/*
+ * Add an iWarp fragment to the reassembly table and return the
+ * reassembled data if all fragments have been added
+ */
+static tvbuff_t *add_iwarp_fragment(tvbuff_t *tvb,
+ rdma_conv_info_t *p_rdma_conv_info, packet_info *pinfo,
+ proto_tree *tree)
+{
+ guint32 sbytes = 0; /* Total bytes for all segments in current reassembly */
+ guint32 rbytes = 0; /* Total bytes received so far */
+ guint32 msgno; /* Message number for this fragment */
+ guint32 steering_tag;
+ guint64 tagged_offset;
+ gboolean more_frags = TRUE;
+ wmem_list_t *msgid_segments;
+ wmem_list_frame_t *item;
+ segment_info_t *p_seginfo;
+ segment_info_t *p_segment_info;
+ rdmap_request_t *p_read_request = NULL;
+ tvbuff_t *new_tvb = NULL;
+
+ if (pinfo->fd->visited) {
+ return get_reassembled_data(tvb, 0, pinfo, tree);
+ } else if (gp_rdmap_info->opcode == RDMA_READ_RESPONSE) {
+ /* Read fragment: map sink -> source using the request info */
+ p_read_request = wmem_tree_lookup32(p_rdma_conv_info->request_list, gp_rdmap_info->steering_tag);
+ if (p_read_request) {
+ /* Map Read Response STag to segment STag */
+ steering_tag = p_read_request->source_stag;
+ /* Map Read Response offset to segment offset */
+ tagged_offset = gp_rdmap_info->tagged_offset - p_read_request->sink_toffset + p_read_request->source_toffset;
+ } else {
+ return NULL;
+ }
+ } else {
+ /* Write fragment: no need for mapping, use steering tag and offset */
+ steering_tag = gp_rdmap_info->steering_tag;
+ tagged_offset = gp_rdmap_info->tagged_offset;
+ }
+
+ p_rdma_conv_info->segment_info = NULL;
+ p_segment_info = find_segment_info(p_rdma_conv_info, steering_tag, tagged_offset);
+ if (p_segment_info) {
+ /* Message number is relative with respect to chunk, adding
+ * one since msgno = 0 is reserved for the reduced message */
+ msgno = (guint32)(tagged_offset - p_segment_info->offset) + p_segment_info->msgno + 1;
+ p_rdma_conv_info->segment_info = p_segment_info;
+
+ /* Include this fragment's data */
+ p_segment_info->rbytes += tvb_captured_length_remaining(tvb, 0);
+
+ if (gp_rdmap_info->last_flag) {
+ /* This is a last fragment so go through all segments
+ * to calculate sbytes and rbytes */
+ msgid_segments = wmem_tree_lookup32(p_rdma_conv_info->msgid_list, p_segment_info->msgid);
+ if (msgid_segments) {
+ for (item = wmem_list_head(msgid_segments); item != NULL; item = wmem_list_frame_next(item)) {
+ p_seginfo = wmem_list_frame_data(item);
+ sbytes += p_seginfo->length;
+ rbytes += p_seginfo->rbytes;
+ }
+ }
+ if (p_read_request && rbytes == sbytes) {
+ /* Complete read chunk reassembly since all fragments
+ * have been received */
+ more_frags = FALSE;
+ }
+ }
+ new_tvb = add_fragment(tvb, 0, p_segment_info->msgid, msgno, TRUE, p_rdma_conv_info, pinfo, tree);
+ if ((!new_tvb && !more_frags) || (gp_rdmap_info->last_flag && !p_read_request && rbytes == sbytes)) {
+ /* This is the very last fragment, include any padding if needed */
+ add_iwarp_padding(tvb, 0, p_segment_info->msgid, msgno+1, pinfo);
+ }
+ if (!new_tvb && !more_frags) {
+ /* Complete reassembly */
+ end_reassembly(p_segment_info->msgid, p_rdma_conv_info, pinfo);
+ new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
+ }
+ }
+ return new_tvb;
+}
+
+static guint get_read_list_size(tvbuff_t *tvb, guint max_offset, guint offset)
+{
+ guint32 value_follows;
+ guint start = offset;
+
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (offset > max_offset)
+ return 0;
+ if (!value_follows)
+ break;
+
+ offset += 20;
+ if (offset > max_offset)
+ return 0;
+ }
+
+ return offset - start;
+}
+
+static guint get_read_list_chunk_count(tvbuff_t *tvb, guint offset)
+{
+ guint32 value_follows;
+ guint num_chunks;
+
+ num_chunks = 0;
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (!value_follows)
+ break;
+
+ num_chunks++;
+ offset += 20;
+ }
+ return num_chunks;
+}
+
+static guint get_write_chunk_size(tvbuff_t *tvb, guint offset)
+{
+ guint segment_count;
+ guint max_count = (guint)tvb_reported_length_remaining(tvb, offset + 4) / 16;
+
+ segment_count = tvb_get_ntohl(tvb, offset);
+ if (segment_count > max_count) {
+ /* XXX We should throw an exception here. */
+ segment_count = max_count;
+ }
+ return 4 + (segment_count * 16);
+}
+
+static guint get_write_list_size(tvbuff_t *tvb, guint max_offset, guint offset)
+{
+ guint32 value_follows;
+ guint chunk_size, start = offset;
+
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (offset > max_offset)
+ return 0;
+ if (!value_follows)
+ break;
+
+ chunk_size = get_write_chunk_size(tvb, offset);
+ if (max_offset - offset < chunk_size)
+ return 0;
+ offset += chunk_size;
+ }
+
+ return offset - start;
+}
+
+static guint get_write_list_chunk_count(tvbuff_t *tvb, guint offset)
+{
+ guint32 value_follows;
+ guint num_chunks, chunk_size;
+
+ num_chunks = 0;
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (!value_follows)
+ break;
+
+ num_chunks++;
+ chunk_size = get_write_chunk_size(tvb, offset);
+ if (chunk_size == 0)
+ break;
+ offset += chunk_size;
+ }
+
+ return num_chunks;
+}
+
+static guint get_reply_chunk_size(tvbuff_t *tvb, guint max_offset, guint offset)
+{
+ guint32 value_follows;
+ guint start = offset;
+
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (offset > max_offset)
+ return 0;
+
+ if (value_follows) {
+ offset += get_write_chunk_size(tvb, offset);
+ if (offset > max_offset)
+ return 0;
+ }
+
+ return offset - start;
+}
+
+static guint get_reply_chunk_count(tvbuff_t *tvb, guint offset)
+{
+ guint32 value_follows;
+
+ value_follows = tvb_get_ntohl(tvb, offset);
+ return value_follows ? 1 : 0;
+}
+
+/*
+ * The RDMA read list is given as a list of read segments in the protocol.
+ * In order to create a list of chunks, all segments having the same XDR
+ * position will be part of an RDMA read chunk.
+ */
+static void add_rdma_read_segment(wmem_array_t *p_read_list,
+ rdma_segment_t *p_rdma_segment)
+{
+ guint i;
+ rdma_segment_t *p_segment;
+ rdma_chunk_t *p_rdma_chunk = NULL;
+
+ /* Look for correct chunk where to insert the segment */
+ for (i=0; i<wmem_array_get_count(p_read_list); i++) {
+ p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_read_list, i);
+ p_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, 0);
+ if (p_segment->xdrpos == p_rdma_segment->xdrpos) {
+ /* Found correct read chunk */
+ break;
+ } else {
+ p_rdma_chunk = NULL;
+ }
+ }
+
+ if (p_rdma_chunk == NULL) {
+ /* No read chunk was found so initialize a new chunk */
+ p_rdma_chunk = wmem_new(wmem_packet_scope(), rdma_chunk_t);
+ p_rdma_chunk->type = RDMA_READ_CHUNK;
+ p_rdma_chunk->segments = wmem_array_new(wmem_packet_scope(), sizeof(rdma_segment_t));
+ /* Add read chunk to the RDMA read list */
+ wmem_array_append(p_read_list, p_rdma_chunk, 1);
+ }
+
+ /* Add segment to the read chunk */
+ wmem_array_append(p_rdma_chunk->segments, p_rdma_segment, 1);
+}
+
+static guint dissect_rpcrdma_read_chunk(proto_tree *read_list,
+ tvbuff_t *tvb, guint offset, wmem_array_t *p_read_list)
+{
+ proto_tree *read_chunk;
+ guint32 position;
+ rdma_segment_t *p_rdma_segment;
+
+ /* Initialize read segment */
+ p_rdma_segment = wmem_new(wmem_packet_scope(), rdma_segment_t);
+
+ position = tvb_get_ntohl(tvb, offset);
+ p_rdma_segment->xdrpos = position;
+ read_chunk = proto_tree_add_subtree_format(read_list, tvb,
+ offset, 20, ett_rpcordma_read_chunk, NULL,
+ "Read chunk: (position %u)", position);
+
+ proto_tree_add_item(read_chunk, hf_rpcordma_position, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ proto_tree_add_item_ret_uint(read_chunk, hf_rpcordma_rdma_handle, tvb,
+ offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->handle);
+ offset += 4;
+ proto_tree_add_item_ret_uint(read_chunk, hf_rpcordma_rdma_length, tvb,
+ offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->length);
+ offset += 4;
+ proto_tree_add_item_ret_uint64(read_chunk, hf_rpcordma_rdma_offset, tvb,
+ offset, 8, ENC_BIG_ENDIAN, &p_rdma_segment->offset);
+
+ add_rdma_read_segment(p_read_list, p_rdma_segment);
+ return offset + 8;
+}
+
+static guint dissect_rpcrdma_read_list(tvbuff_t *tvb, guint offset,
+ proto_tree *tree, rdma_lists_t *rdma_lists)
+{
+ guint chunk_count, start = offset;
+ proto_tree *read_list;
+ guint32 value_follows;
+ proto_item *item;
+
+ chunk_count = get_read_list_chunk_count(tvb, offset);
+ item = proto_tree_add_uint_format(tree, hf_rpcordma_reads_count,
+ tvb, offset, 0, chunk_count,
+ "Read list (count: %u)", chunk_count);
+
+ read_list = proto_item_add_subtree(item, ett_rpcordma_read_list);
+
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (!value_follows)
+ break;
+
+ if (rdma_lists->p_read_list == NULL) {
+ /* Initialize RDMA read list */
+ rdma_lists->p_read_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
+ }
+ offset = dissect_rpcrdma_read_chunk(read_list, tvb, offset, rdma_lists->p_read_list);
+ }
+
+ proto_item_set_len(item, offset - start);
+ return offset;
+}
+
+static guint dissect_rpcrdma_segment(proto_tree *write_chunk, tvbuff_t *tvb,
+ guint offset, guint32 i, wmem_array_t *p_segments)
+{
+ proto_tree *segment;
+ rdma_segment_t *p_rdma_segment;
+
+ /* Initialize write segment */
+ p_rdma_segment = wmem_new(wmem_packet_scope(), rdma_segment_t);
+ p_rdma_segment->xdrpos = 0; /* Not used in write segments */
+
+ segment = proto_tree_add_subtree_format(write_chunk, tvb,
+ offset, 16, ett_rpcordma_segment, NULL,
+ "RDMA segment %u", i);
+
+ proto_tree_add_item_ret_uint(segment, hf_rpcordma_rdma_handle, tvb,
+ offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->handle);
+ offset += 4;
+ proto_tree_add_item_ret_uint(segment, hf_rpcordma_rdma_length, tvb,
+ offset, 4, ENC_BIG_ENDIAN, &p_rdma_segment->length);
+ offset += 4;
+ proto_tree_add_item_ret_uint64(segment, hf_rpcordma_rdma_offset, tvb,
+ offset, 8, ENC_BIG_ENDIAN, &p_rdma_segment->offset);
+
+ /* Add segment to the write chunk */
+ wmem_array_append(p_segments, p_rdma_segment, 1);
+ return offset + 8;
+}
+
+static guint dissect_rpcrdma_write_chunk(proto_tree *write_list, tvbuff_t *tvb,
+ guint offset, chunk_type_t chunk_type, wmem_array_t *p_rdma_list)
+{
+ guint32 i, segment_count;
+ proto_tree *write_chunk;
+ guint selection_size;
+ rdma_chunk_t *p_rdma_chunk;
+
+ selection_size = get_write_chunk_size(tvb, offset);
+ segment_count = tvb_get_ntohl(tvb, offset);
+ write_chunk = proto_tree_add_subtree_format(write_list, tvb,
+ offset, selection_size,
+ ett_rpcordma_write_chunk, NULL,
+ "Write chunk (%u segment%s)", segment_count,
+ segment_count == 1 ? "" : "s");
+ proto_tree_add_item(write_chunk, hf_rpcordma_segment_count,
+ tvb, offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+
+ /* Initialize write chunk */
+ p_rdma_chunk = wmem_new(wmem_packet_scope(), rdma_chunk_t);
+ p_rdma_chunk->type = chunk_type;
+ p_rdma_chunk->segments = wmem_array_new(wmem_packet_scope(), sizeof(rdma_segment_t));
+
+ /* Add chunk to the write/reply list */
+ wmem_array_append(p_rdma_list, p_rdma_chunk, 1);
+
+ for (i = 0; i < segment_count; ++i)
+ offset = dissect_rpcrdma_segment(write_chunk, tvb, offset, i, p_rdma_chunk->segments);
+
+ return offset;
+}
+
+static guint dissect_rpcrdma_write_list(tvbuff_t *tvb, guint offset,
+ proto_tree *tree, rdma_lists_t *rdma_lists)
+{
+ guint chunk_count, start = offset;
+ proto_tree *write_list;
+ guint32 value_follows;
+ proto_item *item;
+
+ chunk_count = get_write_list_chunk_count(tvb, offset);
+ item = proto_tree_add_uint_format(tree, hf_rpcordma_writes_count,
+ tvb, offset, 0, chunk_count,
+ "Write list (count: %u)", chunk_count);
+
+ write_list = proto_item_add_subtree(item, ett_rpcordma_write_list);
+
+ while (1) {
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (!value_follows)
+ break;
+
+ if (rdma_lists->p_write_list == NULL) {
+ /* Initialize RDMA write list */
+ rdma_lists->p_write_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
+ }
+ offset = dissect_rpcrdma_write_chunk(write_list, tvb, offset, RDMA_WRITE_CHUNK, rdma_lists->p_write_list);
+ }
+
+ proto_item_set_len(item, offset - start);
+ return offset;
+}
+
+static guint dissect_rpcrdma_reply_chunk(tvbuff_t *tvb, guint offset,
+ proto_tree *tree, rdma_lists_t *rdma_lists)
+{
+ guint32 chunk_count, start = offset;
+ proto_tree *reply_chunk;
+ guint32 value_follows;
+ proto_item *item;
+
+ chunk_count = get_reply_chunk_count(tvb, offset);
+ item = proto_tree_add_uint_format(tree, hf_rpcordma_reply_count,
+ tvb, offset, 4, chunk_count,
+ "Reply chunk (count: %u)", chunk_count);
+
+ reply_chunk = proto_item_add_subtree(item, ett_rpcordma_reply_chunk);
+
+ value_follows = tvb_get_ntohl(tvb, offset);
+ offset += 4;
+ if (!value_follows)
+ return offset;
+
+ /* Initialize RDMA reply list */
+ rdma_lists->p_reply_list = wmem_array_new(wmem_packet_scope(), sizeof(rdma_chunk_t));
+
+ offset = dissect_rpcrdma_write_chunk(reply_chunk, tvb, offset, RDMA_REPLY_CHUNK, rdma_lists->p_reply_list);
+ proto_item_set_len(item, offset - start);
+ return offset;
+}
+
+static guint parse_rdma_header(tvbuff_t *tvb, guint offset, proto_tree *tree,
+ rdma_lists_t *rdma_lists)
+{
+ offset = dissect_rpcrdma_read_list(tvb, offset, tree, rdma_lists);
+ offset = dissect_rpcrdma_write_list(tvb, offset, tree, rdma_lists);
+ return dissect_rpcrdma_reply_chunk(tvb, offset, tree, rdma_lists);
+}
+
+static guint get_chunk_lists_size(tvbuff_t *tvb, guint max_offset, guint offset)
+{
+ guint size, start = offset;
+
+ size = get_read_list_size(tvb, max_offset, offset);
+ if (!size)
+ return 0;
+ offset += size;
+
+ size = get_write_list_size(tvb, max_offset, offset);
+ if (!size)
+ return 0;
+ offset += size;
+
+ size = get_reply_chunk_size(tvb, max_offset, offset);
+ if (!size)
+ return 0;
+ offset += size;
+
+ return offset - start;
+}
+
+/*
+ * Return the total number of bytes for the given RDMA chunk list
+ * Returns 0 when called on an RPC call message because that is where the
+ * segments are set up. On an RPC reply message the total number of bytes
+ * added to the reassembly table is returned. This is only valid for RDMA
+ * writes since there is no RPC-over-RDMA layer for RDMA reads on an RPC reply.
+ */
+static guint
+get_rdma_list_size(wmem_array_t *p_list, packet_info *pinfo)
+{
+ guint i, j, size = 0;
+ guint32 *p_size;
+ rdma_chunk_t *p_rdma_chunk;
+ rdma_segment_t *p_rdma_segment;
+ segment_info_t *p_segment_info;
+ rdma_conv_info_t *p_rdma_conv_info;
+
+ if (p_list) {
+ /* Get size from cache */
+ p_size = (guint32 *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_WRITE_SIZE);
+ if (p_size) {
+ return *p_size;
+ }
+ /* Get conversation state */
+ p_rdma_conv_info = get_rdma_conv_info(pinfo);
+ for (i=0; i<wmem_array_get_count(p_list); i++) {
+ p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_list, i);
+ for (j=0; j<wmem_array_get_count(p_rdma_chunk->segments); j++) {
+ p_rdma_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, j);
+ p_segment_info = find_segment_info(p_rdma_conv_info, p_rdma_segment->handle, p_rdma_segment->offset);
+ if (p_segment_info) {
+ size += p_segment_info->rbytes;
+ }
+ }
+ }
+ }
+ if (size > 0) {
+ /* Save size on the proto data cache */
+ p_size = wmem_new(wmem_file_scope(), guint32);
+ *p_size = size;
+ p_add_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_WRITE_SIZE, p_size);
+ }
+ return size;
+}
+
+/* Process an RDMA chunk list (read, write or reply) */
+static tvbuff_t *
+process_rdma_list(tvbuff_t *tvb, guint offset, wmem_array_t *p_list,
+ packet_info *pinfo, proto_tree *tree)
+{
+ guint i, j, size;
+ guint32 msgid = 0;
+ guint32 xdrpos = 0;
+ guint32 xdrprev = 0;
+ guint32 lenprev = 0;
+ guint32 msg_num = 0;
+ guint32 msg_off = 0;
+ guint *p_offset = NULL;
+ tvbuff_t *tmp_tvb;
+ tvbuff_t *new_tvb = NULL;
+ fragment_head *fd_head;
+ rdma_segment_t *p_rdma_segment;
+ rdma_chunk_t *p_rdma_chunk = NULL;
+ segment_info_t *p_segment_info = NULL;
+ gboolean setup = FALSE;
+ wmem_list_t *msgid_segments = NULL;
+ rdma_conv_info_t *p_rdma_conv_info;
+
+ if (p_list) {
+ /* Get conversation state */
+ p_rdma_conv_info = get_rdma_conv_info(pinfo);
+
+ for (i=0; i<wmem_array_get_count(p_list); i++) {
+ p_rdma_chunk = (rdma_chunk_t *)wmem_array_index(p_list, i);
+ p_rdma_chunk->length = 0;
+ p_offset = NULL;
+
+ if (p_rdma_chunk->type == RDMA_WRITE_CHUNK) {
+ /* Process any write chunk offsets from reduced message */
+ if (gp_rdma_write_offsets && wmem_array_get_count(gp_rdma_write_offsets) == wmem_array_get_count(p_list)) {
+ p_offset = (guint *)wmem_array_index(gp_rdma_write_offsets, i);
+ /* Convert reduced offset to xdr position */
+ xdrpos = tvb_reported_length_remaining(tvb, offset) - *p_offset + msg_off;
+ }
+ }
+
+ for (j=0; j<wmem_array_get_count(p_rdma_chunk->segments); j++) {
+ p_rdma_segment = (rdma_segment_t *)wmem_array_index(p_rdma_chunk->segments, j);
+ if (p_rdma_chunk->type == RDMA_READ_CHUNK) {
+ xdrpos = p_rdma_segment->xdrpos;
+ }
+ p_segment_info = find_segment_info(p_rdma_conv_info, p_rdma_segment->handle, p_rdma_segment->offset);
+ if (p_segment_info) {
+ /* This must be the reply, change segment size */
+ p_segment_info->length = p_rdma_segment->length;
+ } else {
+ if (msgid == 0) {
+ /* Create new message ID */
+ msgid = get_msg_id();
+ msgid_segments = wmem_list_new(wmem_file_scope());
+ wmem_tree_insert32(p_rdma_conv_info->msgid_list, msgid, msgid_segments);
+ }
+ /* Create new segment info */
+ p_segment_info = wmem_new(wmem_file_scope(), segment_info_t);
+ p_segment_info->handle = p_rdma_segment->handle;
+ p_segment_info->length = p_rdma_segment->length;
+ p_segment_info->offset = p_rdma_segment->offset;
+ p_segment_info->msgid = msgid;
+ p_segment_info->msgno = p_rdma_chunk->length;
+ p_segment_info->type = p_rdma_chunk->type;
+ p_segment_info->xdrpos = xdrpos;
+ p_segment_info->rbytes = 0;
+ /* Add segment to the list of segments */
+ wmem_tree_insert32(p_rdma_conv_info->segment_list, p_rdma_segment->handle, p_segment_info);
+ wmem_list_append(msgid_segments, p_segment_info);
+ setup = TRUE;
+ }
+ /* Calculate the number of bytes for the whole chunk */
+ p_rdma_chunk->length += p_rdma_segment->length;
+ }
+
+ /* Add chunk length to correctly calculate xdrpos */
+ msg_off += p_rdma_chunk->length;
+
+ /*
+ * Add reduced data before each chunk data for either the
+ * read chunk or write chunk (p_offset != NULL)
+ */
+ if (p_rdma_chunk->type == RDMA_READ_CHUNK || p_offset) {
+ /*
+ * Payload data in this frame (e.g., two chunks)
+ * where chunk data is sent separately using RDMA:
+ * +----------------+----------------+----------------+
+ * | xdrdata1 | xdrdata2 | xdrdata3 |
+ * +----------------+----------------+----------------+
+ * chunk data1 --^ chunk data2 --^
+ *
+ * Reassembled message should look like the following in which
+ * the xdrpos specifies where the chunk data must be inserted.
+ * The xdrpos is relative to the reassembled message and NOT
+ * relative to the reduced data (data in this frame):
+ * +----------+-------------+----------+-------------+----------+
+ * | xdrdata1 | chunk data1 | xdrdata2 | chunk data2 | xdrdata3 |
+ * +----------+-------------+----------+-------------+----------+
+ * xdrpos1 ---^ xdrpos2 --^
+ */
+
+ /* Add data before the xdr position */
+ size = xdrpos - xdrprev - lenprev;
+ if (size > 0 && tvb_captured_length_remaining(tvb, offset) > 0 && p_segment_info) {
+ tmp_tvb = tvb_new_subset_length(tvb, offset, size);
+ add_fragment(tmp_tvb, 0, p_segment_info->msgid, msg_num, TRUE, p_rdma_conv_info, pinfo, tree);
+ /* Message number for fragment after read/write chunk */
+ msg_num += p_rdma_chunk->length;
+ /* Save rest of data for next fragment */
+ tvb = tvb_new_subset_remaining(tvb, offset+size);
+ offset = 0;
+ }
+
+ xdrprev = xdrpos;
+ lenprev = p_rdma_chunk->length;
+ }
+ }
+
+ fd_head = get_fragment_head(pinfo);
+ if (fd_head == NULL) {
+ if (p_segment_info == NULL) {
+ return NULL;
+ } else if (p_rdma_chunk->type == RDMA_REPLY_CHUNK && !setup &&
+ !pinfo->fd->visited && p_rdma_chunk->length > 0) {
+ /* Only reassemble if reply chunk size is non-zero to avoid
+ * reassembly of a single fragment. The RPC-over-RDMA reply
+ * has no data when the reply chunk size is non-zero but it
+ * needs to reassemble all fragments (more_frags = FALSE)
+ * in this frame. On the other hand when the reply chunk
+ * size is zero, the whole message is given in this frame
+ * therefore there is no need to reassemble. */
+ new_tvb = add_fragment(tvb, offset, p_segment_info->msgid, 0, FALSE, p_rdma_conv_info, pinfo, tree);
+ } else if (p_rdma_chunk->type == RDMA_READ_CHUNK && tvb_captured_length_remaining(tvb, offset) > 0) {
+ /* Add data after the last read chunk */
+ add_fragment(tvb, offset, p_segment_info->msgid, msg_num, TRUE, p_rdma_conv_info, pinfo, tree);
+ } else if (p_offset && tvb_reported_length_remaining(tvb, offset) > 0) {
+ /* Add data after the last write chunk */
+ new_tvb = add_fragment(tvb, offset, p_segment_info->msgid, msg_num, TRUE, p_rdma_conv_info, pinfo, tree);
+ }
+ }
+ }
+
+ return new_tvb;
+}
+
+/* Process all RDMA chunk lists (read, write and reply) */
+static tvbuff_t *
+process_rdma_lists(tvbuff_t *tvb, guint offset, rdma_lists_t *rdma_lists,
+ packet_info *pinfo, proto_tree *tree)
+{
+ tvbuff_t *new_tvb;
+ tvbuff_t *ret_tvb;
+
+ new_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
+ if (new_tvb) {
+ /* Reassembled message has already been cached */
+ return new_tvb;
+ }
+
+ /*
+ * Reassembly is not done here, process the rdma list to set up the
+ * expected read chunks and their respective segments
+ * Reassembly is done on the last read response
+ * - Used for a large RPC call which has at least one large opaque,
+ * e.g., NFS WRITE
+ * - The RPC call packet is used only to set up the RDMA read chunk list.
+ * It also has the reduced message data which includes the first fragment
+ * (XDR data up to and including the opaque length), but it could also
+ * have fragments between each read chunk and the last fragment after
+ * the last read chunk data. The reduced message is then broken down
+ * into fragments and inserted into the reassembly table.
+ * - The opaque data is transferred via RDMA reads, once all fragments are
+ * accounted for they are reassembled and the whole RPC call is dissected
+ * in the last read response -- there is no RPCoRDMA layer
+ *
+ * - Packet sent order, the reduced RPC call is sent first, then the RDMA
+ * reads, e.g., showing only for a single chunk:
+ * +----------------+-------------+-----------+-----------+-----+-----------+
+ * | WRITE call XDR | opaque size | GETATTR | RDMA read | ... | RDMA read |
+ * +----------------+-------------+-----------+-----------+-----+-----------+
+ * |<-------------- First frame ------------->|<-------- chunk data ------->|
+ * Each RDMA read could be a single RDMA_READ_RESPONSE_ONLY or a series of
+ * RDMA_READ_RESPONSE_FIRST, RDMA_READ_RESPONSE_MIDDLE, ...,
+ * RDMA_READ_RESPONSE_LAST
+ *
+ * - NFS WRITE call, this is how it should be reassembled:
+ * +----------------+-------------+-----------+-----+-----------+-----------+
+ * | WRITE call XDR | opaque size | RDMA read | ... | RDMA read | GETATTR |
+ * +----------------+-------------+-----------+-----+-----------+-----------+
+ * |<--- opaque (chunk) data --->|
+ */
+ process_rdma_list(tvb, offset, rdma_lists->p_read_list, pinfo, tree);
+
+ /*
+ * Reassembly is done on the reply message (RDMA_NOMSG)
+ * Process the rdma list on the call message to set up the reply
+ * chunk and its respective segments expected by the reply
+ * - Used for a large RPC reply which does not fit into a single SEND
+ * operation and does not have a single large opaque, e.g., NFS READDIR
+ * - The RPC call packet is used only to set up the RDMA reply chunk list
+ * - The whole RPC reply is transferred via RDMA writes
+ * - The RPC reply packet has no data (RDMA_NOMSG) but fragments are
+ * reassembled and the whole RPC reply is dissected
+ *
+ * - Packet sent order, this is the whole XDR data for the RPC reply:
+ * +--------------------------+------------------+--------------------------+
+ * | RDMA write | ... | RDMA write |
+ * +--------------------------+------------------+--------------------------+
+ * Each RDMA write could be a single RDMA_WRITE_ONLY or a series of
+ * RDMA_WRITE_FIRST, RDMA_WRITE_MIDDLE, ..., RDMA_WRITE_LAST
+ */
+ new_tvb = process_rdma_list(tvb, offset, rdma_lists->p_reply_list, pinfo, tree);
+
+ /*
+ * Reassembly is done on the reply message (RDMA_MSG)
+ * Process the rdma list on the call message to set up the write
+ * chunks and their respective segments expected by the reply
+ * - Used for a large RPC reply which has at least one large opaque,
+ * e.g., NFS READ
+ * - The RPC call packet is used only to set up the RDMA write chunk list
+ * - The opaque data is transferred via RDMA writes
+ * - The RPC reply packet has the reduced message data which includes the
+ * first fragment (XDR data up to and including the opaque length), but
+ * it could also have fragments between each write chunk and the last
+ * fragment after the last write chunk data. The reduced message is
+ * then broken down into fragments and inserted into the reassembly table.
+ * Fragments are then reassembled and the whole RPC reply is dissected
+ * - Packet sent order, the RDMA writes are sent first, then the reduced RPC
+ * reply, e.g., showing only for a single chunk:
+ * +------------+-----+------------+----------------+-------------+---------+
+ * | RDMA write | ... | RDMA write | READ reply XDR | opaque size | GETATTR |
+ * +------------+-----+------------+----------------+-------------+---------+
+ * |<-------- write chunk -------->|<------------- Last frame ------------->|
+ * Each RDMA write could be a single RDMA_WRITE_ONLY or a series of
+ * RDMA_WRITE_FIRST, RDMA_WRITE_MIDDLE, ..., RDMA_WRITE_LAST
+ *
+ * - NFS READ reply, this is how it should be reassembled:
+ * +----------------+-------------+------------+-----+------------+---------+
+ * | READ reply XDR | opaque size | RDMA write | ... | RDMA write | GETATTR |
+ * +----------------+-------------+------------+-----+------------+---------+
+ * |<---- opaque (chunk) data ---->|
+ */
+ ret_tvb = process_rdma_list(tvb, offset, rdma_lists->p_write_list, pinfo, tree);
+
+ /*
+ * Either the reply chunk or the write chunks should be reassembled here
+ * but not both
+ */
+ new_tvb = (new_tvb && ret_tvb) ? NULL : (ret_tvb ? ret_tvb : new_tvb);
+
+ return new_tvb;
+}
+
+/*
+ * Add a fragment to the SEND reassembly table and return the reassembled data
+ * if all fragments have been added
+ */
+static tvbuff_t *add_send_fragment(rdma_conv_info_t *p_rdma_conv_info,
+ tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree)
+{
+ guint32 msgid = 0;
+ gint32 msgno = -1;
+ tvbuff_t *new_tvb = NULL;
+ gboolean first_frag = FALSE;
+ gboolean middle_frag = FALSE;
+ gboolean last_frag = FALSE;
+ send_info_t *p_send_info = NULL;
+
+ if (gp_infiniband_info) {
+ first_frag = gp_infiniband_info->opCode == RC_SEND_FIRST;
+ middle_frag = gp_infiniband_info->opCode == RC_SEND_MIDDLE;
+ last_frag = (gp_infiniband_info->opCode == RC_SEND_LAST || \
+ gp_infiniband_info->opCode == RC_SEND_LAST_INVAL);
+ } else if (gp_rdmap_info) {
+ first_frag = !gp_rdmap_info->last_flag && gp_rdmap_info->message_offset == 0;
+ middle_frag = !gp_rdmap_info->last_flag && gp_rdmap_info->message_offset > 0;
+ last_frag = gp_rdmap_info->last_flag && gp_rdmap_info->message_offset > 0;
+ }
+
+ if (!first_frag && !middle_frag && !last_frag) {
+ /* Only one SEND fragment, no need to reassemble */
+ return tvb;
+ } else if (pinfo->fd->visited) {
+ return get_reassembled_data(tvb, 0, pinfo, tree);
+ } else if (first_frag) {
+ /* Start of multi-SEND message */
+ p_send_info = wmem_new(wmem_file_scope(), send_info_t);
+ p_send_info->msgid = get_msg_id();
+ p_send_info->rsize = 0;
+
+ if (gp_infiniband_info) {
+ /* Message numbers are relative with respect to current PSN */
+ p_send_info->msgno = gp_infiniband_info->packet_seq_num;
+ wmem_tree_insert32(p_rdma_conv_info->send_list, gp_infiniband_info->packet_seq_num, p_send_info);
+ } else if (gp_rdmap_info) {
+ /* Message numbers are given by the RDMAP offset -- msgno is not used */
+ p_send_info->msgno = 0;
+ wmem_tree_insert32(p_rdma_conv_info->msn_list, gp_rdmap_info->message_seq_num, p_send_info);
+ }
+ } else {
+ /* SEND fragment, get the send reassembly info structure */
+ if (gp_infiniband_info) {
+ p_send_info = wmem_tree_lookup32_le(p_rdma_conv_info->send_list, gp_infiniband_info->packet_seq_num);
+ } else if (gp_rdmap_info) {
+ p_send_info = wmem_tree_lookup32(p_rdma_conv_info->msn_list, gp_rdmap_info->message_seq_num);
+ }
+ }
+ if (p_send_info) {
+ p_send_info->rsize += tvb_reported_length(tvb);
+ msgid = p_send_info->msgid;
+ if (gp_infiniband_info) {
+ /* Message numbers are consecutive starting at zero */
+ msgno = gp_infiniband_info->packet_seq_num - p_send_info->msgno;
+ } else if (gp_rdmap_info) {
+ /* Message numbers are given by the RDMAP offset */
+ msgno = gp_rdmap_info->message_offset;
+ }
+ }
+ if (msgid > 0 && msgno >= 0) {
+ new_tvb = add_fragment(tvb, 0, msgid, msgno, !last_frag, p_rdma_conv_info, pinfo, tree);
+ if (last_frag && !new_tvb && gp_rdmap_info) {
+ /* Since message numbers are not consecutive for iWarp,
+ * verify there are no missing fragments */
+ if (p_send_info->rsize == msgno + tvb_reported_length(tvb)) {
+ end_reassembly(msgid, NULL, pinfo);
+ new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
+ }
+ }
+ }
+ if (new_tvb) {
+ /* This is the last fragment, data has been reassembled
+ * and ready to be dissected */
+ return new_tvb;
+ }
+ return tvb;
+}
+
+/*
+ * We need to differentiate between RPC messages inside RDMA and regular send messages.
+ * In order to do that (as well as extra validation) we want to verify that for RDMA_MSG
+ * and RDMA_MSGP types, RPC call or RPC reply header follows. We can do this by comparing
+ * XID in RPC and RPCoRDMA headers.
+ */
+static gboolean
+packet_is_rpcordma(tvbuff_t *tvb)
+{
+ guint size, len = tvb_reported_length(tvb);
+ guint32 xid_rpc;
+ guint32 xid = tvb_get_ntohl(tvb, 0);
+ guint32 msg_type = tvb_get_ntohl(tvb, 12);
+ guint offset;
+
+ if (len < MIN_RPCRDMA_HDR_SZ)
+ return 0;
+
+ switch (msg_type) {
+ case RDMA_MSG:
+ if (len < MIN_RPCRDMA_MSG_SZ)
+ return FALSE;
+ offset = MIN_RPCRDMA_HDR_SZ;
+ size = get_chunk_lists_size(tvb, len, offset);
+ if (!size)
+ return FALSE;
+ offset += size;
+
+ if (offset + 4 > len)
+ return FALSE;
+ xid_rpc = tvb_get_ntohl(tvb, offset);
+ if (xid != xid_rpc)
+ return FALSE;
+ break;
+
+ case RDMA_MSGP:
+ if (len < MIN_RPCRDMA_MSGP_SZ)
+ return FALSE;
+ offset = MIN_RPCRDMA_HDR_SZ + 8;
+ size = get_chunk_lists_size(tvb, len, offset);
+ if (!size)
+ return FALSE;
+ offset += size;
+
+ if (offset + 4 > len)
+ return FALSE;
+ xid_rpc = tvb_get_ntohl(tvb, offset);
+ if (xid != xid_rpc)
+ return FALSE;
+ break;
+
+ case RDMA_NOMSG:
+ case RDMA_DONE:
+ case RDMA_ERROR:
+ break;
+
+ default:
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static int
+dissect_rpcrdma(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, void* data _U_)
+{
+ tvbuff_t *volatile next_tvb;
+ tvbuff_t *frag_tvb;
+ proto_item *ti;
+ proto_tree *rpcordma_tree;
+ guint offset;
+ guint32 msg_type;
+ guint32 xid;
+ guint32 val;
+ guint32 *p_msgid;
+ guint write_size;
+ int save_visited;
+ rdma_lists_t rdma_lists = { NULL, NULL, NULL };
+
+ /* tvb_get_ntohl() should not throw an exception while checking if
+ this is an rpcrdma packet */
+ if (tvb_captured_length(tvb) < MIN_RPCRDMA_HDR_SZ)
+ return 0;
+
+ if (tvb_get_ntohl(tvb, 4) != 1) /* vers */
+ return 0;
+
+ xid = tvb_get_ntohl(tvb, 0);
+ msg_type = tvb_get_ntohl(tvb, 12);
+
+ col_set_str(pinfo->cinfo, COL_PROTOCOL, "RPCoRDMA");
+ col_add_fstr(pinfo->cinfo, COL_INFO, "%s XID 0x%x",
+ val_to_str(msg_type, rpcordma_message_type, "Unknown (%d)"), xid);
+
+ ti = proto_tree_add_item(tree, proto_rpcordma, tvb, 0, MIN_RPCRDMA_HDR_SZ, ENC_NA);
+
+ rpcordma_tree = proto_item_add_subtree(ti, ett_rpcordma);
+
+ offset = 0;
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_xid, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_flow_control, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_message_type, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+
+ switch (msg_type) {
+ case RDMA_MSG:
+ /* Parse rpc_rdma_header */
+ offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
+
+ proto_item_set_len(ti, offset);
+
+ frag_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
+ if (frag_tvb) {
+ /* Reassembled message has already been cached -- call upper dissector */
+ return call_dissector(rpc_handler, frag_tvb, pinfo, tree);
+ } else if (pinfo->fd->visited && !g_needs_reassembly && rdma_lists.p_read_list) {
+ /* This frame has already been added as a read fragment */
+ return 0;
+ } else {
+ next_tvb = tvb_new_subset_remaining(tvb, offset);
+
+ /*
+ * Get the total number of bytes for the write chunk list.
+ * It returns 0 if there is no write chunk list, or this is an
+ * RPC call (list has just been set up) or it is an RPC reply but
+ * there is an error so the reply message has not been reduced.
+ */
+ write_size = get_rdma_list_size(rdma_lists.p_write_list, pinfo);
+
+ if (write_size > 0 && !pinfo->fd->visited) {
+ /* Initialize array of write chunk offsets */
+ gp_rdma_write_offsets = wmem_array_new(wmem_packet_scope(), sizeof(gint));
+ register_frame_end_routine(pinfo, reset_write_offsets);
+ TRY {
+ /*
+ * Call the upper layer dissector to get a list of offsets
+ * where message has been reduced.
+ * This is done on the first pass (visited = 0)
+ */
+ g_rpcrdma_reduced = TRUE;
+ call_dissector(rpc_handler, next_tvb, pinfo, tree);
+ }
+ FINALLY {
+ /* Make sure to disable reduced data processing */
+ g_rpcrdma_reduced = FALSE;
+ }
+ ENDTRY;
+ } else if (write_size > 0 && pinfo->fd->visited) {
+ /*
+ * Reassembly is done on the second pass (visited = 1)
+ * This is done because dissecting the upper layer(s) again
+ * causes the upper layer(s) to be displayed twice if it is
+ * done on the same pass.
+ */
+ p_msgid = (guint32 *)p_get_proto_data(wmem_file_scope(), pinfo, proto_rpcordma, RPCRDMA_MSG_ID);
+ if (p_msgid) {
+ /*
+ * All fragments were added during the first pass,
+ * reassembly just needs to be completed here
+ */
+ save_visited = pinfo->fd->visited;
+ pinfo->fd->visited = 0;
+ end_reassembly(*p_msgid, NULL, pinfo);
+ pinfo->fd->visited = save_visited;
+ }
+ }
+
+ /*
+ * If there is a write chunk list, process_rdma_lists will convert
+ * the offsets returned by the upper layer into xdr positions
+ * and break the current reduced message into separate fragments
+ * and insert them into the reassembly table in the first pass.
+ * On the second pass, the reassembly has just been done so
+ * process_rdma_lists should only call process_reassembled_data
+ * to get the reassembled data and call the dissector for the
+ * upper layer with the reassembled message.
+ */
+ frag_tvb = process_rdma_lists(next_tvb, 0, &rdma_lists, pinfo, tree);
+ gp_rdma_write_offsets = NULL;
+ if (rdma_lists.p_read_list) {
+ /*
+ * If there is a read chunk list, do not dissect upper layer
+ * just label rest of packet as "Data" since the reassembly
+ * will be done on the last read response.
+ */
+ call_data_dissector(next_tvb, pinfo, tree);
+ break;
+ } else if (frag_tvb) {
+ /* Replace current frame data with the reassembled data */
+ next_tvb = frag_tvb;
+ }
+ }
+ return call_dissector(rpc_handler, next_tvb, pinfo, tree);
+
+ case RDMA_NOMSG:
+ /* Parse rpc_rdma_header_nomsg */
+ offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
+ if (pinfo->fd->visited) {
+ /* Reassembly was done on the first pass, so just get the reassembled data */
+ next_tvb = get_reassembled_data(tvb, offset, pinfo, tree);
+ } else {
+ next_tvb = process_rdma_lists(tvb, offset, &rdma_lists, pinfo, tree);
+ }
+ if (next_tvb) {
+ /*
+ * Even though there is no data in this frame, reassembly for
+ * the reply chunk is done in this frame so dissect upper layer
+ */
+ call_dissector(rpc_handler, next_tvb, pinfo, tree);
+ }
+ break;
+
+ case RDMA_MSGP:
+ /* Parse rpc_rdma_header_padded */
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_rdma_align, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_rdma_thresh, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+
+ offset = parse_rdma_header(tvb, offset, rpcordma_tree, &rdma_lists);
+
+ proto_item_set_len(ti, offset);
+ next_tvb = tvb_new_subset_remaining(tvb, offset);
+ return call_dissector(rpc_handler, next_tvb, pinfo, tree);
+
+ case RDMA_DONE:
+ break;
+
+ case RDMA_ERROR:
+ /* rpc_rdma_errcode */
+ val = tvb_get_ntohl(tvb, offset);
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_errcode, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+
+ switch (val) {
+ case ERR_VERS:
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers_low, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ proto_tree_add_item(rpcordma_tree, hf_rpcordma_vers_high, tvb,
+ offset, 4, ENC_BIG_ENDIAN);
+ offset += 4;
+ break;
+
+ case ERR_CHUNK:
+ break;
+
+ default:
+ proto_item_set_len(ti, offset);
+ next_tvb = tvb_new_subset_remaining(tvb, offset);
+ return call_data_dissector(next_tvb, pinfo, tree);
+ }
+ break;
+ }
+
+ proto_item_set_len(ti, offset);
+ return offset;
+}
+
+static gboolean
+dissect_rpcrdma_ib_heur(tvbuff_t *tvb, packet_info *pinfo,
+ proto_tree *tree, void *data)
+{
+ tvbuff_t *new_tvb = NULL;
+ gboolean more_frags = FALSE;
+ rdma_conv_info_t *p_rdma_conv_info;
+
+ /* Initialize global variables for InfiniBand reassembly */
+ g_rpcrdma_reduced = FALSE;
+ g_needs_reassembly = TRUE;
+ gp_rdmap_info = NULL;
+ gp_infiniband_info = (struct infinibandinfo *)data;
+
+ if (!gp_infiniband_info)
+ return FALSE;
+
+ /* Get conversation state */
+ p_rdma_conv_info = get_rdma_conv_info(pinfo);
+
+ switch (gp_infiniband_info->opCode) {
+ case RC_SEND_ONLY:
+ case RC_SEND_ONLY_INVAL:
+ break;
+ case RC_SEND_FIRST:
+ case RC_SEND_MIDDLE:
+ add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
+ return FALSE;
+ case RC_SEND_LAST:
+ case RC_SEND_LAST_INVAL:
+ tvb = add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
+ break;
+ case RC_RDMA_WRITE_ONLY:
+ case RC_RDMA_WRITE_ONLY_IMM:
+ set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
+ add_ib_fragment(tvb, p_rdma_conv_info, TRUE, pinfo, tree);
+ return FALSE;
+ case RC_RDMA_WRITE_FIRST:
+ set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
+ add_request_info(p_rdma_conv_info, pinfo);
+ /* fall through */
+ case RC_RDMA_WRITE_MIDDLE:
+ case RC_RDMA_WRITE_LAST:
+ case RC_RDMA_WRITE_LAST_IMM:
+ /* Add fragment to the reassembly table */
+ add_ib_fragment(tvb, p_rdma_conv_info, FALSE, pinfo, tree);
+ /* Do not dissect here, dissection is done on RDMA_MSG or RDMA_NOMSG */
+ return FALSE;
+ case RC_RDMA_READ_REQUEST:
+ add_request_info(p_rdma_conv_info, pinfo);
+ return FALSE;
+ case RC_RDMA_READ_RESPONSE_FIRST:
+ set_max_iosize(p_rdma_conv_info, tvb_reported_length(tvb));
+ /* fall through */
+ case RC_RDMA_READ_RESPONSE_MIDDLE:
+ more_frags = TRUE;
+ /* fall through */
+ case RC_RDMA_READ_RESPONSE_LAST:
+ case RC_RDMA_READ_RESPONSE_ONLY:
+ /* Add fragment to the reassembly table */
+ new_tvb = add_ib_fragment(tvb, p_rdma_conv_info, FALSE, pinfo, tree);
+ if (!new_tvb && !more_frags && p_rdma_conv_info->segment_info) {
+ /*
+ * Reassembled data has not been cached (new_tvb==NULL) yet,
+ * so make sure reassembly is really done if more_frags==FALSE,
+ * (for the READ_RESPONSE_LAST or ONLY case).
+ * Do not add any more data, just complete the reassembly
+ */
+ end_reassembly(p_rdma_conv_info->segment_info->msgid, p_rdma_conv_info, pinfo);
+ new_tvb = get_reassembled_data(tvb, 0, pinfo, tree);
+ }
+ if (new_tvb) {
+ /* This is the last fragment, data has been reassembled and ready to dissect */
+ return call_dissector(rpc_handler, new_tvb, pinfo, tree);
+ }
+ return FALSE;
+ default:
+ return FALSE;
+ }
+
+ if (!packet_is_rpcordma(tvb))
+ return FALSE;
+ dissect_rpcrdma(tvb, pinfo, tree, NULL);
+ return TRUE;
+}
+
+static gboolean
+dissect_rpcrdma_iwarp_heur(tvbuff_t *tvb, packet_info *pinfo,
+ proto_tree *tree, void *data)
+{
+ tvbuff_t *new_tvb;
+ rdma_conv_info_t *p_rdma_conv_info;
+ rdmap_request_t *p_read_request;
+
+ /* Initialize global variables for iWarp reassembly */
+ g_rpcrdma_reduced = FALSE;
+ g_needs_reassembly = TRUE;
+ gp_infiniband_info = NULL;
+ gp_rdmap_info = (rdmap_info_t *)data;
+
+ if (!gp_rdmap_info)
+ return FALSE;
+
+ /* Get conversation state */
+ p_rdma_conv_info = get_rdma_conv_info(pinfo);
+
+ switch (gp_rdmap_info->opcode) {
+ case RDMA_SEND:
+ case RDMA_SEND_INVALIDATE:
+ tvb = add_send_fragment(p_rdma_conv_info, tvb, pinfo, tree);
+ if (!gp_rdmap_info->last_flag) {
+ /* This is a SEND fragment, do not dissect yet */
+ return FALSE;
+ }
+ break;
+ case RDMA_WRITE:
+ add_iwarp_fragment(tvb, p_rdma_conv_info, pinfo, tree);
+ /* Do not dissect here, dissection is done on RDMA_MSG or RDMA_NOMSG */
+ return FALSE;
+ case RDMA_READ_REQUEST:
+ if (!pinfo->fd->visited && gp_rdmap_info->read_request) {
+ p_read_request = wmem_new(wmem_file_scope(), rdmap_request_t);
+ memcpy(p_read_request, gp_rdmap_info->read_request, sizeof(rdmap_request_t));
+ wmem_tree_insert32(p_rdma_conv_info->request_list, gp_rdmap_info->read_request->sink_stag, p_read_request);
+ }
+ return FALSE;
+ case RDMA_READ_RESPONSE:
+ new_tvb = add_iwarp_fragment(tvb, p_rdma_conv_info, pinfo, tree);
+ if (new_tvb) {
+ /* This is the last fragment, data has been reassembled and ready to dissect */
+ return call_dissector(rpc_handler, new_tvb, pinfo, tree);
+ }
+ return FALSE;
+ default:
+ return FALSE;
+ }
+
+ if (!packet_is_rpcordma(tvb))
+ return FALSE;
+
+ dissect_rpcrdma(tvb, pinfo, tree, NULL);
+ return TRUE;
+}
+
+void
+proto_register_rpcordma(void)
+{
+ module_t *rpcordma_module;
+ static hf_register_info hf[] = {
+ { &hf_rpcordma_xid,
+ { "XID", "rpcordma.xid",
+ FT_UINT32, BASE_HEX,
+ NULL, 0x0, NULL, HFILL}
+ },
+ { &hf_rpcordma_vers,
+ { "Version", "rpcordma.version",
+ FT_UINT32, BASE_DEC,
+ NULL, 0x0, NULL, HFILL}
+ },
+ { &hf_rpcordma_flow_control,
+ { "Flow Control", "rpcordma.flow_control",
+ FT_UINT32, BASE_DEC,
+ NULL, 0x0, NULL, HFILL}
+ },
+ { &hf_rpcordma_message_type,
+ { "Message Type", "rpcordma.msg_type",
+ FT_UINT32, BASE_DEC,
+ VALS(rpcordma_message_type), 0x0, NULL, HFILL}
+ },
+ { &hf_rpcordma_reads_count,
+ { "Read list", "rpcordma.reads_count",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_writes_count,
+ { "Write list", "rpcordma.writes_count",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_reply_count,
+ { "Reply list", "rpcordma.reply_count",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_rdma_handle,
+ { "RDMA handle", "rpcordma.rdma_handle",
+ FT_UINT32, BASE_HEX,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_rdma_length,
+ { "RDMA length", "rpcordma.rdma_length",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_rdma_offset,
+ { "RDMA offset", "rpcordma.rdma_offset",
+ FT_UINT64, BASE_HEX,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_position,
+ { "Position in XDR", "rpcordma.position",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_segment_count,
+ { "Write chunk segment count", "rpcordma.segment_count",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_rdma_align,
+ { "RDMA align", "rpcordma.rdma_align",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_rdma_thresh,
+ { "RDMA threshold", "rpcordma.rdma_thresh",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_errcode,
+ { "Error code", "rpcordma.errcode",
+ FT_UINT32, BASE_DEC,
+ VALS(rpcordma_err), 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_vers_low,
+ { "Version low", "rpcordma.vers_low",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ { &hf_rpcordma_vers_high,
+ { "Version high", "rpcordma.vers_high",
+ FT_UINT32, BASE_DEC,
+ NULL, 0, NULL, HFILL }
+ },
+ /* Fragment entries */
+ { &hf_rpcordma_fragments,
+ { "Reassembled RPCoRDMA fragments", "rpcordma.fragments",
+ FT_NONE, BASE_NONE, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment,
+ { "RPCoRDMA fragment", "rpcordma.fragment",
+ FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_overlap,
+ { "Fragment overlap", "rpcordma.fragment.overlap",
+ FT_BOOLEAN, 0, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_overlap_conflicts,
+ { "Fragment overlapping with conflicting data", "rpcordma.fragment.overlap.conflicts",
+ FT_BOOLEAN, 0, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_multiple_tails,
+ { "Multiple tail fragments found", "rpcordma.fragment.multiple_tails",
+ FT_BOOLEAN, 0, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_too_long_fragment,
+ { "Fragment too long", "rpcordma.fragment.too_long_fragment",
+ FT_BOOLEAN, 0, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_error,
+ { "Defragmentation error", "rpcordma.fragment.error",
+ FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_fragment_count,
+ { "Fragment count", "rpcordma.fragment.count",
+ FT_UINT32, BASE_DEC, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_reassembled_in,
+ { "Reassembled PDU in frame", "rpcordma.reassembled.in",
+ FT_FRAMENUM, BASE_NONE, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_reassembled_length,
+ { "Reassembled RPCoRDMA length", "rpcordma.reassembled.length",
+ FT_UINT32, BASE_DEC, NULL, 0x00, NULL, HFILL}
+ },
+ { &hf_rpcordma_reassembled_data,
+ { "Reassembled RPCoRDMA data", "rpcordma.reassembled.data",
+ FT_BYTES, BASE_NONE, NULL, 0, NULL, HFILL }
+ },
+ };
+
+ static gint *ett[] = {
+ &ett_rpcordma,
+ &ett_rpcordma_chunk,
+ &ett_rpcordma_read_list,
+ &ett_rpcordma_read_chunk,
+ &ett_rpcordma_write_list,
+ &ett_rpcordma_write_chunk,
+ &ett_rpcordma_reply_chunk,
+ &ett_rpcordma_segment,
+ &ett_rpcordma_fragment,
+ &ett_rpcordma_fragments,
+ };
+
+ proto_rpcordma = proto_register_protocol (
+ "RPC over RDMA", /* name */
+ "RPCoRDMA", /* short name */
+ "rpcordma" /* abbrev */
+ );
+
+ proto_register_field_array(proto_rpcordma, hf, array_length(hf));
+ proto_register_subtree_array(ett, array_length(ett));
+ reassembly_table_register(&rpcordma_reassembly_table, &addresses_ports_reassembly_table_functions);
+
+ /* Register dissector handle */
+ rpcordma_handle = register_dissector("rpcordma", dissect_rpcrdma, proto_rpcordma);
+
+ /* Register preferences */
+ rpcordma_module = prefs_register_protocol_obsolete(proto_rpcordma);
+
+ prefs_register_obsolete_preference(rpcordma_module, "manual_en");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_a");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_a_type");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_a_id");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_a_qp");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_b");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_b_type");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_b_id");
+ prefs_register_obsolete_preference(rpcordma_module, "addr_b_qp");
+ prefs_register_obsolete_preference(rpcordma_module, "target_ports");
+}
+
+void
+proto_reg_handoff_rpcordma(void)
+{
+ heur_dissector_add("infiniband.payload", dissect_rpcrdma_ib_heur, "RPC-over-RDMA on Infiniband",
+ "rpcrdma_infiniband", proto_rpcordma, HEURISTIC_ENABLE);
+ dissector_add_for_decode_as("infiniband", rpcordma_handle);
+
+ heur_dissector_add("iwarp_ddp_rdmap", dissect_rpcrdma_iwarp_heur, "RPC-over-RDMA on iWARP",
+ "rpcrdma_iwarp", proto_rpcordma, HEURISTIC_ENABLE);
+
+ rpc_handler = find_dissector_add_dependency("rpc", proto_rpcordma);
+}
+
+/*
+ * Editor modelines - https://www.wireshark.org/tools/modelines.html
+ *
+ * Local variables:
+ * c-basic-offset: 4
+ * tab-width: 8
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vi: set shiftwidth=4 tabstop=8 expandtab:
+ * :indentSize=4:tabSize=8:noTabs=true:
+ */