From: Shamir Rabinovitch <[email protected]>

Introduce a new extension header type RDSV3_EXTHDR_RDMA_BYTES for
an RDMA initiator to exchange rdma byte counts to its target.
Currently, RDMA operations cannot precisely account how many bytes a
peer just transferred via RDMA, which limits per-connection statistics
and future policy (e.g., monitoring or rate/cgroup accounting of RDMA
traffic).

In this patch we expand rds_message_add_extension to accept multiple
extensions, and add new flag to RDS header: RDS_FLAG_EXTHDR_EXTENSION,
along with a new extension to RDS header: rds_ext_header_rdma_bytes.

Signed-off-by: Shamir Rabinovitch <[email protected]>
Signed-off-by: Guangyu Sun <[email protected]>
Signed-off-by: Allison Henderson <[email protected]>
---
 net/rds/ib_send.c | 19 +++++++++++++-
 net/rds/message.c | 65 +++++++++++++++++++++++++++++++++++++----------
 net/rds/rds.h     | 24 +++++++++++++----
 net/rds/send.c    |  6 ++---
 4 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index f9d28ddd168d8..8282ff61b0b37 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -578,10 +578,27 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
                 * used by the peer to release use-once RDMA MRs. */
                if (rm->rdma.op_active) {
                        struct rds_ext_header_rdma ext_hdr;
+                       struct rds_ext_header_rdma_bytes rdma_bytes_ext_hdr;
 
                        ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
                        rds_message_add_extension(&rm->m_inc.i_hdr,
-                                       RDS_EXTHDR_RDMA, &ext_hdr, 
sizeof(ext_hdr));
+                                                 RDS_EXTHDR_RDMA, &ext_hdr);
+
+                       /* prepare the rdma bytes ext header */
+                       rdma_bytes_ext_hdr.h_rflags = rm->rdma.op_write ?
+                               RDS_FLAG_RDMA_WR_BYTES : RDS_FLAG_RDMA_RD_BYTES;
+                       rdma_bytes_ext_hdr.h_rdma_bytes =
+                               cpu_to_be32(rm->rdma.op_bytes);
+
+                       if (rds_message_add_extension(&rm->m_inc.i_hdr,
+                                                     RDS_EXTHDR_RDMA_BYTES,
+                                                     &rdma_bytes_ext_hdr)) {
+                               /* rdma bytes ext header was added successfully,
+                                * notify the remote side via flag in header
+                                */
+                               rm->m_inc.i_hdr.h_flags |=
+                                       RDS_FLAG_EXTHDR_EXTENSION;
+                       }
                }
                if (rm->m_rdma_cookie) {
                        rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
diff --git a/net/rds/message.c b/net/rds/message.c
index 199a899a43e9c..591a27c9c62f7 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,6 +44,7 @@ static unsigned int   rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_VERSION]   = sizeof(struct rds_ext_header_version),
 [RDS_EXTHDR_RDMA]      = sizeof(struct rds_ext_header_rdma),
 [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
 [RDS_EXTHDR_NPATHS]    = sizeof(__be16),
 [RDS_EXTHDR_GEN_NUM]   = sizeof(__be32),
 };
@@ -191,31 +192,69 @@ void rds_message_populate_header(struct rds_header *hdr, 
__be16 sport,
        hdr->h_sport = sport;
        hdr->h_dport = dport;
        hdr->h_sequence = cpu_to_be64(seq);
-       hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+       /* see rds_find_next_ext_space for reason why we memset the
+        * ext header
+        */
+       memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
 }
 EXPORT_SYMBOL_GPL(rds_message_populate_header);
 
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
-                             const void *data, unsigned int len)
+/*
+ * Find the next place we can add an RDS header extension with
+ * specific length. Extension headers are pushed one after the
+ * other. In the following, the number after the colon is the number
+ * of bytes:
+ *
+ * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
+ *
+ * If the extension headers fill the complete extension header space
+ * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
+ */
+static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
+                                  u8 **ext_start)
 {
-       unsigned int ext_len = sizeof(u8) + len;
-       unsigned char *dst;
+       unsigned int ext_len;
+       unsigned int type;
+       int ind = 0;
+
+       while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
+               if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
+                       *ext_start = hdr->h_exthdr + ind;
+                       return 0;
+               }
 
-       /* For now, refuse to add more than one extension header */
-       if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
-               return 0;
+               type = hdr->h_exthdr[ind];
+
+               ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+               WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
+               if (!ext_len)
+                       return -EINVAL;
+
+               /* ind points to a valid ext hdr with known length */
+               ind += 1 + ext_len;
+       }
+
+       /* no room for extension */
+       return -ENOSPC;
+}
+
+/* The ext hdr space is prefilled with zero from the kzalloc() */
+int rds_message_add_extension(struct rds_header *hdr,
+                             unsigned int type, const void *data)
+{
+       unsigned char *dst;
+       unsigned int len;
 
-       if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+       len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+       if (!len)
                return 0;
 
-       if (ext_len >= RDS_HEADER_EXT_SPACE)
+       if (rds_find_next_ext_space(hdr, len, &dst))
                return 0;
-       dst = hdr->h_exthdr;
 
        *dst++ = type;
        memcpy(dst, data, len);
 
-       dst[len] = RDS_EXTHDR_NONE;
        return 1;
 }
 EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +311,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header 
*hdr, u32 r_key, u32 o
 
        ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
        ext_hdr.h_rdma_offset = cpu_to_be32(offset);
-       return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, 
sizeof(ext_hdr));
+       return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
 }
 EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 8a549fe687ac9..cadfd7ec0ba92 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -183,10 +183,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct 
net *net)
        write_pnet(&conn->c_net, net);
 }
 
-#define RDS_FLAG_CONG_BITMAP   0x01
-#define RDS_FLAG_ACK_REQUIRED  0x02
-#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT     255
+#define RDS_FLAG_CONG_BITMAP           0x01
+#define RDS_FLAG_ACK_REQUIRED          0x02
+#define RDS_FLAG_RETRANSMITTED         0x04
+#define RDS_FLAG_EXTHDR_EXTENSION      0x20
+#define RDS_MAX_ADV_CREDIT             255
 
 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
  * probe to exchange control information before establishing a connection.
@@ -258,6 +259,19 @@ struct rds_ext_header_rdma_dest {
        __be32                  h_rdma_offset;
 };
 
+/*
+ * This extension header tells the peer about delivered RDMA byte count.
+ */
+#define RDS_EXTHDR_RDMA_BYTES  4
+
+struct rds_ext_header_rdma_bytes {
+       __be32          h_rdma_bytes;   /* byte count */
+       u8              h_rflags;       /* direction of RDMA, write or read */
+};
+
+#define RDS_FLAG_RDMA_WR_BYTES 0x01
+#define RDS_FLAG_RDMA_RD_BYTES 0x02
+
 /* Extension header announcing number of paths.
  * Implicit length = 2 bytes.
  */
@@ -871,7 +885,7 @@ struct rds_message *rds_message_map_pages(unsigned long 
*page_addrs, unsigned in
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                 __be16 dport, u64 seq);
 int rds_message_add_extension(struct rds_header *hdr,
-                             unsigned int type, const void *data, unsigned int 
len);
+                             unsigned int type, const void *data);
 int rds_message_next_extension(struct rds_header *hdr,
                               unsigned int *pos, void *buf, unsigned int 
*buflen);
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 
offset);
diff --git a/net/rds/send.c b/net/rds/send.c
index 3e3d028bc21ee..306785fa7065e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1459,12 +1459,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
                __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
 
                rds_message_add_extension(&rm->m_inc.i_hdr,
-                                         RDS_EXTHDR_NPATHS, &npaths,
-                                         sizeof(npaths));
+                                         RDS_EXTHDR_NPATHS, &npaths);
                rds_message_add_extension(&rm->m_inc.i_hdr,
                                          RDS_EXTHDR_GEN_NUM,
-                                         &my_gen_num,
-                                         sizeof(u32));
+                                         &my_gen_num);
        }
        spin_unlock_irqrestore(&cp->cp_lock, flags);
 
-- 
2.43.0


Reply via email to