From: Matan Barak <[email protected]>

The RDMA stack allows for applications to create IB_QPT_RAW_PACKET
QPs, which receive plain Ethernet packets, specifically packets that
don't carry any QPN to be matched by the receiving side. Applications
using these QPs must be provided with a method to program some steering
rule with the HW so packets arriving at the local port can be routed to them.

In a similar manner, when the device supports flow streeing, IB UD QPs
created by IPoIB allow user-space applications to steer specific TCP/IP
flows to their QPs.

This patch adds ibv_create_flow(), which allow providing a flow specification
for a QP.  When there's a match between the specification and a received packet,
the packet is forwarded to that QP, in a the same way one uses 
ibv_attach_mcast()
for IB UD multicast handling.

Flow specifications are provided as instances of struct ibv_flow_spec_yyy,
which describes L2, L3 and L4 headers.  Currently specs for Ethernet, IPv4,
TCP and UDP are defined.  Flow specs are made of values and masks.

The input to ib_create_flow() is a struct ib_flow_attr, which contains
a few mandatory control elements and optional flow specs.

 struct ibv_flow_attr {
        uint32_t comp_mask;
        enum ibv_flow_attr_type type;
        uint16_t size;
        uint16_t priority;
        uint8_t  num_of_specs;
        uint8_t  port;
        uint32_t flags;
        /* Following are the optional layers according to user request
         * struct ibv_flow_spec_xxx [L2]
         * struct ibv_flow_spec_yyy [L3/L4]
         */
 };

These flow specs are defined and used in a way which allows adding new spec
types without kernel/user ABI change, just with a little API enhancement which
defines the newly added spec.

The flow spec structures are defined with TLV (Type-Length-Value) entries, which
allows calling ib_create_flow() with a list of variable length of optional 
specs.

For the actual processing of ibv_flow_attr the kernel uses the number
of specs and the size mandatory fields along with the TLV nature of
the specs.

The returned value from ibv_create_flow() is a struct ibv_flow, which contains
a handle provided by the kernel to be used when calling ibv_destroy_flow().

The ib_flow_attr enum type supports usage of flow steering for promiscuous
and sniffer purposes:

    IBV_FLOW_ATTR_NORMAL - "regular" rule, steering according to rule 
specification

    IBV_FLOW_ATTR_ALL_DEFAULT - default unicast and multicast rule, receive
        all Ethernet traffic which isn't steered to any QP

    IBV_FLOW_ATTR_MC_DEFAULT - same as IB_FLOW_ATTR_ALL_DEFAULT but only for 
multicast

ALL_DEFAULT and MC_DEFAULT rules options are valid only for Ethernet link type.

Signed-off-by: Hadar Hen Zion <[email protected]>
Signed-off-by: Or Gerlitz <[email protected]>
Signed-off-by: Matan Barak <[email protected]>
---
 include/infiniband/driver.h   |    4 +
 include/infiniband/kern-abi.h |   99 +++++++++++++++++++++++++++++++-
 include/infiniband/verbs.h    |  128 ++++++++++++++++++++++++++++++++++++++++-
 src/cmd.c                     |  105 +++++++++++++++++++++++++++++++++
 src/device.c                  |    4 +
 src/libibverbs.map            |    2 +
 6 files changed, 340 insertions(+), 2 deletions(-)

diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
index f69962e..5cc092b 100644
--- a/include/infiniband/driver.h
+++ b/include/infiniband/driver.h
@@ -194,6 +194,10 @@ int ibv_cmd_destroy_ah(struct ibv_ah *ah);
 int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t 
lid);
 int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t 
lid);
 
+struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp,
+                                    struct ibv_flow_attr *flow_attr);
+int ibv_cmd_destroy_flow(struct ibv_flow *flow_id);
+
 int ibv_dontfork_range(void *base, size_t size);
 int ibv_dofork_range(void *base, size_t size);
 
diff --git a/include/infiniband/kern-abi.h b/include/infiniband/kern-abi.h
index 0b9c79e..91b45d8 100644
--- a/include/infiniband/kern-abi.h
+++ b/include/infiniband/kern-abi.h
@@ -102,6 +102,13 @@ enum {
 #define IB_USER_VERBS_CMD_FLAG_EXTENDED                0x80ul
 
 
+enum {
+       IB_USER_VERBS_CMD_CREATE_FLOW = (IB_USER_VERBS_CMD_FLAG_EXTENDED <<
+                                        IB_USER_VERBS_CMD_FLAGS_SHIFT) +
+                                       IB_USER_VERBS_CMD_THRESHOLD,
+       IB_USER_VERBS_CMD_DESTROY_FLOW
+};
+
 /*
  * Make sure that all structs defined in this file remain laid out so
  * that they pack the same way on 32-bit and 64-bit architectures (to
@@ -676,6 +683,76 @@ struct ibv_kern_send_wr {
        } qp_type;
 };
 
+struct ibv_kern_eth_filter {
+       __u8  dst_mac[6];
+       __u8  src_mac[6];
+       __u16  ether_type;
+       __u16  vlan_tag;
+};
+
+struct ibv_kern_spec_eth {
+       __u32 type;
+       __u16  size;
+       __u16 reserved;
+       struct ibv_kern_eth_filter val;
+       struct ibv_kern_eth_filter mask;
+};
+
+struct ibv_kern_ipv4_filter {
+       __u32 src_ip;
+       __u32 dst_ip;
+};
+
+struct ibv_kern_spec_ipv4 {
+       __u32  type;
+       __u16  size;
+       __u16 reserved;
+       struct ibv_kern_ipv4_filter val;
+       struct ibv_kern_ipv4_filter mask;
+};
+
+struct ibv_kern_tcp_udp_filter {
+       __u16 dst_port;
+       __u16 src_port;
+};
+
+struct ibv_kern_spec_tcp_udp {
+       __u32  type;
+       __u16  size;
+       __u16 reserved;
+       struct ibv_kern_tcp_udp_filter val;
+       struct ibv_kern_tcp_udp_filter mask;
+};
+
+
+struct ibv_kern_spec {
+       union {
+               struct {
+                       __u32 type;
+                       __u16 size;
+                       __u16 reserved;
+               } hdr;
+               struct ibv_kern_spec_eth eth;
+               struct ibv_kern_spec_ipv4 ipv4;
+               struct ibv_kern_spec_tcp_udp tcp_udp;
+       };
+
+};
+
+struct ibv_kern_flow_attr {
+       __u32 type;
+       __u16 size;
+       __u16 priority;
+       __u8 num_of_specs;
+       __u8 reserved[2];
+       __u8 port;
+       __u32 flags;
+       /* Following are the optional layers according to user request
+        * struct ibv_kern_flow_spec_xxx
+        * struct ibv_kern_flow_spec_yyy
+        */
+};
+
 struct ibv_post_send {
        __u32 command;
        __u16 in_words;
@@ -763,6 +840,24 @@ struct ibv_attach_mcast {
        __u64 driver_data[0];
 };
 
+struct ibv_create_flow  {
+       struct ex_hdr hdr;
+       __u32 comp_mask;
+       __u32 qp_handle;
+       struct ibv_kern_flow_attr flow_attr;
+};
+
+struct ibv_create_flow_resp {
+       __u32 comp_mask;
+       __u32 flow_handle;
+};
+
+struct ibv_destroy_flow  {
+       struct ex_hdr hdr;
+       __u32 comp_mask;
+       __u32 flow_handle;
+};
+
 struct ibv_detach_mcast {
        __u32 command;
        __u16 in_words;
@@ -904,7 +999,9 @@ enum {
        IB_USER_VERBS_CMD_OPEN_XRCD_V2 = -1,
        IB_USER_VERBS_CMD_CLOSE_XRCD_V2 = -1,
        IB_USER_VERBS_CMD_CREATE_XSRQ_V2 = -1,
-       IB_USER_VERBS_CMD_OPEN_QP_V2 = -1
+       IB_USER_VERBS_CMD_OPEN_QP_V2 = -1,
+       IB_USER_VERBS_CMD_CREATE_FLOW_V2 = -1,
+       IB_USER_VERBS_CMD_DESTROY_FLOW_V2 = -1
 };
 
 struct ibv_modify_srq_v3 {
diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
index 4ea5745..f5c1946 100644
--- a/include/infiniband/verbs.h
+++ b/include/infiniband/verbs.h
@@ -111,7 +111,8 @@ enum ibv_device_cap_flags {
        IBV_DEVICE_RC_RNR_NAK_GEN       = 1 << 12,
        IBV_DEVICE_SRQ_RESIZE           = 1 << 13,
        IBV_DEVICE_N_NOTIFY_CQ          = 1 << 14,
-       IBV_DEVICE_XRC                  = 1 << 20
+       IBV_DEVICE_XRC                  = 1 << 20,
+       IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
 };
 
 enum ibv_atomic_cap {
@@ -954,8 +955,113 @@ enum verbs_context_mask {
        VERBS_CONTEXT_RESERVED  = 1 << 4
 };
 
+enum ibv_flow_flags {
+       IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+};
+
+enum ibv_flow_attr_type {
+       /* steering according to rule specifications */
+       IBV_FLOW_ATTR_NORMAL            = 0x0,
+       /* default unicast and multicast rule -
+        * receive all Eth traffic which isn't steered to any QP
+        */
+       IBV_FLOW_ATTR_ALL_DEFAULT       = 0x1,
+       /* default multicast rule -
+        * receive all Eth multicast traffic which isn't steered to any QP
+        */
+       IBV_FLOW_ATTR_MC_DEFAULT        = 0x2,
+};
+
+enum ibv_flow_spec_type {
+       IBV_FLOW_SPEC_ETH       = 0x20,
+       IBV_FLOW_SPEC_IPV4      = 0x30,
+       IBV_FLOW_SPEC_TCP       = 0x40,
+       IBV_FLOW_SPEC_UDP       = 0x41,
+};
+
+struct ibv_flow_eth_filter {
+       uint8_t         dst_mac[6];
+       uint8_t         src_mac[6];
+       uint16_t        ether_type;
+       /*
+        * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+        */
+       uint16_t        vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+       enum ibv_flow_spec_type  type;
+       uint16_t  size;
+       struct ibv_flow_eth_filter val;
+       struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+       uint32_t src_ip;
+       uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+       enum ibv_flow_spec_type  type;
+       uint16_t  size;
+       struct ibv_flow_ipv4_filter val;
+       struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+       uint16_t dst_port;
+       uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+       enum ibv_flow_spec_type  type;
+       uint16_t  size;
+       struct ibv_flow_tcp_udp_filter val;
+       struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+       union {
+               struct {
+                       enum ibv_flow_spec_type type;
+                       uint16_t                size;
+               } hdr;
+               struct ibv_flow_spec_eth eth;
+               struct ibv_flow_spec_ipv4 ipv4;
+               struct ibv_flow_spec_tcp_udp tcp_udp;
+       };
+};
+
+struct ibv_flow_attr {
+       uint32_t comp_mask;
+       enum ibv_flow_attr_type type;
+       uint16_t size;
+       uint16_t priority;
+       uint8_t num_of_specs;
+       uint8_t port;
+       uint32_t flags;
+       /* Following are the optional layers according to user request
+        * struct ibv_flow_spec_xxx [L2]
+        * struct ibv_flow_spec_yyy [L3/L4]
+        */
+};
+
+struct ibv_flow {
+       uint32_t           comp_mask;
+       struct ibv_context *context;
+       uint32_t           handle;
+};
+
 struct verbs_context {
        /*  "grows up" - new fields go here */
+       int (*drv_ibv_destroy_flow) (struct ibv_flow *flow);
+       int (*lib_ibv_destroy_flow) (struct ibv_flow *flow);
+       struct ibv_flow * (*drv_ibv_create_flow) (struct ibv_qp *qp,
+                                                 struct ibv_flow_attr
+                                                 *flow_attr);
+       struct ibv_flow * (*lib_ibv_create_flow) (struct ibv_qp *qp,
+                                                 struct ibv_flow_attr
+                                                 *flow_attr);
        int (*drv_query_port_ex)(struct ibv_context *context, uint8_t port_num,
                                 struct ibv_port_attr_ex *port_attr);
        int (*lib_query_port_ex)(struct ibv_context *context, uint8_t port_num,
@@ -1146,6 +1252,26 @@ struct ibv_pd *ibv_alloc_pd(struct ibv_context *context);
  */
 int ibv_dealloc_pd(struct ibv_pd *pd);
 
+static inline struct ibv_flow *ibv_create_flow(struct ibv_qp *qp,
+                                              struct ibv_flow_attr *flow)
+{
+       struct verbs_context *vctx = verbs_get_ctx_op(qp->context,
+                                                     lib_ibv_create_flow);
+       if (!vctx || !vctx->lib_ibv_create_flow)
+               return NULL;
+
+       return vctx->lib_ibv_create_flow(qp, flow);
+}
+
+static inline int ibv_destroy_flow(struct ibv_flow *flow_id)
+{
+       struct verbs_context *vctx = verbs_get_ctx_op(flow_id->context,
+                                                     lib_ibv_destroy_flow);
+       if (!vctx || !vctx->lib_ibv_destroy_flow)
+               return -ENOSYS;
+       return vctx->lib_ibv_destroy_flow(flow_id);
+}
+
 /**
  * ibv_open_xrcd - Open an extended connection domain
  */
diff --git a/src/cmd.c b/src/cmd.c
index 57d3b03..e52a154 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -1268,3 +1268,108 @@ int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union 
ibv_gid *gid, uint16_t l
 
        return 0;
 }
+
+static int ib_spec_to_kern_spec(struct ibv_flow_spec *ib_spec,
+                               struct ibv_kern_spec *kern_spec)
+{
+       kern_spec->hdr.type = ib_spec->hdr.type;
+
+       switch (ib_spec->hdr.type) {
+       case IBV_FLOW_SPEC_ETH:
+               kern_spec->eth.size = sizeof(struct ibv_kern_spec_eth);
+               memcpy(&kern_spec->eth.val, &ib_spec->eth.val,
+                      sizeof(struct ibv_flow_eth_filter));
+               memcpy(&kern_spec->eth.mask, &ib_spec->eth.mask,
+                      sizeof(struct ibv_flow_eth_filter));
+               break;
+       case IBV_FLOW_SPEC_IPV4:
+               kern_spec->ipv4.size = sizeof(struct ibv_kern_spec_ipv4);
+               memcpy(&kern_spec->ipv4.val, &ib_spec->ipv4.val,
+                      sizeof(struct ibv_flow_ipv4_filter));
+               memcpy(&kern_spec->ipv4.mask, &ib_spec->ipv4.mask,
+                      sizeof(struct ibv_flow_ipv4_filter));
+               break;
+       case IBV_FLOW_SPEC_TCP:
+       case IBV_FLOW_SPEC_UDP:
+               kern_spec->tcp_udp.size = sizeof(struct ibv_kern_spec_tcp_udp);
+               memcpy(&kern_spec->tcp_udp.val, &ib_spec->tcp_udp.val,
+                      sizeof(struct ibv_flow_ipv4_filter));
+               memcpy(&kern_spec->tcp_udp.mask, &ib_spec->tcp_udp.mask,
+                      sizeof(struct ibv_flow_tcp_udp_filter));
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+struct ibv_flow *ibv_cmd_create_flow(struct ibv_qp *qp,
+                                    struct ibv_flow_attr *flow_attr)
+{
+       struct ibv_create_flow *cmd;
+       struct ibv_create_flow_resp resp;
+       struct ibv_flow *flow_id;
+       size_t cmd_size;
+       size_t written_size;
+       int i, err;
+       void *kern_spec;
+       void *ib_spec;
+
+       cmd_size = sizeof(*cmd) + (flow_attr->num_of_specs *
+                                 sizeof(struct ibv_kern_spec));
+       cmd = alloca(cmd_size);
+       flow_id = malloc(sizeof(*flow_id));
+       if (!flow_id)
+               return NULL;
+       memset(cmd, 0, cmd_size);
+
+       cmd->qp_handle = qp->handle;
+
+       cmd->flow_attr.type = flow_attr->type;
+       cmd->flow_attr.priority = flow_attr->priority;
+       cmd->flow_attr.num_of_specs = flow_attr->num_of_specs;
+       cmd->flow_attr.port = flow_attr->port;
+       cmd->flow_attr.flags = flow_attr->flags;
+
+       kern_spec = cmd + 1;
+       ib_spec = flow_attr + 1;
+       for (i = 0; i < flow_attr->num_of_specs; i++) {
+               err = ib_spec_to_kern_spec(ib_spec, kern_spec);
+               if (err)
+                       goto err;
+               cmd->flow_attr.size +=
+                       ((struct ibv_kern_spec *)kern_spec)->hdr.size;
+               kern_spec += ((struct ibv_kern_spec *)kern_spec)->hdr.size;
+               ib_spec += ((struct ibv_flow_spec *)ib_spec)->hdr.size;
+       }
+
+       written_size = sizeof(*cmd) + cmd->flow_attr.size;
+       IBV_INIT_CMD_RESP_EX_VCMD(cmd, written_size, written_size, CREATE_FLOW,
+                                 &resp, sizeof(resp));
+       if (write(qp->context->cmd_fd, cmd, written_size) != written_size)
+               goto err;
+
+       VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp));
+
+       flow_id->context = qp->context;
+       flow_id->handle = resp.flow_handle;
+       return flow_id;
+err:
+       free(flow_id);
+       return NULL;
+}
+
+int ibv_cmd_destroy_flow(struct ibv_flow *flow_id)
+{
+       struct ibv_destroy_flow cmd;
+       int ret = 0;
+
+       memset(&cmd, 0, sizeof(cmd));
+       IBV_INIT_CMD_EX(&cmd, sizeof(cmd), DESTROY_FLOW);
+       cmd.flow_handle = flow_id->handle;
+
+       if (write(flow_id->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd))
+               ret = errno;
+       free(flow_id);
+       return ret;
+}
diff --git a/src/device.c b/src/device.c
index 29ad726..927233e 100644
--- a/src/device.c
+++ b/src/device.c
@@ -171,6 +171,10 @@ struct ibv_context *__ibv_open_device(struct ibv_device 
*device)
                 */
                 context_ex->lib_query_port_ex =
                         context_ex->drv_query_port_ex;
+                context_ex->lib_ibv_create_flow =
+                        context_ex->drv_ibv_create_flow;
+                context_ex->lib_ibv_destroy_flow =
+                        context_ex->drv_ibv_destroy_flow;
        }
 
        context->device = device;
diff --git a/src/libibverbs.map b/src/libibverbs.map
index bbc6de1..30212f3 100644
--- a/src/libibverbs.map
+++ b/src/libibverbs.map
@@ -64,6 +64,8 @@ IBVERBS_1.0 {
                ibv_cmd_destroy_ah;
                ibv_cmd_attach_mcast;
                ibv_cmd_detach_mcast;
+               ibv_cmd_create_flow;
+               ibv_cmd_destroy_flow;
                ibv_copy_qp_attr_from_kern;
                ibv_copy_path_rec_from_kern;
                ibv_copy_path_rec_to_kern;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to