From: Shlomo Pongratz <[email protected]>

Depending on the mlx4 device capabilities, support the RSS IB device
capability, using Topelitz or XOR hash functions. Support creating QP
groups where all RX and TX QPs have contiguous QP numbers.

Signed-off-by: Shlomo Pongratz <[email protected]>
---
 drivers/infiniband/hw/mlx4/main.c    |    5 +
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   13 ++
 drivers/infiniband/hw/mlx4/qp.c      |  342 ++++++++++++++++++++++++++++++++-
 3 files changed, 349 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 8aa06da..1034853 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -131,6 +131,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
        if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
                props->device_cap_flags |= IB_DEVICE_XRC;
 
+       props->device_cap_flags |= IB_DEVICE_QPG;
+       if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
+               props->device_cap_flags |= IB_DEVICE_UD_RSS;
+               props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
+       }
        props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 
36)) &
                0xffffff;
        props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 
30));
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 9060771..57a03ad 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -126,6 +126,17 @@ struct mlx4_ib_gid_entry {
        u8                      port;
 };
 
+struct mlx4_ib_qpg_data {
+       unsigned long *tss_bitmap;
+       unsigned long *rss_bitmap;
+       struct mlx4_ib_qp *qpg_parent;
+       int tss_qpn_base;
+       int rss_qpn_base;
+       u32 tss_child_count;
+       u32 rss_child_count;
+       u32 qpg_tss_mask_sz;
+};
+
 struct mlx4_ib_qp {
        struct ib_qp            ibqp;
        struct mlx4_qp          mqp;
@@ -154,6 +165,8 @@ struct mlx4_ib_qp {
        u8                      sq_no_prefetch;
        u8                      state;
        int                     mlx_type;
+       enum ib_qpg_type        qpg_type;
+       struct mlx4_ib_qpg_data *qpg_data;
        struct list_head        gid_list;
 };
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index a32482c..7fe3500 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,8 @@
 #include <linux/log2.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
@@ -475,6 +477,241 @@ static int qp_has_rq(struct ib_qp_init_attr *attr)
        return !attr->srq;
 }
 
+static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
+                          struct ib_qp_init_attr *attr, int *qpn)
+{
+       struct mlx4_ib_qpg_data *qpg_data;
+       int tss_num, rss_num;
+       int tss_align_num, rss_align_num;
+       int tss_base, rss_base;
+       int err;
+
+       /* Parent is part of the TSS range (in SW TSS ARP is sent via parent) */
+       tss_num = 1 + attr->parent_attrib.tss_child_count;
+       tss_align_num = roundup_pow_of_two(tss_num);
+       rss_num = attr->parent_attrib.rss_child_count;
+       rss_align_num = roundup_pow_of_two(rss_num);
+
+       if (rss_num > 1) {
+               /* RSS is requested */
+               if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
+                       return -ENOSYS;
+               if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
+                       return -EINVAL;
+               /* We must work with power of two */
+               attr->parent_attrib.rss_child_count = rss_align_num;
+       }
+
+       qpg_data = kzalloc(sizeof *qpg_data, GFP_KERNEL);
+       if (!qpg_data)
+               return -ENOMEM;
+
+       err = mlx4_qp_reserve_range(dev->dev, tss_align_num,
+                                   tss_align_num, &tss_base);
+       if (err)
+               goto err1;
+
+       if (tss_num > 1) {
+               u32 alloc = BITS_TO_LONGS(tss_align_num)  * sizeof(long);
+               qpg_data->tss_bitmap = kzalloc(alloc, GFP_KERNEL);
+               if (qpg_data->tss_bitmap == NULL) {
+                       err = -ENOMEM;
+                       goto err2;
+               }
+               bitmap_fill(qpg_data->tss_bitmap, tss_num);
+               /* Note parent takes first index */
+               clear_bit(0, qpg_data->tss_bitmap);
+       }
+
+       if (rss_num > 1) {
+               u32 alloc = BITS_TO_LONGS(rss_align_num) * sizeof(long);
+               err = mlx4_qp_reserve_range(dev->dev, rss_align_num,
+                                           rss_align_num, &rss_base);
+               if (err)
+                       goto err3;
+               qpg_data->rss_bitmap = kzalloc(alloc, GFP_KERNEL);
+               if (qpg_data->rss_bitmap == NULL) {
+                       err = -ENOMEM;
+                       goto err4;
+               }
+               bitmap_fill(qpg_data->rss_bitmap, rss_align_num);
+       }
+
+       qpg_data->tss_child_count = attr->parent_attrib.tss_child_count;
+       qpg_data->rss_child_count = attr->parent_attrib.rss_child_count;
+       qpg_data->qpg_parent = pqp;
+       qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num);
+       qpg_data->tss_qpn_base = tss_base;
+       qpg_data->rss_qpn_base = rss_base;
+
+       pqp->qpg_data = qpg_data;
+       *qpn = tss_base;
+
+       return 0;
+
+err4:
+       mlx4_qp_release_range(dev->dev, rss_base, rss_align_num);
+
+err3:
+       if (tss_num > 1)
+               kfree(qpg_data->tss_bitmap);
+
+err2:
+       mlx4_qp_release_range(dev->dev, tss_base, tss_align_num);
+
+err1:
+       kfree(qpg_data);
+       return err;
+}
+
+static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp)
+{
+       struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data;
+       int align_num;
+
+       if (qpg_data->tss_child_count > 1)
+               kfree(qpg_data->tss_bitmap);
+
+       align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count);
+       mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num);
+
+       if (qpg_data->rss_child_count > 1) {
+               kfree(qpg_data->rss_bitmap);
+               align_num = roundup_pow_of_two(qpg_data->rss_child_count);
+               mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base,
+                                       align_num);
+       }
+
+       kfree(qpg_data);
+}
+
+static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr,
+                        struct mlx4_ib_qp *pqp, int *qpn)
+{
+       struct mlx4_ib_qp *mqp = to_mqp(init_attr->qpg_parent);
+       struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
+       u32 idx, old;
+
+       switch (init_attr->qpg_type) {
+       case IB_QPG_CHILD_TX:
+               if (qpg_data->tss_child_count == 0)
+                       return -EINVAL;
+               do {
+                       /* Parent took index 0 */
+                       idx = find_first_bit(qpg_data->tss_bitmap,
+                                            qpg_data->tss_child_count + 1);
+                       if (idx >= qpg_data->tss_child_count + 1)
+                               return -ENOMEM;
+                       old = test_and_clear_bit(idx, qpg_data->tss_bitmap);
+               } while (old == 0);
+               idx += qpg_data->tss_qpn_base;
+               break;
+       case IB_QPG_CHILD_RX:
+               if (qpg_data->rss_child_count == 0)
+                       return -EINVAL;
+               do {
+                       idx = find_first_bit(qpg_data->rss_bitmap,
+                                            qpg_data->rss_child_count);
+                       if (idx >= qpg_data->rss_child_count)
+                               return -ENOMEM;
+                       old = test_and_clear_bit(idx, qpg_data->rss_bitmap);
+               } while (old == 0);
+               idx += qpg_data->rss_qpn_base;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       pqp->qpg_data = qpg_data;
+       *qpn = idx;
+
+       return 0;
+}
+
+static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn)
+{
+       struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
+
+       switch (mqp->qpg_type) {
+       case IB_QPG_CHILD_TX:
+               /* Do range check */
+               qpn -= qpg_data->tss_qpn_base;
+               set_bit(qpn, qpg_data->tss_bitmap);
+               break;
+       case IB_QPG_CHILD_RX:
+               qpn -= qpg_data->rss_qpn_base;
+               set_bit(qpn, qpg_data->rss_bitmap);
+               break;
+       default:
+               /* error */
+               pr_warn("wrong qpg type (%d)\n", mqp->qpg_type);
+               break;
+       }
+}
+
+static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+                           struct ib_qp_init_attr *attr, int *qpn)
+{
+       int err = 0;
+
+       switch (attr->qpg_type) {
+       case IB_QPG_NONE:
+               /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
+                * BlueFlame setup flow wrongly causes VLAN insertion. */
+               if (attr->qp_type == IB_QPT_RAW_PACKET)
+                       err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, qpn);
+               else
+                       err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn);
+               break;
+       case IB_QPG_PARENT:
+               err = init_qpg_parent(dev, qp, attr, qpn);
+               break;
+       case IB_QPG_CHILD_TX:
+       case IB_QPG_CHILD_RX:
+               err = alloc_qpg_qpn(attr, qp, qpn);
+               break;
+       default:
+               qp->qpg_type = IB_QPG_NONE;
+               err = -EINVAL;
+               break;
+       }
+       if (err)
+               return err;
+       qp->qpg_type = attr->qpg_type;
+       return 0;
+}
+
+static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+                       enum ib_qpg_type qpg_type, int qpn)
+{
+       switch (qpg_type) {
+       case IB_QPG_NONE:
+               mlx4_qp_release_range(dev->dev, qpn, 1);
+               break;
+       case IB_QPG_PARENT:
+               free_qpg_parent(dev, qp);
+               break;
+       case IB_QPG_CHILD_TX:
+       case IB_QPG_CHILD_RX:
+               free_qpg_qpn(qp, qpn);
+               break;
+       default:
+               break;
+       }
+}
+
+/* Revert allocation on create_qp_common */
+static void unalloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+                              struct ib_qp_init_attr *attr, int qpn)
+{
+       free_qpn_common(dev, qp, attr->qpg_type, qpn);
+}
+
+static void release_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+       free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn);
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, int sqpn, struct mlx4_ib_qp 
*qp)
@@ -578,12 +815,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
        if (sqpn) {
                qpn = sqpn;
        } else {
-               /* Raw packet QPNs must be aligned to 8 bits. If not, the WQE
-                * BlueFlame setup flow wrongly causes VLAN insertion. */
-               if (init_attr->qp_type == IB_QPT_RAW_PACKET)
-                       err = mlx4_qp_reserve_range(dev->dev, 1, 1 << 8, &qpn);
-               else
-                       err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+               err = alloc_qpn_common(dev, qp, init_attr, &qpn);
                if (err)
                        goto err_wrid;
        }
@@ -607,8 +839,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct 
ib_pd *pd,
        return 0;
 
 err_qpn:
-       if (!sqpn)
-               mlx4_qp_release_range(dev->dev, qpn, 1);
+       unalloc_qpn_common(dev, qp, init_attr, qpn);
 
 err_wrid:
        if (pd->uobject) {
@@ -746,7 +977,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
        mlx4_qp_free(dev->dev, &qp->mqp);
 
        if (!is_sqp(dev, qp))
-               mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+               release_qpn_common(dev, qp);
 
        mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
@@ -766,6 +997,52 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
        del_gid_entries(qp);
 }
 
+static int check_qpg_attr(struct mlx4_ib_dev *dev,
+                         struct ib_qp_init_attr *attr)
+{
+       if (attr->qpg_type == IB_QPG_NONE)
+               return 0;
+
+       if (attr->qp_type != IB_QPT_UD)
+               return -EINVAL;
+
+       if (attr->qpg_type == IB_QPG_PARENT) {
+               if (attr->parent_attrib.tss_child_count == 1)
+                       return -EINVAL; /* Doesn't make sense */
+               if (attr->parent_attrib.rss_child_count == 1)
+                       return -EINVAL; /* Doesn't make sense */
+               if ((attr->parent_attrib.tss_child_count == 0) &&
+                       (attr->parent_attrib.rss_child_count == 0))
+                       /* Should be called with IP_QPG_NONE */
+                       return -EINVAL;
+               if (attr->parent_attrib.rss_child_count > 1) {
+                       int rss_align_num;
+                       if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
+                               return -ENOSYS;
+                       rss_align_num = roundup_pow_of_two(
+                                       attr->parent_attrib.rss_child_count);
+                       if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
+                               return -EINVAL;
+               }
+       } else {
+               struct mlx4_ib_qpg_data *qpg_data;
+               if (attr->qpg_parent == NULL)
+                       return -EINVAL;
+               if (IS_ERR(attr->qpg_parent))
+                       return -EINVAL;
+               qpg_data = to_mqp(attr->qpg_parent)->qpg_data;
+               if (qpg_data == NULL)
+                       return -EINVAL;
+               if (attr->qpg_type == IB_QPG_CHILD_TX &&
+                   !qpg_data->tss_child_count)
+                       return -EINVAL;
+               if (attr->qpg_type == IB_QPG_CHILD_RX &&
+                   !qpg_data->rss_child_count)
+                       return -EINVAL;
+       }
+       return 0;
+}
+
 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
                                struct ib_qp_init_attr *init_attr,
                                struct ib_udata *udata)
@@ -787,8 +1064,9 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
            (udata || init_attr->qp_type != IB_QPT_UD))
                return ERR_PTR(-EINVAL);
 
-       if (init_attr->qpg_type != IB_QPG_NONE)
-               return ERR_PTR(-ENOSYS);
+       err = check_qpg_attr(to_mdev(pd->device), init_attr);
+       if (err)
+               return ERR_PTR(err);
 
        switch (init_attr->qp_type) {
        case IB_QPT_XRC_TGT:
@@ -1235,6 +1513,43 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == 
IB_QPS_INIT)
                context->rlkey |= (1 << 4);
 
+       if ((attr_mask & IB_QP_GROUP_RSS) &&
+               (qp->qpg_data->rss_child_count > 1)) {
+               struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data;
+               void *rss_context_base = &context->pri_path;
+               struct mlx4_rss_context *rss_context =
+                       (struct mlx4_rss_context *) (rss_context_base
+                                       + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH);
+
+               context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
+
+               /* This should be tbl_sz_base_qpn */
+               rss_context->base_qpn = cpu_to_be32(qpg_data->rss_qpn_base |
+                               (ilog2(qpg_data->rss_child_count) << 24));
+               rss_context->default_qpn = cpu_to_be32(qpg_data->rss_qpn_base);
+               /* This should be flags_hash_fn */
+               rss_context->flags = MLX4_RSS_TCP_IPV6 |
+                                    MLX4_RSS_TCP_IPV4;
+               if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS) {
+                       rss_context->base_qpn_udp = rss_context->default_qpn;
+                       rss_context->flags |= MLX4_RSS_IPV6 |
+                                       MLX4_RSS_IPV4     |
+                                       MLX4_RSS_UDP_IPV6 |
+                                       MLX4_RSS_UDP_IPV4;
+               }
+               if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) {
+                       static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B,
+                               0x1983A2FC, 0x943E1ADB, 0xD9389E6B, 0xD1039C2C,
+                               0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
+                       rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+                       memcpy(rss_context->rss_key, rsskey,
+                               sizeof(rss_context->rss_key));
+               } else {
+                       rss_context->hash_fn = MLX4_RSS_HASH_XOR;
+                       memset(rss_context->rss_key, 0,
+                               sizeof(rss_context->rss_key));
+               }
+       }
        /*
         * Before passing a kernel QP to the HW, make sure that the
         * ownership bits of the send queue are set and the SQ
@@ -2191,6 +2506,11 @@ done:
        if (qp->flags & MLX4_IB_QP_LSO)
                qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
 
+       qp_init_attr->qpg_type = ibqp->qpg_type;
+       if (ibqp->qpg_type == IB_QPG_PARENT)
+               qp_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz;
+       else
+               qp_attr->cap.qpg_tss_mask_sz = 0;
 out:
        mutex_unlock(&qp->mutex);
        return err;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to