mlx4: Do not allocate an extra (unneeded) CQE when creating a CQ.

The extra CQE can cause a huge waste of memory if requesting
a power-of-2 number of CQEs.

The number of CQEs in the cq that is returned to the kernel_caller
is now a power-of-2.  The value returned to userspace callers
is the same as before, in order to preserve the ABI.

Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>

---

Roland,
The previous patch neglected to increase the number of CQEs returned
to the verbs-layer caller by 1. If the mlx4 layer was invoked with a 
power of 2, the returned value was <power-of-2> - 1, which is not in
conformance with the the IB spec.

This patch fixes that oversight.

In order to preserve the ABI, the kernel still returns <power-of-2> - 1 cqes
to a userspace caller; adjustments are made in userspace by libmlx4.

- Jack
Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c     2007-10-28 
09:34:17.055937000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/cq.c  2007-10-28 09:36:22.457431000 
+0200
@@ -80,10 +80,10 @@ static void *get_cqe(struct mlx4_ib_cq *
 
 static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
 {
-       struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+       struct mlx4_cqe *cqe = get_cqe(cq, n & (cq->ibcq.cqe - 1));
 
        return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
-               !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+               !!(n & cq->ibcq.cqe)) ? NULL : cqe;
 }
 
 static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
@@ -108,8 +108,16 @@ struct ib_cq *mlx4_ib_create_cq(struct i
        if (!cq)
                return ERR_PTR(-ENOMEM);
 
-       entries      = roundup_pow_of_two(entries + 1);
-       cq->ibcq.cqe = entries - 1;
+       /* eliminate using extra CQE (for kernel space).
+        * For userspace, do in libmlx4, so that don't break ABI.
+        */
+       if (context) {
+               entries      = roundup_pow_of_two(entries + 1);
+               cq->ibcq.cqe = entries - 1;
+       } else {
+               entries      = roundup_pow_of_two(entries);
+               cq->ibcq.cqe = entries;
+       }
        buf_size     = entries * sizeof (struct mlx4_cqe);
        spin_lock_init(&cq->lock);
 
@@ -222,7 +230,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
                mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), 
&mcq->db);
                ib_umem_release(mcq->umem);
        } else {
-               mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct 
mlx4_cqe),
+               mlx4_buf_free(dev->dev, (cq->cqe) * sizeof (struct mlx4_cqe),
                              &mcq->buf.buf);
                mlx4_ib_db_free(dev, &mcq->db);
        }
@@ -489,7 +497,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
         * from our QP and therefore don't need to be checked.
         */
        for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); 
++prod_index)
-               if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+               if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe - 1)
                        break;
 
        /*
@@ -497,13 +505,13 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
         * that match our QP by copying older entries on top of them.
         */
        while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
-               cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+               cqe = get_cqe(cq, prod_index & (cq->ibcq.cqe - 1));
                if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
                        if (srq && !(cqe->owner_sr_opcode & 
MLX4_CQE_IS_SEND_MASK))
                                mlx4_ib_free_srq_wqe(srq, 
be16_to_cpu(cqe->wqe_index));
                        ++nfreed;
                } else if (nfreed) {
-                       dest = get_cqe(cq, (prod_index + nfreed) & 
cq->ibcq.cqe);
+                       dest = get_cqe(cq, (prod_index + nfreed) & 
(cq->ibcq.cqe - 1));
                        owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
                        memcpy(dest, cqe, sizeof *cqe);
                        dest->owner_sr_opcode = owner_bit |
Index: infiniband/drivers/net/mlx4/main.c
===================================================================
--- infiniband.orig/drivers/net/mlx4/main.c     2007-10-28 09:34:17.077932000 
+0200
+++ infiniband/drivers/net/mlx4/main.c  2007-10-28 09:36:22.465430000 +0200
@@ -141,12 +141,7 @@ static int mlx4_dev_cap(struct mlx4_dev 
        dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
        dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
        dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
-       /*
-        * Subtract 1 from the limit because we need to allocate a
-        * spare CQE so the HCA HW can tell the difference between an
-        * empty CQ and a full CQ.
-        */
-       dev->caps.max_cqes           = dev_cap->max_cq_sz - 1;
+       dev->caps.max_cqes           = dev_cap->max_cq_sz;
        dev->caps.reserved_cqs       = dev_cap->reserved_cqs;
        dev->caps.reserved_eqs       = dev_cap->reserved_eqs;
        dev->caps.reserved_mtts      = DIV_ROUND_UP(dev_cap->reserved_mtts,
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to