OK, I added the patch below to my tree.  I cleaned up Jack's patch a
little and it seems to work for me; I hope I didn't break anything.

commit 4a36e85ada9307b9f5d16df3856cdcfce1e9c5f0
Author: Jack Morgenstein <[EMAIL PROTECTED]>
Date:   Wed Sep 19 09:52:25 2007 -0700

    IB/mlx4: Fix data corruption triggered by wrong headroom marking order
    
    This is an addendum to commit 0e6e7416 ("IB/mlx4: Handle new FW
    requirement for send request prefetching").  We also need to handle
    prefetch marking properly for S/G segments, or else the HCA may end up
    processing S/G segments that are not fully written and end up sending
    the wrong data.
    
    We write S/G segments in reverse order into the WQE, in order to
    guarantee that the first dword of all cachelines containing S/G
    segments is written last (overwriting the headroom invalidation
    pattern).  The entire cacheline will thus contain valid data when the
    invalidation pattern is overwritten.
    
    Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>
    Signed-off-by: Roland Dreier <[EMAIL PROTECTED]>
---
 drivers/infiniband/hw/mlx4/qp.c |   69 +++++++++++++++++++++++++++++++-------
 1 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 6c0ced2..f51c1fc 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1211,20 +1211,58 @@ static void set_datagram_seg(struct 
mlx4_wqe_datagram_seg *dseg,
        dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
 }
 
-static __always_inline void set_data_seg(struct mlx4_wqe_data_seg *dseg,
-                                        struct ib_sge *sg)
+static void set_mlx_icrc_seg(void *dseg)
+{
+       u32 *t = dseg;
+       struct mlx4_wqe_inline_seg *iseg = dseg;
+
+       t[1] = 0;
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
 {
        dseg->byte_count = cpu_to_be32(sg->length);
        dseg->lkey       = cpu_to_be32(sg->lkey);
        dseg->addr       = cpu_to_be64(sg->addr);
 }
 
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+       dseg->lkey       = cpu_to_be32(sg->lkey);
+       dseg->addr       = cpu_to_be64(sg->addr);
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       dseg->byte_count = cpu_to_be32(sg->length);
+}
+
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                      struct ib_send_wr **bad_wr)
 {
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        void *wqe;
        struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_data_seg *dseg;
        unsigned long flags;
        int nreq;
        int err = 0;
@@ -1324,22 +1362,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
                        break;
                }
 
-               for (i = 0; i < wr->num_sge; ++i) {
-                       set_data_seg(wqe, wr->sg_list + i);
+               /*
+                * Write data segments in reverse order, so as to
+                * overwrite cacheline stamp last within each
+                * cacheline.  This avoids issues with WQE
+                * prefetching.
+                */
 
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
-                       size += sizeof (struct mlx4_wqe_data_seg) / 16;
-               }
+               dseg = wqe;
+               dseg += wr->num_sge - 1;
+               size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
 
                /* Add one more inline data segment for ICRC for MLX sends */
-               if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == 
IB_QPT_GSI) {
-                       ((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
-                               cpu_to_be32((1 << 31) | 4);
-                       ((u32 *) wqe)[1] = 0;
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
+               if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
+                            qp->ibqp.qp_type == IB_QPT_GSI)) {
+                       set_mlx_icrc_seg(dseg + 1);
                        size += sizeof (struct mlx4_wqe_data_seg) / 16;
                }
 
+               for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+                       set_data_seg(dseg, wr->sg_list + i);
+
                ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
                                    MLX4_WQE_CTRL_FENCE : 0) | size;
 
@@ -1429,7 +1472,7 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct 
ib_recv_wr *wr,
                scat = get_recv_wqe(qp, ind);
 
                for (i = 0; i < wr->num_sge; ++i)
-                       set_data_seg(scat + i, wr->sg_list + i);
+                       __set_data_seg(scat + i, wr->sg_list + i);
 
                if (i < qp->rq.max_gs) {
                        scat[i].byte_count = 0;
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to