Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=6e694ea33e7a7fad908d188c46f441f04fb633d4
Commit:     6e694ea33e7a7fad908d188c46f441f04fb633d4
Parent:     40ffbfad6bb79a99cc7627bdaca0ee22dec526f6
Author:     Jack Morgenstein <[EMAIL PROTECTED]>
AuthorDate: Wed Sep 19 09:52:25 2007 -0700
Committer:  Roland Dreier <[EMAIL PROTECTED]>
CommitDate: Sun Sep 23 13:03:22 2007 -0700

    IB/mlx4: Fix data corruption triggered by wrong headroom marking order
    
    This is an addendum to commit 0e6e7416 ("IB/mlx4: Handle new FW
    requirement for send request prefetching").  We also need to handle
    prefetch marking properly for S/G segments, or else the HCA may end up
    processing S/G segments that are not fully written and end up sending
    the wrong data.  This can actually cause data corruption in practice,
    especially on systems with relatively slow CPUs (where the HCA is more
    likely to prefetch while the CPU is in the middle of writing a work
    request into memory).
    
    We write S/G segments in reverse order into the WQE, in order to
    guarantee that the first dword of all cachelines containing S/G
    segments is written last (overwriting the headroom invalidation
    pattern).  The entire cacheline will thus contain valid data when the
    invalidation pattern is overwritten.
    
    Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>
    Signed-off-by: Roland Dreier <[EMAIL PROTECTED]>
---
 drivers/infiniband/hw/mlx4/qp.c |   62 ++++++++++++++++++++++++++++++--------
 1 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index ba0428d..85c51bd 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1211,12 +1211,42 @@ static void set_datagram_seg(struct 
mlx4_wqe_datagram_seg *dseg,
        dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
 }
 
-static void set_data_seg(struct mlx4_wqe_data_seg *dseg,
-                        struct ib_sge *sg)
+static void set_mlx_icrc_seg(void *dseg)
+{
+       u32 *t = dseg;
+       struct mlx4_wqe_inline_seg *iseg = dseg;
+
+       t[1] = 0;
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
 {
-       dseg->byte_count = cpu_to_be32(sg->length);
        dseg->lkey       = cpu_to_be32(sg->lkey);
        dseg->addr       = cpu_to_be64(sg->addr);
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       dseg->byte_count = cpu_to_be32(sg->length);
 }
 
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
@@ -1225,6 +1255,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        void *wqe;
        struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_data_seg *dseg;
        unsigned long flags;
        int nreq;
        int err = 0;
@@ -1324,22 +1355,27 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
                        break;
                }
 
-               for (i = 0; i < wr->num_sge; ++i) {
-                       set_data_seg(wqe, wr->sg_list + i);
+               /*
+                * Write data segments in reverse order, so as to
+                * overwrite cacheline stamp last within each
+                * cacheline.  This avoids issues with WQE
+                * prefetching.
+                */
 
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
-                       size += sizeof (struct mlx4_wqe_data_seg) / 16;
-               }
+               dseg = wqe;
+               dseg += wr->num_sge - 1;
+               size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
 
                /* Add one more inline data segment for ICRC for MLX sends */
-               if (qp->ibqp.qp_type == IB_QPT_SMI || qp->ibqp.qp_type == 
IB_QPT_GSI) {
-                       ((struct mlx4_wqe_inline_seg *) wqe)->byte_count =
-                               cpu_to_be32((1 << 31) | 4);
-                       ((u32 *) wqe)[1] = 0;
-                       wqe  += sizeof (struct mlx4_wqe_data_seg);
+               if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
+                            qp->ibqp.qp_type == IB_QPT_GSI)) {
+                       set_mlx_icrc_seg(dseg + 1);
                        size += sizeof (struct mlx4_wqe_data_seg) / 16;
                }
 
+               for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+                       set_data_seg(dseg, wr->sg_list + i);
+
                ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
                                    MLX4_WQE_CTRL_FENCE : 0) | size;
 
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to