Do not use memcpy when copying to the BlueFlame buffer.
memcpy implementations may use move-string-buffer (byte-wise
copy) assembler instructions, which do not guarantee copy order
into the blueflame buffer. Use a tight for-loop instead.
BTW, this patch also slightly improves latency.
Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>
---
diff --git a/src/doorbell.h b/src/doorbell.h
index 3171e76..c89ef0e 100644
--- a/src/doorbell.h
+++ b/src/doorbell.h
@@ -35,6 +35,8 @@
#if SIZEOF_LONG == 8
+typedef uint64_t mlx4_wc_copy_t;
+
#if __BYTE_ORDER == __LITTLE_ENDIAN
# define MLX4_PAIR_TO_64(val) ((uint64_t) val[1] << 32 | val[0])
#elif __BYTE_ORDER == __BIG_ENDIAN
@@ -50,6 +52,8 @@ static inline void mlx4_write64(uint32_t val[2], struct
mlx4_context *ctx, int o
#else
+typedef uint32_t mlx4_wc_copy_t;
+
static inline void mlx4_write64(uint32_t val[2], struct mlx4_context *ctx, int
offset)
{
pthread_spin_lock(&ctx->uar_lock);
diff --git a/src/qp.c b/src/qp.c
index bced740..8fc8450 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -391,7 +391,23 @@ out:
pthread_spin_lock(&ctx->bf_lock);
- memcpy(ctx->bf_page + ctx->bf_offset, ctrl, align(size * 16,
64));
+ /*
+ * Avoid using memcpy to copy to BlueFlame page, since recent
+ * memcpy implementations use move-string-buffer assembler
+ * instructions, which do not guarantee order of copying.
+ */
+
+ {
+ mlx4_wc_copy_t *target =
+ (mlx4_wc_copy_t *) (ctx->bf_page +
ctx->bf_offset);
+ mlx4_wc_copy_t *src = (mlx4_wc_copy_t *) ctrl;
+ int n = align(size * 16, 64) / (sizeof(mlx4_wc_copy_t)
* 2);
+ for (; n; --n) {
+ *target++ = *src++;
+ *target++ = *src++;
+ }
+ }
+
wc_wmb();
ctx->bf_offset ^= ctx->bf_buf_size;
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general