QP buffers are allocated with mlx4_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().

  However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:

(requested size rounded to the page size) + (page size) + (malloc overhead)

rounded internally to the page size.

  So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.

  This patch creates a new function mlx4_alloc_page() for use by
mlx4_alloc_qp_buf() that does an mmap() instead of a posix_memalign().

Signed-off-by: Sebastien Dugue <sebastien.du...@bull.net>
---
 src/buf.c  |   34 ++++++++++++++++++++++++++++++++--
 src/mlx4.h |    7 +++++++
 src/qp.c   |    5 +++--
 3 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/buf.c b/src/buf.c
index 0e5f9b6..73565e6 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
 #endif /* HAVE_CONFIG_H */
 
 #include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
 
 #include "mlx4.h"
 
@@ -69,14 +71,42 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int 
page_size)
        if (ret)
                free(buf->buf);
 
-       if (!ret)
+       if (!ret) {
                buf->length = size;
+               buf->type = MLX4_MALIGN;
+       }
 
        return ret;
 }
 
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size)
+{
+       int ret;
+
+       /* Use mmap directly to allocate an aligned buffer */
+       buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+       if (buf->buf == MAP_FAILED)
+               return errno;
+
+       ret = ibv_dontfork_range(buf->buf, size);
+       if (ret)
+               munmap(buf->buf, align(size, page_size));
+       else {
+               buf->length = size;
+               buf->type = MLX4_MMAP;
+       }
+
+        return ret;
+ }
+
 void mlx4_free_buf(struct mlx4_buf *buf)
 {
        ibv_dofork_range(buf->buf, buf->length);
-       free(buf->buf);
+
+       if ( buf->type == MLX4_MMAP )
+               munmap(buf->buf, buf->length);
+       else
+               free(buf->buf);
 }
diff --git a/src/mlx4.h b/src/mlx4.h
index 827a201..83547f5 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -161,9 +161,15 @@ struct mlx4_context {
        pthread_mutex_t                 db_list_mutex;
 };
 
+enum mlx4_buf_type {
+       MLX4_MMAP,
+       MLX4_MALIGN
+};
+
 struct mlx4_buf {
        void                           *buf;
        size_t                          length;
+       enum mlx4_buf_type              type;
 };
 
 struct mlx4_pd {
@@ -288,6 +294,7 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah)
 }
 
 int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size);
+int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size);
 void mlx4_free_buf(struct mlx4_buf *buf);
 
 uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);
diff --git a/src/qp.c b/src/qp.c
index d194ae3..557e255 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -604,8 +604,9 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap 
*cap,
                qp->sq.offset = 0;
        }
 
-       if (mlx4_alloc_buf(&qp->buf,
-                           align(qp->buf_size, 
to_mdev(pd->context->device)->page_size),
+       if (mlx4_alloc_page(&qp->buf,
+                           align(qp->buf_size,
+                                 to_mdev(pd->context->device)->page_size),
                            to_mdev(pd->context->device)->page_size)) {
                free(qp->sq.wrid);
                free(qp->rq.wrid);
-- 
1.6.3.1

_______________________________________________
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to