QP buffers are allocated with mlx4_alloc_buf(), which rounds the buffers size to the page size and then allocates page aligned memory using posix_memalign().
However, this allocation is quite wasteful on architectures using 64K pages (ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc parameter and chunks are allocated using mmap. thus we end up allocating: (requested size rounded to the page size) + (page size) + (malloc overhead) rounded internally to the page size. So for example, if we request a buffer of page_size bytes, we end up consuming 3 pages. In short, for each QP buffer we allocate, there is an overhead of 2 pages. This is quite visible on large clusters especially where the number of QP can reach several thousands. This patch creates a new function mlx4_alloc_page() for use by mlx4_alloc_qp_buf() that does an mmap() instead of a posix_memalign(). Signed-off-by: Sebastien Dugue <sebastien.du...@bull.net> --- src/buf.c | 34 ++++++++++++++++++++++++++++++++-- src/mlx4.h | 7 +++++++ src/qp.c | 5 +++-- 3 files changed, 42 insertions(+), 4 deletions(-) diff --git a/src/buf.c b/src/buf.c index 0e5f9b6..73565e6 100644 --- a/src/buf.c +++ b/src/buf.c @@ -35,6 +35,8 @@ #endif /* HAVE_CONFIG_H */ #include <stdlib.h> +#include <sys/mman.h> +#include <errno.h> #include "mlx4.h" @@ -69,14 +71,42 @@ int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size) if (ret) free(buf->buf); - if (!ret) + if (!ret) { buf->length = size; + buf->type = MLX4_MALIGN; + } return ret; } +int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size) +{ + int ret; + + /* Use mmap directly to allocate an aligned buffer */ + buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, align(size, page_size)); + else { + buf->length = size; + buf->type = MLX4_MMAP; + } + + return ret; + } + void mlx4_free_buf(struct mlx4_buf *buf) { ibv_dofork_range(buf->buf, buf->length); - free(buf->buf); + + if ( buf->type == MLX4_MMAP ) + munmap(buf->buf, buf->length); + else + free(buf->buf); } diff --git a/src/mlx4.h b/src/mlx4.h index 827a201..83547f5 100644 --- a/src/mlx4.h +++ b/src/mlx4.h @@ -161,9 +161,15 @@ struct mlx4_context { pthread_mutex_t db_list_mutex; }; +enum mlx4_buf_type { + MLX4_MMAP, + MLX4_MALIGN +}; + struct mlx4_buf { void *buf; size_t length; + enum mlx4_buf_type type; }; struct mlx4_pd { @@ -288,6 +294,7 @@ static inline struct mlx4_ah *to_mah(struct ibv_ah *ibah) } int mlx4_alloc_buf(struct mlx4_buf *buf, size_t size, int page_size); +int mlx4_alloc_page(struct mlx4_buf *buf, size_t size, int page_size); void mlx4_free_buf(struct mlx4_buf *buf); uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type); diff --git a/src/qp.c b/src/qp.c index d194ae3..557e255 100644 --- a/src/qp.c +++ b/src/qp.c @@ -604,8 +604,9 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, qp->sq.offset = 0; } - if (mlx4_alloc_buf(&qp->buf, - align(qp->buf_size, to_mdev(pd->context->device)->page_size), + if (mlx4_alloc_page(&qp->buf, + align(qp->buf_size, + to_mdev(pd->context->device)->page_size), to_mdev(pd->context->device)->page_size)) { free(qp->sq.wrid); free(qp->rq.wrid); -- 1.6.3.1 _______________________________________________ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general