QP buffers are allocated with mthca_alloc_buf(), which rounds the buffers size to the page size and then allocates page aligned memory using posix_memalign().
However, this allocation is quite wasteful on architectures using 64K pages (ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc parameter and chunks are allocated using mmap. thus we end up allocating: (requested size rounded to the page size) + (page size) + (malloc overhead) rounded internally to the page size. So for example, if we request a buffer of page_size bytes, we end up consuming 3 pages. In short, for each QP buffer we allocate, there is an overhead of 2 pages. This is quite visible on large clusters especially where the number of QP can reach several thousands. This patch creates a new function mthca_alloc_page() for use by mthca_alloc_qp_buf() that does an mmap() instead of a posix_memalign(). Signed-off-by: Sebastien Dugue <sebastien.du...@bull.net> --- src/buf.c | 34 ++++++++++++++++++++++++++++++++-- src/mthca.h | 7 +++++++ src/qp.c | 7 ++++--- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/buf.c b/src/buf.c index 6c1be4f..499edeb 100644 --- a/src/buf.c +++ b/src/buf.c @@ -35,6 +35,8 @@ #endif /* HAVE_CONFIG_H */ #include <stdlib.h> +#include <sys/mman.h> +#include <errno.h> #include "mthca.h" @@ -69,8 +71,32 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size) if (ret) free(buf->buf); - if (!ret) + if (!ret) { buf->length = size; + buf->type = MTHCA_MALIGN; + } + + return ret; +} + +int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size) +{ + int ret; + + /* Use mmap directly to allocate an aligned buffer */ + buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (buf->buf == MAP_FAILED) + return errno; + + ret = ibv_dontfork_range(buf->buf, size); + if (ret) + munmap(buf->buf, align(size, page_size)); + else { + buf->length = size; + buf->type = MTHCA_MMAP; + } return ret; } @@ -78,5 +104,9 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size) void mthca_free_buf(struct mthca_buf *buf) { ibv_dofork_range(buf->buf, buf->length); - free(buf->buf); + + if ( buf->type == MTHCA_MMAP ) + munmap(buf->buf, buf->length); + else + free(buf->buf); } diff --git a/src/mthca.h b/src/mthca.h index 66751f3..7db15a7 100644 --- a/src/mthca.h +++ b/src/mthca.h @@ -138,9 +138,15 @@ struct mthca_context { int qp_table_mask; }; +enum mthca_buf_type { + MTHCA_MMAP, + MTHCA_MALIGN +}; + struct mthca_buf { void *buf; size_t length; + enum mthca_buf_type type; }; struct mthca_pd { @@ -291,6 +297,7 @@ static inline int mthca_is_memfree(struct ibv_context *ibctx) } int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size); +int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size); void mthca_free_buf(struct mthca_buf *buf); int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type, diff --git a/src/qp.c b/src/qp.c index 84dd206..15f4805 100644 --- a/src/qp.c +++ b/src/qp.c @@ -848,9 +848,10 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift); - if (mthca_alloc_buf(&qp->buf, - align(qp->buf_size, to_mdev(pd->context->device)->page_size), - to_mdev(pd->context->device)->page_size)) { + if (mthca_alloc_page(&qp->buf, + align(qp->buf_size, + to_mdev(pd->context->device)->page_size), + to_mdev(pd->context->device)->page_size)) { free(qp->wrid); return -1; } -- 1.6.3.1 _______________________________________________ general mailing list general@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general