QP buffers are allocated with mthca_alloc_buf(), which rounds the buffers
size to the page size and then allocates page aligned memory using
posix_memalign().

  However, this allocation is quite wasteful on architectures using 64K pages
(ia64 for example) because we then hit glibc's MMAP_THRESHOLD malloc
parameter and chunks are allocated using mmap. thus we end up allocating:

(requested size rounded to the page size) + (page size) + (malloc overhead)

rounded internally to the page size.

  So for example, if we request a buffer of page_size bytes, we end up
consuming 3 pages. In short, for each QP buffer we allocate, there is an
overhead of 2 pages. This is quite visible on large clusters especially where
the number of QP can reach several thousands.

  This patch creates a new function mthca_alloc_page() for use by
mthca_alloc_qp_buf() that does an mmap() instead of a posix_memalign().

Signed-off-by: Sebastien Dugue <sebastien.du...@bull.net>
---
 src/buf.c   |   34 ++++++++++++++++++++++++++++++++--
 src/mthca.h |    7 +++++++
 src/qp.c    |    7 ++++---
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/buf.c b/src/buf.c
index 6c1be4f..499edeb 100644
--- a/src/buf.c
+++ b/src/buf.c
@@ -35,6 +35,8 @@
 #endif /* HAVE_CONFIG_H */
 
 #include <stdlib.h>
+#include <sys/mman.h>
+#include <errno.h>
 
 #include "mthca.h"
 
@@ -69,8 +71,32 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int 
page_size)
        if (ret)
                free(buf->buf);
 
-       if (!ret)
+       if (!ret) {
                buf->length = size;
+               buf->type = MTHCA_MALIGN;
+       }
+
+       return ret;
+}
+
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size)
+{
+       int ret;
+
+       /* Use mmap directly to allocate an aligned buffer */
+       buf->buf = mmap(0 ,align(size, page_size) , PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+       if (buf->buf == MAP_FAILED)
+               return errno;
+
+       ret = ibv_dontfork_range(buf->buf, size);
+       if (ret)
+               munmap(buf->buf, align(size, page_size));
+       else {
+               buf->length = size;
+               buf->type = MTHCA_MMAP;
+       }
 
        return ret;
 }
@@ -78,5 +104,9 @@ int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int 
page_size)
 void mthca_free_buf(struct mthca_buf *buf)
 {
        ibv_dofork_range(buf->buf, buf->length);
-       free(buf->buf);
+
+       if ( buf->type == MTHCA_MMAP )
+               munmap(buf->buf, buf->length);
+       else
+               free(buf->buf);
 }
diff --git a/src/mthca.h b/src/mthca.h
index 66751f3..7db15a7 100644
--- a/src/mthca.h
+++ b/src/mthca.h
@@ -138,9 +138,15 @@ struct mthca_context {
        int                    qp_table_mask;
 };
 
+enum mthca_buf_type {
+       MTHCA_MMAP,
+       MTHCA_MALIGN
+};
+
 struct mthca_buf {
        void                   *buf;
        size_t                  length;
+       enum mthca_buf_type     type;
 };
 
 struct mthca_pd {
@@ -291,6 +297,7 @@ static inline int mthca_is_memfree(struct ibv_context 
*ibctx)
 }
 
 int mthca_alloc_buf(struct mthca_buf *buf, size_t size, int page_size);
+int mthca_alloc_page(struct mthca_buf *buf, size_t size, int page_size);
 void mthca_free_buf(struct mthca_buf *buf);
 
 int mthca_alloc_db(struct mthca_db_table *db_tab, enum mthca_db_type type,
diff --git a/src/qp.c b/src/qp.c
index 84dd206..15f4805 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -848,9 +848,10 @@ int mthca_alloc_qp_buf(struct ibv_pd *pd, struct 
ibv_qp_cap *cap,
 
        qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
 
-       if (mthca_alloc_buf(&qp->buf,
-                           align(qp->buf_size, 
to_mdev(pd->context->device)->page_size),
-                           to_mdev(pd->context->device)->page_size)) {
+       if (mthca_alloc_page(&qp->buf,
+                            align(qp->buf_size,
+                                  to_mdev(pd->context->device)->page_size),
+                            to_mdev(pd->context->device)->page_size)) {
                free(qp->wrid);
                return -1;
        }
-- 
1.6.3.1

_______________________________________________
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to