[RFC contig pages support 1/2] IB: Supports contiguous memory operations

Yishai Hadas Tue, 08 Dec 2015 07:17:08 -0800

New structure 'cmem' represents the contiguous allocated memory.
It supports:
Allocate, Free, 'Map to virtual address' operations, etc.


Signed-off-by: Yishai Hadas <[email protected]>
---
 drivers/infiniband/core/Makefile |   2 +-
 drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_cmem.h           |  41 +++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/cmem.c
 create mode 100644 include/rdma/ib_cmem.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..8549ea4 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=       ib_uverbs.o 
ib_ucm.o \
 ib_core-y :=                   packer.o ud_header.o verbs.o sysfs.o \
                                device.o fmr_pool.o cache.o netlink.o \
                                roce_gid_mgmt.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
 ib_mad-y :=                    mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
new file mode 100644
index 0000000..21d8573
--- /dev/null
+++ b/drivers/infiniband/core/cmem.c
@@ -0,0 +1,245 @@
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_cmem.h>
+#include "uverbs.h"
+
+static void ib_cmem_release(struct kref *ref)
+{
+       struct ib_cmem *cmem;
+       struct ib_cmem_block *cmem_block, *tmp;
+       unsigned long ntotal_pages;
+
+       cmem = container_of(ref, struct ib_cmem, refcount);
+
+       list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
+               __free_pages(cmem_block->page, cmem->block_order);
+               list_del(&cmem_block->list);
+               kfree(cmem_block);
+       }
+       /* no locking is needed:
+         * ib_cmem_release is called from vm_close which is always called
+         * with mm->mmap_sem held for writing.
+         * The only exception is when the process shutting down but in that 
case
+         * counter not relevant any more.
+         */
+       if (current->mm) {
+               ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
+               current->mm->pinned_vm -= ntotal_pages;
+       }
+       kfree(cmem);
+}
+
+/**
+ * ib_cmem_release_contiguous_pages - release memory allocated by
+ *                                              ib_cmem_alloc_contiguous_pages.
+ * @cmem: cmem struct to release
+ */
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
+{
+       kref_put(&cmem->refcount, ib_cmem_release);
+}
+EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
+
+static void cmem_vma_open(struct vm_area_struct *area)
+{
+       struct ib_cmem *ib_cmem;
+
+       ib_cmem = (struct ib_cmem *)(area->vm_private_data);
+
+       /* vm_open and vm_close are always called with mm->mmap_sem held for
+         * writing. The only exception is when the process is shutting down, at
+         * which point vm_close is called with no locks held, but since it is
+         * after the VMAs have been detached, it is impossible that vm_open 
will
+         * be called. Therefore, there is no need to synchronize the kref_get 
and
+         * kref_put calls.
+       */
+       kref_get(&ib_cmem->refcount);
+}
+
+static void cmem_vma_close(struct vm_area_struct *area)
+{
+       struct ib_cmem *cmem;
+
+       cmem = (struct ib_cmem *)(area->vm_private_data);
+
+       ib_cmem_release_contiguous_pages(cmem);
+}
+
+static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
+       .open = cmem_vma_open,
+       .close = cmem_vma_close
+};
+
+/**
+ * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
+ * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
+ * @vma: VMA to inject pages into.
+ */
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+                                       struct vm_area_struct *vma)
+{
+       int ret;
+       unsigned long page_entry;
+       unsigned long ntotal_pages;
+       unsigned long ncontig_pages;
+       unsigned long total_size;
+       struct page *page;
+       unsigned long vma_entry_number = 0;
+       struct ib_cmem_block *ib_cmem_block = NULL;
+
+       total_size = vma->vm_end - vma->vm_start;
+       if (ib_cmem->length != total_size)
+               return -EINVAL;
+
+       if (total_size != PAGE_ALIGN(total_size)) {
+               WARN(1,
+                    "ib_cmem_map: total size %lu not aligned to page size\n",
+                    total_size);
+               return -EINVAL;
+       }
+
+       ntotal_pages = total_size >> PAGE_SHIFT;
+       ncontig_pages = 1 << ib_cmem->block_order;
+
+       list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
+               page = ib_cmem_block->page;
+               for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
+                       /* We reached end of vma - going out from both loops */
+                       if (vma_entry_number >= ntotal_pages)
+                               goto end;
+
+                       ret = vm_insert_page(vma, vma->vm_start +
+                               (vma_entry_number << PAGE_SHIFT), page);
+                       if (ret < 0)
+                               goto err_vm_insert;
+
+                       vma_entry_number++;
+                       page++;
+               }
+       }
+
+end:
+
+       /* We expect to have enough pages   */
+       if (vma_entry_number >= ntotal_pages) {
+               vma->vm_ops =  &cmem_contig_pages_vm_ops;
+               vma->vm_private_data = ib_cmem;
+               return 0;
+       }
+       /* Not expected but if we reached here
+         * not enough contiguous pages were registered
+         */
+       ret = -EINVAL;
+
+err_vm_insert:
+
+       zap_vma_ptes(vma, vma->vm_start, total_size);
+       return ret;
+}
+EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
+
+/**
+ * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
+ * @context: userspace context to allocate memory for
+ * @total_size: total required size for that allocation.
+ * @page_size_order: order of one contiguous page.
+ * @numa_nude: From which numa node to allocate memory
+ *             when numa_nude < 0 use default numa_nude.
+ */
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+                                              unsigned long total_size,
+                                              unsigned long page_size_order,
+                                              int numa_node)
+{
+       struct ib_cmem *cmem;
+       unsigned long ntotal_pages;
+       unsigned long ncontiguous_pages;
+       unsigned long ncontiguous_groups;
+       struct page *page;
+       int i;
+       int ncontiguous_pages_order;
+       struct ib_cmem_block *ib_cmem_block;
+       unsigned long locked;
+       unsigned long lock_limit;
+
+       if (page_size_order < PAGE_SHIFT || page_size_order > 31)
+               return ERR_PTR(-EINVAL);
+
+       cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
+       if (!cmem)
+               return ERR_PTR(-ENOMEM);
+
+       kref_init(&cmem->refcount);
+       cmem->context   = context;
+       INIT_LIST_HEAD(&cmem->ib_cmem_block);
+
+       /* Total size is expected to be already page aligned -
+         * verifying anyway.
+         */
+       ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
+       /* ib_cmem_alloc_contiguous_pages is called as part of mmap
+         * with mm->mmap_sem held for writing.
+         * No need to lock
+         */
+       locked     = ntotal_pages + current->mm->pinned_vm;
+       lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+       if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+               goto err_alloc;
+
+       /* How many contiguous pages do we need in 1 block */
+       ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
+       ncontiguous_pages_order = ilog2(ncontiguous_pages);
+       ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
+               (!!(ntotal_pages & (ncontiguous_pages - 1)));
+
+       /* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
+       if (ncontiguous_pages_order >= MAX_ORDER)
+               goto err_alloc;
+       /* we set block_order before starting allocation to prevent
+         * a leak in a failure flow in ib_cmem_release.
+         * cmem->length has at that step value 0 from kzalloc as expected
+         */
+       cmem->block_order = ncontiguous_pages_order;
+       for (i = 0; i < ncontiguous_groups; i++) {
+               /* Allocating the managed entry */
+               ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
+                                       GFP_KERNEL);
+               if (!ib_cmem_block)
+                       goto err_alloc;
+
+               if (numa_node < 0)
+                       page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
+                                           __GFP_COMP | __GFP_NOWARN,
+                                           ncontiguous_pages_order);
+               else
+                       page =  alloc_pages_node(numa_node,
+                                                GFP_HIGHUSER | __GFP_ZERO |
+                                                __GFP_COMP | __GFP_NOWARN,
+                                                ncontiguous_pages_order);
+
+               if (!page) {
+                       kfree(ib_cmem_block);
+                       /* We should deallocate previous succeeded allocatations
+                         * if exists.
+                         */
+                       goto err_alloc;
+               }
+
+               ib_cmem_block->page = page;
+               list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
+       }
+
+       cmem->length = total_size;
+       current->mm->pinned_vm = locked;
+       return cmem;
+
+err_alloc:
+       ib_cmem_release_contiguous_pages(cmem);
+       return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
new file mode 100644
index 0000000..5f26a49
--- /dev/null
+++ b/include/rdma/ib_cmem.h
@@ -0,0 +1,41 @@
+#ifndef IB_CMEM_H
+#define IB_CMEM_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+/* contiguous memory structure */
+struct ib_cmem {
+       struct ib_ucontext     *context;
+       size_t                  length;
+       /* Link list of contiguous blocks being part of that cmem  */
+       struct list_head ib_cmem_block;
+
+       /* Order of cmem block,  2^ block_order will equal number
+         * of physical pages per block
+         */
+       unsigned long    block_order;
+       /* Refernce counter for that memory area
+         * When value became 0 pages will be returned to the kernel.
+         */
+       struct kref refcount;
+};
+
+struct ib_cmem_block {
+       struct list_head        list;
+       /* page will point to the page struct of the head page
+         * in the current compound page.
+         * block order is saved once as part of ib_cmem.
+         */
+       struct page            *page;
+};
+
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+                                       struct vm_area_struct *vma);
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+                                              unsigned long total_size,
+                                              unsigned long page_size_order,
+                                              int numa_node);
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
+
+#endif
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC contig pages support 1/2] IB: Supports contiguous memory operations

Reply via email to