On Thu, 2010-07-29 at 09:25 -0700, Tom Tucker wrote:
> From: Tom Tucker <[email protected]>
>
> Add an ib_iomem_get service that converts a vma to an array of
> physical addresses. This makes it easier for each device driver to
> add support for the reg_io_mr provider method.
>
> Signed-off-by: Tom Tucker <[email protected]>
> ---
>
> drivers/infiniband/core/umem.c | 248
> ++++++++++++++++++++++++++++++++++++++--
> include/rdma/ib_umem.h | 14 ++
> 2 files changed, 251 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 415e186..f103956 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
...
> @@ -292,3 +295,226 @@ int ib_umem_page_count(struct ib_umem *umem)
> return n;
> }
> EXPORT_SYMBOL(ib_umem_page_count);
> +/*
> + * Return the PFN for the specified address in the vma. This only
> + * works for a vma that is VM_PFNMAP.
> + */
> +static unsigned long follow_io_pfn(struct vm_area_struct *vma,
> + unsigned long address, int write)
> +{
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *ptep, pte;
> + spinlock_t *ptl;
> + unsigned long pfn;
> + struct mm_struct *mm = vma->vm_mm;
> +
> + BUG_ON(0 == (vma->vm_flags & VM_PFNMAP));
Why use BUG_ON?
WARN_ON is more appropriate but
if (!(vma->vm_flags & VM_PFNMAP))
return 0;
seems better.
In fact, move it outside the inner do loop in ib_get_io_pfn().
> + pgd = pgd_offset(mm, address);
> + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
> + return 0;
> +
> + pud = pud_offset(pgd, address);
> + if (pud_none(*pud))
> + return 0;
> + if (unlikely(pud_bad(*pud)))
> + return 0;
> +
> + pmd = pmd_offset(pud, address);
> + if (pmd_none(*pmd))
> + return 0;
> + if (unlikely(pmd_bad(*pmd)))
> + return 0;
> +
> + ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
> + pte = *ptep;
> + if (!pte_present(pte))
> + goto bad;
> + if (write && !pte_write(pte))
> + goto bad;
> +
> + pfn = pte_pfn(pte);
> + pte_unmap_unlock(ptep, ptl);
> + return pfn;
> + bad:
> + pte_unmap_unlock(ptep, ptl);
> + return 0;
> +}
> +
> +int ib_get_io_pfn(struct task_struct *tsk, struct mm_struct *mm,
> + unsigned long start, int len, int write, int force,
> + unsigned long *pfn_list, struct vm_area_struct **vmas)
> +{
> + unsigned long pfn;
> + int i;
> + if (len <= 0)
> + return 0;
> +
> + i = 0;
> + do {
> + struct vm_area_struct *vma;
> +
> + vma = find_vma(mm, start);
> + if (0 == (vma->vm_flags & VM_PFNMAP))
> + return -EINVAL;
Style nit: I would use ! instead of 0 ==
> + if (0 == (vma->vm_flags & VM_IO))
> + return -EFAULT;
> +
> + if (is_vm_hugetlb_page(vma))
> + return -EFAULT;
> +
> + do {
> + cond_resched();
> + pfn = follow_io_pfn(vma, start, write);
> + if (!pfn)
> + return -EFAULT;
> + if (pfn_list)
> + pfn_list[i] = pfn;
> + if (vmas)
> + vmas[i] = vma;
> + i++;
> + start += PAGE_SIZE;
> + len--;
> + } while (len && start < vma->vm_end);
> + } while (len);
> + return i;
> +}
> +
> +/**
> + * ib_iomem_get - DMA map a userspace map of IO memory.
> + * @context: userspace context to map memory for
> + * @addr: userspace virtual address to start at
> + * @size: length of region to map
> + * @access: IB_ACCESS_xxx flags for memory being mapped
> + * @dmasync: flush in-flight DMA when the memory region is written
> + */
> +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr,
> + size_t size, int access, int dmasync)
> +{
> + struct ib_umem *umem;
> + unsigned long *pfn_list;
> + struct ib_umem_chunk *chunk;
> + unsigned long locked;
> + unsigned long lock_limit;
> + unsigned long cur_base;
> + unsigned long npages;
> + int ret;
> + int off;
> + int i;
> + DEFINE_DMA_ATTRS(attrs);
> +
> + if (dmasync)
> + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
> +
> + if (!can_do_mlock())
> + return ERR_PTR(-EPERM);
> +
> + umem = kmalloc(sizeof *umem, GFP_KERNEL);
> + if (!umem)
> + return ERR_PTR(-ENOMEM);
> +
> + umem->type = IB_UMEM_IO_MAP;
> + umem->context = context;
> + umem->length = size;
> + umem->offset = addr & ~PAGE_MASK;
> + umem->page_size = PAGE_SIZE;
> + /*
> + * We ask for writable memory if any access flags other than
> + * "remote read" are set. "Local write" and "remote write"
> + * obviously require write access. "Remote atomic" can do
> + * things like fetch and add, which will modify memory, and
> + * "MW bind" can change permissions by binding a window.
> + */
> + umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
> +
> + /* IO memory is not hugetlb memory */
> + umem->hugetlb = 0;
> +
> + INIT_LIST_HEAD(&umem->chunk_list);
> +
> + pfn_list = (unsigned long *) __get_free_page(GFP_KERNEL);
> + if (!pfn_list) {
> + kfree(umem);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
> +
> + down_write(¤t->mm->mmap_sem);
> +
> + locked = npages + current->mm->locked_vm;
> + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur
> + >> PAGE_SHIFT;
I think the current kernel code is supposed to look like:
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + cur_base = addr & PAGE_MASK;
> +
> + ret = 0;
> + while (npages) {
> + ret = ib_get_io_pfn(current, current->mm, cur_base,
> + min_t(unsigned long, npages,
> + PAGE_SIZE / sizeof(unsigned long *)),
> + umem->writable,
> + !umem->writable, pfn_list, NULL);
> +
> + if (ret < 0)
> + goto out;
> +
> + cur_base += ret * PAGE_SIZE;
> + npages -= ret;
> +
> + off = 0;
> +
> + while (ret) {
> + chunk = kmalloc(sizeof *chunk +
> + sizeof(struct scatterlist) *
> + min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
> + GFP_KERNEL);
> + if (!chunk) {
> + ret = -ENOMEM;
> + goto out;
> + }
> + chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
> + sg_init_table(chunk->page_list, chunk->nents);
> + /* The pfn_list we built is a set of Page
> + * Frame Numbers (PFN) whose physical address
> + * is PFN << PAGE_SHIFT. The SG DMA mapping
> + * services expect page addresses, not PFN,
> + * therefore, we have to do the dma mapping
> + * ourselves here. */
> + for (i = 0; i < chunk->nents; ++i) {
> + sg_set_page(&chunk->page_list[i], 0,
> + PAGE_SIZE, 0);
> + chunk->page_list[i].dma_address =
> + (pfn_list[i] << PAGE_SHIFT);
> + chunk->page_list[i].dma_length = PAGE_SIZE;
dma_length isn't present if CONFIG_NEED_SG_DMA_LENGTH is undefined.
In fact, the kmalloc() should probably be kzalloc since the other
struct scatterlist entries are uninitialized.
> + }
> + chunk->nmap = chunk->nents;
> + ret -= chunk->nents;
> + off += chunk->nents;
> + list_add_tail(&chunk->list, &umem->chunk_list);
> + }
> +
> + ret = 0;
This is redundant since ret == 0 at the end of the loop.
> + }
> +
> +out:
> + if (ret < 0) {
> + __ib_umem_release(context->device, umem, 0);
> + kfree(umem);
> + } else
> + current->mm->locked_vm = locked;
> + up_write(¤t->mm->mmap_sem);
> + free_page((unsigned long) pfn_list);
> +
> + return ret < 0 ? ERR_PTR(ret) : umem;
> +}
> +EXPORT_SYMBOL(ib_iomem_get);
> +
> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
> index 9ee0d2e..2c64d82 100644
> --- a/include/rdma/ib_umem.h
> +++ b/include/rdma/ib_umem.h
> @@ -39,8 +39,14 @@
>
> struct ib_ucontext;
>
> +enum ib_umem_type {
> + IB_UMEM_MEM_MAP = 0,
> + IB_UMEM_IO_MAP = 1
> +};
> +
> struct ib_umem {
> struct ib_ucontext *context;
> + enum ib_umem_type type;
> size_t length;
> int offset;
> int page_size;
> @@ -61,6 +67,8 @@ struct ib_umem_chunk {
>
> #ifdef CONFIG_INFINIBAND_USER_MEM
>
> +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr,
> + size_t size, int access, int dmasync);
> struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
> size_t size, int access, int dmasync);
> void ib_umem_release(struct ib_umem *umem);
> @@ -70,6 +78,12 @@ int ib_umem_page_count(struct ib_umem *umem);
>
> #include <linux/err.h>
>
> +static struct ib_umem *ib_iomem_get(struct ib_ucontext *context,
> + unsigned long addr, size_t size,
> + int access, int dmasync) {
> + return ERR_PTR(-EINVAL);
> +}
> +
> static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
> unsigned long addr, size_t size,
> int access, int dmasync) {
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html