Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region

2020-10-29 Thread Jason Gunthorpe
On Tue, Oct 27, 2020 at 08:33:52PM +, Xiong, Jianxin wrote:
> > > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr 
> > > *imr,
> > >   * Returns:
> > >   *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages 
> > > that are
> > >   *   not accessible, or the MR is no longer valid.
> > > + *  -EAGAIN: The operation should be retried
> > > + *
> > > + *  >0: Number of pages mapped
> > > + */
> > > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem 
> > > *umem,
> > > +u64 io_virt, size_t bcnt, u32 *bytes_mapped,
> > > +u32 flags)
> > > +{
> > > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> > > + u64 user_va;
> > > + u64 end;
> > > + int npages;
> > > + int err;
> > > +
> > > + if (unlikely(io_virt < mr->mmkey.iova))
> > > + return -EFAULT;
> > > + if (check_add_overflow(io_virt - mr->mmkey.iova,
> > > +(u64)umem->address, _va))
> > > + return -EFAULT;
> > > + /* Overflow has alreddy been checked at the umem creation time */
> > > + end = umem->address + umem->length;
> > > + if (unlikely(user_va >= end || end  - user_va < bcnt))
> > > + return -EFAULT;
> > 
> > Why duplicate this sequence? Caller does it
> 
> The sequence in the caller is for umem_odp only.

Nothing about umem_odp in this code though??

> > >   /* prefetch with write-access must be supported by the MR */
> > >   if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> > > - !odp->umem.writable)
> > > + !mr->umem->writable)
> > 
> > ??

> There is no need to use umem_odp here, mr->umem is the same as >umem. 
> This change makes the code works for both umem_odp and umem_dmabuf.

Ok

Can you please also think about how to test this? I very much prefer
to see new pyverbs tests for new APIs. 

Distros are running the rdma-core test suite, if you want this to work
widely we need a public test for it.

Thanks,
Jason
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


RE: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region

2020-10-28 Thread Xiong, Jianxin
> -Original Message-
> From: Jason Gunthorpe 
> Sent: Wednesday, October 28, 2020 9:36 AM
> To: Xiong, Jianxin 
> Cc: linux-r...@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford 
> ; Leon Romanovsky
> ; Sumit Semwal ; Christian Koenig 
> ; Vetter, Daniel
> 
> Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory 
> region
> 
> On Tue, Oct 27, 2020 at 08:33:52PM +, Xiong, Jianxin wrote:
> > > > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr 
> > > > *imr,
> > > >   * Returns:
> > > >   *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages 
> > > > that are
> > > >   *   not accessible, or the MR is no longer valid.
> > > > + *  -EAGAIN: The operation should be retried
> > > > + *
> > > > + *  >0: Number of pages mapped
> > > > + */
> > > > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem 
> > > > *umem,
> > > > +  u64 io_virt, size_t bcnt, u32 
> > > > *bytes_mapped,
> > > > +  u32 flags)
> > > > +{
> > > > +   struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem);
> > > > +   u64 user_va;
> > > > +   u64 end;
> > > > +   int npages;
> > > > +   int err;
> > > > +
> > > > +   if (unlikely(io_virt < mr->mmkey.iova))
> > > > +   return -EFAULT;
> > > > +   if (check_add_overflow(io_virt - mr->mmkey.iova,
> > > > +  (u64)umem->address, _va))
> > > > +   return -EFAULT;
> > > > +   /* Overflow has alreddy been checked at the umem creation time 
> > > > */
> > > > +   end = umem->address + umem->length;
> > > > +   if (unlikely(user_va >= end || end  - user_va < bcnt))
> > > > +   return -EFAULT;
> > >
> > > Why duplicate this sequence? Caller does it
> >
> > The sequence in the caller is for umem_odp only.
> 
> Nothing about umem_odp in this code though??

The code in the caller uses ib_umem_end(odp) instead of the 'end' here, but we
can consolidate that with some minor changes.
  
> 
> > > > /* prefetch with write-access must be supported by the MR */
> > > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
> > > > -   !odp->umem.writable)
> > > > +   !mr->umem->writable)
> > >
> > > ??
> 
> > There is no need to use umem_odp here, mr->umem is the same as >umem.
> > This change makes the code works for both umem_odp and umem_dmabuf.
> 
> Ok
> 
> Can you please also think about how to test this? I very much prefer to see 
> new pyverbs tests for new APIs.
> 
> Distros are running the rdma-core test suite, if you want this to work widely 
> we need a public test for it.
> 

Will look into that.

> Thanks,
> Jason
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region

2020-10-28 Thread Jason Gunthorpe
On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote:

> diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
> index b261797..3bc412b 100644
> +++ b/drivers/infiniband/hw/mlx5/mr.c
> @@ -1,5 +1,6 @@
>  /*
>   * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
> + * Copyright (c) 2020, Intel Corporation. All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
>   * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -36,6 +37,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, 
> int npages,
>   dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
>   if (mr->umem->is_odp) {
>   mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
> + } else if (mr->umem->is_dmabuf && (flags & 
> MLX5_IB_UPD_XLT_ZAP)) {
> + memset(xlt, 0, size);
>   } else {
>   __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
>  npages, xlt,
> @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, 
> u64 start, u64 length,
>   return ERR_PTR(err);
>  }
>  
> +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
> +{
> + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
> + struct mlx5_ib_mr *mr = umem_dmabuf->device_context;
> +
> + mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP);
> + ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> +}
> +
> +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
> + .allow_peer2peer = 1,
> + .move_notify = mlx5_ib_dmabuf_invalidate_cb,
> +};
> +
> +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
> +  u64 length, u64 virt_addr,
> +  int fd, int access_flags,
> +  struct ib_udata *udata)
> +{
> + struct mlx5_ib_dev *dev = to_mdev(pd->device);
> + struct mlx5_ib_mr *mr = NULL;
> + struct ib_umem *umem;
> + struct ib_umem_dmabuf *umem_dmabuf;
> + int npages;
> + int order;
> + int err;
> +
> + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> + return ERR_PTR(-EOPNOTSUPP);
> +
> + mlx5_ib_dbg(dev,
> + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, 
> access_flags 0x%x\n",
> + offset, virt_addr, length, fd, access_flags);
> +
> + if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> + return ERR_PTR(-EINVAL);
> +
> + umem = ib_umem_dmabuf_get(>ib_dev, offset, length, fd, 
> access_flags,
> +   _ib_dmabuf_attach_ops);
> + if (IS_ERR(umem)) {
> + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> + return ERR_PTR(PTR_ERR(umem));
> + }
> +
> + npages = ib_umem_num_pages(umem);
> + if (!npages) {

ib_umem_get should reject invalid umems like this

> + mlx5_ib_warn(dev, "avoid zero region\n");
> + ib_umem_release(umem);
> + return ERR_PTR(-EINVAL);
> + }
> +
> + order = ilog2(roundup_pow_of_two(npages));

Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the
argument for this scenario

> + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> + npages, npages, order, PAGE_SHIFT);
> +
> + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages,
> +  PAGE_SHIFT, order, access_flags);
> + if (IS_ERR(mr))
> + mr = NULL;
> +
> + if (!mr) {
> + mutex_lock(>slow_path_mutex);
> + mr = reg_create(NULL, pd, virt_addr, length, umem, npages,
> + PAGE_SHIFT, access_flags, false);
> + mutex_unlock(>slow_path_mutex);
> + }

Rebase on the mlx5 series just posted and use their version of this
code sequence, this is just not quite right


> + err = mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT,
> +  MLX5_IB_UPD_XLT_ENABLE | MLX5_IB_UPD_XLT_ZAP);
> +
> + if (err) {
> + dereg_mr(dev, mr);
> + return ERR_PTR(err);
> + }

The current mlx5 code preloads the buffer with the right data, zapping
is fairly expensive, mapping and loading is the same cost

> @@ -1536,7 +1646,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int 
> flags, u64 start,
>   if (!mr->umem)
>   return -EINVAL;
>  
> - if (is_odp_mr(mr))
> + if (is_odp_mr(mr) || is_dmabuf_mr(mr))
>   return -EOPNOTSUPP;
>  
>   if (flags & IB_MR_REREG_TRANS) {
> @@ -1695,7 +1805,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct 
> 

RE: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region

2020-10-27 Thread Xiong, Jianxin
> -Original Message-
> From: Jason Gunthorpe 
> Sent: Tuesday, October 27, 2020 1:08 PM
> To: Xiong, Jianxin 
> Cc: linux-r...@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford 
> ; Leon Romanovsky
> ; Sumit Semwal ; Christian Koenig 
> ; Vetter, Daniel
> 
> Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory 
> region
> 
> On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote:
> 
> > diff --git a/drivers/infiniband/hw/mlx5/mr.c
> > b/drivers/infiniband/hw/mlx5/mr.c index b261797..3bc412b 100644
> > +++ b/drivers/infiniband/hw/mlx5/mr.c
> > @@ -1,5 +1,6 @@
> >  /*
> >   * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
> > + * Copyright (c) 2020, Intel Corporation. All rights reserved.
> >   *
> >   * This software is available to you under a choice of one of two
> >   * licenses.  You may choose to be licensed under the terms of the
> > GNU @@ -36,6 +37,8 @@  #include   #include
> >   #include 
> > +#include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 
> > idx, int npages,
> > dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
> > if (mr->umem->is_odp) {
> > mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
> > +   } else if (mr->umem->is_dmabuf && (flags & 
> > MLX5_IB_UPD_XLT_ZAP)) {
> > +   memset(xlt, 0, size);
> > } else {
> > __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
> >npages, xlt,
> > @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, 
> > u64 start, u64 length,
> > return ERR_PTR(err);
> >  }
> >
> > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment
> > +*attach) {
> > +   struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
> > +   struct mlx5_ib_mr *mr = umem_dmabuf->device_context;
> > +
> > +   mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP);
> > +   ib_umem_dmabuf_unmap_pages(umem_dmabuf);
> > +}
> > +
> > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
> > +   .allow_peer2peer = 1,
> > +   .move_notify = mlx5_ib_dmabuf_invalidate_cb, };
> > +
> > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
> > +u64 length, u64 virt_addr,
> > +int fd, int access_flags,
> > +struct ib_udata *udata)
> > +{
> > +   struct mlx5_ib_dev *dev = to_mdev(pd->device);
> > +   struct mlx5_ib_mr *mr = NULL;
> > +   struct ib_umem *umem;
> > +   struct ib_umem_dmabuf *umem_dmabuf;
> > +   int npages;
> > +   int order;
> > +   int err;
> > +
> > +   if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
> > +   return ERR_PTR(-EOPNOTSUPP);
> > +
> > +   mlx5_ib_dbg(dev,
> > +   "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, 
> > access_flags 0x%x\n",
> > +   offset, virt_addr, length, fd, access_flags);
> > +
> > +   if (!mlx5_ib_can_load_pas_with_umr(dev, length))
> > +   return ERR_PTR(-EINVAL);
> > +
> > +   umem = ib_umem_dmabuf_get(>ib_dev, offset, length, fd, 
> > access_flags,
> > + _ib_dmabuf_attach_ops);
> > +   if (IS_ERR(umem)) {
> > +   mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
> > +   return ERR_PTR(PTR_ERR(umem));
> > +   }
> > +
> > +   npages = ib_umem_num_pages(umem);
> > +   if (!npages) {
> 
> ib_umem_get should reject invalid umems like this

Will move the check to ib_umem_dmabuf_get().

> 
> > +   mlx5_ib_warn(dev, "avoid zero region\n");
> > +   ib_umem_release(umem);
> > +   return ERR_PTR(-EINVAL);
> > +   }
> > +
> > +   order = ilog2(roundup_pow_of_two(npages));
> 
> Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the argument 
> for this scenario

Will fix.

> 
> > +   mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
> > +   npages, npages, order, PAGE_SHIFT);
> > +
> > +   mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages,
> > +PAGE_SHIFT, order, access_f