On 2020/11/21 15:58, xuxiaoyang (C) wrote:
> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> each to return the physical pfn.  When dealing with large arrays of
> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page.In this case, we can divide the iova pfn
> array into multiple continuous ranges and optimize them.  For example,
> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
> groups {1}, {5,6,7}, {9} for processing.  When processing {5,6,7}, the
> number of calls to pin_user_pages_remote is reduced from 3 times to once.
> For single page or large array of discontinuous iovas, we still use
> vfio_pin_page_external to deal with it to reduce the performance loss
> caused by refactoring.
> 
> Signed-off-by: Xiaoyang Xu <[email protected]>
> ---
> v1 -> v2:
>  * make vfio_iommu_type1_pin_contiguous_pages use vfio_pin_page_external
>  to pin single page when npage=1
>  * make vfio_pin_contiguous_pages_external use set npage to mark
>  consecutive pages as dirty. simplify the processing logic of unwind
>  * remove unnecessary checks in vfio_get_contiguous_pages_length, put
>  the least costly judgment logic at the top, and replace
>  vfio_iova_get_vfio_pfn with vfio_find_vpfn
> 
>  drivers/vfio/vfio_iommu_type1.c | 231 ++++++++++++++++++++++++++++----
>  1 file changed, 204 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 67e827638995..080727b531c6 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -628,6 +628,196 @@ static int vfio_unpin_page_external(struct vfio_dma 
> *dma, dma_addr_t iova,
>       return unlocked;
>  }
> 
> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long 
> vaddr,
> +                                 int prot, long npage, unsigned long 
> *phys_pfn)
> +{
> +     struct page **pages = NULL;
> +     unsigned int flags = 0;
> +     int i, ret;
> +
> +     pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
> +     if (!pages)
> +             return -ENOMEM;
> +
> +     if (prot & IOMMU_WRITE)
> +             flags |= FOLL_WRITE;
> +
> +     mmap_read_lock(mm);
> +     ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
> +                                 pages, NULL, NULL);
> +     mmap_read_unlock(mm);
> +
> +     for (i = 0; i < ret; i++)
> +             *(phys_pfn + i) = page_to_pfn(pages[i]);
> +
> +     kvfree(pages);
> +
> +     return ret;
> +}
> +
> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
> +                                 struct vfio_dma *dma,
> +                                 unsigned long *user_pfn,
> +                                 int npage, unsigned long *phys_pfn,
> +                                 bool do_accounting)
> +{
> +     int ret, i, j, lock_acct = 0;
> +     unsigned long remote_vaddr;
> +     dma_addr_t iova;
> +     struct mm_struct *mm;
> +     struct vfio_pfn *vpfn;
> +
> +     mm = get_task_mm(dma->task);
> +     if (!mm)
> +             return -ENODEV;
> +
> +     iova = user_pfn[0] << PAGE_SHIFT;
> +     remote_vaddr = dma->vaddr + iova - dma->iova;
> +     ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
> +                                         npage, phys_pfn);
> +     mmput(mm);
> +     if (ret <= 0)
> +             return ret;
> +
> +     npage = ret;
> +     for (i = 0; i < npage; i++) {
> +             iova = user_pfn[i] << PAGE_SHIFT;
> +             ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +             if (ret)
> +                     goto unwind;
> +
> +             if (!is_invalid_reserved_pfn(phys_pfn[i]))
> +                     lock_acct++;
> +     }
> +
> +     if (do_accounting) {
> +             ret = vfio_lock_acct(dma, lock_acct, true);
> +             if (ret) {
> +                     if (ret == -ENOMEM)
> +                             pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) 
> exceeded\n",
> +                                     __func__, dma->task->comm, 
> task_pid_nr(dma->task),
> +                                     task_rlimit(dma->task, RLIMIT_MEMLOCK));
> +                     goto unwind;
> +             }
> +     }
> +
> +     if (iommu->dirty_page_tracking) {
> +             unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +             /*
> +              * Bitmap populated with the smallest supported page
> +              * size
> +              */
> +             bitmap_set(dma->bitmap,
> +                        ((user_pfn[0] << PAGE_SHIFT) - dma->iova) >> 
> pgshift, npage);
> +     }
> +
> +     return i;
> +unwind:
> +     for (j = 0; j < npage; j++) {
> +             if (j < i) {
> +                     iova = user_pfn[j] << PAGE_SHIFT;
> +                     vpfn = vfio_find_vpfn(dma, iova);
> +                     vfio_iova_put_vfio_pfn(dma, vpfn);
> +             } else {
> +                     put_pfn(phys_pfn[j], dma->prot);
> +             }
> +
> +             phys_pfn[j] = 0;
> +     }
> +
> +     return ret;
> +}
> +
> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
> +                                         struct vfio_dma *dma,
> +                                         unsigned long *user_pfn,
> +                                         int npage, unsigned long *phys_pfn,
> +                                         bool do_accounting)
> +{
> +     int ret = 0, i, j;
> +     unsigned long remote_vaddr;
> +     dma_addr_t iova;
> +
> +     if (npage == 1)
> +             goto pin_single_page;
> +
> +     ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
> +                             phys_pfn, do_accounting);
> +     if (ret == npage)
> +             return ret;
> +
> +     if (ret < 0)
> +             ret = 0;
> +
> +pin_single_page:
> +     for (i = ret; i < npage; i++) {
> +             iova = user_pfn[i] << PAGE_SHIFT;
> +             remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> +             ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> +                         do_accounting);
> +             if (ret)
> +                     goto pin_unwind;
> +
> +             ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> +             if (ret) {
> +                     if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> +                             vfio_lock_acct(dma, -1, true);
> +                     goto pin_unwind;
> +             }
> +
> +             if (iommu->dirty_page_tracking) {
> +                     unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> +                     /*
> +                      * Bitmap populated with the smallest supported page
> +                      * size
> +                      */
> +                     bitmap_set(dma->bitmap,
> +                                        (iova - dma->iova) >> pgshift, 1);
> +             }
> +     }
> +
> +     return i;
> +
> +pin_unwind:
> +     phys_pfn[i] = 0;
> +     for (j = 0; j < i; j++) {
> +             iova = user_pfn[j] << PAGE_SHIFT;
> +             vfio_unpin_page_external(dma, iova, do_accounting);
> +             phys_pfn[j] = 0;
> +     }
> +
> +     return ret;
> +}
> +
> +static int vfio_get_contiguous_pages_length(struct vfio_dma *dma,
> +                                 unsigned long *user_pfn, int npage)
> +{
> +     int i;
> +     dma_addr_t iova = user_pfn[0] << PAGE_SHIFT;
> +     struct vfio_pfn *vpfn;
> +
> +     if (npage <= 1)
> +             return npage;
> +
> +     for (i = 1; i < npage; i++) {
> +             if (user_pfn[i] != user_pfn[0] + i)
> +                     break;
> +
> +             iova = user_pfn[i] << PAGE_SHIFT;
> +             if (iova >= dma->iova + dma->size ||
> +                             iova + PAGE_SIZE <= dma->iova)
> +                     break;
> +
> +             vpfn = vfio_find_vpfn(dma, iova);
> +             if (vpfn)
> +                     break;
> +     }
> +     return i;
> +}
> +
>  static int vfio_iommu_type1_pin_pages(void *iommu_data,
>                                     struct iommu_group *iommu_group,
>                                     unsigned long *user_pfn,
> @@ -637,9 +827,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>       struct vfio_iommu *iommu = iommu_data;
>       struct vfio_group *group;
>       int i, j, ret;
> -     unsigned long remote_vaddr;
>       struct vfio_dma *dma;
>       bool do_accounting;
> +     int contiguous_npage;
> 
>       if (!iommu || !user_pfn || !phys_pfn)
>               return -EINVAL;
> @@ -663,7 +853,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>        */
>       do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
> 
> -     for (i = 0; i < npage; i++) {
> +     for (i = 0; i < npage; i += contiguous_npage) {
>               dma_addr_t iova;
>               struct vfio_pfn *vpfn;
> 
> @@ -682,31 +872,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>               vpfn = vfio_iova_get_vfio_pfn(dma, iova);
>               if (vpfn) {
>                       phys_pfn[i] = vpfn->pfn;
> -                     continue;
> -             }
> -
> -             remote_vaddr = dma->vaddr + (iova - dma->iova);
> -             ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> -                                          do_accounting);
> -             if (ret)
> -                     goto pin_unwind;
> -
> -             ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> -             if (ret) {
> -                     if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> -                             vfio_lock_acct(dma, -1, true);
> -                     goto pin_unwind;
> -             }
> -
> -             if (iommu->dirty_page_tracking) {
> -                     unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> -
> -                     /*
> -                      * Bitmap populated with the smallest supported page
> -                      * size
> -                      */
> -                     bitmap_set(dma->bitmap,
> -                                (iova - dma->iova) >> pgshift, 1);
> +                     contiguous_npage = 1;
> +             } else {
> +                     ret = vfio_get_contiguous_pages_length(dma,
> +                                     &user_pfn[i], npage - i);
> +                     if (ret < 0)
> +                             goto pin_unwind;
> +
> +                     ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
> +                                     dma, &user_pfn[i], ret, &phys_pfn[i], 
> do_accounting);
> +                     if (ret < 0)
> +                             goto pin_unwind;
> +                     contiguous_npage = ret;
>               }
>       }
>       ret = i;
> --
> 2.19.1
> .
> 

hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang

vfio_pin_pages() accepts an array of unrelated iova pfns and processes
each to return the physical pfn.  When dealing with large arrays of
contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
it is processed page by page.  In this case, we can divide the iova pfn
array into multiple continuous ranges and optimize them.  I have a set
of performance test data for reference.

The patch was not applied
                    1 page           512 pages
no huge pages:     1638ns           223651ns
THP:               1668ns           222330ns
HugeTLB:           1526ns           208151ns

The patch was applied
                    1 page           512 pages
no huge pages       1735ns           167286ns
THP:               1934ns           126900ns
HugeTLB:           1713ns           102188ns

As Alex Williamson said, this patch lacks proof that it works in the
real world. I think you will have some valuable opinions.

Regards,
Xu

Reply via email to