Re: [PATCH v2 5/7] mm/hmm: make full use of walk_page_range()

2019-07-29 Thread Jason Gunthorpe
On Thu, Jul 25, 2019 at 05:56:48PM -0700, Ralph Campbell wrote:
> hmm_range_fault() calls find_vma() and walk_page_range() in a loop.
> This is unnecessary duplication since walk_page_range() calls find_vma()
> in a loop already.
> Simplify hmm_range_fault() by defining a walk_test() callback function
> to filter unhandled vmas.
> 
> Signed-off-by: Ralph Campbell 
> Cc: "Jérôme Glisse" 
> Cc: Jason Gunthorpe 
> Cc: Christoph Hellwig 
>  mm/hmm.c | 130 ---
>  1 file changed, 57 insertions(+), 73 deletions(-)
> 
> diff --git a/mm/hmm.c b/mm/hmm.c
> index 1bc014cddd78..838cd1d50497 100644
> +++ b/mm/hmm.c
> @@ -840,13 +840,44 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, 
> unsigned long hmask,
>  #endif
>  }
>  
> -static void hmm_pfns_clear(struct hmm_range *range,
> -uint64_t *pfns,
> -unsigned long addr,
> -unsigned long end)
> +static int hmm_vma_walk_test(unsigned long start,
> +  unsigned long end,
> +  struct mm_walk *walk)
>  {
> - for (; addr < end; addr += PAGE_SIZE, pfns++)
> - *pfns = range->values[HMM_PFN_NONE];
> + struct hmm_vma_walk *hmm_vma_walk = walk->private;
> + struct hmm_range *range = hmm_vma_walk->range;
> + struct vm_area_struct *vma = walk->vma;
> +
> + /* If range is no longer valid, force retry. */
> + if (!range->valid)
> + return -EBUSY;
> +
> + /*
> +  * Skip vma ranges that don't have struct page backing them or
> +  * map I/O devices directly.
> +  * TODO: handle peer-to-peer device mappings.
> +  */
> + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
> + return -EFAULT;
> +
> + if (is_vm_hugetlb_page(vma)) {
> + if (huge_page_shift(hstate_vma(vma)) != range->page_shift &&
> + range->page_shift != PAGE_SHIFT)
> + return -EINVAL;
> + } else {
> + if (range->page_shift != PAGE_SHIFT)
> + return -EINVAL;
> + }
> +
> + /*
> +  * If vma does not allow read access, then assume that it does not
> +  * allow write access, either. HMM does not support architectures
> +  * that allow write without read.
> +  */
> + if (!(vma->vm_flags & VM_READ))
> + return -EPERM;
> +
> + return 0;
>  }
>  
>  /*
> @@ -965,82 +996,35 @@ EXPORT_SYMBOL(hmm_range_unregister);
>   */
>  long hmm_range_fault(struct hmm_range *range, unsigned int flags)
>  {
> - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
> - unsigned long start = range->start, end;
> - struct hmm_vma_walk hmm_vma_walk;
> + unsigned long start = range->start;
> + struct hmm_vma_walk hmm_vma_walk = {};
>   struct hmm *hmm = range->hmm;
> - struct vm_area_struct *vma;
> - struct mm_walk mm_walk;
> + struct mm_walk mm_walk = {};
>   int ret;
>  
>   lockdep_assert_held(>mm->mmap_sem);
>  
> - do {
> - /* If range is no longer valid force retry. */
> - if (!range->valid)
> - return -EBUSY;
> + hmm_vma_walk.range = range;
> + hmm_vma_walk.last = start;
> + hmm_vma_walk.flags = flags;
> + mm_walk.private = _vma_walk;
>  
> - vma = find_vma(hmm->mm, start);
> - if (vma == NULL || (vma->vm_flags & device_vma))
> - return -EFAULT;

It is hard to tell what is a confused/wrong and what is deliberate in
this code...

Currently the hmm_range_fault invokes walk_page_range on a VMA by VMA
basis, and the above prevents some cases of walk->vma becoming
NULL, but not all - for instance it doesn't check for start < vma->vm_start.

However, checking if it can actually tolerate the walk->vma == NULL it
looks like no:

 walk_page_range
  find_vma == NULL || start < vm_start -> walk->vma == NULL
  __walk_page_range
walk_pgd_range
  pte_hole / hmm_vma_walk_hole
hmm_vma_walk_hole_
 hmm_vma_do_fault
handle_mm_fault(walk->vma, addr, flags)
  vma->vm_mm <-- OOPS

Which kind of suggests the find_vma above was about preventing
walk->vma == NULL? Does something else tricky prevent this?

This patch also changes behavior so that missing VMAs don't always
trigger EFAULT (which is a good thing, but needs to be in the commit
message)

I strongly believe this is the correct direction to go in, and the fact
that this function returns EFAULT if there is no VMA/incompatible VMA
is actually a semantic bug we need to fix before it is a usable API.

Ie consider the user does something like
  ptr = mmap(0, PAGE_SIZE ..)
  mr = ib_reg_mr(ptr - PAGE_SIZE, ptr + 3*PAGE_SIZE, IBV_ACCESS_ON_DEMAND)

Then in the kernel I want to do hmm_range_fault(HMM_FAULT_SNAPSHOT)
across the MR VA and get a pfns array that says PAGE 0 is FAULT, PAGE
1 is R/W, PAGE 2 is FAULT.

Instead the 

[PATCH v2 5/7] mm/hmm: make full use of walk_page_range()

2019-07-25 Thread Ralph Campbell
hmm_range_fault() calls find_vma() and walk_page_range() in a loop.
This is unnecessary duplication since walk_page_range() calls find_vma()
in a loop already.
Simplify hmm_range_fault() by defining a walk_test() callback function
to filter unhandled vmas.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 130 ---
 1 file changed, 57 insertions(+), 73 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 1bc014cddd78..838cd1d50497 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -840,13 +840,44 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, 
unsigned long hmask,
 #endif
 }
 
-static void hmm_pfns_clear(struct hmm_range *range,
-  uint64_t *pfns,
-  unsigned long addr,
-  unsigned long end)
+static int hmm_vma_walk_test(unsigned long start,
+unsigned long end,
+struct mm_walk *walk)
 {
-   for (; addr < end; addr += PAGE_SIZE, pfns++)
-   *pfns = range->values[HMM_PFN_NONE];
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+
+   /* If range is no longer valid, force retry. */
+   if (!range->valid)
+   return -EBUSY;
+
+   /*
+* Skip vma ranges that don't have struct page backing them or
+* map I/O devices directly.
+* TODO: handle peer-to-peer device mappings.
+*/
+   if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
+   return -EFAULT;
+
+   if (is_vm_hugetlb_page(vma)) {
+   if (huge_page_shift(hstate_vma(vma)) != range->page_shift &&
+   range->page_shift != PAGE_SHIFT)
+   return -EINVAL;
+   } else {
+   if (range->page_shift != PAGE_SHIFT)
+   return -EINVAL;
+   }
+
+   /*
+* If vma does not allow read access, then assume that it does not
+* allow write access, either. HMM does not support architectures
+* that allow write without read.
+*/
+   if (!(vma->vm_flags & VM_READ))
+   return -EPERM;
+
+   return 0;
 }
 
 /*
@@ -965,82 +996,35 @@ EXPORT_SYMBOL(hmm_range_unregister);
  */
 long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 {
-   const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
-   unsigned long start = range->start, end;
-   struct hmm_vma_walk hmm_vma_walk;
+   unsigned long start = range->start;
+   struct hmm_vma_walk hmm_vma_walk = {};
struct hmm *hmm = range->hmm;
-   struct vm_area_struct *vma;
-   struct mm_walk mm_walk;
+   struct mm_walk mm_walk = {};
int ret;
 
lockdep_assert_held(>mm->mmap_sem);
 
-   do {
-   /* If range is no longer valid force retry. */
-   if (!range->valid)
-   return -EBUSY;
+   hmm_vma_walk.range = range;
+   hmm_vma_walk.last = start;
+   hmm_vma_walk.flags = flags;
+   mm_walk.private = _vma_walk;
 
-   vma = find_vma(hmm->mm, start);
-   if (vma == NULL || (vma->vm_flags & device_vma))
-   return -EFAULT;
-
-   if (is_vm_hugetlb_page(vma)) {
-   if (huge_page_shift(hstate_vma(vma)) !=
-   range->page_shift &&
-   range->page_shift != PAGE_SHIFT)
-   return -EINVAL;
-   } else {
-   if (range->page_shift != PAGE_SHIFT)
-   return -EINVAL;
-   }
+   mm_walk.mm = hmm->mm;
+   mm_walk.pud_entry = hmm_vma_walk_pud;
+   mm_walk.pmd_entry = hmm_vma_walk_pmd;
+   mm_walk.pte_hole = hmm_vma_walk_hole;
+   mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+   mm_walk.test_walk = hmm_vma_walk_test;
 
-   if (!(vma->vm_flags & VM_READ)) {
-   /*
-* If vma do not allow read access, then assume that it
-* does not allow write access, either. HMM does not
-* support architecture that allow write without read.
-*/
-   hmm_pfns_clear(range, range->pfns,
-   range->start, range->end);
-   return -EPERM;
-   }
+   do {
+   ret = walk_page_range(start, range->end, _walk);
+   start = hmm_vma_walk.last;
 
-   range->vma = vma;
-   hmm_vma_walk.pgmap = NULL;
-   hmm_vma_walk.last = start;
-   hmm_vma_walk.flags = flags;
-   hmm_vma_walk.range = range;
-   mm_walk.private = _vma_walk;
-