Re: [PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-16 Thread Kirill A. Shutemov
On Mon, Sep 16, 2019 at 09:35:21AM +, Justin He (Arm Technology China) 
wrote:
> 
> Hi Kirill
> > -Original Message-
> > From: Kirill A. Shutemov 
> > Sent: 2019年9月16日 17:16
> > To: Justin He (Arm Technology China) 
> > Cc: Catalin Marinas ; Will Deacon
> > ; Mark Rutland ; James Morse
> > ; Marc Zyngier ; Matthew
> > Wilcox ; Kirill A. Shutemov
> > ; linux-arm-ker...@lists.infradead.org;
> > linux-kernel@vger.kernel.org; linux...@kvack.org; Punit Agrawal
> > ; Anshuman Khandual
> > ; Jun Yao ;
> > Alex Van Brunt ; Robin Murphy
> > ; Thomas Gleixner ;
> > Andrew Morton ; Jérôme Glisse
> > ; Ralph Campbell ;
> > hejia...@gmail.com
> > Subject: Re: [PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF
> > is cleared
> >
> > On Sat, Sep 14, 2019 at 12:32:39AM +0800, Jia He wrote:
> > > When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest,
> > there
> > > will be a double page fault in __copy_from_user_inatomic of
> > cow_user_page.
> > >
> > > Below call trace is from arm64 do_page_fault for debugging purpose
> > > [  110.016195] Call trace:
> > > [  110.016826]  do_page_fault+0x5a4/0x690
> > > [  110.017812]  do_mem_abort+0x50/0xb0
> > > [  110.018726]  el1_da+0x20/0xc4
> > > [  110.019492]  __arch_copy_from_user+0x180/0x280
> > > [  110.020646]  do_wp_page+0xb0/0x860
> > > [  110.021517]  __handle_mm_fault+0x994/0x1338
> > > [  110.022606]  handle_mm_fault+0xe8/0x180
> > > [  110.023584]  do_page_fault+0x240/0x690
> > > [  110.024535]  do_mem_abort+0x50/0xb0
> > > [  110.025423]  el0_da+0x20/0x24
> > >
> > > The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
> > > [9b007000] pgd=00023d4f8003, pud=00023da9b003,
> > pmd=00023d4b3003, pte=36298607bd3
> > >
> > > As told by Catalin: "On arm64 without hardware Access Flag, copying
> > from
> > > user will fail because the pte is old and cannot be marked young. So we
> > > always end up with zeroed page after fork() + CoW for pfn mappings. we
> > > don't always have a hardware-managed access flag on arm64."
> > >
> > > This patch fix it by calling pte_mkyoung. Also, the parameter is
> > > changed because vmf should be passed to cow_user_page()
> > >
> > > [1]
> > https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> > >
> > > Reported-by: Yibo Cai 
> > > Signed-off-by: Jia He 
> > > ---
> > >  mm/memory.c | 30 +-
> > >  1 file changed, 25 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index e2bb51b6242e..a64af6495f71 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -118,6 +118,13 @@ int randomize_va_space __read_mostly =
> > > 2;
> > >  #endif
> > >
> > > +#ifndef arch_faults_on_old_pte
> > > +static inline bool arch_faults_on_old_pte(void)
> > > +{
> > > +   return false;
> > > +}
> > > +#endif
> > > +
> > >  static int __init disable_randmaps(char *s)
> > >  {
> > > randomize_va_space = 0;
> > > @@ -2140,7 +2147,8 @@ static inline int pte_unmap_same(struct
> > mm_struct *mm, pmd_t *pmd,
> > > return same;
> > >  }
> > >
> > > -static inline void cow_user_page(struct page *dst, struct page *src,
> > unsigned long va, struct vm_area_struct *vma)
> > > +static inline void cow_user_page(struct page *dst, struct page *src,
> > > +   struct vm_fault *vmf)
> > >  {
> > > debug_dma_assert_idle(src);
> > >
> > > @@ -2152,20 +2160,32 @@ static inline void cow_user_page(struct page
> > *dst, struct page *src, unsigned lo
> > >  */
> > > if (unlikely(!src)) {
> > > void *kaddr = kmap_atomic(dst);
> > > -   void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > > +   void __user *uaddr = (void __user *)(vmf->address &
> > PAGE_MASK);
> > > +   pte_t entry;
> > >
> > > /*
> > >  * This really shouldn't fail, because the page is there
> > >  * in the page tables. But it might just be unreadable,
> > >  * in which case we just give up and fill the result with
> > > -* zeroes.
> > > +* zeroes. If PTE_AF is cleared on arm64, it might
> > > +* cause double page fault. So makes pte young here
> > >  */
> > > +   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte))
> > {
> > > +   spin_lock(vmf->ptl);
> > > +   entry = pte_mkyoung(vmf->orig_pte);
> >
> > Should't you re-validate that orig_pte after re-taking ptl? It can be
> > stale by now.
> Thanks, do you mean flush_cache_page(vma, vmf->address, 
> pte_pfn(vmf->orig_pte))
> before pte_mkyoung?

No. You need to check pte_same(*vmf->pte, vmf->orig_pte) before modifying
anything and bail out if *vmf->pte has changed under you.

-- 
 Kirill A. Shutemov


RE: [PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-16 Thread Justin He (Arm Technology China)

Hi Kirill
> -Original Message-
> From: Kirill A. Shutemov 
> Sent: 2019年9月16日 17:16
> To: Justin He (Arm Technology China) 
> Cc: Catalin Marinas ; Will Deacon
> ; Mark Rutland ; James Morse
> ; Marc Zyngier ; Matthew
> Wilcox ; Kirill A. Shutemov
> ; linux-arm-ker...@lists.infradead.org;
> linux-kernel@vger.kernel.org; linux...@kvack.org; Punit Agrawal
> ; Anshuman Khandual
> ; Jun Yao ;
> Alex Van Brunt ; Robin Murphy
> ; Thomas Gleixner ;
> Andrew Morton ; Jérôme Glisse
> ; Ralph Campbell ;
> hejia...@gmail.com
> Subject: Re: [PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF
> is cleared
>
> On Sat, Sep 14, 2019 at 12:32:39AM +0800, Jia He wrote:
> > When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest,
> there
> > will be a double page fault in __copy_from_user_inatomic of
> cow_user_page.
> >
> > Below call trace is from arm64 do_page_fault for debugging purpose
> > [  110.016195] Call trace:
> > [  110.016826]  do_page_fault+0x5a4/0x690
> > [  110.017812]  do_mem_abort+0x50/0xb0
> > [  110.018726]  el1_da+0x20/0xc4
> > [  110.019492]  __arch_copy_from_user+0x180/0x280
> > [  110.020646]  do_wp_page+0xb0/0x860
> > [  110.021517]  __handle_mm_fault+0x994/0x1338
> > [  110.022606]  handle_mm_fault+0xe8/0x180
> > [  110.023584]  do_page_fault+0x240/0x690
> > [  110.024535]  do_mem_abort+0x50/0xb0
> > [  110.025423]  el0_da+0x20/0x24
> >
> > The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
> > [9b007000] pgd=00023d4f8003, pud=00023da9b003,
> pmd=00023d4b3003, pte=36298607bd3
> >
> > As told by Catalin: "On arm64 without hardware Access Flag, copying
> from
> > user will fail because the pte is old and cannot be marked young. So we
> > always end up with zeroed page after fork() + CoW for pfn mappings. we
> > don't always have a hardware-managed access flag on arm64."
> >
> > This patch fix it by calling pte_mkyoung. Also, the parameter is
> > changed because vmf should be passed to cow_user_page()
> >
> > [1]
> https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> >
> > Reported-by: Yibo Cai 
> > Signed-off-by: Jia He 
> > ---
> >  mm/memory.c | 30 +-
> >  1 file changed, 25 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index e2bb51b6242e..a64af6495f71 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -118,6 +118,13 @@ int randomize_va_space __read_mostly =
> > 2;
> >  #endif
> >
> > +#ifndef arch_faults_on_old_pte
> > +static inline bool arch_faults_on_old_pte(void)
> > +{
> > +   return false;
> > +}
> > +#endif
> > +
> >  static int __init disable_randmaps(char *s)
> >  {
> > randomize_va_space = 0;
> > @@ -2140,7 +2147,8 @@ static inline int pte_unmap_same(struct
> mm_struct *mm, pmd_t *pmd,
> > return same;
> >  }
> >
> > -static inline void cow_user_page(struct page *dst, struct page *src,
> unsigned long va, struct vm_area_struct *vma)
> > +static inline void cow_user_page(struct page *dst, struct page *src,
> > +   struct vm_fault *vmf)
> >  {
> > debug_dma_assert_idle(src);
> >
> > @@ -2152,20 +2160,32 @@ static inline void cow_user_page(struct page
> *dst, struct page *src, unsigned lo
> >  */
> > if (unlikely(!src)) {
> > void *kaddr = kmap_atomic(dst);
> > -   void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > +   void __user *uaddr = (void __user *)(vmf->address &
> PAGE_MASK);
> > +   pte_t entry;
> >
> > /*
> >  * This really shouldn't fail, because the page is there
> >  * in the page tables. But it might just be unreadable,
> >  * in which case we just give up and fill the result with
> > -* zeroes.
> > +* zeroes. If PTE_AF is cleared on arm64, it might
> > +* cause double page fault. So makes pte young here
> >  */
> > +   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte))
> {
> > +   spin_lock(vmf->ptl);
> > +   entry = pte_mkyoung(vmf->orig_pte);
>
> Should't you re-validate that orig_pte after re-taking ptl? It can be
> stale by now.
Thanks, do you mean flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte))
before pte_mkyoung?

--
Cheers,
Justin (Jia He)


>
> > +   if (ptep_set_access_flags(vmf->vma, vmf->address,
> > + vmf->pte, entry, 0))
> > +   update_mmu_cache(vmf->vma, vmf-
> >address,
> > +vmf->pte);
> > +   spin_unlock(vmf->ptl);
> > +   }
> > +
> > if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> > clear_page(kaddr);
> > kunmap_atomic(kaddr);
> > flush_dcache_page(dst);
> > } else
> > -   copy_user_highpage(dst, src, va, vma);

Re: [PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-16 Thread Kirill A. Shutemov
On Sat, Sep 14, 2019 at 12:32:39AM +0800, Jia He wrote:
> When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
> will be a double page fault in __copy_from_user_inatomic of cow_user_page.
> 
> Below call trace is from arm64 do_page_fault for debugging purpose
> [  110.016195] Call trace:
> [  110.016826]  do_page_fault+0x5a4/0x690
> [  110.017812]  do_mem_abort+0x50/0xb0
> [  110.018726]  el1_da+0x20/0xc4
> [  110.019492]  __arch_copy_from_user+0x180/0x280
> [  110.020646]  do_wp_page+0xb0/0x860
> [  110.021517]  __handle_mm_fault+0x994/0x1338
> [  110.022606]  handle_mm_fault+0xe8/0x180
> [  110.023584]  do_page_fault+0x240/0x690
> [  110.024535]  do_mem_abort+0x50/0xb0
> [  110.025423]  el0_da+0x20/0x24
> 
> The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
> [9b007000] pgd=00023d4f8003, pud=00023da9b003, 
> pmd=00023d4b3003, pte=36298607bd3
> 
> As told by Catalin: "On arm64 without hardware Access Flag, copying from
> user will fail because the pte is old and cannot be marked young. So we
> always end up with zeroed page after fork() + CoW for pfn mappings. we
> don't always have a hardware-managed access flag on arm64."
> 
> This patch fix it by calling pte_mkyoung. Also, the parameter is
> changed because vmf should be passed to cow_user_page()
> 
> [1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> 
> Reported-by: Yibo Cai 
> Signed-off-by: Jia He 
> ---
>  mm/memory.c | 30 +-
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index e2bb51b6242e..a64af6495f71 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -118,6 +118,13 @@ int randomize_va_space __read_mostly =
>   2;
>  #endif
>  
> +#ifndef arch_faults_on_old_pte
> +static inline bool arch_faults_on_old_pte(void)
> +{
> + return false;
> +}
> +#endif
> +
>  static int __init disable_randmaps(char *s)
>  {
>   randomize_va_space = 0;
> @@ -2140,7 +2147,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, 
> pmd_t *pmd,
>   return same;
>  }
>  
> -static inline void cow_user_page(struct page *dst, struct page *src, 
> unsigned long va, struct vm_area_struct *vma)
> +static inline void cow_user_page(struct page *dst, struct page *src,
> + struct vm_fault *vmf)
>  {
>   debug_dma_assert_idle(src);
>  
> @@ -2152,20 +2160,32 @@ static inline void cow_user_page(struct page *dst, 
> struct page *src, unsigned lo
>*/
>   if (unlikely(!src)) {
>   void *kaddr = kmap_atomic(dst);
> - void __user *uaddr = (void __user *)(va & PAGE_MASK);
> + void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
> + pte_t entry;
>  
>   /*
>* This really shouldn't fail, because the page is there
>* in the page tables. But it might just be unreadable,
>* in which case we just give up and fill the result with
> -  * zeroes.
> +  * zeroes. If PTE_AF is cleared on arm64, it might
> +  * cause double page fault. So makes pte young here
>*/
> + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
> + spin_lock(vmf->ptl);
> + entry = pte_mkyoung(vmf->orig_pte);

Should't you re-validate that orig_pte after re-taking ptl? It can be
stale by now.

> + if (ptep_set_access_flags(vmf->vma, vmf->address,
> +   vmf->pte, entry, 0))
> + update_mmu_cache(vmf->vma, vmf->address,
> +  vmf->pte);
> + spin_unlock(vmf->ptl);
> + }
> +
>   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
>   clear_page(kaddr);
>   kunmap_atomic(kaddr);
>   flush_dcache_page(dst);
>   } else
> - copy_user_highpage(dst, src, va, vma);
> + copy_user_highpage(dst, src, vmf->address, vmf->vma);
>  }
>  
>  static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
> @@ -2318,7 +2338,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>   vmf->address);
>   if (!new_page)
>   goto oom;
> - cow_user_page(new_page, old_page, vmf->address, vma);
> + cow_user_page(new_page, old_page, vmf);
>   }
>  
>   if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, , 
> false))
> -- 
> 2.17.1
> 
> 

-- 
 Kirill A. Shutemov


[PATCH v3 2/2] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-13 Thread Jia He
When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
will be a double page fault in __copy_from_user_inatomic of cow_user_page.

Below call trace is from arm64 do_page_fault for debugging purpose
[  110.016195] Call trace:
[  110.016826]  do_page_fault+0x5a4/0x690
[  110.017812]  do_mem_abort+0x50/0xb0
[  110.018726]  el1_da+0x20/0xc4
[  110.019492]  __arch_copy_from_user+0x180/0x280
[  110.020646]  do_wp_page+0xb0/0x860
[  110.021517]  __handle_mm_fault+0x994/0x1338
[  110.022606]  handle_mm_fault+0xe8/0x180
[  110.023584]  do_page_fault+0x240/0x690
[  110.024535]  do_mem_abort+0x50/0xb0
[  110.025423]  el0_da+0x20/0x24

The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
[9b007000] pgd=00023d4f8003, pud=00023da9b003, 
pmd=00023d4b3003, pte=36298607bd3

As told by Catalin: "On arm64 without hardware Access Flag, copying from
user will fail because the pte is old and cannot be marked young. So we
always end up with zeroed page after fork() + CoW for pfn mappings. we
don't always have a hardware-managed access flag on arm64."

This patch fix it by calling pte_mkyoung. Also, the parameter is
changed because vmf should be passed to cow_user_page()

[1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork

Reported-by: Yibo Cai 
Signed-off-by: Jia He 
---
 mm/memory.c | 30 +-
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..a64af6495f71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,13 @@ int randomize_va_space __read_mostly =
2;
 #endif
 
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+   return false;
+}
+#endif
+
 static int __init disable_randmaps(char *s)
 {
randomize_va_space = 0;
@@ -2140,7 +2147,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, 
pmd_t *pmd,
return same;
 }
 
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
+static inline void cow_user_page(struct page *dst, struct page *src,
+   struct vm_fault *vmf)
 {
debug_dma_assert_idle(src);
 
@@ -2152,20 +2160,32 @@ static inline void cow_user_page(struct page *dst, 
struct page *src, unsigned lo
 */
if (unlikely(!src)) {
void *kaddr = kmap_atomic(dst);
-   void __user *uaddr = (void __user *)(va & PAGE_MASK);
+   void __user *uaddr = (void __user *)(vmf->address & PAGE_MASK);
+   pte_t entry;
 
/*
 * This really shouldn't fail, because the page is there
 * in the page tables. But it might just be unreadable,
 * in which case we just give up and fill the result with
-* zeroes.
+* zeroes. If PTE_AF is cleared on arm64, it might
+* cause double page fault. So makes pte young here
 */
+   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+   spin_lock(vmf->ptl);
+   entry = pte_mkyoung(vmf->orig_pte);
+   if (ptep_set_access_flags(vmf->vma, vmf->address,
+ vmf->pte, entry, 0))
+   update_mmu_cache(vmf->vma, vmf->address,
+vmf->pte);
+   spin_unlock(vmf->ptl);
+   }
+
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
clear_page(kaddr);
kunmap_atomic(kaddr);
flush_dcache_page(dst);
} else
-   copy_user_highpage(dst, src, va, vma);
+   copy_user_highpage(dst, src, vmf->address, vmf->vma);
 }
 
 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2318,7 +2338,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
vmf->address);
if (!new_page)
goto oom;
-   cow_user_page(new_page, old_page, vmf->address, vma);
+   cow_user_page(new_page, old_page, vmf);
}
 
if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, , 
false))
-- 
2.17.1