Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-24 Thread Catalin Marinas
On Tue, Sep 24, 2019 at 11:29:07PM +0800, Jia He wrote:
> On 2019/9/24 18:33, Catalin Marinas wrote:
> > On Tue, Sep 24, 2019 at 06:43:06AM +, Justin He (Arm Technology China) 
> > wrote:
> > > Catalin Marinas wrote:
> > > > On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> > > > >   /*
> > > > >* This really shouldn't fail, because the page is there
> > > > >* in the page tables. But it might just be unreadable,
> > > > >* in which case we just give up and fill the result 
> > > > > with
> > > > >* zeroes.
> > > > >*/
> > > > > - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> > > > > + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 
> > > > > {
> > > > > + /* Give a warn in case there can be some obscure
> > > > > +  * use-case
> > > > > +  */
> > > > > + WARN_ON_ONCE(1);
> > > > That's more of a question for the mm guys: at this point we do the
> > > > copying with the ptl released; is there anything else that could have
> > > > made the pte old in the meantime? I think unuse_pte() is only called on
> > > > anonymous vmas, so it shouldn't be the case here.
> >
> > If we need to hold the ptl here, you could as well have an enclosing
> > kmap/kunmap_atomic (option 2) with some goto instead of "return false".
> 
> I am not 100% sure that I understand your suggestion well, so I
> drafted the patch

Well, however you think the code is cleaner really.

The copy/paste didn't work well, tabs disappeared (or rather the
Exchange server corrupting outgoing emails) but I'll try to comment
below:

> -static inline void cow_user_page(struct page *dst, struct page *src,
>   unsigned long va, struct vm_area_struct *vma)
> +static inline bool cow_user_page(struct page *dst, struct page *src,
> +                 struct vm_fault *vmf)
>  {
> +    struct vm_area_struct *vma = vmf->vma;
> +    struct mm_struct *mm = vma->vm_mm;
> +    unsigned long addr = vmf->address;
> +    bool ret;
> +    pte_t entry;
> +    void *kaddr;
> +    void __user *uaddr;
> +
>  debug_dma_assert_idle(src);
> 
> +    if (likely(src)) {
> +        copy_user_highpage(dst, src, addr, vma);
> +        return true;
> +    }
> +
>  /*
>   * If the source page was a PFN mapping, we don't have
>   * a "struct page" for it. We do a best-effort copy by
>   * just copying from the original user address. If that
>   * fails, we just zero-fill it. Live with it.
>   */
> -    if (unlikely(!src)) {
> -        void *kaddr = kmap_atomic(dst);
> -        void __user *uaddr = (void __user *)(va & PAGE_MASK);
> +    kaddr = kmap_atomic(dst);
> +    uaddr = (void __user *)(addr & PAGE_MASK);
> +
> +    /*
> +     * On architectures with software "accessed" bits, we would
> +     * take a double page fault, so mark it accessed here.
> +     */
> +    vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, >ptl);
> +    if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {

I'd move the pte_offset_map_lock() inside the 'if' block as we don't
want to affect architectures that handle old ptes automatically.

> +        if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> +            /*
> +             * Other thread has already handled the fault
> +             * and we don't need to do anything. If it's
> +             * not the case, the fault will be triggered
> +             * again on the same address.
> +             */
> +            ret = false;
> +            goto pte_unlock;
> +        }
> +
> +        entry = pte_mkyoung(vmf->orig_pte);
> +        if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
> +            update_mmu_cache(vma, addr, vmf->pte);
> +    }
> 
> +    /*
> +     * This really shouldn't fail, because the page is there
> +     * in the page tables. But it might just be unreadable,
> +     * in which case we just give up and fill the result with
> +     * zeroes.
> +     */
> +    if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
>      /*
> -         * This really shouldn't fail, because the page is there
> -         * in the page tables. But it might just be unreadable,
> -         * in which case we just give up and fill the result with
> -         * zeroes.
> +         * Give a warn in case there can be some obscure
> +         * use-case
>       */
> -        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> -            clear_page(kaddr);
> -        kunmap_atomic(kaddr);
> -        flush_dcache_page(dst);
> -    } else
> -        copy_user_highpage(dst, src, va, vma);
> +        WARN_ON_ONCE(1);
> +        clear_page(kaddr);
> +    }
> +
> +    ret = true;
> +
> +pte_unlock:
> +    pte_unmap_unlock(vmf->pte, vmf->ptl);

Since the locking would be moved in the 'if' block above, we need
another check here before unlocking:

if 

Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-24 Thread Jia He

Hi Catalin

On 2019/9/24 18:33, Catalin Marinas wrote:

On Tue, Sep 24, 2019 at 06:43:06AM +, Justin He (Arm Technology China) 
wrote:

Catalin Marinas wrote:

On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:

@@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page *dst, 
struct page *src, unsigned lo
 * fails, we just zero-fill it. Live with it.
 */
if (unlikely(!src)) {
-   void *kaddr = kmap_atomic(dst);
-   void __user *uaddr = (void __user *)(va & PAGE_MASK);
+   void *kaddr;
+   pte_t entry;
+   void __user *uaddr = (void __user *)(addr & PAGE_MASK);

+   /* On architectures with software "accessed" bits, we would
+* take a double page fault, so mark it accessed here.
+*/

[...]

+   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr,
+  >ptl);
+   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+   entry = pte_mkyoung(vmf->orig_pte);
+   if (ptep_set_access_flags(vma, addr,
+ vmf->pte, entry, 0))
+   update_mmu_cache(vma, addr, vmf->pte);
+   } else {
+   /* Other thread has already handled the fault
+* and we don't need to do anything. If it's
+* not the case, the fault will be triggered
+* again on the same address.
+*/
+   pte_unmap_unlock(vmf->pte, vmf->ptl);
+   return false;
+   }
+   pte_unmap_unlock(vmf->pte, vmf->ptl);
+   }

[...]

+
+   kaddr = kmap_atomic(dst);

Since you moved the kmap_atomic() here, could the above
arch_faults_on_old_pte() run in a preemptible context? I suggested to
add a WARN_ON in patch 2 to be sure.

Should I move kmap_atomic back to the original line? Thus, we can make sure
that arch_faults_on_old_pte() is in the context of preempt_disabled?
Otherwise, arch_faults_on_old_pte() may cause plenty of warning if I add
a WARN_ON in arch_faults_on_old_pte.  I tested it when I enable the PREEMPT=y
on a ThunderX2 qemu guest.

So we have two options here:

1. Change arch_faults_on_old_pte() scope to the whole system rather than
just the current CPU. You'd have to wire up a new arm64 capability
for the access flag but this way we don't care whether it's
preemptible or not.

2. Keep the arch_faults_on_old_pte() per-CPU but make sure we are not
preempted here. The kmap_atomic() move would do but you'd have to
kunmap_atomic() before the return.

I think the answer to my question below also has some implication on
which option to pick:


/*
 * This really shouldn't fail, because the page is there
 * in the page tables. But it might just be unreadable,
 * in which case we just give up and fill the result with
 * zeroes.
 */
-   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+   /* Give a warn in case there can be some obscure
+* use-case
+*/
+   WARN_ON_ONCE(1);

That's more of a question for the mm guys: at this point we do the
copying with the ptl released; is there anything else that could have
made the pte old in the meantime? I think unuse_pte() is only called on
anonymous vmas, so it shouldn't be the case here.

If we need to hold the ptl here, you could as well have an enclosing
kmap/kunmap_atomic (option 2) with some goto instead of "return false".


I am not 100% sure that I understand your suggestion well, so I drafted the 
patch

here:

Changes: optimize the indentions

 hold the ptl longer


-static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)

+static inline bool cow_user_page(struct page *dst, struct page *src,
+                 struct vm_fault *vmf)
 {
+    struct vm_area_struct *vma = vmf->vma;
+    struct mm_struct *mm = vma->vm_mm;
+    unsigned long addr = vmf->address;
+    bool ret;
+    pte_t entry;
+    void *kaddr;
+    void __user *uaddr;
+
 debug_dma_assert_idle(src);

+    if (likely(src)) {
+        copy_user_highpage(dst, src, addr, vma);
+        return true;
+    }
+
 /*
  * If the source page was a PFN mapping, we don't have
  * a "struct page" for it. We do a best-effort copy by
  * just copying from the original 

Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-24 Thread Kirill A. Shutemov
On Tue, Sep 24, 2019 at 11:33:25AM +0100, Catalin Marinas wrote:
> On Tue, Sep 24, 2019 at 06:43:06AM +, Justin He (Arm Technology China) 
> wrote:
> > Catalin Marinas wrote:
> > > On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> > > > @@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page 
> > > > *dst, struct page *src, unsigned lo
> > > >  * fails, we just zero-fill it. Live with it.
> > > >  */
> > > > if (unlikely(!src)) {
> > > > -   void *kaddr = kmap_atomic(dst);
> > > > -   void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > > > +   void *kaddr;
> > > > +   pte_t entry;
> > > > +   void __user *uaddr = (void __user *)(addr & PAGE_MASK);
> > > >
> > > > +   /* On architectures with software "accessed" bits, we 
> > > > would
> > > > +* take a double page fault, so mark it accessed here.
> > > > +*/
> [...]
> > > > +   if (arch_faults_on_old_pte() && 
> > > > !pte_young(vmf->orig_pte)) {
> > > > +   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, 
> > > > addr,
> > > > +  >ptl);
> > > > +   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) 
> > > > {
> > > > +   entry = pte_mkyoung(vmf->orig_pte);
> > > > +   if (ptep_set_access_flags(vma, addr,
> > > > + vmf->pte, 
> > > > entry, 0))
> > > > +   update_mmu_cache(vma, addr, 
> > > > vmf->pte);
> > > > +   } else {
> > > > +   /* Other thread has already handled the 
> > > > fault
> > > > +* and we don't need to do anything. If 
> > > > it's
> > > > +* not the case, the fault will be 
> > > > triggered
> > > > +* again on the same address.
> > > > +*/
> > > > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > > +   return false;
> > > > +   }
> > > > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > > +   }
> [...]
> > > > +
> > > > +   kaddr = kmap_atomic(dst);
> > > 
> > > Since you moved the kmap_atomic() here, could the above
> > > arch_faults_on_old_pte() run in a preemptible context? I suggested to
> > > add a WARN_ON in patch 2 to be sure.
> > 
> > Should I move kmap_atomic back to the original line? Thus, we can make sure
> > that arch_faults_on_old_pte() is in the context of preempt_disabled?
> > Otherwise, arch_faults_on_old_pte() may cause plenty of warning if I add
> > a WARN_ON in arch_faults_on_old_pte.  I tested it when I enable the 
> > PREEMPT=y
> > on a ThunderX2 qemu guest.
> 
> So we have two options here:
> 
> 1. Change arch_faults_on_old_pte() scope to the whole system rather than
>just the current CPU. You'd have to wire up a new arm64 capability
>for the access flag but this way we don't care whether it's
>preemptible or not.
> 
> 2. Keep the arch_faults_on_old_pte() per-CPU but make sure we are not
>preempted here. The kmap_atomic() move would do but you'd have to
>kunmap_atomic() before the return.
> 
> I think the answer to my question below also has some implication on
> which option to pick:
> 
> > > > /*
> > > >  * This really shouldn't fail, because the page is there
> > > >  * in the page tables. But it might just be unreadable,
> > > >  * in which case we just give up and fill the result 
> > > > with
> > > >  * zeroes.
> > > >  */
> > > > -   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> > > > +   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 
> > > > {
> > > > +   /* Give a warn in case there can be some obscure
> > > > +* use-case
> > > > +*/
> > > > +   WARN_ON_ONCE(1);
> > > 
> > > That's more of a question for the mm guys: at this point we do the
> > > copying with the ptl released; is there anything else that could have
> > > made the pte old in the meantime? I think unuse_pte() is only called on
> > > anonymous vmas, so it shouldn't be the case here.
> 
> If we need to hold the ptl here, you could as well have an enclosing
> kmap/kunmap_atomic (option 2) with some goto instead of "return false".

Yeah, look like we need to hold ptl for longer. There is nothing I see
that would prevent clearing young bit under us otherwise.

-- 
 Kirill A. Shutemov


Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-24 Thread Catalin Marinas
On Tue, Sep 24, 2019 at 06:43:06AM +, Justin He (Arm Technology China) 
wrote:
> Catalin Marinas wrote:
> > On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> > > @@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page 
> > > *dst, struct page *src, unsigned lo
> > >* fails, we just zero-fill it. Live with it.
> > >*/
> > >   if (unlikely(!src)) {
> > > - void *kaddr = kmap_atomic(dst);
> > > - void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > > + void *kaddr;
> > > + pte_t entry;
> > > + void __user *uaddr = (void __user *)(addr & PAGE_MASK);
> > >
> > > + /* On architectures with software "accessed" bits, we would
> > > +  * take a double page fault, so mark it accessed here.
> > > +  */
[...]
> > > + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
> > > + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr,
> > > +>ptl);
> > > + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> > > + entry = pte_mkyoung(vmf->orig_pte);
> > > + if (ptep_set_access_flags(vma, addr,
> > > +   vmf->pte, entry, 0))
> > > + update_mmu_cache(vma, addr, vmf->pte);
> > > + } else {
> > > + /* Other thread has already handled the fault
> > > +  * and we don't need to do anything. If it's
> > > +  * not the case, the fault will be triggered
> > > +  * again on the same address.
> > > +  */
> > > + pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > + return false;
> > > + }
> > > + pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > + }
[...]
> > > +
> > > + kaddr = kmap_atomic(dst);
> > 
> > Since you moved the kmap_atomic() here, could the above
> > arch_faults_on_old_pte() run in a preemptible context? I suggested to
> > add a WARN_ON in patch 2 to be sure.
> 
> Should I move kmap_atomic back to the original line? Thus, we can make sure
> that arch_faults_on_old_pte() is in the context of preempt_disabled?
> Otherwise, arch_faults_on_old_pte() may cause plenty of warning if I add
> a WARN_ON in arch_faults_on_old_pte.  I tested it when I enable the PREEMPT=y
> on a ThunderX2 qemu guest.

So we have two options here:

1. Change arch_faults_on_old_pte() scope to the whole system rather than
   just the current CPU. You'd have to wire up a new arm64 capability
   for the access flag but this way we don't care whether it's
   preemptible or not.

2. Keep the arch_faults_on_old_pte() per-CPU but make sure we are not
   preempted here. The kmap_atomic() move would do but you'd have to
   kunmap_atomic() before the return.

I think the answer to my question below also has some implication on
which option to pick:

> > >   /*
> > >* This really shouldn't fail, because the page is there
> > >* in the page tables. But it might just be unreadable,
> > >* in which case we just give up and fill the result with
> > >* zeroes.
> > >*/
> > > - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> > > + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
> > > + /* Give a warn in case there can be some obscure
> > > +  * use-case
> > > +  */
> > > + WARN_ON_ONCE(1);
> > 
> > That's more of a question for the mm guys: at this point we do the
> > copying with the ptl released; is there anything else that could have
> > made the pte old in the meantime? I think unuse_pte() is only called on
> > anonymous vmas, so it shouldn't be the case here.

If we need to hold the ptl here, you could as well have an enclosing
kmap/kunmap_atomic (option 2) with some goto instead of "return false".

-- 
Catalin


RE: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-24 Thread Justin He (Arm Technology China)
Hi Catalin
Please see an important comment inline, thanks

> -Original Message-
> From: Catalin Marinas 
> Sent: 2019年9月24日 1:05
> To: Justin He (Arm Technology China) 
> Cc: Will Deacon ; Mark Rutland
> ; James Morse ; Marc
> Zyngier ; Matthew Wilcox ; Kirill A.
> Shutemov ; linux-arm-
> ker...@lists.infradead.org; linux-kernel@vger.kernel.org; linux-
> m...@kvack.org; Suzuki Poulose ; Punit
> Agrawal ; Anshuman Khandual
> ; Alex Van Brunt
> ; Robin Murphy ;
> Thomas Gleixner ; Andrew Morton  foundation.org>; Jérôme Glisse ; Ralph Campbell
> ; hejia...@gmail.com; Kaly Xin (Arm Technology
> China) ; nd 
> Subject: Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF
> is cleared
> 
> On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> > @@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page
> *dst, struct page *src, unsigned lo
> >  * fails, we just zero-fill it. Live with it.
> >  */
> > if (unlikely(!src)) {
> > -   void *kaddr = kmap_atomic(dst);
> > -   void __user *uaddr = (void __user *)(va & PAGE_MASK);
> > +   void *kaddr;
> > +   pte_t entry;
> > +   void __user *uaddr = (void __user *)(addr & PAGE_MASK);
> >
> > +   /* On architectures with software "accessed" bits, we would
> > +* take a double page fault, so mark it accessed here.
> > +*/
> 
> Nitpick: please follow the kernel coding style for multi-line comments
> (above and the for the rest of the patch):
> 
>   /*
>* Your multi-line comment.
>*/
> 
> > +   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte))
> {
> > +   vmf->pte = pte_offset_map_lock(mm, vmf->pmd,
> addr,
> > +  >ptl);
> > +   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> > +   entry = pte_mkyoung(vmf->orig_pte);
> > +   if (ptep_set_access_flags(vma, addr,
> > + vmf->pte, entry, 0))
> > +   update_mmu_cache(vma, addr, vmf-
> >pte);
> > +   } else {
> > +   /* Other thread has already handled the
> fault
> > +* and we don't need to do anything. If it's
> > +* not the case, the fault will be triggered
> > +* again on the same address.
> > +*/
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   return false;
> > +   }
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   }
> 
> Another nit, you could rewrite this block slightly to avoid too much
> indentation. Something like (untested):
> 
>   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte))
> {
>   vmf->pte = pte_offset_map_lock(mm, vmf->pmd,
> addr,
>  >ptl);
>   if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
>   /*
>* Other thread has already handled the fault
>* and we don't need to do anything. If it's
>* not the case, the fault will be triggered
>* again on the same address.
>*/
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>   return false;
>   }
>   entry = pte_mkyoung(vmf->orig_pte);
>   if (ptep_set_access_flags(vma, addr,
> vmf->pte, entry, 0))
>   update_mmu_cache(vma, addr, vmf->pte);
>   pte_unmap_unlock(vmf->pte, vmf->ptl);
>   }
> 
> > +
> > +   kaddr = kmap_atomic(dst);
> 
> Since you moved the kmap_atomic() here, could the above
> arch_faults_on_old_pte() run in a preemptible context? I suggested to
> add a WARN_ON in patch 2 to be sure.

Should I move kmap_atomic back to the original line? Thus, we can make sure
that arch_faults_on_old_pte() is in the context of preempt_disabled?
Otherwise, arch_faults_on_old_pte() may cause plenty of warning if I add
a WARN_ON in arch_faults_on_old_pte.  I tested it when I enable the PREEMPT=y
on a ThunderX2 qemu guest.


--
Cheers,
Justin (Jia He)


> 
> > /*
> >  * This really shouldn't fail, because the page is there
> >  * in the page tables. But it might just be unreadable,
> >  * in which case we just give up and fill the result with
> >  * zeroes.
> >  */
> > -   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> > +   

Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-23 Thread Catalin Marinas
On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> @@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page *dst, 
> struct page *src, unsigned lo
>* fails, we just zero-fill it. Live with it.
>*/
>   if (unlikely(!src)) {
> - void *kaddr = kmap_atomic(dst);
> - void __user *uaddr = (void __user *)(va & PAGE_MASK);
> + void *kaddr;
> + pte_t entry;
> + void __user *uaddr = (void __user *)(addr & PAGE_MASK);
>  
> + /* On architectures with software "accessed" bits, we would
> +  * take a double page fault, so mark it accessed here.
> +  */

Nitpick: please follow the kernel coding style for multi-line comments
(above and the for the rest of the patch):

/*
 * Your multi-line comment.
 */

> + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
> + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr,
> +>ptl);
> + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
> + entry = pte_mkyoung(vmf->orig_pte);
> + if (ptep_set_access_flags(vma, addr,
> +   vmf->pte, entry, 0))
> + update_mmu_cache(vma, addr, vmf->pte);
> + } else {
> + /* Other thread has already handled the fault
> +  * and we don't need to do anything. If it's
> +  * not the case, the fault will be triggered
> +  * again on the same address.
> +  */
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + return false;
> + }
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + }

Another nit, you could rewrite this block slightly to avoid too much
indentation. Something like (untested):

if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr,
   >ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
/*
 * Other thread has already handled the fault
 * and we don't need to do anything. If it's
 * not the case, the fault will be triggered
 * again on the same address.
 */
pte_unmap_unlock(vmf->pte, vmf->ptl);
return false;
}
entry = pte_mkyoung(vmf->orig_pte);
if (ptep_set_access_flags(vma, addr,
  vmf->pte, entry, 0))
update_mmu_cache(vma, addr, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
}

> +
> + kaddr = kmap_atomic(dst);

Since you moved the kmap_atomic() here, could the above
arch_faults_on_old_pte() run in a preemptible context? I suggested to
add a WARN_ON in patch 2 to be sure.

>   /*
>* This really shouldn't fail, because the page is there
>* in the page tables. But it might just be unreadable,
>* in which case we just give up and fill the result with
>* zeroes.
>*/
> - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
> + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
> + /* Give a warn in case there can be some obscure
> +  * use-case
> +  */
> + WARN_ON_ONCE(1);

That's more of a question for the mm guys: at this point we do the
copying with the ptl released; is there anything else that could have
made the pte old in the meantime? I think unuse_pte() is only called on
anonymous vmas, so it shouldn't be the case here.

>   clear_page(kaddr);
> + }
>   kunmap_atomic(kaddr);
>   flush_dcache_page(dst);
>   } else
> - copy_user_highpage(dst, src, va, vma);
> + copy_user_highpage(dst, src, addr, vma);
> +
> + return true;
>  }

-- 
Catalin


Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-23 Thread Kirill A. Shutemov
On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
> will be a double page fault in __copy_from_user_inatomic of cow_user_page.
> 
> Below call trace is from arm64 do_page_fault for debugging purpose
> [  110.016195] Call trace:
> [  110.016826]  do_page_fault+0x5a4/0x690
> [  110.017812]  do_mem_abort+0x50/0xb0
> [  110.018726]  el1_da+0x20/0xc4
> [  110.019492]  __arch_copy_from_user+0x180/0x280
> [  110.020646]  do_wp_page+0xb0/0x860
> [  110.021517]  __handle_mm_fault+0x994/0x1338
> [  110.022606]  handle_mm_fault+0xe8/0x180
> [  110.023584]  do_page_fault+0x240/0x690
> [  110.024535]  do_mem_abort+0x50/0xb0
> [  110.025423]  el0_da+0x20/0x24
> 
> The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
> [9b007000] pgd=00023d4f8003, pud=00023da9b003, 
> pmd=00023d4b3003, pte=36298607bd3
> 
> As told by Catalin: "On arm64 without hardware Access Flag, copying from
> user will fail because the pte is old and cannot be marked young. So we
> always end up with zeroed page after fork() + CoW for pfn mappings. we
> don't always have a hardware-managed access flag on arm64."
> 
> This patch fix it by calling pte_mkyoung. Also, the parameter is
> changed because vmf should be passed to cow_user_page()
> 
> Add a WARN_ON_ONCE when __copy_from_user_inatomic() returns error
> in case there can be some obscure use-case.(by Kirill)
> 
> [1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> 
> Reported-by: Yibo Cai 
> Signed-off-by: Jia He 

Acked-by: Kirill A. Shutemov 

-- 
 Kirill A. Shutemov


Re: [PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-21 Thread Matthew Wilcox
On Sat, Sep 21, 2019 at 09:50:54PM +0800, Jia He wrote:
> When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
> will be a double page fault in __copy_from_user_inatomic of cow_user_page.
> 
> Below call trace is from arm64 do_page_fault for debugging purpose
> [  110.016195] Call trace:
> [  110.016826]  do_page_fault+0x5a4/0x690
> [  110.017812]  do_mem_abort+0x50/0xb0
> [  110.018726]  el1_da+0x20/0xc4
> [  110.019492]  __arch_copy_from_user+0x180/0x280
> [  110.020646]  do_wp_page+0xb0/0x860
> [  110.021517]  __handle_mm_fault+0x994/0x1338
> [  110.022606]  handle_mm_fault+0xe8/0x180
> [  110.023584]  do_page_fault+0x240/0x690
> [  110.024535]  do_mem_abort+0x50/0xb0
> [  110.025423]  el0_da+0x20/0x24
> 
> The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
> [9b007000] pgd=00023d4f8003, pud=00023da9b003, 
> pmd=00023d4b3003, pte=36298607bd3
> 
> As told by Catalin: "On arm64 without hardware Access Flag, copying from
> user will fail because the pte is old and cannot be marked young. So we
> always end up with zeroed page after fork() + CoW for pfn mappings. we
> don't always have a hardware-managed access flag on arm64."
> 
> This patch fix it by calling pte_mkyoung. Also, the parameter is
> changed because vmf should be passed to cow_user_page()
> 
> Add a WARN_ON_ONCE when __copy_from_user_inatomic() returns error
> in case there can be some obscure use-case.(by Kirill)
> 
> [1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
> 
> Reported-by: Yibo Cai 
> Signed-off-by: Jia He 

Reviewed-by: Matthew Wilcox (Oracle) 


[PATCH v8 3/3] mm: fix double page fault on arm64 if PTE_AF is cleared

2019-09-21 Thread Jia He
When we tested pmdk unit test [1] vmmalloc_fork TEST1 in arm64 guest, there
will be a double page fault in __copy_from_user_inatomic of cow_user_page.

Below call trace is from arm64 do_page_fault for debugging purpose
[  110.016195] Call trace:
[  110.016826]  do_page_fault+0x5a4/0x690
[  110.017812]  do_mem_abort+0x50/0xb0
[  110.018726]  el1_da+0x20/0xc4
[  110.019492]  __arch_copy_from_user+0x180/0x280
[  110.020646]  do_wp_page+0xb0/0x860
[  110.021517]  __handle_mm_fault+0x994/0x1338
[  110.022606]  handle_mm_fault+0xe8/0x180
[  110.023584]  do_page_fault+0x240/0x690
[  110.024535]  do_mem_abort+0x50/0xb0
[  110.025423]  el0_da+0x20/0x24

The pte info before __copy_from_user_inatomic is (PTE_AF is cleared):
[9b007000] pgd=00023d4f8003, pud=00023da9b003, 
pmd=00023d4b3003, pte=36298607bd3

As told by Catalin: "On arm64 without hardware Access Flag, copying from
user will fail because the pte is old and cannot be marked young. So we
always end up with zeroed page after fork() + CoW for pfn mappings. we
don't always have a hardware-managed access flag on arm64."

This patch fix it by calling pte_mkyoung. Also, the parameter is
changed because vmf should be passed to cow_user_page()

Add a WARN_ON_ONCE when __copy_from_user_inatomic() returns error
in case there can be some obscure use-case.(by Kirill)

[1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork

Reported-by: Yibo Cai 
Signed-off-by: Jia He 
---
 mm/memory.c | 67 -
 1 file changed, 61 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..ae09b070b04d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,13 @@ int randomize_va_space __read_mostly =
2;
 #endif
 
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+   return false;
+}
+#endif
+
 static int __init disable_randmaps(char *s)
 {
randomize_va_space = 0;
@@ -2140,8 +2147,13 @@ static inline int pte_unmap_same(struct mm_struct *mm, 
pmd_t *pmd,
return same;
 }
 
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned 
long va, struct vm_area_struct *vma)
+static inline bool cow_user_page(struct page *dst, struct page *src,
+struct vm_fault *vmf)
 {
+   struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *mm = vma->vm_mm;
+   unsigned long addr = vmf->address;
+
debug_dma_assert_idle(src);
 
/*
@@ -2151,21 +2163,53 @@ static inline void cow_user_page(struct page *dst, 
struct page *src, unsigned lo
 * fails, we just zero-fill it. Live with it.
 */
if (unlikely(!src)) {
-   void *kaddr = kmap_atomic(dst);
-   void __user *uaddr = (void __user *)(va & PAGE_MASK);
+   void *kaddr;
+   pte_t entry;
+   void __user *uaddr = (void __user *)(addr & PAGE_MASK);
 
+   /* On architectures with software "accessed" bits, we would
+* take a double page fault, so mark it accessed here.
+*/
+   if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+   vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr,
+  >ptl);
+   if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+   entry = pte_mkyoung(vmf->orig_pte);
+   if (ptep_set_access_flags(vma, addr,
+ vmf->pte, entry, 0))
+   update_mmu_cache(vma, addr, vmf->pte);
+   } else {
+   /* Other thread has already handled the fault
+* and we don't need to do anything. If it's
+* not the case, the fault will be triggered
+* again on the same address.
+*/
+   pte_unmap_unlock(vmf->pte, vmf->ptl);
+   return false;
+   }
+   pte_unmap_unlock(vmf->pte, vmf->ptl);
+   }
+
+   kaddr = kmap_atomic(dst);
/*
 * This really shouldn't fail, because the page is there
 * in the page tables. But it might just be unreadable,
 * in which case we just give up and fill the result with
 * zeroes.
 */
-   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+   if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+   /* Give a warn in case there can be some obscure
+* use-case
+*/
+