On 25/07/2018 11:04, zhong jiang wrote:
> On 2018/7/25 0:10, Laurent Dufour wrote:
>>
>> On 24/07/2018 16:26, zhong jiang wrote:
>>> On 2018/5/17 19:06, Laurent Dufour wrote:
>>>> From: Peter Zijlstra <pet...@infradead.org>
>>>>
>>>> Provide infrastructure to do a speculative fault (not holding
>>>> mmap_sem).
>>>>
>>>> The not holding of mmap_sem means we can race against VMA
>>>> change/removal and page-table destruction. We use the SRCU VMA freeing
>>>> to keep the VMA around. We use the VMA seqcount to detect change
>>>> (including umapping / page-table deletion) and we use gup_fast() style
>>>> page-table walking to deal with page-table races.
>>>>
>>>> Once we've obtained the page and are ready to update the PTE, we
>>>> validate if the state we started the fault with is still valid, if
>>>> not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
>>>> PTE and we're done.
>>>>
>>>> Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
>>>>
>>>> [Manage the newly introduced pte_spinlock() for speculative page
>>>>  fault to fail if the VMA is touched in our back]
>>>> [Rename vma_is_dead() to vma_has_changed() and declare it here]
>>>> [Fetch p4d and pud]
>>>> [Set vmd.sequence in __handle_mm_fault()]
>>>> [Abort speculative path when handle_userfault() has to be called]
>>>> [Add additional VMA's flags checks in handle_speculative_fault()]
>>>> [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
>>>> [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
>>>> [Remove warning comment about waiting for !seq&1 since we don't want
>>>>  to wait]
>>>> [Remove warning about no huge page support, mention it explictly]
>>>> [Don't call do_fault() in the speculative path as __do_fault() calls
>>>>  vma->vm_ops->fault() which may want to release mmap_sem]
>>>> [Only vm_fault pointer argument for vma_has_changed()]
>>>> [Fix check against huge page, calling pmd_trans_huge()]
>>>> [Use READ_ONCE() when reading VMA's fields in the speculative path]
>>>> [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
>>>>  processing done in vm_normal_page()]
>>>> [Check that vma->anon_vma is already set when starting the speculative
>>>>  path]
>>>> [Check for memory policy as we can't support MPOL_INTERLEAVE case due to
>>>>  the processing done in mpol_misplaced()]
>>>> [Don't support VMA growing up or down]
>>>> [Move check on vm_sequence just before calling handle_pte_fault()]
>>>> [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
>>>> [Add mem cgroup oom check]
>>>> [Use READ_ONCE to access p*d entries]
>>>> [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
>>>> [Don't fetch pte again in handle_pte_fault() when running the speculative
>>>>  path]
>>>> [Check PMD against concurrent collapsing operation]
>>>> [Try spin lock the pte during the speculative path to avoid deadlock with
>>>>  other CPU's invalidating the TLB and requiring this CPU to catch the
>>>>  inter processor's interrupt]
>>>> [Move define of FAULT_FLAG_SPECULATIVE here]
>>>> [Introduce __handle_speculative_fault() and add a check against
>>>>  mm->mm_users in handle_speculative_fault() defined in mm.h]
>>>> Signed-off-by: Laurent Dufour <lduf...@linux.vnet.ibm.com>
>>>> ---
>>>>  include/linux/hugetlb_inline.h |   2 +-
>>>>  include/linux/mm.h             |  30 ++++
>>>>  include/linux/pagemap.h        |   4 +-
>>>>  mm/internal.h                  |  16 +-
>>>>  mm/memory.c                    | 340 
>>>> ++++++++++++++++++++++++++++++++++++++++-
>>>>  5 files changed, 385 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/include/linux/hugetlb_inline.h 
>>>> b/include/linux/hugetlb_inline.h
>>>> index 0660a03d37d9..9e25283d6fc9 100644
>>>> --- a/include/linux/hugetlb_inline.h
>>>> +++ b/include/linux/hugetlb_inline.h
>>>> @@ -8,7 +8,7 @@
>>>>  
>>>>  static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
>>>>  {
>>>> -  return !!(vma->vm_flags & VM_HUGETLB);
>>>> +  return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
>>>>  }
>>>>  
>>>>  #else
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index 05cbba70104b..31acf98a7d92 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -315,6 +315,7 @@ extern pgprot_t protection_map[16];
>>>>  #define FAULT_FLAG_USER           0x40    /* The fault originated in 
>>>> userspace */
>>>>  #define FAULT_FLAG_REMOTE 0x80    /* faulting for non current tsk/mm */
>>>>  #define FAULT_FLAG_INSTRUCTION  0x100     /* The fault was during an 
>>>> instruction fetch */
>>>> +#define FAULT_FLAG_SPECULATIVE    0x200   /* Speculative fault, not 
>>>> holding mmap_sem */
>>>>  
>>>>  #define FAULT_FLAG_TRACE \
>>>>    { FAULT_FLAG_WRITE,             "WRITE" }, \
>>>> @@ -343,6 +344,10 @@ struct vm_fault {
>>>>    gfp_t gfp_mask;                 /* gfp mask to be used for allocations 
>>>> */
>>>>    pgoff_t pgoff;                  /* Logical page offset based on vma */
>>>>    unsigned long address;          /* Faulting virtual address */
>>>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>>>> +  unsigned int sequence;
>>>> +  pmd_t orig_pmd;                 /* value of PMD at the time of fault */
>>>> +#endif
>>>>    pmd_t *pmd;                     /* Pointer to pmd entry matching
>>>>                                     * the 'address' */
>>>>    pud_t *pud;                     /* Pointer to pud entry matching
>>>> @@ -1415,6 +1420,31 @@ int invalidate_inode_page(struct page *page);
>>>>  #ifdef CONFIG_MMU
>>>>  extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long 
>>>> address,
>>>>            unsigned int flags);
>>>> +
>>>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>>>> +extern int __handle_speculative_fault(struct mm_struct *mm,
>>>> +                                unsigned long address,
>>>> +                                unsigned int flags);
>>>> +static inline int handle_speculative_fault(struct mm_struct *mm,
>>>> +                                     unsigned long address,
>>>> +                                     unsigned int flags)
>>>> +{
>>>> +  /*
>>>> +   * Try speculative page fault for multithreaded user space task only.
>>>> +   */
>>>> +  if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
>>>> +          return VM_FAULT_RETRY;
>>>> +  return __handle_speculative_fault(mm, address, flags);
>>>> +}
>>>> +#else
>>>> +static inline int handle_speculative_fault(struct mm_struct *mm,
>>>> +                                     unsigned long address,
>>>> +                                     unsigned int flags)
>>>> +{
>>>> +  return VM_FAULT_RETRY;
>>>> +}
>>>> +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
>>>> +
>>>>  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
>>>>                        unsigned long address, unsigned int fault_flags,
>>>>                        bool *unlocked);
>>>> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
>>>> index b1bd2186e6d2..6e2aa4e79af7 100644
>>>> --- a/include/linux/pagemap.h
>>>> +++ b/include/linux/pagemap.h
>>>> @@ -456,8 +456,8 @@ static inline pgoff_t linear_page_index(struct 
>>>> vm_area_struct *vma,
>>>>    pgoff_t pgoff;
>>>>    if (unlikely(is_vm_hugetlb_page(vma)))
>>>>            return linear_hugepage_index(vma, address);
>>>> -  pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
>>>> -  pgoff += vma->vm_pgoff;
>>>> +  pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
>>>> +  pgoff += READ_ONCE(vma->vm_pgoff);
>>>>    return pgoff;
>>>>  }
>>>>  
>>>> diff --git a/mm/internal.h b/mm/internal.h
>>>> index fb2667b20f0a..10b188c87fa4 100644
>>>> --- a/mm/internal.h
>>>> +++ b/mm/internal.h
>>>> @@ -44,7 +44,21 @@ int do_swap_page(struct vm_fault *vmf);
>>>>  extern struct vm_area_struct *get_vma(struct mm_struct *mm,
>>>>                                  unsigned long addr);
>>>>  extern void put_vma(struct vm_area_struct *vma);
>>>> -#endif
>>>> +
>>>> +static inline bool vma_has_changed(struct vm_fault *vmf)
>>>> +{
>>>> +  int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
>>>> +  unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence);
>>>> +
>>>> +  /*
>>>> +   * Matches both the wmb in write_seqlock_{begin,end}() and
>>>> +   * the wmb in vma_rb_erase().
>>>> +   */
>>>> +  smp_rmb();
>>>> +
>>>> +  return ret || seq != vmf->sequence;
>>>> +}
>>>> +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
>>>>  
>>>>  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct 
>>>> *start_vma,
>>>>            unsigned long floor, unsigned long ceiling);
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index ab32b0b4bd69..7bbbb8c7b9cd 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -769,7 +769,8 @@ static void print_bad_pte(struct vm_area_struct *vma, 
>>>> unsigned long addr,
>>>>    if (page)
>>>>            dump_page(page, "bad pte");
>>>>    pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
>>>> -           (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
>>>> +           (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma,
>>>> +           mapping, index);
>>>>    pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
>>>>             vma->vm_file,
>>>>             vma->vm_ops ? vma->vm_ops->fault : NULL,
>>>> @@ -2306,6 +2307,118 @@ int apply_to_page_range(struct mm_struct *mm, 
>>>> unsigned long addr,
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(apply_to_page_range);
>>>>  
>>>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>>>> +static bool pte_spinlock(struct vm_fault *vmf)
>>>> +{
>>>> +  bool ret = false;
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +  pmd_t pmdval;
>>>> +#endif
>>>> +
>>>> +  /* Check if vma is still valid */
>>>> +  if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
>>>> +          vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
>>>> +          spin_lock(vmf->ptl);
>>>> +          return true;
>>>> +  }
>>>> +
>>>> +again:
>>>> +  local_irq_disable();
>>>> +  if (vma_has_changed(vmf))
>>>> +          goto out;
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +  /*
>>>> +   * We check if the pmd value is still the same to ensure that there
>>>> +   * is not a huge collapse operation in progress in our back.
>>>> +   */
>>>> +  pmdval = READ_ONCE(*vmf->pmd);
>>>> +  if (!pmd_same(pmdval, vmf->orig_pmd))
>>>> +          goto out;
>>>> +#endif
>>>> +
>>>> +  vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
>>>> +  if (unlikely(!spin_trylock(vmf->ptl))) {
>>>> +          local_irq_enable();
>>>> +          goto again;
>>>> +  }
>>>> +
>>>> +  if (vma_has_changed(vmf)) {
>>>> +          spin_unlock(vmf->ptl);
>>>> +          goto out;
>>>> +  }
>>>> +
>>>> +  ret = true;
>>>> +out:
>>>> +  local_irq_enable();
>>>> +  return ret;
>>>> +}
>>>> +
>>>> +static bool pte_map_lock(struct vm_fault *vmf)
>>>> +{
>>>> +  bool ret = false;
>>>> +  pte_t *pte;
>>>> +  spinlock_t *ptl;
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +  pmd_t pmdval;
>>>> +#endif
>>>> +
>>>> +  if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
>>>> +          vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
>>>> +                                         vmf->address, &vmf->ptl);
>>>> +          return true;
>>>> +  }
>>>> +
>>>> +  /*
>>>> +   * The first vma_has_changed() guarantees the page-tables are still
>>>> +   * valid, having IRQs disabled ensures they stay around, hence the
>>>> +   * second vma_has_changed() to make sure they are still valid once
>>>> +   * we've got the lock. After that a concurrent zap_pte_range() will
>>>> +   * block on the PTL and thus we're safe.
>>>> +   */
>>>> +again:
>>>> +  local_irq_disable();
>>>> +  if (vma_has_changed(vmf))
>>>> +          goto out;
>>>> +
>>>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>>> +  /*
>>>> +   * We check if the pmd value is still the same to ensure that there
>>>> +   * is not a huge collapse operation in progress in our back.
>>>> +   */
>>>> +  pmdval = READ_ONCE(*vmf->pmd);
>>>> +  if (!pmd_same(pmdval, vmf->orig_pmd))
>>>> +          goto out;
>>>> +#endif
>>>> +
>>>> +  /*
>>>> +   * Same as pte_offset_map_lock() except that we call
>>>> +   * spin_trylock() in place of spin_lock() to avoid race with
>>>> +   * unmap path which may have the lock and wait for this CPU
>>>> +   * to invalidate TLB but this CPU has irq disabled.
>>>> +   * Since we are in a speculative patch, accept it could fail
>>>> +   */
>>>> +  ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
>>>> +  pte = pte_offset_map(vmf->pmd, vmf->address);
>>>> +  if (unlikely(!spin_trylock(ptl))) {
>>>> +          pte_unmap(pte);
>>>> +          local_irq_enable();
>>>> +          goto again;
>>>> +  }
>>>> +
>>>> +  if (vma_has_changed(vmf)) {
>>>> +          pte_unmap_unlock(pte, ptl);
>>>> +          goto out;
>>>> +  }
>>>> +
>>>> +  vmf->pte = pte;
>>>> +  vmf->ptl = ptl;
>>>> +  ret = true;
>>>> +out:
>>>> +  local_irq_enable();
>>>> +  return ret;
>>>> +}
>>>> +#else
>>>>  static inline bool pte_spinlock(struct vm_fault *vmf)
>>>>  {
>>>>    vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
>>>> @@ -2319,6 +2432,7 @@ static inline bool pte_map_lock(struct vm_fault *vmf)
>>>>                                   vmf->address, &vmf->ptl);
>>>>    return true;
>>>>  }
>>>> +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
>>>>  
>>>>  /*
>>>>   * handle_pte_fault chooses page fault handler according to an entry 
>>>> which was
>>>> @@ -3208,6 +3322,14 @@ static int do_anonymous_page(struct vm_fault *vmf)
>>>>            ret = check_stable_address_space(vma->vm_mm);
>>>>            if (ret)
>>>>                    goto unlock;
>>>> +          /*
>>>> +           * Don't call the userfaultfd during the speculative path.
>>>> +           * We already checked for the VMA to not be managed through
>>>> +           * userfaultfd, but it may be set in our back once we have lock
>>>> +           * the pte. In such a case we can ignore it this time.
>>>> +           */
>>>> +          if (vmf->flags & FAULT_FLAG_SPECULATIVE)
>>>> +                  goto setpte;
>>>>            /* Deliver the page fault to userland, check inside PT lock */
>>>>            if (userfaultfd_missing(vma)) {
>>>>                    pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>> @@ -3249,7 +3371,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
>>>>            goto unlock_and_release;
>>>>  
>>>>    /* Deliver the page fault to userland, check inside PT lock */
>>>> -  if (userfaultfd_missing(vma)) {
>>>> +  if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) 
>>>> {
>>>>            pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>            mem_cgroup_cancel_charge(page, memcg, false);
>>>>            put_page(page);
>>>> @@ -3994,13 +4116,22 @@ static int handle_pte_fault(struct vm_fault *vmf)
>>>>  
>>>>    if (unlikely(pmd_none(*vmf->pmd))) {
>>>>            /*
>>>> +           * In the case of the speculative page fault handler we abort
>>>> +           * the speculative path immediately as the pmd is probably
>>>> +           * in the way to be converted in a huge one. We will try
>>>> +           * again holding the mmap_sem (which implies that the collapse
>>>> +           * operation is done).
>>>> +           */
>>>> +          if (vmf->flags & FAULT_FLAG_SPECULATIVE)
>>>> +                  return VM_FAULT_RETRY;
>>>> +          /*
>>>>             * Leave __pte_alloc() until later: because vm_ops->fault may
>>>>             * want to allocate huge page, and if we expose page table
>>>>             * for an instant, it will be difficult to retract from
>>>>             * concurrent faults and from rmap lookups.
>>>>             */
>>>>            vmf->pte = NULL;
>>>> -  } else {
>>>> +  } else if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
>>>>            /* See comment in pte_alloc_one_map() */
>>>>            if (pmd_devmap_trans_unstable(vmf->pmd))
>>>>                    return 0;
>>>> @@ -4009,6 +4140,9 @@ static int handle_pte_fault(struct vm_fault *vmf)
>>>>             * pmd from under us anymore at this point because we hold the
>>>>             * mmap_sem read mode and khugepaged takes it in write mode.
>>>>             * So now it's safe to run pte_offset_map().
>>>> +           * This is not applicable to the speculative page fault handler
>>>> +           * but in that case, the pte is fetched earlier in
>>>> +           * handle_speculative_fault().
>>>>             */
>>>>            vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
>>>>            vmf->orig_pte = *vmf->pte;
>>>> @@ -4031,6 +4165,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
>>>>    if (!vmf->pte) {
>>>>            if (vma_is_anonymous(vmf->vma))
>>>>                    return do_anonymous_page(vmf);
>>>> +          else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
>>>> +                  return VM_FAULT_RETRY;
>>>>            else
>>>>                    return do_fault(vmf);
>>>>    }
>>>> @@ -4128,6 +4264,9 @@ static int __handle_mm_fault(struct vm_area_struct 
>>>> *vma, unsigned long address,
>>>>    vmf.pmd = pmd_alloc(mm, vmf.pud, address);
>>>>    if (!vmf.pmd)
>>>>            return VM_FAULT_OOM;
>>>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>>>> +  vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
>>>> +#endif
>>>>    if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
>>>>            ret = create_huge_pmd(&vmf);
>>>>            if (!(ret & VM_FAULT_FALLBACK))
>>>> @@ -4161,6 +4300,201 @@ static int __handle_mm_fault(struct vm_area_struct 
>>>> *vma, unsigned long address,
>>>>    return handle_pte_fault(&vmf);
>>>>  }
>>>>  
>>>> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
>>>> +/*
>>>> + * Tries to handle the page fault in a speculative way, without grabbing 
>>>> the
>>>> + * mmap_sem.
>>>> + */
>>>> +int __handle_speculative_fault(struct mm_struct *mm, unsigned long 
>>>> address,
>>>> +                         unsigned int flags)
>>>> +{
>>>> +  struct vm_fault vmf = {
>>>> +          .address = address,
>>>> +  };
>>>> +  pgd_t *pgd, pgdval;
>>>> +  p4d_t *p4d, p4dval;
>>>> +  pud_t pudval;
>>>> +  int seq, ret = VM_FAULT_RETRY;
>>>> +  struct vm_area_struct *vma;
>>>> +#ifdef CONFIG_NUMA
>>>> +  struct mempolicy *pol;
>>>> +#endif
>>>> +
>>>> +  /* Clear flags that may lead to release the mmap_sem to retry */
>>>> +  flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>>>> +  flags |= FAULT_FLAG_SPECULATIVE;
>>>> +
>>>> +  vma = get_vma(mm, address);
>>>> +  if (!vma)
>>>> +          return ret;
>>>> +
>>>> +  seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> 
>>>> seqlock,vma_rb_erase() */
>>>> +  if (seq & 1)
>>>> +          goto out_put;
>>>> +
>>>> +  /*
>>>> +   * Can't call vm_ops service has we don't know what they would do
>>>> +   * with the VMA.
>>>> +   * This include huge page from hugetlbfs.
>>>> +   */
>>>> +  if (vma->vm_ops)
>>>> +          goto out_put;
>>>> +
>>>   Hi   Laurent
>>>    
>>>    I think that most of pagefault will leave here.   Is there any case  
>>> need to skip ?
>>>   I have tested the following  patch, it work well.
>> Hi Zhong,
>>
>> Well this will allow file mapping to be handle in a speculative way, but 
>> that's
>> a bit dangerous today as there is no guaranty that the vm_ops.vm_fault()
>> operation will be fair.
>>
>> In the case of the anonymous file mapping that's often not a problem, 
>> depending
>> on the underlying file system, but there are so many cases to check and this 
>> is
>> hard to say this can be done in a speculative way as is.
>  This patch say that spf just handle anonyous page. but I find that 
> do_swap_page
>  also maybe release the mmap_sem without FAULT_FLAG_RETRY_NOWAIT. why is it 
> safe
>  to handle the case. I think that the case is similar to file page. Maybe I 
> miss
>  something else.

do_swap_page() may released the mmap_sem through the call to
__lock_page_or_retry(), but this can only happen if FAULT_FLAG_ALLOW_RETRY or
FAULT_FLAG_KILLABLE are set and they are unset in __handle_speculative_fault().

> 
>  I test the patches and find just only 18% of the pagefault will enter into 
> the
>  speculative page fault during a process startup. As I had said. most of 
> pagefault
>  will be handled by ops->fault. I do not know the data you had posted is how 
> to get.

I do agree that handling file mapping will be required, but this will add more
complexity to this series, since we need a way for drivers to tell they are
compatible with the speculative path.

May be I should give it a try on the next send.

For my information, what was the performance improvement you seen when handling
file page faulting this way ?

Thanks,
Laurent.

> 
> 
>  Thanks
>  zhong jiang
>> The huge work to do is to double check that all the code called by
>> vm_ops.fault() is not dealing with the mmap_sem, which could be handled using
>> FAULT_FLAG_RETRY_NOWAIT, and care is also needed about the resources that 
>> code
>> is managing as it may assume that it is under the protection of the mmap_sem 
>> in
>> read mode, and that can be done implicitly.
>>
>> Cheers,
>> Laurent.
>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 936128b..9bc1545 100644
>>>  @@ -3893,8 +3898,6 @@ static int handle_pte_fault(struct fault_env *fe)
>>>         if (!fe->pte) {
>>>                 if (vma_is_anonymous(fe->vma))
>>>                         return do_anonymous_page(fe);
>>> -               else if (fe->flags & FAULT_FLAG_SPECULATIVE)
>>> -                       return VM_FAULT_RETRY;
>>>                 else
>>>                         return do_fault(fe);
>>>         }
>>> @@ -4026,20 +4029,11 @@ int __handle_speculative_fault(struct mm_struct 
>>> *mm, unsigned long address,
>>>                 goto out_put;
>>>         }
>>>         /*
>>> -        * Can't call vm_ops service has we don't know what they would do
>>> -        * with the VMA.
>>> -        * This include huge page from hugetlbfs.
>>> -        */
>>> -       if (vma->vm_ops) {
>>> -               trace_spf_vma_notsup(_RET_IP_, vma, address);
>>> -               goto out_put;
>>> -       }
>>>
>>>
>>> Thanks
>>> zhong jiang
>>>> +  /*
>>>> +   * __anon_vma_prepare() requires the mmap_sem to be held
>>>> +   * because vm_next and vm_prev must be safe. This can't be guaranteed
>>>> +   * in the speculative path.
>>>> +   */
>>>> +  if (unlikely(!vma->anon_vma))
>>>> +          goto out_put;
>>>> +
>>>> +  vmf.vma_flags = READ_ONCE(vma->vm_flags);
>>>> +  vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>>>> +
>>>> +  /* Can't call userland page fault handler in the speculative path */
>>>> +  if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>>>> +          goto out_put;
>>>> +
>>>> +  if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>>>> +          /*
>>>> +           * This could be detected by the check address against VMA's
>>>> +           * boundaries but we want to trace it as not supported instead
>>>> +           * of changed.
>>>> +           */
>>>> +          goto out_put;
>>>> +
>>>> +  if (address < READ_ONCE(vma->vm_start)
>>>> +      || READ_ONCE(vma->vm_end) <= address)
>>>> +          goto out_put;
>>>> +
>>>> +  if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>>>> +                                 flags & FAULT_FLAG_INSTRUCTION,
>>>> +                                 flags & FAULT_FLAG_REMOTE)) {
>>>> +          ret = VM_FAULT_SIGSEGV;
>>>> +          goto out_put;
>>>> +  }
>>>> +
>>>> +  /* This is one is required to check that the VMA has write access set */
>>>> +  if (flags & FAULT_FLAG_WRITE) {
>>>> +          if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
>>>> +                  ret = VM_FAULT_SIGSEGV;
>>>> +                  goto out_put;
>>>> +          }
>>>> +  } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {
>>>> +          ret = VM_FAULT_SIGSEGV;
>>>> +          goto out_put;
>>>> +  }
>>>> +
>>>> +#ifdef CONFIG_NUMA
>>>> +  /*
>>>> +   * MPOL_INTERLEAVE implies additional checks in
>>>> +   * mpol_misplaced() which are not compatible with the
>>>> +   *speculative page fault processing.
>>>> +   */
>>>> +  pol = __get_vma_policy(vma, address);
>>>> +  if (!pol)
>>>> +          pol = get_task_policy(current);
>>>> +  if (pol && pol->mode == MPOL_INTERLEAVE)
>>>> +          goto out_put;
>>>> +#endif
>>>> +
>>>> +  /*
>>>> +   * Do a speculative lookup of the PTE entry.
>>>> +   */
>>>> +  local_irq_disable();
>>>> +  pgd = pgd_offset(mm, address);
>>>> +  pgdval = READ_ONCE(*pgd);
>>>> +  if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
>>>> +          goto out_walk;
>>>> +
>>>> +  p4d = p4d_offset(pgd, address);
>>>> +  p4dval = READ_ONCE(*p4d);
>>>> +  if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
>>>> +          goto out_walk;
>>>> +
>>>> +  vmf.pud = pud_offset(p4d, address);
>>>> +  pudval = READ_ONCE(*vmf.pud);
>>>> +  if (pud_none(pudval) || unlikely(pud_bad(pudval)))
>>>> +          goto out_walk;
>>>> +
>>>> +  /* Huge pages at PUD level are not supported. */
>>>> +  if (unlikely(pud_trans_huge(pudval)))
>>>> +          goto out_walk;
>>>> +
>>>> +  vmf.pmd = pmd_offset(vmf.pud, address);
>>>> +  vmf.orig_pmd = READ_ONCE(*vmf.pmd);
>>>> +  /*
>>>> +   * pmd_none could mean that a hugepage collapse is in progress
>>>> +   * in our back as collapse_huge_page() mark it before
>>>> +   * invalidating the pte (which is done once the IPI is catched
>>>> +   * by all CPU and we have interrupt disabled).
>>>> +   * For this reason we cannot handle THP in a speculative way since we
>>>> +   * can't safely indentify an in progress collapse operation done in our
>>>> +   * back on that PMD.
>>>> +   * Regarding the order of the following checks, see comment in
>>>> +   * pmd_devmap_trans_unstable()
>>>> +   */
>>>> +  if (unlikely(pmd_devmap(vmf.orig_pmd) ||
>>>> +               pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd) ||
>>>> +               is_swap_pmd(vmf.orig_pmd)))
>>>> +          goto out_walk;
>>>> +
>>>> +  /*
>>>> +   * The above does not allocate/instantiate page-tables because doing so
>>>> +   * would lead to the possibility of instantiating page-tables after
>>>> +   * free_pgtables() -- and consequently leaking them.
>>>> +   *
>>>> +   * The result is that we take at least one !speculative fault per PMD
>>>> +   * in order to instantiate it.
>>>> +   */
>>>> +
>>>> +  vmf.pte = pte_offset_map(vmf.pmd, address);
>>>> +  vmf.orig_pte = READ_ONCE(*vmf.pte);
>>>> +  barrier(); /* See comment in handle_pte_fault() */
>>>> +  if (pte_none(vmf.orig_pte)) {
>>>> +          pte_unmap(vmf.pte);
>>>> +          vmf.pte = NULL;
>>>> +  }
>>>> +
>>>> +  vmf.vma = vma;
>>>> +  vmf.pgoff = linear_page_index(vma, address);
>>>> +  vmf.gfp_mask = __get_fault_gfp_mask(vma);
>>>> +  vmf.sequence = seq;
>>>> +  vmf.flags = flags;
>>>> +
>>>> +  local_irq_enable();
>>>> +
>>>> +  /*
>>>> +   * We need to re-validate the VMA after checking the bounds, otherwise
>>>> +   * we might have a false positive on the bounds.
>>>> +   */
>>>> +  if (read_seqcount_retry(&vma->vm_sequence, seq))
>>>> +          goto out_put;
>>>> +
>>>> +  mem_cgroup_oom_enable();
>>>> +  ret = handle_pte_fault(&vmf);
>>>> +  mem_cgroup_oom_disable();
>>>> +
>>>> +  put_vma(vma);
>>>> +
>>>> +  /*
>>>> +   * The task may have entered a memcg OOM situation but
>>>> +   * if the allocation error was handled gracefully (no
>>>> +   * VM_FAULT_OOM), there is no need to kill anything.
>>>> +   * Just clean up the OOM state peacefully.
>>>> +   */
>>>> +  if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
>>>> +          mem_cgroup_oom_synchronize(false);
>>>> +  return ret;
>>>> +
>>>> +out_walk:
>>>> +  local_irq_enable();
>>>> +out_put:
>>>> +  put_vma(vma);
>>>> +  return ret;
>>>> +}
>>>> +#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
>>>> +
>>>>  /*
>>>>   * By the time we get here, we already hold the mm semaphore
>>>>   *
>>>
>>
>> .
>>
> 
> 

Reply via email to