On 26/05/26 8:20 pm, Yin Tirui wrote:
> Classify the source PMD via pmd_present() and vm_normal_folio_pmd(),
> matching the way the PTE path uses pte_present() and vm_normal_page().
> This moves the present-PMD decision from VMA identity checks to the
> actual PMD/folio state.
> 
> Drop the defensive "if (!pmd_trans_huge(pmd)) goto out_unlock" branch:
> with mmap_write_lock held during fork, it should not occur.

Split this from this patch? This is a functional change so will be better
to review it separately.

> 
> Extract the present-PMD side of copy_huge_pmd() into
> copy_present_huge_pmd(). The helper owns the child pgtable passed by the
> caller: it either deposits the pgtable when installing a copied PMD, or
> frees it on paths that do not install one.
> 
> The child pgtable is now allocated once up front and freed on every skip
> path. This makes file/shmem and PFNMAP/special skip paths take the PMD
> locks and free the preallocated pgtable before returning. These are not
> expected to be hot paths, and the PFNMAP case is reused by the follow-up
> PMD PFNMAP copy support.
> 
> Signed-off-by: Yin Tirui <[email protected]>
> ---

Since the series is still marked RFC, I think it would be better to send
the refactor patches separately?

>  mm/huge_memory.c | 175 +++++++++++++++++++++++++----------------------
>  1 file changed, 95 insertions(+), 80 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9832ee910d5e..3964258ff91d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1879,6 +1879,82 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned 
> long addr,
>       return false;
>  }
>  
> +static int copy_present_huge_pmd(
> +             struct mm_struct *dst_mm, struct mm_struct *src_mm,
> +             pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
> +             struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> +             pmd_t pmd, pgtable_t pgtable, bool *need_split)
> +{
> +     struct folio *src_folio;
> +     bool wrprotect = true;
> +
> +     src_folio = vm_normal_folio_pmd(src_vma, addr, pmd);
> +     if (!src_folio) {
> +             /*
> +              * When page table lock is held, the huge zero pmd should not be
> +              * under splitting since we don't split the page itself, only 
> pmd to
> +              * a page table.
> +              */
> +             if (is_huge_zero_pmd(pmd)) {
> +                     /*
> +                      * mm_get_huge_zero_folio() will never allocate a new
> +                      * folio here, since we already have a zero page to
> +                      * copy. It just takes a reference.
> +                      */
> +                     mm_get_huge_zero_folio(dst_mm);
> +                     goto set_pmd;
> +             }
> +
> +             /*
> +              * Making sure it's not a CoW VMA with writable
> +              * mapping, otherwise it means either the anon page wrongly
> +              * applied special bit, or we made the PRIVATE mapping be
> +              * able to wrongly write to the backend MMIO.
> +              */
> +             VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && 
> pmd_write(pmd));
> +             pte_free(dst_mm, pgtable);
> +             pgtable = NULL;
> +             wrprotect = false;
> +             goto set_pmd;
> +     }
> +
> +     /* File THPs are copied lazily by refaulting. */
> +     if (!folio_test_anon(src_folio)) {
> +             pte_free(dst_mm, pgtable);
> +             return 0;
> +     }


You removed !vma_is_anonymous to condense it into !folio_test_anon.
For private-file mappings that is not true, but okay since PMD mapping
is not supported.

> +
> +     folio_get(src_folio);
> +     if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio,
> +                                                     &src_folio->page,
> +                                                     dst_vma, src_vma))) {
> +             /* Page maybe pinned: split and retry the fault on PTEs. */
> +             folio_put(src_folio);
> +             pte_free(dst_mm, pgtable);
> +             *need_split = true;
> +             return -EAGAIN;
> +     }
> +     add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +
> +set_pmd:
> +     if (pgtable) {
> +             mm_inc_nr_ptes(dst_mm);
> +             pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> +     }
> +
> +     if (wrprotect) {
> +             pmdp_set_wrprotect(src_mm, addr, src_pmd);
> +             if (!userfaultfd_wp(dst_vma))
> +                     pmd = pmd_clear_uffd_wp(pmd);
> +             pmd = pmd_wrprotect(pmd);
> +     }
> +
> +     pmd = pmd_mkold(pmd);
> +     set_pmd_at(dst_mm, addr, dst_pmd, pmd);
> +
> +     return 0;
> +}
> +
>  static void copy_huge_non_present_pmd(
>               struct mm_struct *dst_mm, struct mm_struct *src_mm,
>               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
> @@ -1940,104 +2016,43 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
> mm_struct *src_mm,
>                 struct vm_area_struct *dst_vma, struct vm_area_struct 
> *src_vma)
>  {
>       spinlock_t *dst_ptl, *src_ptl;
> -     struct page *src_page;
> -     struct folio *src_folio;
> -     pmd_t pmd;
>       pgtable_t pgtable = NULL;
> -     int ret = -ENOMEM;
> -
> -     pmd = pmdp_get_lockless(src_pmd);
> -     if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
> -                  !is_huge_zero_pmd(pmd))) {
> -             dst_ptl = pmd_lock(dst_mm, dst_pmd);
> -             src_ptl = pmd_lockptr(src_mm, src_pmd);
> -             spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> -             /*
> -              * No need to recheck the pmd, it can't change with write
> -              * mmap lock held here.
> -              *
> -              * Meanwhile, making sure it's not a CoW VMA with writable
> -              * mapping, otherwise it means either the anon page wrongly
> -              * applied special bit, or we made the PRIVATE mapping be
> -              * able to wrongly write to the backend MMIO.
> -              */
> -             VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && 
> pmd_write(pmd));
> -             goto set_pmd;
> -     }
> -
> -     /* Skip if can be re-fill on fault */
> -     if (!vma_is_anonymous(dst_vma))
> -             return 0;
> +     bool need_split = false;
> +     int ret = 0;
> +     pmd_t pmd;
>  
>       pgtable = pte_alloc_one(dst_mm);
>       if (unlikely(!pgtable))
> -             goto out;
> +             return -ENOMEM;
>  
>       dst_ptl = pmd_lock(dst_mm, dst_pmd);
>       src_ptl = pmd_lockptr(src_mm, src_pmd);
>       spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>  
> -     ret = -EAGAIN;
>       pmd = *src_pmd;
>  
> -     if (unlikely(thp_migration_supported() &&
> -                  pmd_is_valid_softleaf(pmd))) {
> -             copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd, 
> addr,
> +     if (likely(pmd_present(pmd))) {
> +             ret = copy_present_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, 
> addr,
> +                                       dst_vma, src_vma, pmd, pgtable, 
> &need_split);
> +     } else if (unlikely(thp_migration_supported() && 
> pmd_is_valid_softleaf(pmd))) {
> +             if (unlikely(!vma_is_anonymous(dst_vma)))
> +                     pte_free(dst_mm, pgtable);
> +             else
> +                     copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, 
> src_pmd, addr,
>                                         dst_vma, src_vma, pmd, pgtable);
> -             ret = 0;
> -             goto out_unlock;
> -     }
> -
> -     if (unlikely(!pmd_trans_huge(pmd))) {
> +     } else {
> +             VM_WARN_ONCE(1, "unexpected non-present PMD %llx\n",
> +                             (unsigned long long)pmd_val(pmd));
>               pte_free(dst_mm, pgtable);
> -             goto out_unlock;
> -     }
> -     /*
> -      * When page table lock is held, the huge zero pmd should not be
> -      * under splitting since we don't split the page itself, only pmd to
> -      * a page table.
> -      */
> -     if (is_huge_zero_pmd(pmd)) {
> -             /*
> -              * mm_get_huge_zero_folio() will never allocate a new
> -              * folio here, since we already have a zero page to
> -              * copy. It just takes a reference.
> -              */
> -             mm_get_huge_zero_folio(dst_mm);
> -             goto out_zero_page;
> +             ret = -EAGAIN;
>       }
>  
> -     src_page = pmd_page(pmd);
> -     VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
> -     src_folio = page_folio(src_page);
> +     spin_unlock(src_ptl);
> +     spin_unlock(dst_ptl);
>  
> -     folio_get(src_folio);
> -     if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, 
> src_vma))) {
> -             /* Page maybe pinned: split and retry the fault on PTEs. */
> -             folio_put(src_folio);
> -             pte_free(dst_mm, pgtable);
> -             spin_unlock(src_ptl);
> -             spin_unlock(dst_ptl);
> +     if (unlikely(need_split))
>               __split_huge_pmd(src_vma, src_pmd, addr, false);
> -             return -EAGAIN;
> -     }
> -     add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -out_zero_page:
> -     mm_inc_nr_ptes(dst_mm);
> -     pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> -     pmdp_set_wrprotect(src_mm, addr, src_pmd);
> -     if (!userfaultfd_wp(dst_vma))
> -             pmd = pmd_clear_uffd_wp(pmd);
> -     pmd = pmd_wrprotect(pmd);
> -set_pmd:
> -     pmd = pmd_mkold(pmd);
> -     set_pmd_at(dst_mm, addr, dst_pmd, pmd);
>  
> -     ret = 0;
> -out_unlock:
> -     spin_unlock(src_ptl);
> -     spin_unlock(dst_ptl);
> -out:
>       return ret;
>  }
>  


Reply via email to