On 26/05/26 8:20 pm, Yin Tirui wrote:
> Classify the source PMD via pmd_present() and vm_normal_folio_pmd(),
> matching the way the PTE path uses pte_present() and vm_normal_page().
> This moves the present-PMD decision from VMA identity checks to the
> actual PMD/folio state.
>
> Drop the defensive "if (!pmd_trans_huge(pmd)) goto out_unlock" branch:
> with mmap_write_lock held during fork, it should not occur.
Split this from this patch? This is a functional change so will be better
to review it separately.
>
> Extract the present-PMD side of copy_huge_pmd() into
> copy_present_huge_pmd(). The helper owns the child pgtable passed by the
> caller: it either deposits the pgtable when installing a copied PMD, or
> frees it on paths that do not install one.
>
> The child pgtable is now allocated once up front and freed on every skip
> path. This makes file/shmem and PFNMAP/special skip paths take the PMD
> locks and free the preallocated pgtable before returning. These are not
> expected to be hot paths, and the PFNMAP case is reused by the follow-up
> PMD PFNMAP copy support.
>
> Signed-off-by: Yin Tirui <[email protected]>
> ---
Since the series is still marked RFC, I think it would be better to send
the refactor patches separately?
> mm/huge_memory.c | 175 +++++++++++++++++++++++++----------------------
> 1 file changed, 95 insertions(+), 80 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9832ee910d5e..3964258ff91d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1879,6 +1879,82 @@ bool touch_pmd(struct vm_area_struct *vma, unsigned
> long addr,
> return false;
> }
>
> +static int copy_present_huge_pmd(
> + struct mm_struct *dst_mm, struct mm_struct *src_mm,
> + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
> + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> + pmd_t pmd, pgtable_t pgtable, bool *need_split)
> +{
> + struct folio *src_folio;
> + bool wrprotect = true;
> +
> + src_folio = vm_normal_folio_pmd(src_vma, addr, pmd);
> + if (!src_folio) {
> + /*
> + * When page table lock is held, the huge zero pmd should not be
> + * under splitting since we don't split the page itself, only
> pmd to
> + * a page table.
> + */
> + if (is_huge_zero_pmd(pmd)) {
> + /*
> + * mm_get_huge_zero_folio() will never allocate a new
> + * folio here, since we already have a zero page to
> + * copy. It just takes a reference.
> + */
> + mm_get_huge_zero_folio(dst_mm);
> + goto set_pmd;
> + }
> +
> + /*
> + * Making sure it's not a CoW VMA with writable
> + * mapping, otherwise it means either the anon page wrongly
> + * applied special bit, or we made the PRIVATE mapping be
> + * able to wrongly write to the backend MMIO.
> + */
> + VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) &&
> pmd_write(pmd));
> + pte_free(dst_mm, pgtable);
> + pgtable = NULL;
> + wrprotect = false;
> + goto set_pmd;
> + }
> +
> + /* File THPs are copied lazily by refaulting. */
> + if (!folio_test_anon(src_folio)) {
> + pte_free(dst_mm, pgtable);
> + return 0;
> + }
You removed !vma_is_anonymous to condense it into !folio_test_anon.
For private-file mappings that is not true, but okay since PMD mapping
is not supported.
> +
> + folio_get(src_folio);
> + if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio,
> + &src_folio->page,
> + dst_vma, src_vma))) {
> + /* Page maybe pinned: split and retry the fault on PTEs. */
> + folio_put(src_folio);
> + pte_free(dst_mm, pgtable);
> + *need_split = true;
> + return -EAGAIN;
> + }
> + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +
> +set_pmd:
> + if (pgtable) {
> + mm_inc_nr_ptes(dst_mm);
> + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> + }
> +
> + if (wrprotect) {
> + pmdp_set_wrprotect(src_mm, addr, src_pmd);
> + if (!userfaultfd_wp(dst_vma))
> + pmd = pmd_clear_uffd_wp(pmd);
> + pmd = pmd_wrprotect(pmd);
> + }
> +
> + pmd = pmd_mkold(pmd);
> + set_pmd_at(dst_mm, addr, dst_pmd, pmd);
> +
> + return 0;
> +}
> +
> static void copy_huge_non_present_pmd(
> struct mm_struct *dst_mm, struct mm_struct *src_mm,
> pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
> @@ -1940,104 +2016,43 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct
> mm_struct *src_mm,
> struct vm_area_struct *dst_vma, struct vm_area_struct
> *src_vma)
> {
> spinlock_t *dst_ptl, *src_ptl;
> - struct page *src_page;
> - struct folio *src_folio;
> - pmd_t pmd;
> pgtable_t pgtable = NULL;
> - int ret = -ENOMEM;
> -
> - pmd = pmdp_get_lockless(src_pmd);
> - if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
> - !is_huge_zero_pmd(pmd))) {
> - dst_ptl = pmd_lock(dst_mm, dst_pmd);
> - src_ptl = pmd_lockptr(src_mm, src_pmd);
> - spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
> - /*
> - * No need to recheck the pmd, it can't change with write
> - * mmap lock held here.
> - *
> - * Meanwhile, making sure it's not a CoW VMA with writable
> - * mapping, otherwise it means either the anon page wrongly
> - * applied special bit, or we made the PRIVATE mapping be
> - * able to wrongly write to the backend MMIO.
> - */
> - VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) &&
> pmd_write(pmd));
> - goto set_pmd;
> - }
> -
> - /* Skip if can be re-fill on fault */
> - if (!vma_is_anonymous(dst_vma))
> - return 0;
> + bool need_split = false;
> + int ret = 0;
> + pmd_t pmd;
>
> pgtable = pte_alloc_one(dst_mm);
> if (unlikely(!pgtable))
> - goto out;
> + return -ENOMEM;
>
> dst_ptl = pmd_lock(dst_mm, dst_pmd);
> src_ptl = pmd_lockptr(src_mm, src_pmd);
> spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>
> - ret = -EAGAIN;
> pmd = *src_pmd;
>
> - if (unlikely(thp_migration_supported() &&
> - pmd_is_valid_softleaf(pmd))) {
> - copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
> addr,
> + if (likely(pmd_present(pmd))) {
> + ret = copy_present_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
> addr,
> + dst_vma, src_vma, pmd, pgtable,
> &need_split);
> + } else if (unlikely(thp_migration_supported() &&
> pmd_is_valid_softleaf(pmd))) {
> + if (unlikely(!vma_is_anonymous(dst_vma)))
> + pte_free(dst_mm, pgtable);
> + else
> + copy_huge_non_present_pmd(dst_mm, src_mm, dst_pmd,
> src_pmd, addr,
> dst_vma, src_vma, pmd, pgtable);
> - ret = 0;
> - goto out_unlock;
> - }
> -
> - if (unlikely(!pmd_trans_huge(pmd))) {
> + } else {
> + VM_WARN_ONCE(1, "unexpected non-present PMD %llx\n",
> + (unsigned long long)pmd_val(pmd));
> pte_free(dst_mm, pgtable);
> - goto out_unlock;
> - }
> - /*
> - * When page table lock is held, the huge zero pmd should not be
> - * under splitting since we don't split the page itself, only pmd to
> - * a page table.
> - */
> - if (is_huge_zero_pmd(pmd)) {
> - /*
> - * mm_get_huge_zero_folio() will never allocate a new
> - * folio here, since we already have a zero page to
> - * copy. It just takes a reference.
> - */
> - mm_get_huge_zero_folio(dst_mm);
> - goto out_zero_page;
> + ret = -EAGAIN;
> }
>
> - src_page = pmd_page(pmd);
> - VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
> - src_folio = page_folio(src_page);
> + spin_unlock(src_ptl);
> + spin_unlock(dst_ptl);
>
> - folio_get(src_folio);
> - if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma,
> src_vma))) {
> - /* Page maybe pinned: split and retry the fault on PTEs. */
> - folio_put(src_folio);
> - pte_free(dst_mm, pgtable);
> - spin_unlock(src_ptl);
> - spin_unlock(dst_ptl);
> + if (unlikely(need_split))
> __split_huge_pmd(src_vma, src_pmd, addr, false);
> - return -EAGAIN;
> - }
> - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -out_zero_page:
> - mm_inc_nr_ptes(dst_mm);
> - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> - pmdp_set_wrprotect(src_mm, addr, src_pmd);
> - if (!userfaultfd_wp(dst_vma))
> - pmd = pmd_clear_uffd_wp(pmd);
> - pmd = pmd_wrprotect(pmd);
> -set_pmd:
> - pmd = pmd_mkold(pmd);
> - set_pmd_at(dst_mm, addr, dst_pmd, pmd);
>
> - ret = 0;
> -out_unlock:
> - spin_unlock(src_ptl);
> - spin_unlock(dst_ptl);
> -out:
> return ret;
> }
>