Rework __split_huge_pmd_locked() to classify huge PMDs by the PMD entry itself instead of starting from vma_is_anonymous().
Present PMDs are classified with vm_normal_folio_pmd(): file/shmem THPs are dropped and refaulted later, anonymous THPs are split into PTEs, and PMDs without a normal folio are handled as huge zero or special PMDs. Non-present PMDs are classified with pmd_to_softleaf_folio(): file/shmem migration entries are dropped, while anonymous migration/device-private entries are split into PTEs. This also makes the anonymous decision folio-based. A private file mapping that has CoW'ed to an anonymous THP now follows the anonymous path even though the VMA is file-backed. No intended behavioural change. Signed-off-by: Yin Tirui <[email protected]> --- mm/huge_memory.c | 197 +++++++++++++++++++++++++++-------------------- 1 file changed, 114 insertions(+), 83 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3964258ff91d..8cd77389d52f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3136,25 +3136,38 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); - if (!vma_is_anonymous(vma)) { - old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); - /* - * We are going to unmap this huge page. So - * just go ahead and zap it - */ - if (arch_needs_pgtable_deposit()) - zap_deposited_table(mm, pmd); - if (vma_is_special_huge(vma)) - return; - if (unlikely(pmd_is_migration_entry(old_pmd))) { - const softleaf_t old_entry = softleaf_from_pmd(old_pmd); + if (pmd_present(*pmd)) { + folio = vm_normal_folio_pmd(vma, haddr, *pmd); + + if (unlikely(!folio)) { + if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below + * inside __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ + return __split_huge_zero_page_pmd(vma, haddr, pmd); + } - folio = softleaf_to_folio(old_entry); - } else if (is_huge_zero_pmd(old_pmd)) { + /* Present but not a normal folio: drop the PMD. */ + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); return; - } else { + } + + if (unlikely(!folio_test_anon(folio))) { + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); + if (vma_is_special_huge(vma)) + return; + page = pmd_page(old_pmd); - folio = page_folio(page); if (!folio_test_dirty(folio) && pmd_dirty(old_pmd)) folio_mark_dirty(folio); if (!folio_test_referenced(folio) && pmd_young(old_pmd)) @@ -3164,72 +3177,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_put(folio); return; } - add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); - return; - } - - if (is_huge_zero_pmd(*pmd)) { - /* - * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below - * inside __split_huge_pmd() ? - * - * We are going from a zero huge page write protected to zero - * small page also write protected so it does not seems useful - * to invalidate secondary mmu at this time. - */ - return __split_huge_zero_page_pmd(vma, haddr, pmd); - } - - if (pmd_is_migration_entry(*pmd)) { - softleaf_t entry; - - old_pmd = *pmd; - entry = softleaf_from_pmd(old_pmd); - page = softleaf_to_page(entry); - folio = page_folio(page); - - soft_dirty = pmd_swp_soft_dirty(old_pmd); - uffd_wp = pmd_swp_uffd_wp(old_pmd); - - write = softleaf_is_migration_write(entry); - if (PageAnon(page)) - anon_exclusive = softleaf_is_migration_read_exclusive(entry); - young = softleaf_is_migration_young(entry); - dirty = softleaf_is_migration_dirty(entry); - } else if (pmd_is_device_private_entry(*pmd)) { - softleaf_t entry; - - old_pmd = *pmd; - entry = softleaf_from_pmd(old_pmd); - page = softleaf_to_page(entry); - folio = page_folio(page); - - soft_dirty = pmd_swp_soft_dirty(old_pmd); - uffd_wp = pmd_swp_uffd_wp(old_pmd); - - write = softleaf_is_device_private_write(entry); - anon_exclusive = PageAnonExclusive(page); - - /* - * Device private THP should be treated the same as regular - * folios w.r.t anon exclusive handling. See the comments for - * folio handling and anon_exclusive below. - */ - if (freeze && anon_exclusive && - folio_try_share_anon_rmap_pmd(folio, page)) - freeze = false; - if (!freeze) { - rmap_t rmap_flags = RMAP_NONE; - - folio_ref_add(folio, HPAGE_PMD_NR - 1); - if (anon_exclusive) - rmap_flags |= RMAP_EXCLUSIVE; - folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, - vma, haddr, rmap_flags); - } - } else { /* * Up to this point the pmd is present and huge and userland has * the whole access to the hugepage during the split (which @@ -3255,7 +3203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ old_pmd = pmdp_invalidate(vma, haddr, pmd); page = pmd_page(old_pmd); - folio = page_folio(page); if (pmd_dirty(old_pmd)) { dirty = true; folio_set_dirty(folio); @@ -3266,7 +3213,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, uffd_wp = pmd_uffd_wp(old_pmd); VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio); - VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -3296,6 +3242,85 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, vma, haddr, rmap_flags); } + } else { + /* + * Non-present PMD: a softleaf-encoded migration or + * device-private entry. pmd_to_softleaf_folio() warns and + * returns NULL for any other encoding. + */ + folio = pmd_to_softleaf_folio(*pmd); + if (unlikely(!folio)) + return; + + if (unlikely(!folio_test_anon(folio))) { + /* + * File/shmem migration entry: drop the PMD without + * splitting. Unlike the present case the entry holds + * neither a folio reference nor an rmap to release, + * so just adjust the RSS counter. + */ + pmdp_huge_clear_flush(vma, haddr, pmd); + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); + if (unlikely(vma_is_special_huge(vma))) { + VM_WARN_ONCE(1, + "unexpected special huge PMD migration entry\n"); + return; + } + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); + return; + } + + if (pmd_is_migration_entry(*pmd)) { + softleaf_t entry; + + old_pmd = *pmd; + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_migration_write(entry); + if (PageAnon(page)) + anon_exclusive = softleaf_is_migration_read_exclusive(entry); + young = softleaf_is_migration_young(entry); + dirty = softleaf_is_migration_dirty(entry); + } else if (pmd_is_device_private_entry(*pmd)) { + softleaf_t entry; + + old_pmd = *pmd; + entry = softleaf_from_pmd(old_pmd); + page = softleaf_to_page(entry); + + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + + write = softleaf_is_device_private_write(entry); + anon_exclusive = PageAnonExclusive(page); + + /* + * Device-private THP should be treated the same as + * regular folios w.r.t. anon-exclusive handling. See + * the matching code for present anon folios above. + */ + if (freeze && anon_exclusive && + folio_try_share_anon_rmap_pmd(folio, page)) + freeze = false; + if (!freeze) { + rmap_t rmap_flags = RMAP_NONE; + + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (anon_exclusive) + rmap_flags |= RMAP_EXCLUSIVE; + + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, + vma, haddr, rmap_flags); + } + } else { + VM_WARN_ON_ONCE(1); + return; + } } /* -- 2.43.0
