On 16 Sep 2025, at 8:21, Balbir Singh wrote: > Add support for splitting device-private THP folios, enabling fallback > to smaller page sizes when large page allocation or migration fails. > > Key changes: > - split_huge_pmd(): Handle device-private PMD entries during splitting > - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios > - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they > don't support shared zero page semantics > > Signed-off-by: Balbir Singh <balb...@nvidia.com> > Cc: David Hildenbrand <da...@redhat.com> > Cc: Zi Yan <z...@nvidia.com> > Cc: Joshua Hahn <joshua.hah...@gmail.com> > Cc: Rakie Kim <rakie....@sk.com> > Cc: Byungchul Park <byungc...@sk.com> > Cc: Gregory Price <gou...@gourry.net> > Cc: Ying Huang <ying.hu...@linux.alibaba.com> > Cc: Alistair Popple <apop...@nvidia.com> > Cc: Oscar Salvador <osalva...@suse.de> > Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> > Cc: Baolin Wang <baolin.w...@linux.alibaba.com> > Cc: "Liam R. Howlett" <liam.howl...@oracle.com> > Cc: Nico Pache <npa...@redhat.com> > Cc: Ryan Roberts <ryan.robe...@arm.com> > Cc: Dev Jain <dev.j...@arm.com> > Cc: Barry Song <bao...@kernel.org> > Cc: Lyude Paul <ly...@redhat.com> > Cc: Danilo Krummrich <d...@kernel.org> > Cc: David Airlie <airl...@gmail.com> > Cc: Simona Vetter <sim...@ffwll.ch> > Cc: Ralph Campbell <rcampb...@nvidia.com> > Cc: Mika Penttilä <mpent...@redhat.com> > Cc: Matthew Brost <matthew.br...@intel.com> > Cc: Francois Dugast <francois.dug...@intel.com> > --- > mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- > 1 file changed, 98 insertions(+), 40 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 78166db72f4d..5291ee155a02 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t old_pmd, _pmd; > - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; > - bool anon_exclusive = false, dirty = false; > + bool soft_dirty, uffd_wp = false, young = false, write = false; > + bool anon_exclusive = false, dirty = false, present = false; > unsigned long addr; > pte_t *pte; > int i; > + swp_entry_t swp_entry; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); > + > + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && > !pmd_trans_huge(*pmd)); > > count_vm_event(THP_SPLIT_PMD); > > @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - pmd_migration = is_pmd_migration_entry(*pmd); > - if (unlikely(pmd_migration)) { > - swp_entry_t entry; > > + present = pmd_present(*pmd); > + if (is_pmd_migration_entry(*pmd)) { > old_pmd = *pmd; > - entry = pmd_to_swp_entry(old_pmd); > - page = pfn_swap_entry_to_page(entry); > - write = is_writable_migration_entry(entry); > + swp_entry = pmd_to_swp_entry(old_pmd); > + page = pfn_swap_entry_to_page(swp_entry); > + folio = page_folio(page); > + > + soft_dirty = pmd_swp_soft_dirty(old_pmd); > + uffd_wp = pmd_swp_uffd_wp(old_pmd); > + > + write = is_writable_migration_entry(swp_entry); > if (PageAnon(page)) > - anon_exclusive = > is_readable_exclusive_migration_entry(entry); > - young = is_migration_entry_young(entry); > - dirty = is_migration_entry_dirty(entry); > + anon_exclusive = > is_readable_exclusive_migration_entry(swp_entry); > + young = is_migration_entry_young(swp_entry); > + dirty = is_migration_entry_dirty(swp_entry); > + } else if (is_pmd_device_private_entry(*pmd)) { > + old_pmd = *pmd; > + swp_entry = pmd_to_swp_entry(old_pmd); > + page = pfn_swap_entry_to_page(swp_entry); > + folio = page_folio(page); > + > soft_dirty = pmd_swp_soft_dirty(old_pmd); > uffd_wp = pmd_swp_uffd_wp(old_pmd); > + > + write = is_writable_device_private_entry(swp_entry); > + anon_exclusive = PageAnonExclusive(page); > + > + if (freeze && anon_exclusive && > + folio_try_share_anon_rmap_pmd(folio, page)) > + freeze = false;
Why is it OK to change the freeze request? OK, it is replicating the code for present PMD folios. Either add a comment to point to the explanation in the comment below, or move “if (is_pmd_device_private_entry(*pmd))“ branch in the else below to deduplicate this code. > + if (!freeze) { > + rmap_t rmap_flags = RMAP_NONE; > + > + folio_ref_add(folio, HPAGE_PMD_NR - 1); > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + > + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, > + vma, haddr, rmap_flags); > + } > } else { > /* > * Up to this point the pmd is present and huge and userland has > @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > * Note that NUMA hinting access restrictions are not transferred to > * avoid any possibility of altering permissions across VMAs. > */ > - if (freeze || pmd_migration) { > - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += > PAGE_SIZE) { > - pte_t entry; > - swp_entry_t swp_entry; > - > - if (write) > - swp_entry = make_writable_migration_entry( > - page_to_pfn(page + i)); > - else if (anon_exclusive) > - swp_entry = > make_readable_exclusive_migration_entry( > - page_to_pfn(page + i)); > - else > - swp_entry = make_readable_migration_entry( > - page_to_pfn(page + i)); > - if (young) > - swp_entry = > make_migration_entry_young(swp_entry); > - if (dirty) > - swp_entry = > make_migration_entry_dirty(swp_entry); > - entry = swp_entry_to_pte(swp_entry); > - if (soft_dirty) > - entry = pte_swp_mksoft_dirty(entry); > - if (uffd_wp) > - entry = pte_swp_mkuffd_wp(entry); > + if (freeze || !present) { > + pte_t entry; > > - VM_WARN_ON(!pte_none(ptep_get(pte + i))); > - set_pte_at(mm, addr, pte + i, entry); > + if (freeze || is_migration_entry(swp_entry)) { > <snip> > + } else { <snip> > } > } else { > pte_t entry; David already pointed this out in v5. It can be done such as: if (freeze || pmd_migration) { ... } else if (is_pmd_device_private_entry(old_pmd)) { ... } else { /* for present, non freeze case */ } > @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > } > pte_unmap(pte); > > - if (!pmd_migration) > + if (!is_pmd_migration_entry(*pmd)) > folio_remove_rmap_pmd(folio, page, vma); > if (freeze) > put_page(page); > @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, > unsigned long address, > pmd_t *pmd, bool freeze) > { > VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); > - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) > + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) > __split_huge_pmd_locked(vma, pmd, address, freeze); > } > > @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, > struct folio *new_folio, > VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); > lockdep_assert_held(&lruvec->lru_lock); > > + if (folio_is_device_private(folio)) > + return; > + > if (list) { > /* page reclaim is reclaiming a huge page */ > VM_WARN_ON(folio_test_lru(folio)); > @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned > int new_order, > if (nr_shmem_dropped) > shmem_uncharge(mapping->host, nr_shmem_dropped); > > - if (!ret && is_anon) > + if (!ret && is_anon && !folio_is_device_private(folio)) > remap_flags = RMP_USE_SHARED_ZEROPAGE; > + You should remove this and add if (folio_is_device_private(folio)) return false; in try_to_map_unused_to_zeropage(). Otherwise, no one would know device private folios need to be excluded from mapping unused to zero page. > remap_page(folio, 1 << order, remap_flags); > > /* > -- > 2.50.1 Best Regards, Yan, Zi