On 16 Sep 2025, at 8:21, Balbir Singh wrote: > Extend core huge page management functions to handle device-private THP > entries. This enables proper handling of large device-private folios in > fundamental MM operations. > > The following functions have been updated: > > - copy_huge_pmd(): Handle device-private entries during fork/clone > - zap_huge_pmd(): Properly free device-private THP during munmap > - change_huge_pmd(): Support protection changes on device-private THP > - __pte_offset_map(): Add device-private entry awareness > > Signed-off-by: Matthew Brost <matthew.br...@intel.com> > Signed-off-by: Balbir Singh <balb...@nvidia.com> > Cc: David Hildenbrand <da...@redhat.com> > Cc: Zi Yan <z...@nvidia.com> > Cc: Joshua Hahn <joshua.hah...@gmail.com> > Cc: Rakie Kim <rakie....@sk.com> > Cc: Byungchul Park <byungc...@sk.com> > Cc: Gregory Price <gou...@gourry.net> > Cc: Ying Huang <ying.hu...@linux.alibaba.com> > Cc: Alistair Popple <apop...@nvidia.com> > Cc: Oscar Salvador <osalva...@suse.de> > Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> > Cc: Baolin Wang <baolin.w...@linux.alibaba.com> > Cc: "Liam R. Howlett" <liam.howl...@oracle.com> > Cc: Nico Pache <npa...@redhat.com> > Cc: Ryan Roberts <ryan.robe...@arm.com> > Cc: Dev Jain <dev.j...@arm.com> > Cc: Barry Song <bao...@kernel.org> > Cc: Lyude Paul <ly...@redhat.com> > Cc: Danilo Krummrich <d...@kernel.org> > Cc: David Airlie <airl...@gmail.com> > Cc: Simona Vetter <sim...@ffwll.ch> > Cc: Ralph Campbell <rcampb...@nvidia.com> > Cc: Mika Penttilä <mpent...@redhat.com> > Cc: Matthew Brost <matthew.br...@intel.com> > Cc: Francois Dugast <francois.dug...@intel.com> > --- > include/linux/swapops.h | 32 +++++++++++++++++++++++ > mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++------- > mm/pgtable-generic.c | 2 +- > 3 files changed, 80 insertions(+), 10 deletions(-) > > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 64ea151a7ae3..2687928a8146 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > } > #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) > + > +/** > + * is_pmd_device_private_entry() - Check if PMD contains a device private > swap entry > + * @pmd: The PMD to check > + * > + * Returns true if the PMD contains a swap entry that represents a device > private > + * page mapping. This is used for zone device private pages that have been > + * swapped out but still need special handling during various memory > management > + * operations. > + * > + * Return: 1 if PMD contains device private entry, 0 otherwise > + */ > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return is_swap_pmd(pmd) && > is_device_private_entry(pmd_to_swp_entry(pmd)); > +} > + > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return 0; > +} > + > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > static inline int non_swap_entry(swp_entry_t entry) > { > return swp_type(entry) >= MAX_SWAPFILES; > } > > +static inline int is_pmd_non_present_folio_entry(pmd_t pmd) > +{ > + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); > +} > +
non_present seems too vague. Maybe just open code it. > #endif /* CONFIG_MMU */ > #endif /* _LINUX_SWAPOPS_H */ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 5acca24bbabb..a5e4c2aef191 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1703,17 +1703,45 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct > mm_struct *src_mm, > if (unlikely(is_swap_pmd(pmd))) { > swp_entry_t entry = pmd_to_swp_entry(pmd); > > - VM_BUG_ON(!is_pmd_migration_entry(pmd)); > - if (!is_readable_migration_entry(entry)) { > - entry = make_readable_migration_entry( > - swp_offset(entry)); > + VM_WARN_ON(!is_pmd_non_present_folio_entry(pmd)); > + > + if (is_writable_migration_entry(entry) || > + is_readable_exclusive_migration_entry(entry)) { > + entry = > make_readable_migration_entry(swp_offset(entry)); > pmd = swp_entry_to_pmd(entry); > if (pmd_swp_soft_dirty(*src_pmd)) > pmd = pmd_swp_mksoft_dirty(pmd); > if (pmd_swp_uffd_wp(*src_pmd)) > pmd = pmd_swp_mkuffd_wp(pmd); > set_pmd_at(src_mm, addr, src_pmd, pmd); > + } else if (is_device_private_entry(entry)) { > + /* > + * For device private entries, since there are no > + * read exclusive entries, writable = !readable > + */ > + if (is_writable_device_private_entry(entry)) { > + entry = > make_readable_device_private_entry(swp_offset(entry)); > + pmd = swp_entry_to_pmd(entry); > + > + if (pmd_swp_soft_dirty(*src_pmd)) > + pmd = pmd_swp_mksoft_dirty(pmd); > + if (pmd_swp_uffd_wp(*src_pmd)) > + pmd = pmd_swp_mkuffd_wp(pmd); > + set_pmd_at(src_mm, addr, src_pmd, pmd); > + } > + > + src_folio = pfn_swap_entry_folio(entry); > + VM_WARN_ON(!folio_test_large(src_folio)); > + > + folio_get(src_folio); > + /* > + * folio_try_dup_anon_rmap_pmd does not fail for > + * device private entries. > + */ > + folio_try_dup_anon_rmap_pmd(src_folio, &src_folio->page, > + dst_vma, src_vma);’ folio_get() and folio_try_dup_anon_rmap_pmd() are needed, because contrary to the migration entry case, this folio exists as a device private one. > } > + > add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > mm_inc_nr_ptes(dst_mm); > pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); > @@ -2211,15 +2239,16 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > folio_remove_rmap_pmd(folio, page, vma); > WARN_ON_ONCE(folio_mapcount(folio) < 0); > VM_BUG_ON_PAGE(!PageHead(page), page); > - } else if (thp_migration_supported()) { > + } else if (is_pmd_non_present_folio_entry(orig_pmd)) { > swp_entry_t entry; > > - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); It implies thp_migration_supported() is true here. We could have VM_WARN_ONCE_ON(!thp_migration_supported()), but that might be too much. > entry = pmd_to_swp_entry(orig_pmd); > folio = pfn_swap_entry_folio(entry); > flush_needed = 0; > - } else > - WARN_ONCE(1, "Non present huge pmd without pmd > migration enabled!"); > + > + if (!thp_migration_supported()) > + WARN_ONCE(1, "Non present huge pmd without pmd > migration enabled!"); > + } > > if (folio_test_anon(folio)) { > zap_deposited_table(tlb->mm, pmd); > @@ -2239,6 +2268,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > folio_mark_accessed(folio); > } > > + if (folio_is_device_private(folio)) { > + folio_remove_rmap_pmd(folio, &folio->page, vma); > + WARN_ON_ONCE(folio_mapcount(folio) < 0); > + folio_put(folio); > + } > + > spin_unlock(ptl); > if (flush_needed) > tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); > @@ -2367,7 +2402,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > struct folio *folio = pfn_swap_entry_folio(entry); > pmd_t newpmd; > > - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); > + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd)); > if (is_writable_migration_entry(entry)) { > /* > * A protection check is difficult so > @@ -2380,6 +2415,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > newpmd = swp_entry_to_pmd(entry); > if (pmd_swp_soft_dirty(*pmd)) > newpmd = pmd_swp_mksoft_dirty(newpmd); > + } else if (is_writable_device_private_entry(entry)) { > + entry = > make_readable_device_private_entry(swp_offset(entry)); > + newpmd = swp_entry_to_pmd(entry); > } else { > newpmd = *pmd; > } > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 567e2d084071..0c847cdf4fd3 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -290,7 +290,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, > pmd_t *pmdvalp) > > if (pmdvalp) > *pmdvalp = pmdval; > - if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) > + if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval))) > goto nomap; > if (unlikely(pmd_trans_huge(pmdval))) > goto nomap; > -- > 2.50.1 Otherwise, LGTM. Acked-by: Zi Yan <z...@nvidia.com> Best Regards, Yan, Zi