Extend core huge page management functions to handle device-private THP entries. This enables proper handling of large device-private folios in fundamental MM operations.
The following functions have been updated: - copy_huge_pmd(): Handle device-private entries during fork/clone - zap_huge_pmd(): Properly free device-private THP during munmap - change_huge_pmd(): Support protection changes on device-private THP - __pte_offset_map(): Add device-private entry awareness Cc: Andrew Morton <a...@linux-foundation.org> Cc: David Hildenbrand <da...@redhat.com> Cc: Zi Yan <z...@nvidia.com> Cc: Joshua Hahn <joshua.hah...@gmail.com> Cc: Rakie Kim <rakie....@sk.com> Cc: Byungchul Park <byungc...@sk.com> Cc: Gregory Price <gou...@gourry.net> Cc: Ying Huang <ying.hu...@linux.alibaba.com> Cc: Alistair Popple <apop...@nvidia.com> Cc: Oscar Salvador <osalva...@suse.de> Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> Cc: Baolin Wang <baolin.w...@linux.alibaba.com> Cc: "Liam R. Howlett" <liam.howl...@oracle.com> Cc: Nico Pache <npa...@redhat.com> Cc: Ryan Roberts <ryan.robe...@arm.com> Cc: Dev Jain <dev.j...@arm.com> Cc: Barry Song <bao...@kernel.org> Cc: Lyude Paul <ly...@redhat.com> Cc: Danilo Krummrich <d...@kernel.org> Cc: David Airlie <airl...@gmail.com> Cc: Simona Vetter <sim...@ffwll.ch> Cc: Ralph Campbell <rcampb...@nvidia.com> Cc: Mika Penttilä <mpent...@redhat.com> Cc: Matthew Brost <matthew.br...@intel.com> Cc: Francois Dugast <francois.dug...@intel.com> Signed-off-by: Matthew Brost <matthew.br...@intel.com> Signed-off-by: Balbir Singh <balb...@nvidia.com> --- include/linux/swapops.h | 27 +++++++++++++++++++ mm/huge_memory.c | 60 ++++++++++++++++++++++++++++++++++++----- mm/pgtable-generic.c | 6 +++++ 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..59c5889a4d54 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -594,6 +594,33 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry + * @pmd: The PMD to check + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: 1 if PMD contains device private entry, 0 otherwise + */ +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return 0; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26cedfcd7418..2af74e09b279 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1703,8 +1703,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (!is_readable_migration_entry(entry)) { + VM_WARN_ON(!is_pmd_migration_entry(pmd) && + !is_pmd_device_private_entry(pmd)); + + if (is_migration_entry(entry) && + !is_readable_migration_entry(entry)) { entry = make_readable_migration_entry( swp_offset(entry)); pmd = swp_entry_to_pmd(entry); @@ -1713,7 +1716,37 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pmd_swp_uffd_wp(*src_pmd)) pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(src_mm, addr, src_pmd, pmd); + } else if (is_device_private_entry(entry)) { + /* + * For device private entries, since there are no + * read exclusive entries, writable = !readable + */ + if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry( + swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + + src_folio = pfn_swap_entry_folio(entry); + VM_WARN_ON(!folio_test_large(src_folio)); + + folio_get(src_folio); + /* + * folio_try_dup_anon_rmap_pmd does not fail for + * device private entries. + */ + ret = folio_try_dup_anon_rmap_pmd(src_folio, + &src_folio->page, + dst_vma, src_vma); + VM_WARN_ON(ret); } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); @@ -2211,15 +2244,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_remove_rmap_pmd(folio, page, vma); WARN_ON_ONCE(folio_mapcount(folio) < 0); VM_BUG_ON_PAGE(!PageHead(page), page); - } else if (thp_migration_supported()) { + } else if (is_pmd_migration_entry(orig_pmd) || + is_pmd_device_private_entry(orig_pmd)) { swp_entry_t entry; - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); folio = pfn_swap_entry_folio(entry); flush_needed = 0; - } else - WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (!thp_migration_supported()) + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + } if (folio_test_anon(folio)) { zap_deposited_table(tlb->mm, pmd); @@ -2239,6 +2274,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, folio_mark_accessed(folio); } + if (folio_is_device_private(folio)) { + folio_remove_rmap_pmd(folio, &folio->page, vma); + WARN_ON_ONCE(folio_mapcount(folio) < 0); + folio_put(folio); + } + spin_unlock(ptl); if (flush_needed) tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); @@ -2367,7 +2408,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && + !folio_is_device_private(folio)); if (is_writable_migration_entry(entry)) { /* * A protection check is difficult so @@ -2380,6 +2422,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); + } else if (is_writable_device_private_entry(entry)) { + entry = make_readable_device_private_entry( + swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); } else { newpmd = *pmd; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 567e2d084071..604e8206a2ec 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) goto nomap; + if (is_swap_pmd(pmdval)) { + swp_entry_t entry = pmd_to_swp_entry(pmdval); + + if (is_device_private_entry(entry)) + goto nomap; + } if (unlikely(pmd_trans_huge(pmdval))) goto nomap; if (unlikely(pmd_bad(pmdval))) { -- 2.50.1