On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote: > Make THP handling code in the mm subsystem for THP pages aware of zone > device pages. Although the code is designed to be generic when it comes > to handling splitting of pages, the code is designed to work for THP > page sizes corresponding to HPAGE_PMD_NR. > > Modify page_vma_mapped_walk() to return true when a zone device huge > entry is present, enabling try_to_migrate() and other code migration > paths to appropriately process the entry. page_vma_mapped_walk() will > return true for zone device private large folios only when > PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are > not zone device private pages from having to add awareness. The key > callback that needs this flag is try_to_migrate_one(). The other > callbacks page idle, damon use it for setting young/dirty bits, which is > not significant when it comes to pmd level bit harvesting. > > pmd_pfn() does not work well with zone device entries, use > pfn_pmd_entry_to_swap() for checking and comparison as for zone device > entries. > > Support partial unmapping of zone device private entries, which happens > via munmap(). munmap() causes the device private entry pmd to be split, > but the corresponding folio is not split. Deferred split does not work for > zone device private folios due to the need to split during fault > handling. Get migrate_vma_collect_pmd() to handle this case by splitting > partially unmapped device private folios. > > Cc: Andrew Morton <a...@linux-foundation.org> > Cc: David Hildenbrand <da...@redhat.com> > Cc: Zi Yan <z...@nvidia.com> > Cc: Joshua Hahn <joshua.hah...@gmail.com> > Cc: Rakie Kim <rakie....@sk.com> > Cc: Byungchul Park <byungc...@sk.com> > Cc: Gregory Price <gou...@gourry.net> > Cc: Ying Huang <ying.hu...@linux.alibaba.com> > Cc: Alistair Popple <apop...@nvidia.com> > Cc: Oscar Salvador <osalva...@suse.de> > Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> > Cc: Baolin Wang <baolin.w...@linux.alibaba.com> > Cc: "Liam R. Howlett" <liam.howl...@oracle.com> > Cc: Nico Pache <npa...@redhat.com> > Cc: Ryan Roberts <ryan.robe...@arm.com> > Cc: Dev Jain <dev.j...@arm.com> > Cc: Barry Song <bao...@kernel.org> > Cc: Lyude Paul <ly...@redhat.com> > Cc: Danilo Krummrich <d...@kernel.org> > Cc: David Airlie <airl...@gmail.com> > Cc: Simona Vetter <sim...@ffwll.ch> > Cc: Ralph Campbell <rcampb...@nvidia.com> > Cc: Mika Penttilä <mpent...@redhat.com> > Cc: Matthew Brost <matthew.br...@intel.com> > Cc: Francois Dugast <francois.dug...@intel.com> > > Signed-off-by: Matthew Brost <matthew.br...@intel.com> > Signed-off-by: Balbir Singh <balb...@nvidia.com> > --- > include/linux/rmap.h | 2 + > include/linux/swapops.h | 17 ++++ > lib/test_hmm.c | 2 +- > mm/huge_memory.c | 214 +++++++++++++++++++++++++++++++--------- > mm/migrate_device.c | 47 +++++++++ > mm/page_vma_mapped.c | 13 ++- > mm/pgtable-generic.c | 6 ++ > mm/rmap.c | 24 ++++- > 8 files changed, 272 insertions(+), 53 deletions(-) > > diff --git a/include/linux/rmap.h b/include/linux/rmap.h > index 6cd020eea37a..dfb7aae3d77b 100644 > --- a/include/linux/rmap.h > +++ b/include/linux/rmap.h > @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, > unsigned long addr, > #define PVMW_SYNC (1 << 0) > /* Look for migration entries rather than present PTEs */ > #define PVMW_MIGRATION (1 << 1) > +/* Look for device private THP entries */ > +#define PVMW_THP_DEVICE_PRIVATE (1 << 2) > > struct page_vma_mapped_walk { > unsigned long pfn; > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 64ea151a7ae3..2641c01bd5d2 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > { > return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); > } > + > #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, > struct page *page) > @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd) > } > #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ > > +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return is_swap_pmd(pmd) && > is_device_private_entry(pmd_to_swp_entry(pmd)); > +} > + > +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > +static inline int is_pmd_device_private_entry(pmd_t pmd) > +{ > + return 0; > +} > + > +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ > + > static inline int non_swap_entry(swp_entry_t entry) > { > return swp_type(entry) >= MAX_SWAPFILES; > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > index 761725bc713c..297f1e034045 100644 > --- a/lib/test_hmm.c > +++ b/lib/test_hmm.c > @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault > *vmf) > * the mirror but here we use it to hold the page for the simulated > * device memory and that page holds the pointer to the mirror. > */ > - rpage = vmf->page->zone_device_data; > + rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data; > dmirror = rpage->zone_device_data; > > /* FIXME demonstrate how we can adjust migrate range */ > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 9c38a95e9f09..2495e3fdbfae 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct > mm_struct *src_mm, > if (unlikely(is_swap_pmd(pmd))) { > swp_entry_t entry = pmd_to_swp_entry(pmd); > > - VM_BUG_ON(!is_pmd_migration_entry(pmd)); > - if (!is_readable_migration_entry(entry)) { > + VM_WARN_ON(!is_pmd_migration_entry(pmd) && > + !is_pmd_device_private_entry(pmd)); > + > + if (is_migration_entry(entry) && > + is_writable_migration_entry(entry)) { > entry = make_readable_migration_entry( > swp_offset(entry)); > pmd = swp_entry_to_pmd(entry); > @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct > mm_struct *src_mm, > pmd = pmd_swp_mkuffd_wp(pmd); > set_pmd_at(src_mm, addr, src_pmd, pmd); > } > + > + if (is_device_private_entry(entry)) { > + if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + pmd = swp_entry_to_pmd(entry); > + > + if (pmd_swp_soft_dirty(*src_pmd)) > + pmd = pmd_swp_mksoft_dirty(pmd); > + if (pmd_swp_uffd_wp(*src_pmd)) > + pmd = pmd_swp_mkuffd_wp(pmd); > + set_pmd_at(src_mm, addr, src_pmd, pmd); > + } > + > + src_folio = pfn_swap_entry_folio(entry); > + VM_WARN_ON(!folio_test_large(src_folio)); > + > + folio_get(src_folio); > + /* > + * folio_try_dup_anon_rmap_pmd does not fail for > + * device private entries. > + */ > + VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio, > + &src_folio->page, dst_vma, src_vma));
VM_WARN_ON compiles out in non-debug builds. I hit this running the fork self I shared with a non-debug build. Matt [1] https://elixir.bootlin.com/linux/v6.16.3/source/include/linux/mmdebug.h#L112 > + } > + > add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); > mm_inc_nr_ptes(dst_mm); > pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); > @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > folio_remove_rmap_pmd(folio, page, vma); > WARN_ON_ONCE(folio_mapcount(folio) < 0); > VM_BUG_ON_PAGE(!PageHead(page), page); > - } else if (thp_migration_supported()) { > + } else if (is_pmd_migration_entry(orig_pmd) || > + is_pmd_device_private_entry(orig_pmd)) { > swp_entry_t entry; > > - VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); > entry = pmd_to_swp_entry(orig_pmd); > folio = pfn_swap_entry_folio(entry); > flush_needed = 0; > - } else > - WARN_ONCE(1, "Non present huge pmd without pmd > migration enabled!"); > + > + if (!thp_migration_supported()) > + WARN_ONCE(1, "Non present huge pmd without pmd > migration enabled!"); > + > + if (is_pmd_device_private_entry(orig_pmd)) { > + folio_remove_rmap_pmd(folio, &folio->page, vma); > + WARN_ON_ONCE(folio_mapcount(folio) < 0); > + } > + } > > if (folio_test_anon(folio)) { > zap_deposited_table(tlb->mm, pmd); > @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > folio_mark_accessed(folio); > } > > + /* > + * Do a folio put on zone device private pages after > + * changes to mm_counter, because the folio_put() will > + * clean folio->mapping and the folio_test_anon() check > + * will not be usable. > + */ > + if (folio_is_device_private(folio)) > + folio_put(folio); > + > spin_unlock(ptl); > if (flush_needed) > tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE); > @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > struct folio *folio = pfn_swap_entry_folio(entry); > pmd_t newpmd; > > - VM_BUG_ON(!is_pmd_migration_entry(*pmd)); > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && > + !folio_is_device_private(folio)); > if (is_writable_migration_entry(entry)) { > /* > * A protection check is difficult so > @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct > vm_area_struct *vma, > newpmd = swp_entry_to_pmd(entry); > if (pmd_swp_soft_dirty(*pmd)) > newpmd = pmd_swp_mksoft_dirty(newpmd); > + } else if (is_writable_device_private_entry(entry)) { > + entry = make_readable_device_private_entry( > + swp_offset(entry)); > + newpmd = swp_entry_to_pmd(entry); > } else { > newpmd = *pmd; > } > @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t old_pmd, _pmd; > - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; > - bool anon_exclusive = false, dirty = false; > + bool young, write, soft_dirty, uffd_wp = false; > + bool anon_exclusive = false, dirty = false, present = false; > unsigned long addr; > pte_t *pte; > int i; > + swp_entry_t swp_entry; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); > + > + VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) > + && !(is_pmd_device_private_entry(*pmd))); > > count_vm_event(THP_SPLIT_PMD); > > @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - pmd_migration = is_pmd_migration_entry(*pmd); > - if (unlikely(pmd_migration)) { > - swp_entry_t entry; > > + present = pmd_present(*pmd); > + if (unlikely(!present)) { > + swp_entry = pmd_to_swp_entry(*pmd); > old_pmd = *pmd; > - entry = pmd_to_swp_entry(old_pmd); > - page = pfn_swap_entry_to_page(entry); > - write = is_writable_migration_entry(entry); > - if (PageAnon(page)) > - anon_exclusive = > is_readable_exclusive_migration_entry(entry); > - young = is_migration_entry_young(entry); > - dirty = is_migration_entry_dirty(entry); > + > + folio = pfn_swap_entry_folio(swp_entry); > + VM_WARN_ON(!is_migration_entry(swp_entry) && > + !is_device_private_entry(swp_entry)); > + page = pfn_swap_entry_to_page(swp_entry); > + > + if (is_pmd_migration_entry(old_pmd)) { > + write = is_writable_migration_entry(swp_entry); > + if (PageAnon(page)) > + anon_exclusive = > + is_readable_exclusive_migration_entry( > + swp_entry); > + young = is_migration_entry_young(swp_entry); > + dirty = is_migration_entry_dirty(swp_entry); > + } else if (is_pmd_device_private_entry(old_pmd)) { > + write = is_writable_device_private_entry(swp_entry); > + anon_exclusive = PageAnonExclusive(page); > + if (freeze && anon_exclusive && > + folio_try_share_anon_rmap_pmd(folio, page)) > + freeze = false; > + if (!freeze) { > + rmap_t rmap_flags = RMAP_NONE; > + > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + > + folio_ref_add(folio, HPAGE_PMD_NR - 1); > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + folio_add_anon_rmap_ptes(folio, page, > HPAGE_PMD_NR, > + vma, haddr, rmap_flags); > + } > + } > + > soft_dirty = pmd_swp_soft_dirty(old_pmd); > uffd_wp = pmd_swp_uffd_wp(old_pmd); > } else { > @@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > * Note that NUMA hinting access restrictions are not transferred to > * avoid any possibility of altering permissions across VMAs. > */ > - if (freeze || pmd_migration) { > + if (freeze || !present) { > for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += > PAGE_SIZE) { > pte_t entry; > - swp_entry_t swp_entry; > - > - if (write) > - swp_entry = make_writable_migration_entry( > - page_to_pfn(page + i)); > - else if (anon_exclusive) > - swp_entry = > make_readable_exclusive_migration_entry( > - page_to_pfn(page + i)); > - else > - swp_entry = make_readable_migration_entry( > - page_to_pfn(page + i)); > - if (young) > - swp_entry = > make_migration_entry_young(swp_entry); > - if (dirty) > - swp_entry = > make_migration_entry_dirty(swp_entry); > - entry = swp_entry_to_pte(swp_entry); > - if (soft_dirty) > - entry = pte_swp_mksoft_dirty(entry); > - if (uffd_wp) > - entry = pte_swp_mkuffd_wp(entry); > - > + if (freeze || is_migration_entry(swp_entry)) { > + if (write) > + swp_entry = > make_writable_migration_entry( > + > page_to_pfn(page + i)); > + else if (anon_exclusive) > + swp_entry = > make_readable_exclusive_migration_entry( > + > page_to_pfn(page + i)); > + else > + swp_entry = > make_readable_migration_entry( > + > page_to_pfn(page + i)); > + if (young) > + swp_entry = > make_migration_entry_young(swp_entry); > + if (dirty) > + swp_entry = > make_migration_entry_dirty(swp_entry); > + entry = swp_entry_to_pte(swp_entry); > + if (soft_dirty) > + entry = pte_swp_mksoft_dirty(entry); > + if (uffd_wp) > + entry = pte_swp_mkuffd_wp(entry); > + } else { > + /* > + * anon_exclusive was already propagated to the > relevant > + * pages corresponding to the pte entries when > freeze > + * is false. > + */ > + if (write) > + swp_entry = > make_writable_device_private_entry( > + > page_to_pfn(page + i)); > + else > + swp_entry = > make_readable_device_private_entry( > + > page_to_pfn(page + i)); > + /* > + * Young and dirty bits are not progated via > swp_entry > + */ > + entry = swp_entry_to_pte(swp_entry); > + if (soft_dirty) > + entry = pte_swp_mksoft_dirty(entry); > + if (uffd_wp) > + entry = pte_swp_mkuffd_wp(entry); > + } > VM_WARN_ON(!pte_none(ptep_get(pte + i))); > set_pte_at(mm, addr, pte + i, entry); > } > @@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > } > pte_unmap(pte); > > - if (!pmd_migration) > + if (present) > folio_remove_rmap_pmd(folio, page, vma); > if (freeze) > put_page(page); > @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct > vm_area_struct *vma, pmd_t *pmd, > void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, > pmd_t *pmd, bool freeze) > { > + > VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); > - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) > + if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) || > + (is_pmd_device_private_entry(*pmd))) > __split_huge_pmd_locked(vma, pmd, address, freeze); > } > > @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, > struct folio *new_folio, > VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); > lockdep_assert_held(&lruvec->lru_lock); > > + if (folio_is_device_private(folio)) > + return; > + > if (list) { > /* page reclaim is reclaiming a huge page */ > VM_WARN_ON(folio_test_lru(folio)); > @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, > struct folio *new_folio, > list_add_tail(&new_folio->lru, &folio->lru); > folio_set_lru(new_folio); > } > + > } > > /* Racy check whether the huge page can be split */ > @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned > int new_order, > > /* Prevent deferred_split_scan() touching ->_refcount */ > spin_lock(&ds_queue->split_queue_lock); > - if (folio_ref_freeze(folio, 1 + extra_pins)) { > + if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) { > struct address_space *swap_cache = NULL; > struct lruvec *lruvec; > int expected_refs; > @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned > int new_order, > if (nr_shmem_dropped) > shmem_uncharge(mapping->host, nr_shmem_dropped); > > - if (!ret && is_anon) > + if (!ret && is_anon && !folio_is_device_private(folio)) > remap_flags = RMP_USE_SHARED_ZEROPAGE; > + > remap_page(folio, 1 << order, remap_flags); > > /* > @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct > page_vma_mapped_walk *pvmw, > return 0; > > flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); > - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); > + if (unlikely(is_pmd_device_private_entry(*pvmw->pmd))) > + pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd); > + else > + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); > > /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */ > anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page); > @@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk > *pvmw, struct page *new) > entry = pmd_to_swp_entry(*pvmw->pmd); > folio_get(folio); > pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot)); > + > + if (folio_is_device_private(folio)) { > + if (pmd_write(pmde)) > + entry = make_writable_device_private_entry( > + page_to_pfn(new)); > + else > + entry = make_readable_device_private_entry( > + page_to_pfn(new)); > + pmde = swp_entry_to_pmd(entry); > + } > + > if (pmd_swp_soft_dirty(*pvmw->pmd)) > pmde = pmd_mksoft_dirty(pmde); > if (is_writable_migration_entry(entry)) > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index e05e14d6eacd..0ed337f94fcd 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > * page table entry. Other special swap entries are not > * migratable, and we ignore regular swapped page. > */ > + struct folio *folio; > + > entry = pte_to_swp_entry(pte); > if (!is_device_private_entry(entry)) > goto next; > @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > pgmap->owner != migrate->pgmap_owner) > goto next; > > + folio = page_folio(page); > + if (folio_test_large(folio)) { > + struct folio *new_folio; > + struct folio *new_fault_folio = NULL; > + > + /* > + * The reason for finding pmd present with a > + * device private pte and a large folio for the > + * pte is partial unmaps. Split the folio now > + * for the migration to be handled correctly > + */ > + pte_unmap_unlock(ptep, ptl); > + > + folio_get(folio); > + if (folio != fault_folio) > + folio_lock(folio); > + if (split_folio(folio)) { > + if (folio != fault_folio) > + folio_unlock(folio); > + ptep = pte_offset_map_lock(mm, pmdp, > addr, &ptl); > + goto next; > + } > + > + new_folio = page_folio(page); > + if (fault_folio) > + new_fault_folio = > page_folio(migrate->fault_page); > + > + /* > + * Ensure the lock is held on the correct > + * folio after the split > + */ > + if (!new_fault_folio) { > + folio_unlock(folio); > + folio_put(folio); > + } else if (folio != new_fault_folio) { > + folio_get(new_fault_folio); > + folio_lock(new_fault_folio); > + folio_unlock(folio); > + folio_put(folio); > + } > + > + addr = start; > + goto again; > + } > + > mpfn = migrate_pfn(page_to_pfn(page)) | > MIGRATE_PFN_MIGRATE; > if (is_writable_device_private_entry(entry)) > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index e981a1a292d2..246e6c211f34 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk > *pvmw) > pvmw->ptl = pmd_lock(mm, pvmw->pmd); > pmde = *pvmw->pmd; > if (!pmd_present(pmde)) { > - swp_entry_t entry; > + swp_entry_t entry = pmd_to_swp_entry(pmde); > > if (!thp_migration_supported() || > !(pvmw->flags & PVMW_MIGRATION)) > return not_found(pvmw); > - entry = pmd_to_swp_entry(pmde); > if (!is_migration_entry(entry) || > !check_pmd(swp_offset_pfn(entry), pvmw)) > return not_found(pvmw); > @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk > *pvmw) > * cannot return prematurely, while zap_huge_pmd() has > * cleared *pmd but not decremented compound_mapcount(). > */ > + swp_entry_t entry; > + > + entry = pmd_to_swp_entry(pmde); > + > + if (is_device_private_entry(entry) && > + (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) { > + pvmw->ptl = pmd_lock(mm, pvmw->pmd); > + return true; > + } > + > if ((pvmw->flags & PVMW_SYNC) && > thp_vma_suitable_order(vma, pvmw->address, > PMD_ORDER) && > diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c > index 567e2d084071..604e8206a2ec 100644 > --- a/mm/pgtable-generic.c > +++ b/mm/pgtable-generic.c > @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, > pmd_t *pmdvalp) > *pmdvalp = pmdval; > if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) > goto nomap; > + if (is_swap_pmd(pmdval)) { > + swp_entry_t entry = pmd_to_swp_entry(pmdval); > + > + if (is_device_private_entry(entry)) > + goto nomap; > + } > if (unlikely(pmd_trans_huge(pmdval))) > goto nomap; > if (unlikely(pmd_bad(pmdval))) { > diff --git a/mm/rmap.c b/mm/rmap.c > index b5837075b6e0..f40e45564295 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, > struct vm_area_struct *vma, > unsigned long address, void *arg) > { > struct mm_struct *mm = vma->vm_mm; > - DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); > + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, > + PVMW_THP_DEVICE_PRIVATE); > bool anon_exclusive, writable, ret = true; > pte_t pteval; > struct page *subpage; > @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, > struct vm_area_struct *vma, > while (page_vma_mapped_walk(&pvmw)) { > /* PMD-mapped THP migration entry */ > if (!pvmw.pte) { > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > + unsigned long pfn; > +#endif > + > if (flags & TTU_SPLIT_HUGE_PMD) { > split_huge_pmd_locked(vma, pvmw.address, > pvmw.pmd, true); > @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, > struct vm_area_struct *vma, > break; > } > #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION > - subpage = folio_page(folio, > - pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); > + /* > + * Zone device private folios do not work well with > + * pmd_pfn() on some architectures due to pte > + * inversion. > + */ > + if (is_pmd_device_private_entry(*pvmw.pmd)) { > + swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd); > + > + pfn = swp_offset_pfn(entry); > + } else { > + pfn = pmd_pfn(*pvmw.pmd); > + } > + > + subpage = folio_page(folio, pfn - folio_pfn(folio)); > + > VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || > !folio_test_pmd_mappable(folio), folio); > > -- > 2.50.1 >