thp: zone_device awareness in THP handling code

Matthew Brost Thu, 28 Aug 2025 13:05:44 -0700

On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
> Make THP handling code in the mm subsystem for THP pages aware of zone
> device pages. Although the code is designed to be generic when it comes
> to handling splitting of pages, the code is designed to work for THP
> page sizes corresponding to HPAGE_PMD_NR.
> 
> Modify page_vma_mapped_walk() to return true when a zone device huge
> entry is present, enabling try_to_migrate() and other code migration
> paths to appropriately process the entry. page_vma_mapped_walk() will
> return true for zone device private large folios only when
> PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
> not zone device private pages from having to add awareness. The key
> callback that needs this flag is try_to_migrate_one(). The other
> callbacks page idle, damon use it for setting young/dirty bits, which is
> not significant when it comes to pmd level bit harvesting.
> 
> pmd_pfn() does not work well with zone device entries, use
> pfn_pmd_entry_to_swap() for checking and comparison as for zone device
> entries.
> 
> Support partial unmapping of zone device private entries, which happens
> via munmap(). munmap() causes the device private entry pmd to be split,
> but the corresponding folio is not split. Deferred split does not work for
> zone device private folios due to the need to split during fault
> handling. Get migrate_vma_collect_pmd() to handle this case by splitting
> partially unmapped device private folios.
> 
> Cc: Andrew Morton <a...@linux-foundation.org>
> Cc: David Hildenbrand <da...@redhat.com>
> Cc: Zi Yan <z...@nvidia.com>
> Cc: Joshua Hahn <joshua.hah...@gmail.com>
> Cc: Rakie Kim <rakie....@sk.com>
> Cc: Byungchul Park <byungc...@sk.com>
> Cc: Gregory Price <gou...@gourry.net>
> Cc: Ying Huang <ying.hu...@linux.alibaba.com>
> Cc: Alistair Popple <apop...@nvidia.com>
> Cc: Oscar Salvador <osalva...@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com>
> Cc: Baolin Wang <baolin.w...@linux.alibaba.com>
> Cc: "Liam R. Howlett" <liam.howl...@oracle.com>
> Cc: Nico Pache <npa...@redhat.com>
> Cc: Ryan Roberts <ryan.robe...@arm.com>
> Cc: Dev Jain <dev.j...@arm.com>
> Cc: Barry Song <bao...@kernel.org>
> Cc: Lyude Paul <ly...@redhat.com>
> Cc: Danilo Krummrich <d...@kernel.org>
> Cc: David Airlie <airl...@gmail.com>
> Cc: Simona Vetter <sim...@ffwll.ch>
> Cc: Ralph Campbell <rcampb...@nvidia.com>
> Cc: Mika Penttilä <mpent...@redhat.com>
> Cc: Matthew Brost <matthew.br...@intel.com>
> Cc: Francois Dugast <francois.dug...@intel.com>
> 
> Signed-off-by: Matthew Brost <matthew.br...@intel.com>
> Signed-off-by: Balbir Singh <balb...@nvidia.com>
> ---
>  include/linux/rmap.h    |   2 +
>  include/linux/swapops.h |  17 ++++
>  lib/test_hmm.c          |   2 +-
>  mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
>  mm/migrate_device.c     |  47 +++++++++
>  mm/page_vma_mapped.c    |  13 ++-
>  mm/pgtable-generic.c    |   6 ++
>  mm/rmap.c               |  24 ++++-
>  8 files changed, 272 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 6cd020eea37a..dfb7aae3d77b 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, 
> unsigned long addr,
>  #define PVMW_SYNC            (1 << 0)
>  /* Look for migration entries rather than present PTEs */
>  #define PVMW_MIGRATION               (1 << 1)
> +/* Look for device private THP entries */
> +#define PVMW_THP_DEVICE_PRIVATE      (1 << 2)
>  
>  struct page_vma_mapped_walk {
>       unsigned long pfn;
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 64ea151a7ae3..2641c01bd5d2 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>  {
>       return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
>  }
> +
>  #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>  static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
>               struct page *page)
> @@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
>  }
>  #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
>  
> +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +     return is_swap_pmd(pmd) && 
> is_device_private_entry(pmd_to_swp_entry(pmd));
> +}
> +
> +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
> +static inline int is_pmd_device_private_entry(pmd_t pmd)
> +{
> +     return 0;
> +}
> +
> +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +
>  static inline int non_swap_entry(swp_entry_t entry)
>  {
>       return swp_type(entry) >= MAX_SWAPFILES;
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index 761725bc713c..297f1e034045 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault 
> *vmf)
>        * the mirror but here we use it to hold the page for the simulated
>        * device memory and that page holds the pointer to the mirror.
>        */
> -     rpage = vmf->page->zone_device_data;
> +     rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
>       dmirror = rpage->zone_device_data;
>  
>       /* FIXME demonstrate how we can adjust migrate range */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9c38a95e9f09..2495e3fdbfae 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
> mm_struct *src_mm,
>       if (unlikely(is_swap_pmd(pmd))) {
>               swp_entry_t entry = pmd_to_swp_entry(pmd);
>  
> -             VM_BUG_ON(!is_pmd_migration_entry(pmd));
> -             if (!is_readable_migration_entry(entry)) {
> +             VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
> +                             !is_pmd_device_private_entry(pmd));
> +
> +             if (is_migration_entry(entry) &&
> +                     is_writable_migration_entry(entry)) {
>                       entry = make_readable_migration_entry(
>                                                       swp_offset(entry));
>                       pmd = swp_entry_to_pmd(entry);
> @@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
> mm_struct *src_mm,
>                               pmd = pmd_swp_mkuffd_wp(pmd);
>                       set_pmd_at(src_mm, addr, src_pmd, pmd);
>               }
> +
> +             if (is_device_private_entry(entry)) {
> +                     if (is_writable_device_private_entry(entry)) {
> +                             entry = make_readable_device_private_entry(
> +                                     swp_offset(entry));
> +                             pmd = swp_entry_to_pmd(entry);
> +
> +                             if (pmd_swp_soft_dirty(*src_pmd))
> +                                     pmd = pmd_swp_mksoft_dirty(pmd);
> +                             if (pmd_swp_uffd_wp(*src_pmd))
> +                                     pmd = pmd_swp_mkuffd_wp(pmd);
> +                             set_pmd_at(src_mm, addr, src_pmd, pmd);
> +                     }
> +
> +                     src_folio = pfn_swap_entry_folio(entry);
> +                     VM_WARN_ON(!folio_test_large(src_folio));
> +
> +                     folio_get(src_folio);
> +                     /*
> +                      * folio_try_dup_anon_rmap_pmd does not fail for
> +                      * device private entries.
> +                      */
> +                     VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
> +                                       &src_folio->page, dst_vma, src_vma));


VM_WARN_ON compiles out in non-debug builds. I hit this running the
fork self I shared with a non-debug build.

Matt 

[1] https://elixir.bootlin.com/linux/v6.16.3/source/include/linux/mmdebug.h#L112

> +             }
> +
>               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>               mm_inc_nr_ptes(dst_mm);
>               pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> @@ -2219,15 +2248,22 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>                       folio_remove_rmap_pmd(folio, page, vma);
>                       WARN_ON_ONCE(folio_mapcount(folio) < 0);
>                       VM_BUG_ON_PAGE(!PageHead(page), page);
> -             } else if (thp_migration_supported()) {
> +             } else if (is_pmd_migration_entry(orig_pmd) ||
> +                             is_pmd_device_private_entry(orig_pmd)) {
>                       swp_entry_t entry;
>  
> -                     VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
>                       entry = pmd_to_swp_entry(orig_pmd);
>                       folio = pfn_swap_entry_folio(entry);
>                       flush_needed = 0;
> -             } else
> -                     WARN_ONCE(1, "Non present huge pmd without pmd 
> migration enabled!");
> +
> +                     if (!thp_migration_supported())
> +                             WARN_ONCE(1, "Non present huge pmd without pmd 
> migration enabled!");
> +
> +                     if (is_pmd_device_private_entry(orig_pmd)) {
> +                             folio_remove_rmap_pmd(folio, &folio->page, vma);
> +                             WARN_ON_ONCE(folio_mapcount(folio) < 0);
> +                     }
> +             }
>  
>               if (folio_test_anon(folio)) {
>                       zap_deposited_table(tlb->mm, pmd);
> @@ -2247,6 +2283,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>                               folio_mark_accessed(folio);
>               }
>  
> +             /*
> +              * Do a folio put on zone device private pages after
> +              * changes to mm_counter, because the folio_put() will
> +              * clean folio->mapping and the folio_test_anon() check
> +              * will not be usable.
> +              */
> +             if (folio_is_device_private(folio))
> +                     folio_put(folio);
> +
>               spin_unlock(ptl);
>               if (flush_needed)
>                       tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
> @@ -2375,7 +2420,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>               struct folio *folio = pfn_swap_entry_folio(entry);
>               pmd_t newpmd;
>  
> -             VM_BUG_ON(!is_pmd_migration_entry(*pmd));
> +             VM_WARN_ON(!is_pmd_migration_entry(*pmd) &&
> +                        !folio_is_device_private(folio));
>               if (is_writable_migration_entry(entry)) {
>                       /*
>                        * A protection check is difficult so
> @@ -2388,6 +2434,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>                       newpmd = swp_entry_to_pmd(entry);
>                       if (pmd_swp_soft_dirty(*pmd))
>                               newpmd = pmd_swp_mksoft_dirty(newpmd);
> +             } else if (is_writable_device_private_entry(entry)) {
> +                     entry = make_readable_device_private_entry(
> +                                                     swp_offset(entry));
> +                     newpmd = swp_entry_to_pmd(entry);
>               } else {
>                       newpmd = *pmd;
>               }
> @@ -2842,16 +2892,19 @@ static void __split_huge_pmd_locked(struct 
> vm_area_struct *vma, pmd_t *pmd,
>       struct page *page;
>       pgtable_t pgtable;
>       pmd_t old_pmd, _pmd;
> -     bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
> -     bool anon_exclusive = false, dirty = false;
> +     bool young, write, soft_dirty, uffd_wp = false;
> +     bool anon_exclusive = false, dirty = false, present = false;
>       unsigned long addr;
>       pte_t *pte;
>       int i;
> +     swp_entry_t swp_entry;
>  
>       VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> -     VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
> +
> +     VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
> +                     && !(is_pmd_device_private_entry(*pmd)));
>  
>       count_vm_event(THP_SPLIT_PMD);
>  
> @@ -2899,18 +2952,45 @@ static void __split_huge_pmd_locked(struct 
> vm_area_struct *vma, pmd_t *pmd,
>               return __split_huge_zero_page_pmd(vma, haddr, pmd);
>       }
>  
> -     pmd_migration = is_pmd_migration_entry(*pmd);
> -     if (unlikely(pmd_migration)) {
> -             swp_entry_t entry;
>  
> +     present = pmd_present(*pmd);
> +     if (unlikely(!present)) {
> +             swp_entry = pmd_to_swp_entry(*pmd);
>               old_pmd = *pmd;
> -             entry = pmd_to_swp_entry(old_pmd);
> -             page = pfn_swap_entry_to_page(entry);
> -             write = is_writable_migration_entry(entry);
> -             if (PageAnon(page))
> -                     anon_exclusive = 
> is_readable_exclusive_migration_entry(entry);
> -             young = is_migration_entry_young(entry);
> -             dirty = is_migration_entry_dirty(entry);
> +
> +             folio = pfn_swap_entry_folio(swp_entry);
> +             VM_WARN_ON(!is_migration_entry(swp_entry) &&
> +                             !is_device_private_entry(swp_entry));
> +             page = pfn_swap_entry_to_page(swp_entry);
> +
> +             if (is_pmd_migration_entry(old_pmd)) {
> +                     write = is_writable_migration_entry(swp_entry);
> +                     if (PageAnon(page))
> +                             anon_exclusive =
> +                                     is_readable_exclusive_migration_entry(
> +                                                             swp_entry);
> +                     young = is_migration_entry_young(swp_entry);
> +                     dirty = is_migration_entry_dirty(swp_entry);
> +             } else if (is_pmd_device_private_entry(old_pmd)) {
> +                     write = is_writable_device_private_entry(swp_entry);
> +                     anon_exclusive = PageAnonExclusive(page);
> +                     if (freeze && anon_exclusive &&
> +                         folio_try_share_anon_rmap_pmd(folio, page))
> +                             freeze = false;
> +                     if (!freeze) {
> +                             rmap_t rmap_flags = RMAP_NONE;
> +
> +                             if (anon_exclusive)
> +                                     rmap_flags |= RMAP_EXCLUSIVE;
> +
> +                             folio_ref_add(folio, HPAGE_PMD_NR - 1);
> +                             if (anon_exclusive)
> +                                     rmap_flags |= RMAP_EXCLUSIVE;
> +                             folio_add_anon_rmap_ptes(folio, page, 
> HPAGE_PMD_NR,
> +                                              vma, haddr, rmap_flags);
> +                     }
> +             }
> +
>               soft_dirty = pmd_swp_soft_dirty(old_pmd);
>               uffd_wp = pmd_swp_uffd_wp(old_pmd);
>       } else {
> @@ -2996,30 +3076,49 @@ static void __split_huge_pmd_locked(struct 
> vm_area_struct *vma, pmd_t *pmd,
>        * Note that NUMA hinting access restrictions are not transferred to
>        * avoid any possibility of altering permissions across VMAs.
>        */
> -     if (freeze || pmd_migration) {
> +     if (freeze || !present) {
>               for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += 
> PAGE_SIZE) {
>                       pte_t entry;
> -                     swp_entry_t swp_entry;
> -
> -                     if (write)
> -                             swp_entry = make_writable_migration_entry(
> -                                                     page_to_pfn(page + i));
> -                     else if (anon_exclusive)
> -                             swp_entry = 
> make_readable_exclusive_migration_entry(
> -                                                     page_to_pfn(page + i));
> -                     else
> -                             swp_entry = make_readable_migration_entry(
> -                                                     page_to_pfn(page + i));
> -                     if (young)
> -                             swp_entry = 
> make_migration_entry_young(swp_entry);
> -                     if (dirty)
> -                             swp_entry = 
> make_migration_entry_dirty(swp_entry);
> -                     entry = swp_entry_to_pte(swp_entry);
> -                     if (soft_dirty)
> -                             entry = pte_swp_mksoft_dirty(entry);
> -                     if (uffd_wp)
> -                             entry = pte_swp_mkuffd_wp(entry);
> -
> +                     if (freeze || is_migration_entry(swp_entry)) {
> +                             if (write)
> +                                     swp_entry = 
> make_writable_migration_entry(
> +                                                             
> page_to_pfn(page + i));
> +                             else if (anon_exclusive)
> +                                     swp_entry = 
> make_readable_exclusive_migration_entry(
> +                                                             
> page_to_pfn(page + i));
> +                             else
> +                                     swp_entry = 
> make_readable_migration_entry(
> +                                                             
> page_to_pfn(page + i));
> +                             if (young)
> +                                     swp_entry = 
> make_migration_entry_young(swp_entry);
> +                             if (dirty)
> +                                     swp_entry = 
> make_migration_entry_dirty(swp_entry);
> +                             entry = swp_entry_to_pte(swp_entry);
> +                             if (soft_dirty)
> +                                     entry = pte_swp_mksoft_dirty(entry);
> +                             if (uffd_wp)
> +                                     entry = pte_swp_mkuffd_wp(entry);
> +                     } else {
> +                             /*
> +                              * anon_exclusive was already propagated to the 
> relevant
> +                              * pages corresponding to the pte entries when 
> freeze
> +                              * is false.
> +                              */
> +                             if (write)
> +                                     swp_entry = 
> make_writable_device_private_entry(
> +                                                             
> page_to_pfn(page + i));
> +                             else
> +                                     swp_entry = 
> make_readable_device_private_entry(
> +                                                             
> page_to_pfn(page + i));
> +                             /*
> +                              * Young and dirty bits are not progated via 
> swp_entry
> +                              */
> +                             entry = swp_entry_to_pte(swp_entry);
> +                             if (soft_dirty)
> +                                     entry = pte_swp_mksoft_dirty(entry);
> +                             if (uffd_wp)
> +                                     entry = pte_swp_mkuffd_wp(entry);
> +                     }
>                       VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>                       set_pte_at(mm, addr, pte + i, entry);
>               }
> @@ -3046,7 +3145,7 @@ static void __split_huge_pmd_locked(struct 
> vm_area_struct *vma, pmd_t *pmd,
>       }
>       pte_unmap(pte);
>  
> -     if (!pmd_migration)
> +     if (present)
>               folio_remove_rmap_pmd(folio, page, vma);
>       if (freeze)
>               put_page(page);
> @@ -3058,8 +3157,10 @@ static void __split_huge_pmd_locked(struct 
> vm_area_struct *vma, pmd_t *pmd,
>  void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>                          pmd_t *pmd, bool freeze)
>  {
> +
>       VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
> -     if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
> +     if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
> +                     (is_pmd_device_private_entry(*pmd)))
>               __split_huge_pmd_locked(vma, pmd, address, freeze);
>  }
>  
> @@ -3238,6 +3339,9 @@ static void lru_add_split_folio(struct folio *folio, 
> struct folio *new_folio,
>       VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>       lockdep_assert_held(&lruvec->lru_lock);
>  
> +     if (folio_is_device_private(folio))
> +             return;
> +
>       if (list) {
>               /* page reclaim is reclaiming a huge page */
>               VM_WARN_ON(folio_test_lru(folio));
> @@ -3252,6 +3356,7 @@ static void lru_add_split_folio(struct folio *folio, 
> struct folio *new_folio,
>                       list_add_tail(&new_folio->lru, &folio->lru);
>               folio_set_lru(new_folio);
>       }
> +
>  }
>  
>  /* Racy check whether the huge page can be split */
> @@ -3727,7 +3832,7 @@ static int __folio_split(struct folio *folio, unsigned 
> int new_order,
>  
>       /* Prevent deferred_split_scan() touching ->_refcount */
>       spin_lock(&ds_queue->split_queue_lock);
> -     if (folio_ref_freeze(folio, 1 + extra_pins)) {
> +     if (folio_ref_freeze(folio, 1 + folio_expected_ref_count(folio))) {
>               struct address_space *swap_cache = NULL;
>               struct lruvec *lruvec;
>               int expected_refs;
> @@ -3858,8 +3963,9 @@ static int __folio_split(struct folio *folio, unsigned 
> int new_order,
>       if (nr_shmem_dropped)
>               shmem_uncharge(mapping->host, nr_shmem_dropped);
>  
> -     if (!ret && is_anon)
> +     if (!ret && is_anon && !folio_is_device_private(folio))
>               remap_flags = RMP_USE_SHARED_ZEROPAGE;
> +
>       remap_page(folio, 1 << order, remap_flags);
>  
>       /*
> @@ -4603,7 +4709,10 @@ int set_pmd_migration_entry(struct 
> page_vma_mapped_walk *pvmw,
>               return 0;
>  
>       flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
> -     pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
> +     if (unlikely(is_pmd_device_private_entry(*pvmw->pmd)))
> +             pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);
> +     else
> +             pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
>  
>       /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
>       anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
> @@ -4653,6 +4762,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk 
> *pvmw, struct page *new)
>       entry = pmd_to_swp_entry(*pvmw->pmd);
>       folio_get(folio);
>       pmde = folio_mk_pmd(folio, READ_ONCE(vma->vm_page_prot));
> +
> +     if (folio_is_device_private(folio)) {
> +             if (pmd_write(pmde))
> +                     entry = make_writable_device_private_entry(
> +                                                     page_to_pfn(new));
> +             else
> +                     entry = make_readable_device_private_entry(
> +                                                     page_to_pfn(new));
> +             pmde = swp_entry_to_pmd(entry);
> +     }
> +
>       if (pmd_swp_soft_dirty(*pvmw->pmd))
>               pmde = pmd_mksoft_dirty(pmde);
>       if (is_writable_migration_entry(entry))
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index e05e14d6eacd..0ed337f94fcd 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -136,6 +136,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                        * page table entry. Other special swap entries are not
>                        * migratable, and we ignore regular swapped page.
>                        */
> +                     struct folio *folio;
> +
>                       entry = pte_to_swp_entry(pte);
>                       if (!is_device_private_entry(entry))
>                               goto next;
> @@ -147,6 +149,51 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                           pgmap->owner != migrate->pgmap_owner)
>                               goto next;
>  
> +                     folio = page_folio(page);
> +                     if (folio_test_large(folio)) {
> +                             struct folio *new_folio;
> +                             struct folio *new_fault_folio = NULL;
> +
> +                             /*
> +                              * The reason for finding pmd present with a
> +                              * device private pte and a large folio for the
> +                              * pte is partial unmaps. Split the folio now
> +                              * for the migration to be handled correctly
> +                              */
> +                             pte_unmap_unlock(ptep, ptl);
> +
> +                             folio_get(folio);
> +                             if (folio != fault_folio)
> +                                     folio_lock(folio);
> +                             if (split_folio(folio)) {
> +                                     if (folio != fault_folio)
> +                                             folio_unlock(folio);
> +                                     ptep = pte_offset_map_lock(mm, pmdp, 
> addr, &ptl);
> +                                     goto next;
> +                             }
> +
> +                             new_folio = page_folio(page);
> +                             if (fault_folio)
> +                                     new_fault_folio = 
> page_folio(migrate->fault_page);
> +
> +                             /*
> +                              * Ensure the lock is held on the correct
> +                              * folio after the split
> +                              */
> +                             if (!new_fault_folio) {
> +                                     folio_unlock(folio);
> +                                     folio_put(folio);
> +                             } else if (folio != new_fault_folio) {
> +                                     folio_get(new_fault_folio);
> +                                     folio_lock(new_fault_folio);
> +                                     folio_unlock(folio);
> +                                     folio_put(folio);
> +                             }
> +
> +                             addr = start;
> +                             goto again;
> +                     }
> +
>                       mpfn = migrate_pfn(page_to_pfn(page)) |
>                                       MIGRATE_PFN_MIGRATE;
>                       if (is_writable_device_private_entry(entry))
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index e981a1a292d2..246e6c211f34 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -250,12 +250,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
> *pvmw)
>                       pvmw->ptl = pmd_lock(mm, pvmw->pmd);
>                       pmde = *pvmw->pmd;
>                       if (!pmd_present(pmde)) {
> -                             swp_entry_t entry;
> +                             swp_entry_t entry = pmd_to_swp_entry(pmde);
>  
>                               if (!thp_migration_supported() ||
>                                   !(pvmw->flags & PVMW_MIGRATION))
>                                       return not_found(pvmw);
> -                             entry = pmd_to_swp_entry(pmde);
>                               if (!is_migration_entry(entry) ||
>                                   !check_pmd(swp_offset_pfn(entry), pvmw))
>                                       return not_found(pvmw);
> @@ -277,6 +276,16 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
> *pvmw)
>                        * cannot return prematurely, while zap_huge_pmd() has
>                        * cleared *pmd but not decremented compound_mapcount().
>                        */
> +                     swp_entry_t entry;
> +
> +                     entry = pmd_to_swp_entry(pmde);
> +
> +                     if (is_device_private_entry(entry) &&
> +                             (pvmw->flags & PVMW_THP_DEVICE_PRIVATE)) {
> +                             pvmw->ptl = pmd_lock(mm, pvmw->pmd);
> +                             return true;
> +                     }
> +
>                       if ((pvmw->flags & PVMW_SYNC) &&
>                           thp_vma_suitable_order(vma, pvmw->address,
>                                                  PMD_ORDER) &&
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 567e2d084071..604e8206a2ec 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -292,6 +292,12 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, 
> pmd_t *pmdvalp)
>               *pmdvalp = pmdval;
>       if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
>               goto nomap;
> +     if (is_swap_pmd(pmdval)) {
> +             swp_entry_t entry = pmd_to_swp_entry(pmdval);
> +
> +             if (is_device_private_entry(entry))
> +                     goto nomap;
> +     }
>       if (unlikely(pmd_trans_huge(pmdval)))
>               goto nomap;
>       if (unlikely(pmd_bad(pmdval))) {
> diff --git a/mm/rmap.c b/mm/rmap.c
> index b5837075b6e0..f40e45564295 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -2285,7 +2285,8 @@ static bool try_to_migrate_one(struct folio *folio, 
> struct vm_area_struct *vma,
>                    unsigned long address, void *arg)
>  {
>       struct mm_struct *mm = vma->vm_mm;
> -     DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
> +     DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address,
> +                             PVMW_THP_DEVICE_PRIVATE);
>       bool anon_exclusive, writable, ret = true;
>       pte_t pteval;
>       struct page *subpage;
> @@ -2330,6 +2331,10 @@ static bool try_to_migrate_one(struct folio *folio, 
> struct vm_area_struct *vma,
>       while (page_vma_mapped_walk(&pvmw)) {
>               /* PMD-mapped THP migration entry */
>               if (!pvmw.pte) {
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +                     unsigned long pfn;
> +#endif
> +
>                       if (flags & TTU_SPLIT_HUGE_PMD) {
>                               split_huge_pmd_locked(vma, pvmw.address,
>                                                     pvmw.pmd, true);
> @@ -2338,8 +2343,21 @@ static bool try_to_migrate_one(struct folio *folio, 
> struct vm_area_struct *vma,
>                               break;
>                       }
>  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> -                     subpage = folio_page(folio,
> -                             pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
> +                     /*
> +                      * Zone device private folios do not work well with
> +                      * pmd_pfn() on some architectures due to pte
> +                      * inversion.
> +                      */
> +                     if (is_pmd_device_private_entry(*pvmw.pmd)) {
> +                             swp_entry_t entry = pmd_to_swp_entry(*pvmw.pmd);
> +
> +                             pfn = swp_offset_pfn(entry);
> +                     } else {
> +                             pfn = pmd_pfn(*pvmw.pmd);
> +                     }
> +
> +                     subpage = folio_page(folio, pfn - folio_pfn(folio));
> +
>                       VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
>                                       !folio_test_pmd_mappable(folio), folio);
>  
> -- 
> 2.50.1
>

Re: [v3 02/11] mm/thp: zone_device awareness in THP handling code

Reply via email to