On 28.08.25 22:05, Matthew Brost wrote:
On Tue, Aug 12, 2025 at 12:40:27PM +1000, Balbir Singh wrote:
Make THP handling code in the mm subsystem for THP pages aware of zone
device pages. Although the code is designed to be generic when it comes
to handling splitting of pages, the code is designed to work for THP
page sizes corresponding to HPAGE_PMD_NR.

Modify page_vma_mapped_walk() to return true when a zone device huge
entry is present, enabling try_to_migrate() and other code migration
paths to appropriately process the entry. page_vma_mapped_walk() will
return true for zone device private large folios only when
PVMW_THP_DEVICE_PRIVATE is passed. This is to prevent locations that are
not zone device private pages from having to add awareness. The key
callback that needs this flag is try_to_migrate_one(). The other
callbacks page idle, damon use it for setting young/dirty bits, which is
not significant when it comes to pmd level bit harvesting.

pmd_pfn() does not work well with zone device entries, use
pfn_pmd_entry_to_swap() for checking and comparison as for zone device
entries.

Support partial unmapping of zone device private entries, which happens
via munmap(). munmap() causes the device private entry pmd to be split,
but the corresponding folio is not split. Deferred split does not work for
zone device private folios due to the need to split during fault
handling. Get migrate_vma_collect_pmd() to handle this case by splitting
partially unmapped device private folios.

Cc: Andrew Morton <a...@linux-foundation.org>
Cc: David Hildenbrand <da...@redhat.com>
Cc: Zi Yan <z...@nvidia.com>
Cc: Joshua Hahn <joshua.hah...@gmail.com>
Cc: Rakie Kim <rakie....@sk.com>
Cc: Byungchul Park <byungc...@sk.com>
Cc: Gregory Price <gou...@gourry.net>
Cc: Ying Huang <ying.hu...@linux.alibaba.com>
Cc: Alistair Popple <apop...@nvidia.com>
Cc: Oscar Salvador <osalva...@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com>
Cc: Baolin Wang <baolin.w...@linux.alibaba.com>
Cc: "Liam R. Howlett" <liam.howl...@oracle.com>
Cc: Nico Pache <npa...@redhat.com>
Cc: Ryan Roberts <ryan.robe...@arm.com>
Cc: Dev Jain <dev.j...@arm.com>
Cc: Barry Song <bao...@kernel.org>
Cc: Lyude Paul <ly...@redhat.com>
Cc: Danilo Krummrich <d...@kernel.org>
Cc: David Airlie <airl...@gmail.com>
Cc: Simona Vetter <sim...@ffwll.ch>
Cc: Ralph Campbell <rcampb...@nvidia.com>
Cc: Mika Penttilä <mpent...@redhat.com>
Cc: Matthew Brost <matthew.br...@intel.com>
Cc: Francois Dugast <francois.dug...@intel.com>

Signed-off-by: Matthew Brost <matthew.br...@intel.com>
Signed-off-by: Balbir Singh <balb...@nvidia.com>
---
  include/linux/rmap.h    |   2 +
  include/linux/swapops.h |  17 ++++
  lib/test_hmm.c          |   2 +-
  mm/huge_memory.c        | 214 +++++++++++++++++++++++++++++++---------
  mm/migrate_device.c     |  47 +++++++++
  mm/page_vma_mapped.c    |  13 ++-
  mm/pgtable-generic.c    |   6 ++
  mm/rmap.c               |  24 ++++-
  8 files changed, 272 insertions(+), 53 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6cd020eea37a..dfb7aae3d77b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -927,6 +927,8 @@ struct page *make_device_exclusive(struct mm_struct *mm, 
unsigned long addr,
  #define PVMW_SYNC             (1 << 0)
  /* Look for migration entries rather than present PTEs */
  #define PVMW_MIGRATION                (1 << 1)
+/* Look for device private THP entries */
+#define PVMW_THP_DEVICE_PRIVATE        (1 << 2)
struct page_vma_mapped_walk {
        unsigned long pfn;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 64ea151a7ae3..2641c01bd5d2 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -563,6 +563,7 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
  {
        return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
  }
+
  #else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
  static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
@@ -594,6 +595,22 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
  }
  #endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION)
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+       return is_swap_pmd(pmd) && 
is_device_private_entry(pmd_to_swp_entry(pmd));
+}
+
+#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static inline int is_pmd_device_private_entry(pmd_t pmd)
+{
+       return 0;
+}
+
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
  static inline int non_swap_entry(swp_entry_t entry)
  {
        return swp_type(entry) >= MAX_SWAPFILES;
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 761725bc713c..297f1e034045 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -1408,7 +1408,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault 
*vmf)
         * the mirror but here we use it to hold the page for the simulated
         * device memory and that page holds the pointer to the mirror.
         */
-       rpage = vmf->page->zone_device_data;
+       rpage = folio_page(page_folio(vmf->page), 0)->zone_device_data;
        dmirror = rpage->zone_device_data;
/* FIXME demonstrate how we can adjust migrate range */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..2495e3fdbfae 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1711,8 +1711,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);
- VM_BUG_ON(!is_pmd_migration_entry(pmd));
-               if (!is_readable_migration_entry(entry)) {
+               VM_WARN_ON(!is_pmd_migration_entry(pmd) &&
+                               !is_pmd_device_private_entry(pmd));
+
+               if (is_migration_entry(entry) &&
+                       is_writable_migration_entry(entry)) {
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);
@@ -1722,6 +1725,32 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
                                pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }
+
+               if (is_device_private_entry(entry)) {
+                       if (is_writable_device_private_entry(entry)) {
+                               entry = make_readable_device_private_entry(
+                                       swp_offset(entry));
+                               pmd = swp_entry_to_pmd(entry);
+
+                               if (pmd_swp_soft_dirty(*src_pmd))
+                                       pmd = pmd_swp_mksoft_dirty(pmd);
+                               if (pmd_swp_uffd_wp(*src_pmd))
+                                       pmd = pmd_swp_mkuffd_wp(pmd);
+                               set_pmd_at(src_mm, addr, src_pmd, pmd);
+                       }
+
+                       src_folio = pfn_swap_entry_folio(entry);
+                       VM_WARN_ON(!folio_test_large(src_folio));
+
+                       folio_get(src_folio);
+                       /*
+                        * folio_try_dup_anon_rmap_pmd does not fail for
+                        * device private entries.
+                        */
+                       VM_WARN_ON(folio_try_dup_anon_rmap_pmd(src_folio,
+                                         &src_folio->page, dst_vma, src_vma));

VM_WARN_ON compiles out in non-debug builds. I hit this running the
fork self I shared with a non-debug build.


folio_try_dup_anon_rmap_pmd() will never fail for folio_is_device_private(folio) -- unless something is deeply messed up that we wouldn't identify this folio as being device-private.

Can you elaborate, what were you able to trigger, and in what kind of environment?

--
Cheers

David / dhildenb

Reply via email to