On 9/8/25 14:14, Mika Penttilä wrote: > Hi, > > On 9/8/25 03:04, Balbir Singh wrote: > >> Extend migrate_vma_collect_pmd() to handle partially mapped large >> folios that require splitting before migration can proceed. >> >> During PTE walk in the collection phase, if a large folio is only >> partially mapped in the migration range, it must be split to ensure >> the folio is correctly migrated. >> >> Cc: Andrew Morton <a...@linux-foundation.org> >> Cc: David Hildenbrand <da...@redhat.com> >> Cc: Zi Yan <z...@nvidia.com> >> Cc: Joshua Hahn <joshua.hah...@gmail.com> >> Cc: Rakie Kim <rakie....@sk.com> >> Cc: Byungchul Park <byungc...@sk.com> >> Cc: Gregory Price <gou...@gourry.net> >> Cc: Ying Huang <ying.hu...@linux.alibaba.com> >> Cc: Alistair Popple <apop...@nvidia.com> >> Cc: Oscar Salvador <osalva...@suse.de> >> Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> >> Cc: Baolin Wang <baolin.w...@linux.alibaba.com> >> Cc: "Liam R. Howlett" <liam.howl...@oracle.com> >> Cc: Nico Pache <npa...@redhat.com> >> Cc: Ryan Roberts <ryan.robe...@arm.com> >> Cc: Dev Jain <dev.j...@arm.com> >> Cc: Barry Song <bao...@kernel.org> >> Cc: Lyude Paul <ly...@redhat.com> >> Cc: Danilo Krummrich <d...@kernel.org> >> Cc: David Airlie <airl...@gmail.com> >> Cc: Simona Vetter <sim...@ffwll.ch> >> Cc: Ralph Campbell <rcampb...@nvidia.com> >> Cc: Mika Penttilä <mpent...@redhat.com> >> Cc: Matthew Brost <matthew.br...@intel.com> >> Cc: Francois Dugast <francois.dug...@intel.com> >> >> Signed-off-by: Balbir Singh <balb...@nvidia.com> >> --- >> mm/migrate_device.c | 94 +++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 94 insertions(+) >> >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c >> index abd9f6850db6..f45ef182287d 100644 >> --- a/mm/migrate_device.c >> +++ b/mm/migrate_device.c >> @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start, >> return 0; >> } >> >> +/** >> + * migrate_vma_split_folio() - Helper function to split a THP folio >> + * @folio: the folio to split >> + * @fault_page: struct page associated with the fault if any >> + * >> + * Returns 0 on success >> + */ >> +static int migrate_vma_split_folio(struct folio *folio, >> + struct page *fault_page) >> +{ >> + int ret; >> + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL; >> + struct folio *new_fault_folio = NULL; >> + >> + if (folio != fault_folio) { >> + folio_get(folio); >> + folio_lock(folio); >> + } >> + >> + ret = split_folio(folio); >> + if (ret) { >> + if (folio != fault_folio) { >> + folio_unlock(folio); >> + folio_put(folio); >> + } >> + return ret; >> + } >> + >> + new_fault_folio = fault_page ? page_folio(fault_page) : NULL; >> + >> + /* >> + * Ensure the lock is held on the correct >> + * folio after the split >> + */ >> + if (!new_fault_folio) { >> + folio_unlock(folio); >> + folio_put(folio); >> + } else if (folio != new_fault_folio) { >> + folio_get(new_fault_folio); >> + folio_lock(new_fault_folio); >> + folio_unlock(folio); >> + folio_put(folio); >> + } >> + >> + return 0; >> +} >> + >> static int migrate_vma_collect_pmd(pmd_t *pmdp, >> unsigned long start, >> unsigned long end, >> @@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> * page table entry. Other special swap entries are not >> * migratable, and we ignore regular swapped page. >> */ >> + struct folio *folio; >> + >> entry = pte_to_swp_entry(pte); >> if (!is_device_private_entry(entry)) >> goto next; >> @@ -147,6 +196,29 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> pgmap->owner != migrate->pgmap_owner) >> goto next; >> >> + folio = page_folio(page); >> + if (folio_test_large(folio)) { >> + int ret; >> + >> + /* >> + * The reason for finding pmd present with a >> + * large folio for the pte is partial unmaps. >> + * Split the folio now for the migration to be >> + * handled correctly >> + */ >> + pte_unmap_unlock(ptep, ptl); >> + ret = migrate_vma_split_folio(folio, >> + migrate->fault_page); >> + >> + if (ret) { >> + ptep = pte_offset_map_lock(mm, pmdp, >> addr, &ptl); >> + goto next; >> + } >> + >> + addr = start; >> + goto again; >> + } >> + >> mpfn = migrate_pfn(page_to_pfn(page)) | >> MIGRATE_PFN_MIGRATE; >> if (is_writable_device_private_entry(entry)) >> @@ -171,6 +243,28 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> pgmap->owner != migrate->pgmap_owner) >> goto next; >> } >> + folio = page_folio(page); >> + if (folio_test_large(folio)) { >> + int ret; >> + >> + /* >> + * The reason for finding pmd present with a >> + * large folio for the pte is partial unmaps. >> + * Split the folio now for the migration to be >> + * handled correctly >> + */ > > This comment is still not changed, there are other reasons for pte mapped > large pages. > Also now all the mTHPs are splitted, which is change of behavior (currently > ignored) > for order < PMD_ORDER.
Oh! sorry I missed it. I am attaching the version with the comments removed. On the behaviour change, I agree, but it is required for migration to occur. Updated patch below: mm/migrate_device: handle partially mapped folios during collection Extend migrate_vma_collect_pmd() to handle partially mapped large folios that require splitting before migration can proceed. During PTE walk in the collection phase, if a large folio is only partially mapped in the migration range, it must be split to ensure the folio is correctly migrated. Cc: Andrew Morton <a...@linux-foundation.org> Cc: David Hildenbrand <da...@redhat.com> Cc: Zi Yan <z...@nvidia.com> Cc: Joshua Hahn <joshua.hah...@gmail.com> Cc: Rakie Kim <rakie....@sk.com> Cc: Byungchul Park <byungc...@sk.com> Cc: Gregory Price <gou...@gourry.net> Cc: Ying Huang <ying.hu...@linux.alibaba.com> Cc: Alistair Popple <apop...@nvidia.com> Cc: Oscar Salvador <osalva...@suse.de> Cc: Lorenzo Stoakes <lorenzo.stoa...@oracle.com> Cc: Baolin Wang <baolin.w...@linux.alibaba.com> Cc: "Liam R. Howlett" <liam.howl...@oracle.com> Cc: Nico Pache <npa...@redhat.com> Cc: Ryan Roberts <ryan.robe...@arm.com> Cc: Dev Jain <dev.j...@arm.com> Cc: Barry Song <bao...@kernel.org> Cc: Lyude Paul <ly...@redhat.com> Cc: Danilo Krummrich <d...@kernel.org> Cc: David Airlie <airl...@gmail.com> Cc: Simona Vetter <sim...@ffwll.ch> Cc: Ralph Campbell <rcampb...@nvidia.com> Cc: Mika Penttilä <mpent...@redhat.com> Cc: Matthew Brost <matthew.br...@intel.com> Cc: Francois Dugast <francois.dug...@intel.com> Signed-off-by: Balbir Singh <balb...@nvidia.com> --- mm/migrate_device.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index abd9f6850db6..0afdc8b67c60 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -54,6 +54,53 @@ static int migrate_vma_collect_hole(unsigned long start, return 0; } +/** + * migrate_vma_split_folio() - Helper function to split a THP folio + * @folio: the folio to split + * @fault_page: struct page associated with the fault if any + * + * Returns 0 on success + */ +static int migrate_vma_split_folio(struct folio *folio, + struct page *fault_page) +{ + int ret; + struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL; + struct folio *new_fault_folio = NULL; + + if (folio != fault_folio) { + folio_get(folio); + folio_lock(folio); + } + + ret = split_folio(folio); + if (ret) { + if (folio != fault_folio) { + folio_unlock(folio); + folio_put(folio); + } + return ret; + } + + new_fault_folio = fault_page ? page_folio(fault_page) : NULL; + + /* + * Ensure the lock is held on the correct + * folio after the split + */ + if (!new_fault_folio) { + folio_unlock(folio); + folio_put(folio); + } else if (folio != new_fault_folio) { + folio_get(new_fault_folio); + folio_lock(new_fault_folio); + folio_unlock(folio); + folio_put(folio); + } + + return 0; +} + static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, @@ -136,6 +183,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * page table entry. Other special swap entries are not * migratable, and we ignore regular swapped page. */ + struct folio *folio; + entry = pte_to_swp_entry(pte); if (!is_device_private_entry(entry)) goto next; @@ -147,6 +196,23 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, pgmap->owner != migrate->pgmap_owner) goto next; + folio = page_folio(page); + if (folio_test_large(folio)) { + int ret; + + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + goto next; + } + + addr = start; + goto again; + } + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_writable_device_private_entry(entry)) @@ -171,6 +237,22 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, pgmap->owner != migrate->pgmap_owner) goto next; } + folio = page_folio(page); + if (folio_test_large(folio)) { + int ret; + + pte_unmap_unlock(ptep, ptl); + ret = migrate_vma_split_folio(folio, + migrate->fault_page); + + if (ret) { + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + goto next; + } + + addr = start; + goto again; + } mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } -- 2.50.1 Balbir Singh