On 2/5/26 22:10, Thomas Hellström wrote: > If hmm_range_fault() fails a folio_trylock() in do_swap_page, > trying to acquire the lock of a device-private folio for migration, > to ram, the function will spin until it succeeds grabbing the lock. > > However, if the process holding the lock is depending on a work > item to be completed, which is scheduled on the same CPU as the > spinning hmm_range_fault(), that work item might be starved and > we end up in a livelock / starvation situation which is never > resolved. > > This can happen, for example if the process holding the > device-private folio lock is stuck in > migrate_device_unmap()->lru_add_drain_all() > The lru_add_drain_all() function requires a short work-item > to be run on all online cpus to complete. > > A prerequisite for this to happen is: > a) Both zone device and system memory folios are considered in > migrate_device_unmap(), so that there is a reason to call > lru_add_drain_all() for a system memory folio while a > folio lock is held on a zone device folio. > b) The zone device folio has an initial mapcount > 1 which causes > at least one migration PTE entry insertion to be deferred to > try_to_migrate(), which can happen after the call to > lru_add_drain_all(). > c) No or voluntary only preemption. > > This all seems pretty unlikely to happen, but indeed is hit by > the "xe_exec_system_allocator" igt test. >
Do you have a stack trace from the test? I am trying to visualize the livelock/starvation, but I can't from the description. > Resolve this by waiting for the folio to be unlocked if the > folio_trylock() fails in the do_swap_page() function. > > Rename the migration_entry_wait_on_locked() function to > softleaf_entry_wait_unlock() and update its documentation to > indicate the new use-case. > > Future code improvements might consider moving > the lru_add_drain_all() call in migrate_device_unmap() to be > called *after* all pages have migration entries inserted. > That would eliminate also b) above. > > v2: > - Instead of a cond_resched() in the hmm_range_fault() function, > eliminate the problem by waiting for the folio to be unlocked > in do_swap_page() (Alistair Popple, Andrew Morton) > v3: > - Add a stub migration_entry_wait_on_locked() for the > !CONFIG_MIGRATION case. (Kernel Test Robot) > v4: > - Rename migrate_entry_wait_on_locked() to > softleaf_entry_wait_on_locked() and update docs (Alistair Popple) > > Suggested-by: Alistair Popple <[email protected]> > Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") > Cc: Ralph Campbell <[email protected]> > Cc: Christoph Hellwig <[email protected]> > Cc: Jason Gunthorpe <[email protected]> > Cc: Jason Gunthorpe <[email protected]> > Cc: Leon Romanovsky <[email protected]> > Cc: Andrew Morton <[email protected]> > Cc: Matthew Brost <[email protected]> > Cc: John Hubbard <[email protected]> > Cc: Alistair Popple <[email protected]> > Cc: [email protected] > Cc: <[email protected]> > Signed-off-by: Thomas Hellström <[email protected]> > Cc: <[email protected]> # v6.15+ > Reviewed-by: John Hubbard <[email protected]> #v3 > --- > include/linux/migrate.h | 8 +++++++- > mm/filemap.c | 15 ++++++++++----- > mm/memory.c | 3 ++- > mm/migrate.c | 8 ++++---- > mm/migrate_device.c | 2 +- > 5 files changed, 24 insertions(+), 12 deletions(-) > > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index 26ca00c325d9..3cc387f1957d 100644 > --- a/include/linux/migrate.h > +++ b/include/linux/migrate.h > @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct > list_head *list); > > int migrate_huge_page_move_mapping(struct address_space *mapping, > struct folio *dst, struct folio *src); > -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) > +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) > __releases(ptl); > void folio_migrate_flags(struct folio *newfolio, struct folio *folio); > int folio_migrate_mapping(struct address_space *mapping, > @@ -97,6 +97,12 @@ static inline int set_movable_ops(const struct > movable_operations *ops, enum pag > return -ENOSYS; > } > > +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, > spinlock_t *ptl) > + __releases(ptl) > +{ > + spin_unlock(ptl); > +} > + > #endif /* CONFIG_MIGRATION */ > > #ifdef CONFIG_NUMA_BALANCING > diff --git a/mm/filemap.c b/mm/filemap.c > index ebd75684cb0a..d98e4883f13d 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -1379,14 +1379,16 @@ static inline int folio_wait_bit_common(struct folio > *folio, int bit_nr, > > #ifdef CONFIG_MIGRATION > /** > - * migration_entry_wait_on_locked - Wait for a migration entry to be removed > - * @entry: migration swap entry. > + * softleaf_entry_wait_on_locked - Wait for a migration entry or > + * device_private entry to be removed. > + * @entry: migration or device_private swap entry. > * @ptl: already locked ptl. This function will drop the lock. > * > - * Wait for a migration entry referencing the given page to be removed. This > is > + * Wait for a migration entry referencing the given page, or device_private > + * entry referencing a dvice_private page to be unlocked. This is > * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except > * this can be called without taking a reference on the page. Instead this > - * should be called while holding the ptl for the migration entry referencing > + * should be called while holding the ptl for @entry referencing > * the page. > * > * Returns after unlocking the ptl. > @@ -1394,7 +1396,7 @@ static inline int folio_wait_bit_common(struct folio > *folio, int bit_nr, > * This follows the same logic as folio_wait_bit_common() so see the comments > * there. > */ > -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) > +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) > __releases(ptl) > { > struct wait_page_queue wait_page; > @@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, > spinlock_t *ptl) > * If a migration entry exists for the page the migration path must hold > * a valid reference to the page, and it must take the ptl to remove the > * migration entry. So the page is valid until the ptl is dropped. > + * Similarly any path attempting to drop the last reference to a > + * device-private page needs to grab the ptl to remove the > device-private > + * entry. > */ > spin_unlock(ptl); > > diff --git a/mm/memory.c b/mm/memory.c > index da360a6eb8a4..20172476a57f 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4684,7 +4684,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > unlock_page(vmf->page); > put_page(vmf->page); > } else { > - pte_unmap_unlock(vmf->pte, vmf->ptl); > + pte_unmap(vmf->pte); > + softleaf_entry_wait_on_locked(entry, vmf->ptl); > } > } else if (softleaf_is_hwpoison(entry)) { > ret = VM_FAULT_HWPOISON; > diff --git a/mm/migrate.c b/mm/migrate.c > index 4688b9e38cd2..cf6449b4202e 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -499,7 +499,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t > *pmd, > if (!softleaf_is_migration(entry)) > goto out; > > - migration_entry_wait_on_locked(entry, ptl); > + softleaf_entry_wait_on_locked(entry, ptl); > return; > out: > spin_unlock(ptl); > @@ -531,10 +531,10 @@ void migration_entry_wait_huge(struct vm_area_struct > *vma, unsigned long addr, p > * If migration entry existed, safe to release vma lock > * here because the pgtable page won't be freed without the > * pgtable lock released. See comment right above pgtable > - * lock release in migration_entry_wait_on_locked(). > + * lock release in softleaf_entry_wait_on_locked(). > */ > hugetlb_vma_unlock_read(vma); > - migration_entry_wait_on_locked(entry, ptl); > + softleaf_entry_wait_on_locked(entry, ptl); > return; > } > > @@ -552,7 +552,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t > *pmd) > ptl = pmd_lock(mm, pmd); > if (!pmd_is_migration_entry(*pmd)) > goto unlock; > - migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); > + softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); > return; > unlock: > spin_unlock(ptl); > diff --git a/mm/migrate_device.c b/mm/migrate_device.c > index 23379663b1e1..deab89fd4541 100644 > --- a/mm/migrate_device.c > +++ b/mm/migrate_device.c > @@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, > unsigned long start, > } > > if (softleaf_is_migration(entry)) { > - migration_entry_wait_on_locked(entry, ptl); > + softleaf_entry_wait_on_locked(entry, ptl); > spin_unlock(ptl); > return -EAGAIN; > } Balbir
