Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-12-13 Thread Matthew Brost
On Fri, Dec 13, 2024 at 02:16:51PM -0800, Matthew Brost wrote:
> On Fri, Nov 29, 2024 at 10:31:32AM +1100, Alistair Popple wrote:
> > 
> > Matthew Brost  writes:
> > 
> > > Avoid multiple CPU page faults to the same device page racing by trying
> > > to lock the page in do_swap_page before taking an extra reference to the
> > > page. This prevents scenarios where multiple CPU page faults each take
> > > an extra reference to a device page, which could abort migration in
> > > folio_migrate_mapping. With the device page being locked in
> > > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > > locking the fault_page argument.
> > >
> > > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > > DRM driver) SVM implementation if enough threads faulted the same device
> > > page.
> > >
> > > Cc: Philip Yang 
> > > Cc: Felix Kuehling 
> > > Cc: Christian König 
> > > Cc: Andrew Morton 
> > > Suggessted-by: Simona Vetter 
> > > Signed-off-by: Matthew Brost 
> > > ---
> > >  mm/memory.c | 13 ++---
> > >  mm/migrate_device.c | 69 ++---
> > >  2 files changed, 56 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 2366578015ad..b72bde782611 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >* Get a page reference while we know the page can't be
> > >* freed.
> > >*/
> > > - get_page(vmf->page);
> > > - pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > > - put_page(vmf->page);
> > > + if (trylock_page(vmf->page)) {
> > > + get_page(vmf->page);
> > > + pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > + ret = 
> > > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > > + put_page(vmf->page);
> > > + unlock_page(vmf->page);
> > 
> > Isn't the order wrong here? In the common case put_page() will have just
> > dropped the last reference on the page and freed it so the unlock_page()
> > needs to happen first.
> > 
> 
> Yes, this appears wrong. I haven't seen this show up as a problem but
> certainly a put should be done after dropping the lock. 
> 
> > > + } else {
> > > + pte_unmap_unlock(vmf->pte, vmf->ptl);
> > > + }
> > >   } else if (is_hwpoison_entry(entry)) {
> > >   ret = VM_FAULT_HWPOISON;
> > >   } else if (is_pte_marker_entry(entry)) {
> > > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > > index f163c2131022..2477d39f57be 100644
> > > --- a/mm/migrate_device.c
> > > +++ b/mm/migrate_device.c
> > > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  struct mm_walk *walk)
> > >  {
> > >   struct migrate_vma *migrate = walk->private;
> > > + struct folio *fault_folio = migrate->fault_page ?
> > > + page_folio(migrate->fault_page) : NULL;
> > >   struct vm_area_struct *vma = walk->vma;
> > >   struct mm_struct *mm = vma->vm_mm;
> > >   unsigned long addr = start, unmapped = 0;
> > > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  
> > >   folio_get(folio);
> > >   spin_unlock(ptl);
> > > - if (unlikely(!folio_trylock(folio)))
> > > + if (unlikely(fault_folio != folio &&
> > 
> > We don't currently support large ZONE_DEVICE pages so we should never
> > get here. I think a WARN_ON_ONCE(fault_folio == folio) and bail would be
> > better.
> > 
> 
> Sure will fix.
> 
> > > +  !folio_trylock(folio)))
> > >   return migrate_vma_collect_skip(start, end,
> > >   walk);
> > >   ret = split_folio(folio);
> > > - folio_unlock(folio);
> > > + if (fault_folio != folio)
> > > + folio_unlock(folio);
> > >   folio_put(folio);
> > >   if (ret)
> > >   return migrate_vma_collect_skip(start, end,
> > > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >* optimisation to avoid walking the rmap later with
> > >* try_to_migrate().
> > >*/
> > > - if (folio_trylock(folio)) {
> > > + if (fault_folio == folio || folio_trylock(folio)) {
> > >   bool anon_exclusive;
> > >   pte_t swp_pte;
> > >  
> > > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> > >  
> > >   if (folio_try_share_anon_rmap_pte(folio, 

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-12-13 Thread Matthew Brost
On Fri, Nov 29, 2024 at 10:31:32AM +1100, Alistair Popple wrote:
> 
> Matthew Brost  writes:
> 
> > Avoid multiple CPU page faults to the same device page racing by trying
> > to lock the page in do_swap_page before taking an extra reference to the
> > page. This prevents scenarios where multiple CPU page faults each take
> > an extra reference to a device page, which could abort migration in
> > folio_migrate_mapping. With the device page being locked in
> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > locking the fault_page argument.
> >
> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > DRM driver) SVM implementation if enough threads faulted the same device
> > page.
> >
> > Cc: Philip Yang 
> > Cc: Felix Kuehling 
> > Cc: Christian König 
> > Cc: Andrew Morton 
> > Suggessted-by: Simona Vetter 
> > Signed-off-by: Matthew Brost 
> > ---
> >  mm/memory.c | 13 ++---
> >  mm/migrate_device.c | 69 ++---
> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 2366578015ad..b72bde782611 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >  * Get a page reference while we know the page can't be
> >  * freed.
> >  */
> > -   get_page(vmf->page);
> > -   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > -   ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > -   put_page(vmf->page);
> > +   if (trylock_page(vmf->page)) {
> > +   get_page(vmf->page);
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   ret = 
> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > +   put_page(vmf->page);
> > +   unlock_page(vmf->page);
> 
> Isn't the order wrong here? In the common case put_page() will have just
> dropped the last reference on the page and freed it so the unlock_page()
> needs to happen first.
> 

Yes, this appears wrong. I haven't seen this show up as a problem but
certainly a put should be done after dropping the lock. 

> > +   } else {
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   }
> > } else if (is_hwpoison_entry(entry)) {
> > ret = VM_FAULT_HWPOISON;
> > } else if (is_pte_marker_entry(entry)) {
> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > index f163c2131022..2477d39f57be 100644
> > --- a/mm/migrate_device.c
> > +++ b/mm/migrate_device.c
> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >struct mm_walk *walk)
> >  {
> > struct migrate_vma *migrate = walk->private;
> > +   struct folio *fault_folio = migrate->fault_page ?
> > +   page_folio(migrate->fault_page) : NULL;
> > struct vm_area_struct *vma = walk->vma;
> > struct mm_struct *mm = vma->vm_mm;
> > unsigned long addr = start, unmapped = 0;
> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> > folio_get(folio);
> > spin_unlock(ptl);
> > -   if (unlikely(!folio_trylock(folio)))
> > +   if (unlikely(fault_folio != folio &&
> 
> We don't currently support large ZONE_DEVICE pages so we should never
> get here. I think a WARN_ON_ONCE(fault_folio == folio) and bail would be
> better.
> 

Sure will fix.

> > +!folio_trylock(folio)))
> > return migrate_vma_collect_skip(start, end,
> > walk);
> > ret = split_folio(folio);
> > -   folio_unlock(folio);
> > +   if (fault_folio != folio)
> > +   folio_unlock(folio);
> > folio_put(folio);
> > if (ret)
> > return migrate_vma_collect_skip(start, end,
> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  * optimisation to avoid walking the rmap later with
> >  * try_to_migrate().
> >  */
> > -   if (folio_trylock(folio)) {
> > +   if (fault_folio == folio || folio_trylock(folio)) {
> > bool anon_exclusive;
> > pte_t swp_pte;
> >  
> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> > if (folio_try_share_anon_rmap_pte(folio, page)) 
> > {
> > set_pte_at(mm, addr, ptep, pte);
> > -   folio_unlock(folio);
> > +  

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-11-28 Thread Alistair Popple


Matthew Brost  writes:

> Avoid multiple CPU page faults to the same device page racing by trying
> to lock the page in do_swap_page before taking an extra reference to the
> page. This prevents scenarios where multiple CPU page faults each take
> an extra reference to a device page, which could abort migration in
> folio_migrate_mapping. With the device page being locked in
> do_swap_page, the migrate_vma_* functions need to be updated to avoid
> locking the fault_page argument.
>
> Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> DRM driver) SVM implementation if enough threads faulted the same device
> page.
>
> Cc: Philip Yang 
> Cc: Felix Kuehling 
> Cc: Christian König 
> Cc: Andrew Morton 
> Suggessted-by: Simona Vetter 
> Signed-off-by: Matthew Brost 
> ---
>  mm/memory.c | 13 ++---
>  mm/migrate_device.c | 69 ++---
>  2 files changed, 56 insertions(+), 26 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 2366578015ad..b72bde782611 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>* Get a page reference while we know the page can't be
>* freed.
>*/
> - get_page(vmf->page);
> - pte_unmap_unlock(vmf->pte, vmf->ptl);
> - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> - put_page(vmf->page);
> + if (trylock_page(vmf->page)) {
> + get_page(vmf->page);
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + ret = 
> vmf->page->pgmap->ops->migrate_to_ram(vmf);
> + put_page(vmf->page);
> + unlock_page(vmf->page);

Isn't the order wrong here? In the common case put_page() will have just
dropped the last reference on the page and freed it so the unlock_page()
needs to happen first.

> + } else {
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + }
>   } else if (is_hwpoison_entry(entry)) {
>   ret = VM_FAULT_HWPOISON;
>   } else if (is_pte_marker_entry(entry)) {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index f163c2131022..2477d39f57be 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  struct mm_walk *walk)
>  {
>   struct migrate_vma *migrate = walk->private;
> + struct folio *fault_folio = migrate->fault_page ?
> + page_folio(migrate->fault_page) : NULL;
>   struct vm_area_struct *vma = walk->vma;
>   struct mm_struct *mm = vma->vm_mm;
>   unsigned long addr = start, unmapped = 0;
> @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>   folio_get(folio);
>   spin_unlock(ptl);
> - if (unlikely(!folio_trylock(folio)))
> + if (unlikely(fault_folio != folio &&

We don't currently support large ZONE_DEVICE pages so we should never
get here. I think a WARN_ON_ONCE(fault_folio == folio) and bail would be
better.

> +  !folio_trylock(folio)))
>   return migrate_vma_collect_skip(start, end,
>   walk);
>   ret = split_folio(folio);
> - folio_unlock(folio);
> + if (fault_folio != folio)
> + folio_unlock(folio);
>   folio_put(folio);
>   if (ret)
>   return migrate_vma_collect_skip(start, end,
> @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>* optimisation to avoid walking the rmap later with
>* try_to_migrate().
>*/
> - if (folio_trylock(folio)) {
> + if (fault_folio == folio || folio_trylock(folio)) {
>   bool anon_exclusive;
>   pte_t swp_pte;
>  
> @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>   if (folio_try_share_anon_rmap_pte(folio, page)) 
> {
>   set_pte_at(mm, addr, ptep, pte);
> - folio_unlock(folio);
> + if (fault_folio != folio)
> + folio_unlock(folio);
>   folio_put(folio);
>   mpfn = 0;
>   goto next;
> @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(u

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-11-01 Thread Matthew Brost
On Tue, Oct 29, 2024 at 05:37:45PM +1100, Alistair Popple wrote:
> 
> Matthew Brost  writes:
> 
> > On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost  writes:
> >> 
> >> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> >> >> 
> >> >> Matthew Brost  writes:
> >> >> 
> >> >> > Avoid multiple CPU page faults to the same device page racing by 
> >> >> > trying
> >> >> > to lock the page in do_swap_page before taking an extra reference to 
> >> >> > the
> >> >> > page. This prevents scenarios where multiple CPU page faults each take
> >> >> > an extra reference to a device page, which could abort migration in
> >> >> > folio_migrate_mapping. With the device page being locked in
> >> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> >> >> > locking the fault_page argument.
> >> >> >
> >> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel 
> >> >> > GPU
> >> >> > DRM driver) SVM implementation if enough threads faulted the same 
> >> >> > device
> >> >> > page.
> >> >> >
> >> >> > Cc: Philip Yang 
> >> >> > Cc: Felix Kuehling 
> >> >> > Cc: Christian König 
> >> >> > Cc: Andrew Morton 
> >> >> > Suggessted-by: Simona Vetter 
> >> >> > Signed-off-by: Matthew Brost 
> >> >> > ---
> >> >> >  mm/memory.c | 13 ++---
> >> >> >  mm/migrate_device.c | 69 
> >> >> > ++---
> >> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >> >> >
> >> >> > diff --git a/mm/memory.c b/mm/memory.c
> >> >> > index 2366578015ad..b72bde782611 100644
> >> >> > --- a/mm/memory.c
> >> >> > +++ b/mm/memory.c
> >> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >> >* Get a page reference while we know the page 
> >> >> > can't be
> >> >> >* freed.
> >> >> >*/
> >> >> > - get_page(vmf->page);
> >> >> > - pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> >> > - ret = 
> >> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> >> > - put_page(vmf->page);
> >> >> > + if (trylock_page(vmf->page)) {
> >> >> > + get_page(vmf->page);
> >> >> > + pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> >> > + ret = 
> >> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> >> > + put_page(vmf->page);
> >> >> > + unlock_page(vmf->page);
> >> >> 
> >> >> I don't think my previous review of this change has really been
> >> >> addressed. Why don't we just install the migration entry here? Seems
> >> >> like it would be a much simpler way of solving this.
> >> >> 
> >> >
> >> > I should have mentioned this in the cover-letter, I haven't got around
> >> > to trying that out yet. Included this existing version for correctness
> >> > but I also think this is not strickly required to merge this series as
> >> > our locking in migrate_to_ram only relies on the core MM locks so
> >> > some thread would eventually win the race and make forward progress.
> >> >
> >> > So I guess just ignore this patch and will send an updated version
> >> > individually with installing a migration entry in do_swap_page. If for
> >> > some reason that doesn't work, I'll respond here explaining why.
> >> 
> >> That would be great. I have a fairly strong preference for doing that
> >> instead of adding more special cases for the fault page in the migration
> >> code. And if we can't do that it would be good to understand
> >> why. Thanks.
> >> 
> >
> > I've looked into this and actually prefer the approach in this patch.
> 
> Thanks for looking into this.
> 
> > Consider the scenario where we install a migration entry, but
> > migrate_to_ram fails. How do we handle this?
> >
> > We don't know where migrate_to_ram failed. Was migrate_device_finalize
> > called, removing the migration PTE? Do we need to special-case failures
> > in migrate_to_ram to prevent migrate_device_finalize from removing the
> > faulting page's migration entry? Should we check for a migration entry
> > after migrate_to_ram and remove it if it exists?
> 
> The driver should always call migrate_device_finalize(). On failure it
> will remove the migration entry and remap the original device private
> page. That obviously doesn't handle the fault but the process is about
> to die anyway with a SIGBUS because migrate_to_ram() can't fail.
> 

What if migrate_to_ram fails before calling migrate_vma_setup - e.g. a
kmalloc of the arguments fails? Very ackward situation.

> > Now, if migrate_to_ram succeeds, it seems the migration entry should be
> > removed in migrate_device_finalize since the new page is only available
> > there. We could return the new page in migrate_to_ram, but that feels
> > messy.
> 
> Agreed - I would expect migrat

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-10-28 Thread Alistair Popple


Matthew Brost  writes:

> On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost  writes:
>> 
>> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
>> >> 
>> >> Matthew Brost  writes:
>> >> 
>> >> > Avoid multiple CPU page faults to the same device page racing by trying
>> >> > to lock the page in do_swap_page before taking an extra reference to the
>> >> > page. This prevents scenarios where multiple CPU page faults each take
>> >> > an extra reference to a device page, which could abort migration in
>> >> > folio_migrate_mapping. With the device page being locked in
>> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
>> >> > locking the fault_page argument.
>> >> >
>> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
>> >> > DRM driver) SVM implementation if enough threads faulted the same device
>> >> > page.
>> >> >
>> >> > Cc: Philip Yang 
>> >> > Cc: Felix Kuehling 
>> >> > Cc: Christian König 
>> >> > Cc: Andrew Morton 
>> >> > Suggessted-by: Simona Vetter 
>> >> > Signed-off-by: Matthew Brost 
>> >> > ---
>> >> >  mm/memory.c | 13 ++---
>> >> >  mm/migrate_device.c | 69 ++---
>> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
>> >> >
>> >> > diff --git a/mm/memory.c b/mm/memory.c
>> >> > index 2366578015ad..b72bde782611 100644
>> >> > --- a/mm/memory.c
>> >> > +++ b/mm/memory.c
>> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> >> >  * Get a page reference while we know the page 
>> >> > can't be
>> >> >  * freed.
>> >> >  */
>> >> > -   get_page(vmf->page);
>> >> > -   pte_unmap_unlock(vmf->pte, vmf->ptl);
>> >> > -   ret = 
>> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> >> > -   put_page(vmf->page);
>> >> > +   if (trylock_page(vmf->page)) {
>> >> > +   get_page(vmf->page);
>> >> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
>> >> > +   ret = 
>> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> >> > +   put_page(vmf->page);
>> >> > +   unlock_page(vmf->page);
>> >> 
>> >> I don't think my previous review of this change has really been
>> >> addressed. Why don't we just install the migration entry here? Seems
>> >> like it would be a much simpler way of solving this.
>> >> 
>> >
>> > I should have mentioned this in the cover-letter, I haven't got around
>> > to trying that out yet. Included this existing version for correctness
>> > but I also think this is not strickly required to merge this series as
>> > our locking in migrate_to_ram only relies on the core MM locks so
>> > some thread would eventually win the race and make forward progress.
>> >
>> > So I guess just ignore this patch and will send an updated version
>> > individually with installing a migration entry in do_swap_page. If for
>> > some reason that doesn't work, I'll respond here explaining why.
>> 
>> That would be great. I have a fairly strong preference for doing that
>> instead of adding more special cases for the fault page in the migration
>> code. And if we can't do that it would be good to understand
>> why. Thanks.
>> 
>
> I've looked into this and actually prefer the approach in this patch.

Thanks for looking into this.

> Consider the scenario where we install a migration entry, but
> migrate_to_ram fails. How do we handle this?
>
> We don't know where migrate_to_ram failed. Was migrate_device_finalize
> called, removing the migration PTE? Do we need to special-case failures
> in migrate_to_ram to prevent migrate_device_finalize from removing the
> faulting page's migration entry? Should we check for a migration entry
> after migrate_to_ram and remove it if it exists?

The driver should always call migrate_device_finalize(). On failure it
will remove the migration entry and remap the original device private
page. That obviously doesn't handle the fault but the process is about
to die anyway with a SIGBUS because migrate_to_ram() can't fail.

> Now, if migrate_to_ram succeeds, it seems the migration entry should be
> removed in migrate_device_finalize since the new page is only available
> there. We could return the new page in migrate_to_ram, but that feels
> messy.

Agreed - I would expect migrate_device_finalize() to always be called
and remove the migration entry.

> Additionally, the page lock needs to be held across migrate_to_ram, as
> this patch does, so we'll require some special handling in
> migrate_device_finalize to avoid unlocking the faulting page.

Or just unlock it in migrate_device_finalize(). I agree locking it one
place and unlocking it in another is a bit ugly though.

> Finally, installing a 

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-10-24 Thread Matthew Brost
On Thu, Oct 17, 2024 at 12:51:08PM +1100, Alistair Popple wrote:
> 
> Matthew Brost  writes:
> 
> > On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> >> 
> >> Matthew Brost  writes:
> >> 
> >> > Avoid multiple CPU page faults to the same device page racing by trying
> >> > to lock the page in do_swap_page before taking an extra reference to the
> >> > page. This prevents scenarios where multiple CPU page faults each take
> >> > an extra reference to a device page, which could abort migration in
> >> > folio_migrate_mapping. With the device page being locked in
> >> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> >> > locking the fault_page argument.
> >> >
> >> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> >> > DRM driver) SVM implementation if enough threads faulted the same device
> >> > page.
> >> >
> >> > Cc: Philip Yang 
> >> > Cc: Felix Kuehling 
> >> > Cc: Christian König 
> >> > Cc: Andrew Morton 
> >> > Suggessted-by: Simona Vetter 
> >> > Signed-off-by: Matthew Brost 
> >> > ---
> >> >  mm/memory.c | 13 ++---
> >> >  mm/migrate_device.c | 69 ++---
> >> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >> >
> >> > diff --git a/mm/memory.c b/mm/memory.c
> >> > index 2366578015ad..b72bde782611 100644
> >> > --- a/mm/memory.c
> >> > +++ b/mm/memory.c
> >> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> >   * Get a page reference while we know the page 
> >> > can't be
> >> >   * freed.
> >> >   */
> >> > -get_page(vmf->page);
> >> > -pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > -ret = 
> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> > -put_page(vmf->page);
> >> > +if (trylock_page(vmf->page)) {
> >> > +get_page(vmf->page);
> >> > +pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > +ret = 
> >> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> >> > +put_page(vmf->page);
> >> > +unlock_page(vmf->page);
> >> 
> >> I don't think my previous review of this change has really been
> >> addressed. Why don't we just install the migration entry here? Seems
> >> like it would be a much simpler way of solving this.
> >> 
> >
> > I should have mentioned this in the cover-letter, I haven't got around
> > to trying that out yet. Included this existing version for correctness
> > but I also think this is not strickly required to merge this series as
> > our locking in migrate_to_ram only relies on the core MM locks so
> > some thread would eventually win the race and make forward progress.
> >
> > So I guess just ignore this patch and will send an updated version
> > individually with installing a migration entry in do_swap_page. If for
> > some reason that doesn't work, I'll respond here explaining why.
> 
> That would be great. I have a fairly strong preference for doing that
> instead of adding more special cases for the fault page in the migration
> code. And if we can't do that it would be good to understand
> why. Thanks.
> 

I've looked into this and actually prefer the approach in this patch.

Consider the scenario where we install a migration entry, but
migrate_to_ram fails. How do we handle this?

We don't know where migrate_to_ram failed. Was migrate_device_finalize
called, removing the migration PTE? Do we need to special-case failures
in migrate_to_ram to prevent migrate_device_finalize from removing the
faulting page's migration entry? Should we check for a migration entry
after migrate_to_ram and remove it if it exists?

Now, if migrate_to_ram succeeds, it seems the migration entry should be
removed in migrate_device_finalize since the new page is only available
there. We could return the new page in migrate_to_ram, but that feels
messy.

Additionally, the page lock needs to be held across migrate_to_ram, as
this patch does, so we'll require some special handling in
migrate_device_finalize to avoid unlocking the faulting page.

Finally, installing a migration entry is non-trivial, while taking a
page reference under a lock is straightforward.

Given all this, I prefer to keep this patch as it is.

Matt

>  - Alistair
> 
> > Matt
> >
> >> > +} else {
> >> > +pte_unmap_unlock(vmf->pte, vmf->ptl);
> >> > +}
> >> >  } else if (is_hwpoison_entry(entry)) {
> >> >  ret = VM_FAULT_HWPOISON;
> >> >  } else if (is_pte_marker_entry(entry)) {
> >> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> > index f163c2131022..2477d39f57be 100644
> >> > --- a/mm/migrate_dev

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-10-16 Thread Alistair Popple


Matthew Brost  writes:

> On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
>> 
>> Matthew Brost  writes:
>> 
>> > Avoid multiple CPU page faults to the same device page racing by trying
>> > to lock the page in do_swap_page before taking an extra reference to the
>> > page. This prevents scenarios where multiple CPU page faults each take
>> > an extra reference to a device page, which could abort migration in
>> > folio_migrate_mapping. With the device page being locked in
>> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
>> > locking the fault_page argument.
>> >
>> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
>> > DRM driver) SVM implementation if enough threads faulted the same device
>> > page.
>> >
>> > Cc: Philip Yang 
>> > Cc: Felix Kuehling 
>> > Cc: Christian König 
>> > Cc: Andrew Morton 
>> > Suggessted-by: Simona Vetter 
>> > Signed-off-by: Matthew Brost 
>> > ---
>> >  mm/memory.c | 13 ++---
>> >  mm/migrate_device.c | 69 ++---
>> >  2 files changed, 56 insertions(+), 26 deletions(-)
>> >
>> > diff --git a/mm/memory.c b/mm/memory.c
>> > index 2366578015ad..b72bde782611 100644
>> > --- a/mm/memory.c
>> > +++ b/mm/memory.c
>> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> > * Get a page reference while we know the page can't be
>> > * freed.
>> > */
>> > -  get_page(vmf->page);
>> > -  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > -  ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> > -  put_page(vmf->page);
>> > +  if (trylock_page(vmf->page)) {
>> > +  get_page(vmf->page);
>> > +  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > +  ret = 
>> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
>> > +  put_page(vmf->page);
>> > +  unlock_page(vmf->page);
>> 
>> I don't think my previous review of this change has really been
>> addressed. Why don't we just install the migration entry here? Seems
>> like it would be a much simpler way of solving this.
>> 
>
> I should have mentioned this in the cover-letter, I haven't got around
> to trying that out yet. Included this existing version for correctness
> but I also think this is not strickly required to merge this series as
> our locking in migrate_to_ram only relies on the core MM locks so
> some thread would eventually win the race and make forward progress.
>
> So I guess just ignore this patch and will send an updated version
> individually with installing a migration entry in do_swap_page. If for
> some reason that doesn't work, I'll respond here explaining why.

That would be great. I have a fairly strong preference for doing that
instead of adding more special cases for the fault page in the migration
code. And if we can't do that it would be good to understand
why. Thanks.

 - Alistair

> Matt
>
>> > +  } else {
>> > +  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> > +  }
>> >} else if (is_hwpoison_entry(entry)) {
>> >ret = VM_FAULT_HWPOISON;
>> >} else if (is_pte_marker_entry(entry)) {
>> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> > index f163c2131022..2477d39f57be 100644
>> > --- a/mm/migrate_device.c
>> > +++ b/mm/migrate_device.c
>> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >   struct mm_walk *walk)
>> >  {
>> >struct migrate_vma *migrate = walk->private;
>> > +  struct folio *fault_folio = migrate->fault_page ?
>> > +  page_folio(migrate->fault_page) : NULL;
>> >struct vm_area_struct *vma = walk->vma;
>> >struct mm_struct *mm = vma->vm_mm;
>> >unsigned long addr = start, unmapped = 0;
>> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >  
>> >folio_get(folio);
>> >spin_unlock(ptl);
>> > -  if (unlikely(!folio_trylock(folio)))
>> > +  if (unlikely(fault_folio != folio &&
>> > +   !folio_trylock(folio)))
>> >return migrate_vma_collect_skip(start, end,
>> >walk);
>> >ret = split_folio(folio);
>> > -  folio_unlock(folio);
>> > +  if (fault_folio != folio)
>> > +  folio_unlock(folio);
>> >folio_put(folio);
>> >if (ret)
>> >return migrate_vma_collect_skip(start, end,
>> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> > * optimisation to avoid walking the rmap later with
>> 

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-10-15 Thread Matthew Brost
On Wed, Oct 16, 2024 at 03:00:08PM +1100, Alistair Popple wrote:
> 
> Matthew Brost  writes:
> 
> > Avoid multiple CPU page faults to the same device page racing by trying
> > to lock the page in do_swap_page before taking an extra reference to the
> > page. This prevents scenarios where multiple CPU page faults each take
> > an extra reference to a device page, which could abort migration in
> > folio_migrate_mapping. With the device page being locked in
> > do_swap_page, the migrate_vma_* functions need to be updated to avoid
> > locking the fault_page argument.
> >
> > Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> > DRM driver) SVM implementation if enough threads faulted the same device
> > page.
> >
> > Cc: Philip Yang 
> > Cc: Felix Kuehling 
> > Cc: Christian König 
> > Cc: Andrew Morton 
> > Suggessted-by: Simona Vetter 
> > Signed-off-by: Matthew Brost 
> > ---
> >  mm/memory.c | 13 ++---
> >  mm/migrate_device.c | 69 ++---
> >  2 files changed, 56 insertions(+), 26 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 2366578015ad..b72bde782611 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >  * Get a page reference while we know the page can't be
> >  * freed.
> >  */
> > -   get_page(vmf->page);
> > -   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > -   ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > -   put_page(vmf->page);
> > +   if (trylock_page(vmf->page)) {
> > +   get_page(vmf->page);
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   ret = 
> > vmf->page->pgmap->ops->migrate_to_ram(vmf);
> > +   put_page(vmf->page);
> > +   unlock_page(vmf->page);
> 
> I don't think my previous review of this change has really been
> addressed. Why don't we just install the migration entry here? Seems
> like it would be a much simpler way of solving this.
> 

I should have mentioned this in the cover-letter, I haven't got around
to trying that out yet. Included this existing version for correctness
but I also think this is not strickly required to merge this series as
our locking in migrate_to_ram only relies on the core MM locks so
some thread would eventually win the race and make forward progress.

So I guess just ignore this patch and will send an updated version
individually with installing a migration entry in do_swap_page. If for
some reason that doesn't work, I'll respond here explaining why.

Matt

> > +   } else {
> > +   pte_unmap_unlock(vmf->pte, vmf->ptl);
> > +   }
> > } else if (is_hwpoison_entry(entry)) {
> > ret = VM_FAULT_HWPOISON;
> > } else if (is_pte_marker_entry(entry)) {
> > diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > index f163c2131022..2477d39f57be 100644
> > --- a/mm/migrate_device.c
> > +++ b/mm/migrate_device.c
> > @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >struct mm_walk *walk)
> >  {
> > struct migrate_vma *migrate = walk->private;
> > +   struct folio *fault_folio = migrate->fault_page ?
> > +   page_folio(migrate->fault_page) : NULL;
> > struct vm_area_struct *vma = walk->vma;
> > struct mm_struct *mm = vma->vm_mm;
> > unsigned long addr = start, unmapped = 0;
> > @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  
> > folio_get(folio);
> > spin_unlock(ptl);
> > -   if (unlikely(!folio_trylock(folio)))
> > +   if (unlikely(fault_folio != folio &&
> > +!folio_trylock(folio)))
> > return migrate_vma_collect_skip(start, end,
> > walk);
> > ret = split_folio(folio);
> > -   folio_unlock(folio);
> > +   if (fault_folio != folio)
> > +   folio_unlock(folio);
> > folio_put(folio);
> > if (ret)
> > return migrate_vma_collect_skip(start, end,
> > @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >  * optimisation to avoid walking the rmap later with
> >  * try_to_migrate().
> >  */
> > -   if (folio_trylock(folio)) {
> > +   if (fault_folio == folio || folio_trylock(folio)) {
> > bool anon_exclusive;
> > pte_t swp_pte;
> >  
> > @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp

Re: [PATCH v2 03/29] mm/migrate: Trylock device page in do_swap_page

2024-10-15 Thread Alistair Popple


Matthew Brost  writes:

> Avoid multiple CPU page faults to the same device page racing by trying
> to lock the page in do_swap_page before taking an extra reference to the
> page. This prevents scenarios where multiple CPU page faults each take
> an extra reference to a device page, which could abort migration in
> folio_migrate_mapping. With the device page being locked in
> do_swap_page, the migrate_vma_* functions need to be updated to avoid
> locking the fault_page argument.
>
> Prior to this change, a livelock scenario could occur in Xe's (Intel GPU
> DRM driver) SVM implementation if enough threads faulted the same device
> page.
>
> Cc: Philip Yang 
> Cc: Felix Kuehling 
> Cc: Christian König 
> Cc: Andrew Morton 
> Suggessted-by: Simona Vetter 
> Signed-off-by: Matthew Brost 
> ---
>  mm/memory.c | 13 ++---
>  mm/migrate_device.c | 69 ++---
>  2 files changed, 56 insertions(+), 26 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 2366578015ad..b72bde782611 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4252,10 +4252,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>* Get a page reference while we know the page can't be
>* freed.
>*/
> - get_page(vmf->page);
> - pte_unmap_unlock(vmf->pte, vmf->ptl);
> - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
> - put_page(vmf->page);
> + if (trylock_page(vmf->page)) {
> + get_page(vmf->page);
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + ret = 
> vmf->page->pgmap->ops->migrate_to_ram(vmf);
> + put_page(vmf->page);
> + unlock_page(vmf->page);

I don't think my previous review of this change has really been
addressed. Why don't we just install the migration entry here? Seems
like it would be a much simpler way of solving this.

> + } else {
> + pte_unmap_unlock(vmf->pte, vmf->ptl);
> + }
>   } else if (is_hwpoison_entry(entry)) {
>   ret = VM_FAULT_HWPOISON;
>   } else if (is_pte_marker_entry(entry)) {
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index f163c2131022..2477d39f57be 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -60,6 +60,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  struct mm_walk *walk)
>  {
>   struct migrate_vma *migrate = walk->private;
> + struct folio *fault_folio = migrate->fault_page ?
> + page_folio(migrate->fault_page) : NULL;
>   struct vm_area_struct *vma = walk->vma;
>   struct mm_struct *mm = vma->vm_mm;
>   unsigned long addr = start, unmapped = 0;
> @@ -88,11 +90,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>   folio_get(folio);
>   spin_unlock(ptl);
> - if (unlikely(!folio_trylock(folio)))
> + if (unlikely(fault_folio != folio &&
> +  !folio_trylock(folio)))
>   return migrate_vma_collect_skip(start, end,
>   walk);
>   ret = split_folio(folio);
> - folio_unlock(folio);
> + if (fault_folio != folio)
> + folio_unlock(folio);
>   folio_put(folio);
>   if (ret)
>   return migrate_vma_collect_skip(start, end,
> @@ -192,7 +196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>* optimisation to avoid walking the rmap later with
>* try_to_migrate().
>*/
> - if (folio_trylock(folio)) {
> + if (fault_folio == folio || folio_trylock(folio)) {
>   bool anon_exclusive;
>   pte_t swp_pte;
>  
> @@ -204,7 +208,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>  
>   if (folio_try_share_anon_rmap_pte(folio, page)) 
> {
>   set_pte_at(mm, addr, ptep, pte);
> - folio_unlock(folio);
> + if (fault_folio != folio)
> + folio_unlock(folio);
>   folio_put(folio);
>   mpfn = 0;
>   goto next;
> @@ -363,6 +368,8 @@ static unsigned long migrate_device_unmap(unsigned long 
> *src_pfns,
> unsigned long npages,
> str