Re: [PATCH v3 4/4] drm/xe/userptr: Defer Waiting for TLB invalidation to the second pass if possible

2026-03-03 Thread Matthew Brost
On Tue, Mar 03, 2026 at 02:34:09PM +0100, Thomas Hellström wrote:
> Now that the two-pass notifier flow uses xe_vma_userptr_do_inval() for
> the fence-wait + TLB-invalidate work, extend it to support a further
> deferred TLB wait:
> 
> - xe_vma_userptr_do_inval(): when the embedded finish handle is free,
>   submit the TLB invalidation asynchronously (xe_vm_invalidate_vma_submit)
>   and return &userptr->finish so the mmu_notifier core schedules a third
>   pass.  When the handle is occupied by a concurrent invalidation, fall
>   back to the synchronous xe_vm_invalidate_vma() path.
> 
> - xe_vma_userptr_complete_tlb_inval(): new helper called from
>   invalidate_finish when tlb_inval_submitted is set.  Waits for the
>   previously submitted batch and unmaps the gpusvm pages.
> 
> xe_vma_userptr_invalidate_finish() dispatches between the two helpers
> via tlb_inval_submitted, making the three possible flows explicit:
> 
>   pass1 (fences pending)  -> invalidate_finish -> do_inval (sync TLB)
>   pass1 (fences done) -> do_inval -> invalidate_finish
>   -> complete_tlb_inval (deferred TLB)
>   pass1 (finish occupied) -> do_inval (sync TLB, inline)
> 
> In multi-GPU scenarios this allows TLB flushes to be submitted on all
> GPUs in one pass before any of them are waited on.
> 
> Also adds xe_vm_invalidate_vma_submit() which submits the TLB range
> invalidation without blocking, populating a xe_tlb_inval_batch that
> the caller waits on separately.
> 
> v3:
> - Add locking asserts and notifier state asserts (Matt Brost)
> - Update the locking documentation of the notifier
>   state members (Matt Brost)
> - Remove unrelated code formatting changes (Matt Brost)
> 
> Assisted-by: GitHub Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström 

Reviewed-by: Matthew Brost 

> ---
>  drivers/gpu/drm/xe/xe_userptr.c | 63 -
>  drivers/gpu/drm/xe/xe_userptr.h | 17 +
>  drivers/gpu/drm/xe/xe_vm.c  | 38 +++-
>  drivers/gpu/drm/xe/xe_vm.h  |  2 ++
>  4 files changed, 104 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c
> index 37032b8125a6..6761005c0b90 100644
> --- a/drivers/gpu/drm/xe/xe_userptr.c
> +++ b/drivers/gpu/drm/xe/xe_userptr.c
> @@ -8,6 +8,7 @@
>  
>  #include 
>  
> +#include "xe_tlb_inval.h"
>  #include "xe_trace_bo.h"
>  
>  static void xe_userptr_assert_in_notifier(struct xe_vm *vm)
> @@ -81,8 +82,8 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
>   &ctx);
>  }
>  
> -static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma 
> *uvma,
> - bool is_deferred)
> +static struct mmu_interval_notifier_finish *
> +xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma *uvma, bool 
> is_deferred)
>  {
>   struct xe_userptr *userptr = &uvma->userptr;
>   struct xe_vma *vma = &uvma->vma;
> @@ -93,6 +94,8 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
> struct xe_userptr_vma *uvm
>   long err;
>  
>   xe_userptr_assert_in_notifier(vm);
> + if (is_deferred)
> + xe_assert(vm->xe, userptr->finish_inuse && 
> !userptr->tlb_inval_submitted);
>  
>   err = dma_resv_wait_timeout(xe_vm_resv(vm),
>   DMA_RESV_USAGE_BOOKKEEP,
> @@ -100,6 +103,19 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
> struct xe_userptr_vma *uvm
>   XE_WARN_ON(err <= 0);
>  
>   if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
> + if (!userptr->finish_inuse) {
> + /*
> +  * Defer the TLB wait to an extra pass so the caller
> +  * can pipeline TLB flushes across GPUs before waiting
> +  * on any of them.
> +  */
> + xe_assert(vm->xe, !userptr->tlb_inval_submitted);
> + userptr->finish_inuse = true;
> + userptr->tlb_inval_submitted = true;
> + err = xe_vm_invalidate_vma_submit(vma, 
> &userptr->inval_batch);
> + XE_WARN_ON(err);
> + return &userptr->finish;
> + }
>   err = xe_vm_invalidate_vma(vma);
>   XE_WARN_ON(err);
>   }
> @@ -108,6 +124,28 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
> struct xe_userptr_vma *uvm
>   userptr->finish_inuse = false;
>   drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
>  xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
> + return NULL;
> +}
> +
> +static void
> +xe_vma_userptr_complete_tlb_inval(struct xe_vm *vm, struct xe_userptr_vma 
> *uvma)
> +{
> + struct xe_userptr *userptr = &uvma->userptr;
> + struct xe_vma *vma = &uvma->vma;
> + struct drm_gpusvm_ctx ctx = {
> + .in_notifier = true,
> +

[PATCH v3 4/4] drm/xe/userptr: Defer Waiting for TLB invalidation to the second pass if possible

2026-03-03 Thread Thomas Hellström
Now that the two-pass notifier flow uses xe_vma_userptr_do_inval() for
the fence-wait + TLB-invalidate work, extend it to support a further
deferred TLB wait:

- xe_vma_userptr_do_inval(): when the embedded finish handle is free,
  submit the TLB invalidation asynchronously (xe_vm_invalidate_vma_submit)
  and return &userptr->finish so the mmu_notifier core schedules a third
  pass.  When the handle is occupied by a concurrent invalidation, fall
  back to the synchronous xe_vm_invalidate_vma() path.

- xe_vma_userptr_complete_tlb_inval(): new helper called from
  invalidate_finish when tlb_inval_submitted is set.  Waits for the
  previously submitted batch and unmaps the gpusvm pages.

xe_vma_userptr_invalidate_finish() dispatches between the two helpers
via tlb_inval_submitted, making the three possible flows explicit:

  pass1 (fences pending)  -> invalidate_finish -> do_inval (sync TLB)
  pass1 (fences done) -> do_inval -> invalidate_finish
  -> complete_tlb_inval (deferred TLB)
  pass1 (finish occupied) -> do_inval (sync TLB, inline)

In multi-GPU scenarios this allows TLB flushes to be submitted on all
GPUs in one pass before any of them are waited on.

Also adds xe_vm_invalidate_vma_submit() which submits the TLB range
invalidation without blocking, populating a xe_tlb_inval_batch that
the caller waits on separately.

v3:
- Add locking asserts and notifier state asserts (Matt Brost)
- Update the locking documentation of the notifier
  state members (Matt Brost)
- Remove unrelated code formatting changes (Matt Brost)

Assisted-by: GitHub Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström 
---
 drivers/gpu/drm/xe/xe_userptr.c | 63 -
 drivers/gpu/drm/xe/xe_userptr.h | 17 +
 drivers/gpu/drm/xe/xe_vm.c  | 38 +++-
 drivers/gpu/drm/xe/xe_vm.h  |  2 ++
 4 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c
index 37032b8125a6..6761005c0b90 100644
--- a/drivers/gpu/drm/xe/xe_userptr.c
+++ b/drivers/gpu/drm/xe/xe_userptr.c
@@ -8,6 +8,7 @@
 
 #include 
 
+#include "xe_tlb_inval.h"
 #include "xe_trace_bo.h"
 
 static void xe_userptr_assert_in_notifier(struct xe_vm *vm)
@@ -81,8 +82,8 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
&ctx);
 }
 
-static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma 
*uvma,
-   bool is_deferred)
+static struct mmu_interval_notifier_finish *
+xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma *uvma, bool 
is_deferred)
 {
struct xe_userptr *userptr = &uvma->userptr;
struct xe_vma *vma = &uvma->vma;
@@ -93,6 +94,8 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct 
xe_userptr_vma *uvm
long err;
 
xe_userptr_assert_in_notifier(vm);
+   if (is_deferred)
+   xe_assert(vm->xe, userptr->finish_inuse && 
!userptr->tlb_inval_submitted);
 
err = dma_resv_wait_timeout(xe_vm_resv(vm),
DMA_RESV_USAGE_BOOKKEEP,
@@ -100,6 +103,19 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
struct xe_userptr_vma *uvm
XE_WARN_ON(err <= 0);
 
if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
+   if (!userptr->finish_inuse) {
+   /*
+* Defer the TLB wait to an extra pass so the caller
+* can pipeline TLB flushes across GPUs before waiting
+* on any of them.
+*/
+   xe_assert(vm->xe, !userptr->tlb_inval_submitted);
+   userptr->finish_inuse = true;
+   userptr->tlb_inval_submitted = true;
+   err = xe_vm_invalidate_vma_submit(vma, 
&userptr->inval_batch);
+   XE_WARN_ON(err);
+   return &userptr->finish;
+   }
err = xe_vm_invalidate_vma(vma);
XE_WARN_ON(err);
}
@@ -108,6 +124,28 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
struct xe_userptr_vma *uvm
userptr->finish_inuse = false;
drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
   xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
+   return NULL;
+}
+
+static void
+xe_vma_userptr_complete_tlb_inval(struct xe_vm *vm, struct xe_userptr_vma 
*uvma)
+{
+   struct xe_userptr *userptr = &uvma->userptr;
+   struct xe_vma *vma = &uvma->vma;
+   struct drm_gpusvm_ctx ctx = {
+   .in_notifier = true,
+   .read_only = xe_vma_read_only(vma),
+   };
+
+   xe_userptr_assert_in_notifier(vm);
+   xe_assert(vm->xe, userptr->finish_inuse);
+   xe_assert(vm->xe, userptr->tlb_inval_submitted);
+
+   xe_tlb_inval_batch_wait(&userptr->inval_batch)