userptr: Defer Waiting for TLB invalidation to the second pass if possible

Matthew Brost Mon, 02 Mar 2026 11:15:44 -0800

On Mon, Mar 02, 2026 at 05:32:48PM +0100, Thomas Hellström wrote:
> Now that the two-pass notifier flow uses xe_vma_userptr_do_inval() for
> the fence-wait + TLB-invalidate work, extend it to support a further
> deferred TLB wait:
> 
> - xe_vma_userptr_do_inval(): when the embedded finish handle is free,
>   submit the TLB invalidation asynchronously (xe_vm_invalidate_vma_submit)
>   and return &userptr->finish so the mmu_notifier core schedules a third
>   pass.  When the handle is occupied by a concurrent invalidation, fall
>   back to the synchronous xe_vm_invalidate_vma() path.
> 
> - xe_vma_userptr_complete_tlb_inval(): new helper called from
>   invalidate_finish when tlb_inval_submitted is set.  Waits for the
>   previously submitted batch and unmaps the gpusvm pages.
> 
> xe_vma_userptr_invalidate_finish() dispatches between the two helpers
> via tlb_inval_submitted, making the three possible flows explicit:
> 
>   pass1 (fences pending)  -> invalidate_finish -> do_inval (sync TLB)
>   pass1 (fences done)     -> do_inval -> invalidate_finish
>                           -> complete_tlb_inval (deferred TLB)
>   pass1 (finish occupied) -> do_inval (sync TLB, inline)
> 
> In multi-GPU scenarios this allows TLB flushes to be submitted on all
> GPUs in one pass before any of them are waited on.
> 
> Also adds xe_vm_invalidate_vma_submit() which submits the TLB range
> invalidation without blocking, populating a xe_tlb_inval_batch that
> the caller waits on separately.
>


As suggested in patch #2, maybe squash this into patch #2 as some of
patch #2 is immediately tweaked / rewritten here. 

A couple nits.

> Assisted-by: GitHub Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
>  drivers/gpu/drm/xe/xe_userptr.c | 60 +++++++++++++++++++++++++++------
>  drivers/gpu/drm/xe/xe_userptr.h | 18 ++++++++++
>  drivers/gpu/drm/xe/xe_vm.c      | 38 ++++++++++++++++-----
>  drivers/gpu/drm/xe/xe_vm.h      |  2 ++
>  4 files changed, 99 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c
> index 440b0a79d16f..a62b796afb93 100644
> --- a/drivers/gpu/drm/xe/xe_userptr.c
> +++ b/drivers/gpu/drm/xe/xe_userptr.c
> @@ -8,6 +8,7 @@
>  
>  #include <linux/mm.h>
>  
> +#include "xe_tlb_inval.h"
>  #include "xe_trace_bo.h"
>  
>  /**
> @@ -73,8 +74,8 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
>                                   &ctx);
>  }
>  
> -static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma 
> *uvma,
> -                                 bool is_deferred)
> +static struct mmu_interval_notifier_finish *
> +xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma *uvma, bool 
> is_deferred)
>  {
>       struct xe_userptr *userptr = &uvma->userptr;
>       struct xe_vma *vma = &uvma->vma;
> @@ -84,12 +85,23 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
> struct xe_userptr_vma *uvm
>       };
>       long err;
>  
> -     err = dma_resv_wait_timeout(xe_vm_resv(vm),
> -                                 DMA_RESV_USAGE_BOOKKEEP,
> +     err = dma_resv_wait_timeout(xe_vm_resv(vm), DMA_RESV_USAGE_BOOKKEEP,
>                                   false, MAX_SCHEDULE_TIMEOUT);

Unrelated.

>       XE_WARN_ON(err <= 0);
>  
>       if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
> +             if (!userptr->finish_inuse) {

Since this is state machiney - should we have asserts on state? That
typically the approach I take when I write stae machiney code. Self
documenting plus immediatelt catches misuse.

So here an example would be:

xe_assert(.., !userptr->tlb_inval_submitted);
 
> +                     /*
> +                      * Defer the TLB wait to an extra pass so the caller
> +                      * can pipeline TLB flushes across GPUs before waiting
> +                      * on any of them.
> +                      */
> +                     userptr->finish_inuse = true;
> +                     userptr->tlb_inval_submitted = true;
> +                     err = xe_vm_invalidate_vma_submit(vma, 
> &userptr->inval_batch);
> +                     XE_WARN_ON(err);
> +                     return &userptr->finish;
> +             }
>               err = xe_vm_invalidate_vma(vma);
>               XE_WARN_ON(err);
>       }
> @@ -98,6 +110,24 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, 
> struct xe_userptr_vma *uvm
>               userptr->finish_inuse = false;
>       drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
>                              xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
> +     return NULL;
> +}
> +
> +static void
> +xe_vma_userptr_complete_tlb_inval(struct xe_vm *vm, struct xe_userptr_vma 
> *uvma)
> +{
> +     struct xe_userptr *userptr = &uvma->userptr;
> +     struct xe_vma *vma = &uvma->vma;
> +     struct drm_gpusvm_ctx ctx = {
> +             .in_notifier = true,
> +             .read_only = xe_vma_read_only(vma),
> +     };
> +

xe_svm_assert_in_notifier();

State machine asserts could be:

xe_assert(..., userptr->tlb_inval_submitted);
xe_assert(..., userptr->finish_inuse);

> +     xe_tlb_inval_batch_wait(&userptr->inval_batch);
> +     userptr->tlb_inval_submitted = false;
> +     userptr->finish_inuse = false;
> +     drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
> +                            xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
>  }
>  
>  static struct mmu_interval_notifier_finish *
> @@ -141,13 +171,11 @@ xe_vma_userptr_invalidate_pass1(struct xe_vm *vm, 
> struct xe_userptr_vma *uvma)
>        * If it's already in use, or all fences are already signaled,
>        * proceed directly to invalidation without deferring.
>        */
> -     if (signaled || userptr->finish_inuse) {
> -             xe_vma_userptr_do_inval(vm, uvma, false);
> -             return NULL;
> -     }
> +     if (signaled || userptr->finish_inuse)
> +             return xe_vma_userptr_do_inval(vm, uvma, false);
>  
> +     /* Defer: the notifier core will call invalidate_finish once done. */
>       userptr->finish_inuse = true;
> -

Unrelated.

>       return &userptr->finish;
>  }
>  
> @@ -193,7 +221,15 @@ static void xe_vma_userptr_invalidate_finish(struct 
> mmu_interval_notifier_finish
>               xe_vma_start(vma), xe_vma_size(vma));
>  
>       down_write(&vm->svm.gpusvm.notifier_lock);
> -     xe_vma_userptr_do_inval(vm, uvma, true);
> +     /*
> +      * If a TLB invalidation was previously submitted (deferred from the
> +      * synchronous pass1 fallback), wait for it and unmap pages.
> +      * Otherwise, fences have now completed: invalidate the TLB and unmap.
> +      */
> +     if (uvma->userptr.tlb_inval_submitted)
> +             xe_vma_userptr_complete_tlb_inval(vm, uvma);
> +     else
> +             xe_vma_userptr_do_inval(vm, uvma, true);
>       up_write(&vm->svm.gpusvm.notifier_lock);
>       trace_xe_vma_userptr_invalidate_complete(vma);
>  }
> @@ -231,7 +267,9 @@ void xe_vma_userptr_force_invalidate(struct 
> xe_userptr_vma *uvma)
>  
>       finish = xe_vma_userptr_invalidate_pass1(vm, uvma);
>       if (finish)
> -             xe_vma_userptr_do_inval(vm, uvma, true);
> +             finish = xe_vma_userptr_do_inval(vm, uvma, true);
> +     if (finish)
> +             xe_vma_userptr_complete_tlb_inval(vm, uvma);
>  }
>  #endif
>  
> diff --git a/drivers/gpu/drm/xe/xe_userptr.h b/drivers/gpu/drm/xe/xe_userptr.h
> index 4f42db61fd62..7477009651c2 100644
> --- a/drivers/gpu/drm/xe/xe_userptr.h
> +++ b/drivers/gpu/drm/xe/xe_userptr.h
> @@ -14,6 +14,8 @@
>  
>  #include <drm/drm_gpusvm.h>
>  
> +#include "xe_tlb_inval_types.h"
> +
>  struct xe_vm;
>  struct xe_vma;
>  struct xe_userptr_vma;
> @@ -63,6 +65,15 @@ struct xe_userptr {
>        * Protected by @vm::svm.gpusvm.notifier_lock.
>        */
>       struct mmu_interval_notifier_finish finish;
> +
> +     /**
> +      * @inval_batch: TLB invalidation batch for deferred completion.
> +      * Stores an in-flight TLB invalidation submitted during a two-pass
> +      * notifier so the wait can be deferred to a subsequent pass, allowing
> +      * multiple GPUs to be signalled before any of them are waited on.
> +      * Protected by @vm::svm.gpusvm.notifier_lock.

In write mode?

> +      */
> +     struct xe_tlb_inval_batch inval_batch;
>       /**
>        * @finish_inuse: Whether @finish is currently in use by an in-progress
>        * two-pass invalidation.
> @@ -70,6 +81,13 @@ struct xe_userptr {
>        */
>       bool finish_inuse;
>  
> +     /**
> +      * @tlb_inval_submitted: Whether a TLB invalidation has been submitted
> +      * via @inval_batch and is pending completion.  When set, the next pass
> +      * must call xe_tlb_inval_batch_wait() before reusing @inval_batch.
> +      * Protected by @vm::svm.gpusvm.notifier_lock.

In write mode?

Matt

> +      */
> +     bool tlb_inval_submitted;
>       /**
>        * @initial_bind: user pointer has been bound at least once.
>        * write: vm->svm.gpusvm.notifier_lock in read mode and vm->resv held.
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 7f29d2b2972d..fdad9329dfb4 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3967,20 +3967,23 @@ void xe_vm_unlock(struct xe_vm *vm)
>  }
>  
>  /**
> - * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
> + * xe_vm_invalidate_vma_submit - Submit a job to invalidate GPU mappings for
> + * VMA.
>   * @vma: VMA to invalidate
> + * @batch: TLB invalidation batch to populate; caller must later call
> + *         xe_tlb_inval_batch_wait() on it to wait for completion
>   *
>   * Walks a list of page tables leaves which it memset the entries owned by 
> this
> - * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
> - * complete.
> + * VMA to zero, invalidates the TLBs, but doesn't block waiting for TLB flush
> + * to complete, but instead populates @batch which can be waited on using
> + * xe_tlb_inval_batch_wait().
>   *
>   * Returns 0 for success, negative error code otherwise.
>   */
> -int xe_vm_invalidate_vma(struct xe_vma *vma)
> +int xe_vm_invalidate_vma_submit(struct xe_vma *vma, struct 
> xe_tlb_inval_batch *batch)
>  {
>       struct xe_device *xe = xe_vma_vm(vma)->xe;
>       struct xe_vm *vm = xe_vma_vm(vma);
> -     struct xe_tlb_inval_batch _batch;
>       struct xe_tile *tile;
>       u8 tile_mask = 0;
>       int ret = 0;
> @@ -4023,14 +4026,33 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
>  
>       ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
>                                                xe_vma_start(vma), 
> xe_vma_end(vma),
> -                                              tile_mask, &_batch);
> +                                              tile_mask, batch);
>  
>       /* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
>       WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
> +     return ret;
> +}
> +
> +/**
> + * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
> + * @vma: VMA to invalidate
> + *
> + * Walks a list of page tables leaves which it memset the entries owned by 
> this
> + * VMA to zero, invalidates the TLBs, and block until TLBs invalidation is
> + * complete.
> + *
> + * Returns 0 for success, negative error code otherwise.
> + */
> +int xe_vm_invalidate_vma(struct xe_vma *vma)
> +{
> +     struct xe_tlb_inval_batch batch;
> +     int ret;
>  
> -     if (!ret)
> -             xe_tlb_inval_batch_wait(&_batch);
> +     ret = xe_vm_invalidate_vma_submit(vma, &batch);
> +     if (ret)
> +             return ret;
>  
> +     xe_tlb_inval_batch_wait(&batch);
>       return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index 62f4b6fec0bc..0bc7ed23eeae 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -242,6 +242,8 @@ struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
>  
>  int xe_vm_invalidate_vma(struct xe_vma *vma);
>  
> +int xe_vm_invalidate_vma_submit(struct xe_vma *vma, struct 
> xe_tlb_inval_batch *batch);
> +
>  int xe_vm_validate_protected(struct xe_vm *vm);
>  
>  static inline void xe_vm_queue_rebind_worker(struct xe_vm *vm)
> -- 
> 2.53.0
>

Re: [PATCH v2 4/4] drm/xe/userptr: Defer Waiting for TLB invalidation to the second pass if possible

Reply via email to