Re: [PATCH v3 4/4] drm/xe/userptr: Defer Waiting for TLB invalidation to the second pass if possible
On Tue, Mar 03, 2026 at 02:34:09PM +0100, Thomas Hellström wrote:
> Now that the two-pass notifier flow uses xe_vma_userptr_do_inval() for
> the fence-wait + TLB-invalidate work, extend it to support a further
> deferred TLB wait:
>
> - xe_vma_userptr_do_inval(): when the embedded finish handle is free,
> submit the TLB invalidation asynchronously (xe_vm_invalidate_vma_submit)
> and return &userptr->finish so the mmu_notifier core schedules a third
> pass. When the handle is occupied by a concurrent invalidation, fall
> back to the synchronous xe_vm_invalidate_vma() path.
>
> - xe_vma_userptr_complete_tlb_inval(): new helper called from
> invalidate_finish when tlb_inval_submitted is set. Waits for the
> previously submitted batch and unmaps the gpusvm pages.
>
> xe_vma_userptr_invalidate_finish() dispatches between the two helpers
> via tlb_inval_submitted, making the three possible flows explicit:
>
> pass1 (fences pending) -> invalidate_finish -> do_inval (sync TLB)
> pass1 (fences done) -> do_inval -> invalidate_finish
> -> complete_tlb_inval (deferred TLB)
> pass1 (finish occupied) -> do_inval (sync TLB, inline)
>
> In multi-GPU scenarios this allows TLB flushes to be submitted on all
> GPUs in one pass before any of them are waited on.
>
> Also adds xe_vm_invalidate_vma_submit() which submits the TLB range
> invalidation without blocking, populating a xe_tlb_inval_batch that
> the caller waits on separately.
>
> v3:
> - Add locking asserts and notifier state asserts (Matt Brost)
> - Update the locking documentation of the notifier
> state members (Matt Brost)
> - Remove unrelated code formatting changes (Matt Brost)
>
> Assisted-by: GitHub Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström
Reviewed-by: Matthew Brost
> ---
> drivers/gpu/drm/xe/xe_userptr.c | 63 -
> drivers/gpu/drm/xe/xe_userptr.h | 17 +
> drivers/gpu/drm/xe/xe_vm.c | 38 +++-
> drivers/gpu/drm/xe/xe_vm.h | 2 ++
> 4 files changed, 104 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c
> index 37032b8125a6..6761005c0b90 100644
> --- a/drivers/gpu/drm/xe/xe_userptr.c
> +++ b/drivers/gpu/drm/xe/xe_userptr.c
> @@ -8,6 +8,7 @@
>
> #include
>
> +#include "xe_tlb_inval.h"
> #include "xe_trace_bo.h"
>
> static void xe_userptr_assert_in_notifier(struct xe_vm *vm)
> @@ -81,8 +82,8 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
> &ctx);
> }
>
> -static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma
> *uvma,
> - bool is_deferred)
> +static struct mmu_interval_notifier_finish *
> +xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma *uvma, bool
> is_deferred)
> {
> struct xe_userptr *userptr = &uvma->userptr;
> struct xe_vma *vma = &uvma->vma;
> @@ -93,6 +94,8 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm,
> struct xe_userptr_vma *uvm
> long err;
>
> xe_userptr_assert_in_notifier(vm);
> + if (is_deferred)
> + xe_assert(vm->xe, userptr->finish_inuse &&
> !userptr->tlb_inval_submitted);
>
> err = dma_resv_wait_timeout(xe_vm_resv(vm),
> DMA_RESV_USAGE_BOOKKEEP,
> @@ -100,6 +103,19 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm,
> struct xe_userptr_vma *uvm
> XE_WARN_ON(err <= 0);
>
> if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
> + if (!userptr->finish_inuse) {
> + /*
> + * Defer the TLB wait to an extra pass so the caller
> + * can pipeline TLB flushes across GPUs before waiting
> + * on any of them.
> + */
> + xe_assert(vm->xe, !userptr->tlb_inval_submitted);
> + userptr->finish_inuse = true;
> + userptr->tlb_inval_submitted = true;
> + err = xe_vm_invalidate_vma_submit(vma,
> &userptr->inval_batch);
> + XE_WARN_ON(err);
> + return &userptr->finish;
> + }
> err = xe_vm_invalidate_vma(vma);
> XE_WARN_ON(err);
> }
> @@ -108,6 +124,28 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm,
> struct xe_userptr_vma *uvm
> userptr->finish_inuse = false;
> drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
> xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
> + return NULL;
> +}
> +
> +static void
> +xe_vma_userptr_complete_tlb_inval(struct xe_vm *vm, struct xe_userptr_vma
> *uvma)
> +{
> + struct xe_userptr *userptr = &uvma->userptr;
> + struct xe_vma *vma = &uvma->vma;
> + struct drm_gpusvm_ctx ctx = {
> + .in_notifier = true,
> +
[PATCH v3 4/4] drm/xe/userptr: Defer Waiting for TLB invalidation to the second pass if possible
Now that the two-pass notifier flow uses xe_vma_userptr_do_inval() for
the fence-wait + TLB-invalidate work, extend it to support a further
deferred TLB wait:
- xe_vma_userptr_do_inval(): when the embedded finish handle is free,
submit the TLB invalidation asynchronously (xe_vm_invalidate_vma_submit)
and return &userptr->finish so the mmu_notifier core schedules a third
pass. When the handle is occupied by a concurrent invalidation, fall
back to the synchronous xe_vm_invalidate_vma() path.
- xe_vma_userptr_complete_tlb_inval(): new helper called from
invalidate_finish when tlb_inval_submitted is set. Waits for the
previously submitted batch and unmaps the gpusvm pages.
xe_vma_userptr_invalidate_finish() dispatches between the two helpers
via tlb_inval_submitted, making the three possible flows explicit:
pass1 (fences pending) -> invalidate_finish -> do_inval (sync TLB)
pass1 (fences done) -> do_inval -> invalidate_finish
-> complete_tlb_inval (deferred TLB)
pass1 (finish occupied) -> do_inval (sync TLB, inline)
In multi-GPU scenarios this allows TLB flushes to be submitted on all
GPUs in one pass before any of them are waited on.
Also adds xe_vm_invalidate_vma_submit() which submits the TLB range
invalidation without blocking, populating a xe_tlb_inval_batch that
the caller waits on separately.
v3:
- Add locking asserts and notifier state asserts (Matt Brost)
- Update the locking documentation of the notifier
state members (Matt Brost)
- Remove unrelated code formatting changes (Matt Brost)
Assisted-by: GitHub Copilot:claude-sonnet-4.6
Signed-off-by: Thomas Hellström
---
drivers/gpu/drm/xe/xe_userptr.c | 63 -
drivers/gpu/drm/xe/xe_userptr.h | 17 +
drivers/gpu/drm/xe/xe_vm.c | 38 +++-
drivers/gpu/drm/xe/xe_vm.h | 2 ++
4 files changed, 104 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_userptr.c b/drivers/gpu/drm/xe/xe_userptr.c
index 37032b8125a6..6761005c0b90 100644
--- a/drivers/gpu/drm/xe/xe_userptr.c
+++ b/drivers/gpu/drm/xe/xe_userptr.c
@@ -8,6 +8,7 @@
#include
+#include "xe_tlb_inval.h"
#include "xe_trace_bo.h"
static void xe_userptr_assert_in_notifier(struct xe_vm *vm)
@@ -81,8 +82,8 @@ int xe_vma_userptr_pin_pages(struct xe_userptr_vma *uvma)
&ctx);
}
-static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma
*uvma,
- bool is_deferred)
+static struct mmu_interval_notifier_finish *
+xe_vma_userptr_do_inval(struct xe_vm *vm, struct xe_userptr_vma *uvma, bool
is_deferred)
{
struct xe_userptr *userptr = &uvma->userptr;
struct xe_vma *vma = &uvma->vma;
@@ -93,6 +94,8 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm, struct
xe_userptr_vma *uvm
long err;
xe_userptr_assert_in_notifier(vm);
+ if (is_deferred)
+ xe_assert(vm->xe, userptr->finish_inuse &&
!userptr->tlb_inval_submitted);
err = dma_resv_wait_timeout(xe_vm_resv(vm),
DMA_RESV_USAGE_BOOKKEEP,
@@ -100,6 +103,19 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm,
struct xe_userptr_vma *uvm
XE_WARN_ON(err <= 0);
if (xe_vm_in_fault_mode(vm) && userptr->initial_bind) {
+ if (!userptr->finish_inuse) {
+ /*
+* Defer the TLB wait to an extra pass so the caller
+* can pipeline TLB flushes across GPUs before waiting
+* on any of them.
+*/
+ xe_assert(vm->xe, !userptr->tlb_inval_submitted);
+ userptr->finish_inuse = true;
+ userptr->tlb_inval_submitted = true;
+ err = xe_vm_invalidate_vma_submit(vma,
&userptr->inval_batch);
+ XE_WARN_ON(err);
+ return &userptr->finish;
+ }
err = xe_vm_invalidate_vma(vma);
XE_WARN_ON(err);
}
@@ -108,6 +124,28 @@ static void xe_vma_userptr_do_inval(struct xe_vm *vm,
struct xe_userptr_vma *uvm
userptr->finish_inuse = false;
drm_gpusvm_unmap_pages(&vm->svm.gpusvm, &uvma->userptr.pages,
xe_vma_size(vma) >> PAGE_SHIFT, &ctx);
+ return NULL;
+}
+
+static void
+xe_vma_userptr_complete_tlb_inval(struct xe_vm *vm, struct xe_userptr_vma
*uvma)
+{
+ struct xe_userptr *userptr = &uvma->userptr;
+ struct xe_vma *vma = &uvma->vma;
+ struct drm_gpusvm_ctx ctx = {
+ .in_notifier = true,
+ .read_only = xe_vma_read_only(vma),
+ };
+
+ xe_userptr_assert_in_notifier(vm);
+ xe_assert(vm->xe, userptr->finish_inuse);
+ xe_assert(vm->xe, userptr->tlb_inval_submitted);
+
+ xe_tlb_inval_batch_wait(&userptr->inval_batch)
