On 5/27/26 18:29, Natalie Vock wrote:
> The "moved" VM state is a bit unfortunately named, because BOs can end
> up in this state without being physically moved. While we need to
> invalidate every mapping when BOs are physically moved, in some other
> cases like PRT binds/unbinds there is no need to refresh mappings except
> those affected by the bind.
>
> Full invalidation of all BO mappings manifested as severe regressions in
> PRT bind performance, which this patch fixes. The offending patch is
> 53f0235c0284 ("drm/amdgpu: restructure VM state machine v4") in the
> amd-staging-drm-next tree, although it has not yet propagated anywhere
> else.
Thanks a lot for nailing this down, but that was actually one of the bugs I was
trying to fix with this.
>
> Signed-off-by: Natalie Vock <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 23 +++++++++++++----------
> 1 file changed, 13 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index beaf0aef6f474..969716b3e67e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -214,11 +214,13 @@ static void amdgpu_vm_bo_evicted(struct
> amdgpu_vm_bo_base *vm_bo)
> * amdgpu_vm_bo_moved - vm_bo is moved
> *
> * @vm_bo: vm_bo which is moved
> + * @moved: true if the BO physically changed locations, i.e. all previous
> + * mappings are invalid
> *
> * State for vm_bo objects meaning the underlying BO was moved but the new
> * location not yet reflected in the page tables.
> */
> -static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
> +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo, bool moved)
Just move setting the moved flag out of this function.
> {
> struct amdgpu_vm_bo_status *lists;
> struct amdgpu_bo *bo = vm_bo->bo;
> @@ -232,7 +234,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base
> *vm_bo)
> vm_bo->moved = false;
> list_move(&vm_bo->vm_status, &lists->idle);
> } else {
> - vm_bo->moved = true;amdgpu_vm_validate
> + if (moved)
> + vm_bo->moved = true;
> list_move(&vm_bo->vm_status, &lists->moved);
> }
> amdgpu_vm_bo_unlock_lists(vm_bo);
> @@ -425,7 +428,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base
> *base,
> */
> if (bo->preferred_domains &
> amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))
> - amdgpu_vm_bo_moved(base);
> + amdgpu_vm_bo_moved(base, true);
> else
> amdgpu_vm_bo_evicted(base);
> }
> @@ -597,7 +600,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> return r;
>
> vm->update_funcs->map_table(to_amdgpu_bo_vm(bo_base->bo));
> - amdgpu_vm_bo_moved(bo_base);
> + amdgpu_vm_bo_moved(bo_base, false);
That one and all other in amdgpu_vm_validate() look questionable to me.
When the buffer was validated (e.g. physically moved) the flag should already
be set, but setting it again should be harmless in most cases.
> }
>
> /*
> @@ -614,7 +617,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> if (r)
> return r;
>
> - amdgpu_vm_bo_moved(bo_base);
> + amdgpu_vm_bo_moved(bo_base, false);
> }
>
> if (!ticket)
> @@ -634,7 +637,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct
> amdgpu_vm *vm,
> if (r)
> return r;
>
> - amdgpu_vm_bo_moved(bo_base);
> + amdgpu_vm_bo_moved(bo_base, false);
>
> /* It's a bit inefficient to always jump back to the start, but
> * we would need to re-structure the KFD for properly fixing
> @@ -1782,7 +1785,7 @@ static void amdgpu_vm_bo_insert_map(struct
> amdgpu_device *adev,
> amdgpu_vm_prt_get(adev);
>
> if (amdgpu_vm_is_bo_always_valid(vm, bo) && !bo_va->base.moved)
> - amdgpu_vm_bo_moved(&bo_va->base);
> + amdgpu_vm_bo_moved(&bo_va->base, false);
This is probably the one which really kills you.
Thanks,
Christian.
>
> trace_amdgpu_vm_bo_map(bo_va, mapping);
> }
> @@ -2091,7 +2094,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device
> *adev,
>
> if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
> !before->bo_va->base.moved)
> - amdgpu_vm_bo_moved(&before->bo_va->base);
> + amdgpu_vm_bo_moved(&before->bo_va->base, false);
> } else {
> kfree(before);
> }
> @@ -2106,7 +2109,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device
> *adev,
>
> if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
> !after->bo_va->base.moved)
> - amdgpu_vm_bo_moved(&after->bo_va->base);
> + amdgpu_vm_bo_moved(&after->bo_va->base, false);
> } else {
> kfree(after);
> }
> @@ -2280,7 +2283,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, bool
> evicted)
>
> if (bo_base->moved)
> continue;
> - amdgpu_vm_bo_moved(bo_base);
> + amdgpu_vm_bo_moved(bo_base, true);
> }
> }
>