On 5/27/26 18:29, Natalie Vock wrote:
> The "moved" VM state is a bit unfortunately named, because BOs can end
> up in this state without being physically moved. While we need to
> invalidate every mapping when BOs are physically moved, in some other
> cases like PRT binds/unbinds there is no need to refresh mappings except
> those affected by the bind.
> 
> Full invalidation of all BO mappings manifested as severe regressions in
> PRT bind performance, which this patch fixes. The offending patch is
> 53f0235c0284 ("drm/amdgpu: restructure VM state machine v4") in the
> amd-staging-drm-next tree, although it has not yet propagated anywhere
> else.

Thanks a lot for nailing this down, but that was actually one of the bugs I was 
trying to fix with this.

> 
> Signed-off-by: Natalie Vock <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 23 +++++++++++++----------
>  1 file changed, 13 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index beaf0aef6f474..969716b3e67e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -214,11 +214,13 @@ static void amdgpu_vm_bo_evicted(struct 
> amdgpu_vm_bo_base *vm_bo)
>   * amdgpu_vm_bo_moved - vm_bo is moved
>   *
>   * @vm_bo: vm_bo which is moved
> + * @moved: true if the BO physically changed locations, i.e. all previous
> + *         mappings are invalid
>   *
>   * State for vm_bo objects meaning the underlying BO was moved but the new
>   * location not yet reflected in the page tables.
>   */
> -static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
> +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo, bool moved)

Just move setting the moved flag out of this function.

>  {
>       struct amdgpu_vm_bo_status *lists;
>       struct amdgpu_bo *bo = vm_bo->bo;
> @@ -232,7 +234,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base 
> *vm_bo)
>               vm_bo->moved = false;
>               list_move(&vm_bo->vm_status, &lists->idle);
>       } else {
> -             vm_bo->moved = true;amdgpu_vm_validate
> +             if (moved)
> +                     vm_bo->moved = true;
>               list_move(&vm_bo->vm_status, &lists->moved);
>       }
>       amdgpu_vm_bo_unlock_lists(vm_bo);
> @@ -425,7 +428,7 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base 
> *base,
>        */
>       if (bo->preferred_domains &
>           amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type))
> -             amdgpu_vm_bo_moved(base);
> +             amdgpu_vm_bo_moved(base, true);
>       else
>               amdgpu_vm_bo_evicted(base);
>  }
> @@ -597,7 +600,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>                       return r;
>  
>               vm->update_funcs->map_table(to_amdgpu_bo_vm(bo_base->bo));
> -             amdgpu_vm_bo_moved(bo_base);
> +             amdgpu_vm_bo_moved(bo_base, false);

That one and all other in amdgpu_vm_validate() look questionable to me.

When the buffer was validated (e.g. physically moved) the flag should already 
be set, but setting it again should be harmless in most cases.

>       }
>  
>       /*
> @@ -614,7 +617,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>               if (r)
>                       return r;
>  
> -             amdgpu_vm_bo_moved(bo_base);
> +             amdgpu_vm_bo_moved(bo_base, false);
>       }
>  
>       if (!ticket)
> @@ -634,7 +637,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct 
> amdgpu_vm *vm,
>               if (r)
>                       return r;
>  
> -             amdgpu_vm_bo_moved(bo_base);
> +             amdgpu_vm_bo_moved(bo_base, false);
>  
>               /* It's a bit inefficient to always jump back to the start, but
>                * we would need to re-structure the KFD for properly fixing
> @@ -1782,7 +1785,7 @@ static void amdgpu_vm_bo_insert_map(struct 
> amdgpu_device *adev,
>               amdgpu_vm_prt_get(adev);
>  
>       if (amdgpu_vm_is_bo_always_valid(vm, bo) && !bo_va->base.moved)
> -             amdgpu_vm_bo_moved(&bo_va->base);
> +             amdgpu_vm_bo_moved(&bo_va->base, false);

This is probably the one which really kills you.

Thanks,
Christian.

>  
>       trace_amdgpu_vm_bo_map(bo_va, mapping);
>  }
> @@ -2091,7 +2094,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device 
> *adev,
>  
>               if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
>                   !before->bo_va->base.moved)
> -                     amdgpu_vm_bo_moved(&before->bo_va->base);
> +                     amdgpu_vm_bo_moved(&before->bo_va->base, false);
>       } else {
>               kfree(before);
>       }
> @@ -2106,7 +2109,7 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device 
> *adev,
>  
>               if (amdgpu_vm_is_bo_always_valid(vm, bo) &&
>                   !after->bo_va->base.moved)
> -                     amdgpu_vm_bo_moved(&after->bo_va->base);
> +                     amdgpu_vm_bo_moved(&after->bo_va->base, false);
>       } else {
>               kfree(after);
>       }
> @@ -2280,7 +2283,7 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_bo *bo, bool 
> evicted)
>  
>               if (bo_base->moved)
>                       continue;
> -             amdgpu_vm_bo_moved(bo_base);
> +             amdgpu_vm_bo_moved(bo_base, true);
>       }
>  }
>  

Reply via email to