Re: [PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement

2024-01-26 Thread Christian König

Am 25.01.24 um 00:21 schrieb Zhigang Luo:

From: Victor Skvortsov 

In runtime, use vram manager for virtualization page retirement.

Signed-off-by: Victor Skvortsov 
Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 
  1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f5c66e0038b5..6ff7d3fb2008 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
if (!*data)
goto data_failure;
  
-	bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);

+   bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
if (!bps)
goto bps_failure;
  
-	bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), GFP_KERNEL);

+   bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), 
GFP_KERNEL);


That looks like an rather important bug fix which should be in a 
separate patch.



if (!bps_bo)
goto bps_bo_failure;
  
@@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct amdgpu_device *adev)
  
  	for (i = data->last_reserved - 1; i >= 0; i--) {

bo = data->bps_bo[i];
-   amdgpu_bo_free_kernel(&bo, NULL, NULL);
-   data->bps_bo[i] = bo;
+   if (bo) {
+   amdgpu_bo_free_kernel(&bo, NULL, NULL);
+   data->bps_bo[i] = bo;
+   }
data->last_reserved = i;
}
  }
@@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)
  {
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+   struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+   struct ttm_resource_manager *man = &mgr->manager;
struct amdgpu_bo *bo = NULL;
uint64_t bp;
int i;
@@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)
 * 2) a ras bad page has been reserved (duplicate error 
injection
 *for one page);
 */
-   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
-  AMDGPU_GPU_PAGE_SIZE,
-  &bo, NULL))
-   DRM_DEBUG("RAS WARN: reserve vram for retired page %llx 
fail\n", bp);
-
-   data->bps_bo[i] = bo;
+   if  (ttm_resource_manager_used(man)) {
+   amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+   bp << AMDGPU_GPU_PAGE_SHIFT,
+   AMDGPU_GPU_PAGE_SIZE);
+   data->bps_bo[i] = NULL;
+   } else {
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+   AMDGPU_GPU_PAGE_SIZE,
+   &bo, NULL))
+   DRM_DEBUG("RAS WARN: reserve vram for retired page 
%llx fail\n", bp);
+   data->bps_bo[i] = bo;
+   }


That code makes no sense. If the VRAM mgr is not enabled then 
amdgpu_bo_create_kernel_at() won't work either.


I suggest to completely remove the amdgpu_bo_create_kernel_at() code path.

Regards,
Christian.


data->last_reserved = i + 1;
bo = NULL;
}




[PATCH 3/3] amdgpu/drm: Use vram manager for virtualization page retirement

2024-01-24 Thread Zhigang Luo
From: Victor Skvortsov 

In runtime, use vram manager for virtualization page retirement.

Signed-off-by: Victor Skvortsov 
Change-Id: Ia8fe6c7d4e4acae9d3a953b3ba4567e8fc6de0fa
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 30 
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index f5c66e0038b5..6ff7d3fb2008 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -250,11 +250,11 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
if (!*data)
goto data_failure;
 
-   bps = kmalloc_array(align_space, sizeof((*data)->bps), GFP_KERNEL);
+   bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
if (!bps)
goto bps_failure;
 
-   bps_bo = kmalloc_array(align_space, sizeof((*data)->bps_bo), 
GFP_KERNEL);
+   bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), 
GFP_KERNEL);
if (!bps_bo)
goto bps_bo_failure;
 
@@ -287,8 +287,10 @@ static void amdgpu_virt_ras_release_bp(struct 
amdgpu_device *adev)
 
for (i = data->last_reserved - 1; i >= 0; i--) {
bo = data->bps_bo[i];
-   amdgpu_bo_free_kernel(&bo, NULL, NULL);
-   data->bps_bo[i] = bo;
+   if (bo) {
+   amdgpu_bo_free_kernel(&bo, NULL, NULL);
+   data->bps_bo[i] = bo;
+   }
data->last_reserved = i;
}
 }
@@ -328,6 +330,8 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)
 {
struct amdgpu_virt *virt = &adev->virt;
struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+   struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+   struct ttm_resource_manager *man = &mgr->manager;
struct amdgpu_bo *bo = NULL;
uint64_t bp;
int i;
@@ -343,12 +347,18 @@ static void amdgpu_virt_ras_reserve_bps(struct 
amdgpu_device *adev)
 * 2) a ras bad page has been reserved (duplicate error 
injection
 *for one page);
 */
-   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
-  AMDGPU_GPU_PAGE_SIZE,
-  &bo, NULL))
-   DRM_DEBUG("RAS WARN: reserve vram for retired page %llx 
fail\n", bp);
-
-   data->bps_bo[i] = bo;
+   if  (ttm_resource_manager_used(man)) {
+   amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
+   bp << AMDGPU_GPU_PAGE_SHIFT,
+   AMDGPU_GPU_PAGE_SIZE);
+   data->bps_bo[i] = NULL;
+   } else {
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+   AMDGPU_GPU_PAGE_SIZE,
+   &bo, NULL))
+   DRM_DEBUG("RAS WARN: reserve vram for retired 
page %llx fail\n", bp);
+   data->bps_bo[i] = bo;
+   }
data->last_reserved = i + 1;
bo = NULL;
}
-- 
2.25.1