RE: [PATCH] drm/amdgpu: bound bad-page list writes to allocated capacity

Xie, Chenglei Thu, 07 May 2026 13:13:19 -0700

AMD General

Hi team,


Could anyone help review this security bug fix patch for 
https://ontrack-internal.amd.com/browse/SWDEV-589525.

Thanks,
Chenglei

-----Original Message-----
From: Xie, Chenglei <[email protected]>
Sent: Thursday, May 7, 2026 10:45 AM
To: [email protected]
Cc: Chan, Hing Pong <[email protected]>; Luo, Zhigang <[email protected]>; 
Deucher, Alexander <[email protected]>; Xie, Chenglei 
<[email protected]>
Subject: [PATCH] drm/amdgpu: bound bad-page list writes to allocated capacity

PF2VF bad-page data can provide more entries than the VF-side bad-page buffer 
can hold. amdgpu_virt_ras_add_bps() copied entries without checking remaining 
capacity, which can overrun the 512-entry allocation.

Add an explicit max bad-page constant, validate pages against remaining space 
before memcpy(), and make the helper return bool so the caller stops processing 
when the buffer is full. This preserves normal behavior while preventing 
overwrite from malformed or malicious input.

Signed-off-by: Chenglei Xie <[email protected]>
Change-Id: I924d2b5e369bef007eb1577f287a2274ec83d5a9
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 53 +++++++++++++++---------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 6974b1c5b56c2..8ac808d70356c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -283,14 +283,12 @@ unsigned int amd_sriov_msg_checksum(void *obj,
        return ret;
 }

+#define AMDGPU_VIRT_RAS_MAX_BAD_PAGES 512
+
 static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)  {
        struct amdgpu_virt *virt = &adev->virt;
        struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data;
-       /* GPU will be marked bad on host if bp count more then 10,
-        * so alloc 512 is enough.
-        */
-       unsigned int align_space = 512;
        void *bps = NULL;
        struct amdgpu_bo **bps_bo = NULL;

@@ -298,11 +296,14 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
        if (!*data)
                goto data_failure;

-       bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
+       /* GPU will be marked bad on host if bp count more then 10,
+        * so alloc 512 is enough.
+        */
+       bps = kmalloc_array(AMDGPU_VIRT_RAS_MAX_BAD_PAGES,
+sizeof(*(*data)->bps), GFP_KERNEL);
        if (!bps)
                goto bps_failure;

-       bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), 
GFP_KERNEL);
+       bps_bo = kmalloc_array(AMDGPU_VIRT_RAS_MAX_BAD_PAGES,
+sizeof(*(*data)->bps_bo), GFP_KERNEL);
        if (!bps_bo)
                goto bps_bo_failure;

@@ -361,17 +362,27 @@ void amdgpu_virt_release_ras_err_handler_data(struct 
amdgpu_device *adev)
        virt->virt_eh_data = NULL;
 }

-static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
-               struct eeprom_table_record *bps, int pages)
+static bool amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
+               const struct eeprom_table_record *bps, int pages)
 {
        struct amdgpu_virt *virt = &adev->virt;
        struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;

        if (!data)
-               return;
+               return false;
+
+       if (pages > AMDGPU_VIRT_RAS_MAX_BAD_PAGES - data->count) {
+               dev_warn_ratelimited(adev->dev,
+                                    "RAS WARN: bad page buffer full, count=%d 
pages=%d max=%d\n",
+                                    data->count, pages,
+                                    AMDGPU_VIRT_RAS_MAX_BAD_PAGES);
+               return false;
+       }

        memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
        data->count += pages;
+
+       return true;
 }

 static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) @@ -443,20 
+454,22 @@ static void amdgpu_virt_add_bad_page(struct amdgpu_device *adev,

        memset(&bp, 0, sizeof(bp));

-       if (bp_block_size) {
-               bp_cnt = bp_block_size / sizeof(uint64_t);
-               for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
-                       retired_page = *(uint64_t *)(vram_usage_va +
-                                       bp_block_offset + bp_idx * 
sizeof(uint64_t));
-                       bp.retired_page = retired_page;
+       if (!bp_block_size)
+               return;

-                       if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
-                               continue;
+       bp_cnt = bp_block_size / sizeof(uint64_t);
+       for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
+               retired_page = *(uint64_t *)(vram_usage_va +
+                               bp_block_offset + bp_idx * sizeof(uint64_t));
+               bp.retired_page = retired_page;

-                       amdgpu_virt_ras_add_bps(adev, &bp, 1);
+               if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
+                       continue;

-                       amdgpu_virt_ras_reserve_bps(adev);
-               }
+               if (!amdgpu_virt_ras_add_bps(adev, &bp, 1))
+                       break;
+
+               amdgpu_virt_ras_reserve_bps(adev);
        }
 }

--
2.34.1

RE: [PATCH] drm/amdgpu: bound bad-page list writes to allocated capacity

Reply via email to