[PATCH v3] drm/amdgpu: change VF RAS bad bage space to dynamic allocation

Chenglei Xie Tue, 12 May 2026 07:44:57 -0700

The VF RAS error handler stored bad pages in fixed-size arrays. If the
PF2VF block described more entries than fit, amdgpu_virt_ras_add_bps()
could memcpy past the end of those arrays.


Track table length in a capacity field, allocate a small initial table,
and grow bps / bps_bo together when count + pages would exceed capacity.
Keep amdgpu_virt_ras_add_bps() as the single append path: validate the
addition, grow if needed, then memcpy and bump count.

On allocation failure the existing tables are left unchanged and the
caller stops ingesting more bad pages from the message.

Signed-off-by: Chenglei Xie <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 109 ++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |   2 +
 2 files changed, 90 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 6974b1c5b56c2..beb2693e4c704 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -283,14 +283,63 @@ unsigned int amd_sriov_msg_checksum(void *obj,
        return ret;
 }
 
+#define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY   512
+
+/**
+ * amdgpu_virt_ras_realloc_eh_data_space - alloc/realloc VF bad-page 
@data->bps and @data->bps_bo
+ * @adev: amdgpu device
+ * @data: VF RAS error-handler data
+ * @pages: minimum number of new slots to add beyond @data->capacity
+ *
+ * Return: 0 on success, %-ENOMEM on failure.
+ */
+static int amdgpu_virt_ras_realloc_eh_data_space(struct amdgpu_device *adev,
+               struct amdgpu_virt_ras_err_handler_data *data,
+               int pages)
+{
+       struct eeprom_table_record *new_bps;
+       struct amdgpu_bo **new_bo;
+       unsigned int old_space;
+       unsigned int new_space;
+       unsigned int align_space;
+
+       old_space = (unsigned int)data->capacity;
+       new_space = old_space + max_t(unsigned int, (unsigned int)pages,
+                                     (unsigned 
int)AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY);
+       if (new_space < old_space || new_space > INT_MAX)
+               return -ENOMEM;
+
+       align_space = ALIGN(new_space, 
AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY);
+       if (align_space > INT_MAX)
+               return -ENOMEM;
+
+       new_bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL);
+       new_bo = kcalloc(align_space, sizeof(*data->bps_bo), GFP_KERNEL);
+       if (!new_bps || !new_bo) {
+               kfree(new_bps);
+               kfree(new_bo);
+               dev_warn_ratelimited(adev->dev,
+                                    "RAS WARN: failed to grow bad page table 
to %u slots\n",
+                                    align_space);
+               return -ENOMEM;
+       }
+
+       memcpy(new_bps, data->bps, data->count * sizeof(*data->bps));
+       memcpy(new_bo, data->bps_bo, data->count * sizeof(*data->bps_bo));
+
+       kfree(data->bps);
+       kfree(data->bps_bo);
+       data->bps = new_bps;
+       data->bps_bo = new_bo;
+       data->capacity = (int)align_space;
+
+       return 0;
+}
+
 static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev)
 {
        struct amdgpu_virt *virt = &adev->virt;
        struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data;
-       /* GPU will be marked bad on host if bp count more then 10,
-        * so alloc 512 is enough.
-        */
-       unsigned int align_space = 512;
        void *bps = NULL;
        struct amdgpu_bo **bps_bo = NULL;
 
@@ -298,16 +347,17 @@ static int amdgpu_virt_init_ras_err_handler_data(struct 
amdgpu_device *adev)
        if (!*data)
                goto data_failure;
 
-       bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL);
+       bps = kmalloc_array(AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY, 
sizeof(*(*data)->bps), GFP_KERNEL);
        if (!bps)
                goto bps_failure;
 
-       bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), 
GFP_KERNEL);
+       bps_bo = kcalloc(AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY, 
sizeof(*(*data)->bps_bo), GFP_KERNEL);
        if (!bps_bo)
                goto bps_bo_failure;
 
        (*data)->bps = bps;
        (*data)->bps_bo = bps_bo;
+       (*data)->capacity = AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY;
        (*data)->count = 0;
        (*data)->last_reserved = 0;
 
@@ -361,17 +411,32 @@ void amdgpu_virt_release_ras_err_handler_data(struct 
amdgpu_device *adev)
        virt->virt_eh_data = NULL;
 }
 
-static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
-               struct eeprom_table_record *bps, int pages)
+static bool amdgpu_virt_ras_add_bps(struct amdgpu_device *adev,
+               const struct eeprom_table_record *bps, int pages)
 {
        struct amdgpu_virt *virt = &adev->virt;
        struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data;
+       int need;
 
-       if (!data)
-               return;
+       if (!data || pages <= 0)
+               return false;
+
+       if (pages > INT_MAX - data->count) {
+               dev_warn_ratelimited(adev->dev,
+                                    "RAS WARN: bad page table size overflow 
(count=%d pages=%d)\n",
+                                    data->count, pages);
+               return false;
+       }
+
+       need = data->count + pages;
+       if (need > data->capacity &&
+           amdgpu_virt_ras_realloc_eh_data_space(adev, data, need - 
data->capacity))
+               return false;
 
        memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
        data->count += pages;
+
+       return true;
 }
 
 static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev)
@@ -443,20 +508,22 @@ static void amdgpu_virt_add_bad_page(struct amdgpu_device 
*adev,
 
        memset(&bp, 0, sizeof(bp));
 
-       if (bp_block_size) {
-               bp_cnt = bp_block_size / sizeof(uint64_t);
-               for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
-                       retired_page = *(uint64_t *)(vram_usage_va +
-                                       bp_block_offset + bp_idx * 
sizeof(uint64_t));
-                       bp.retired_page = retired_page;
+       if (!bp_block_size)
+               return;
 
-                       if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
-                               continue;
+       bp_cnt = bp_block_size / sizeof(uint64_t);
+       for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) {
+               retired_page = *(uint64_t *)(vram_usage_va +
+                               bp_block_offset + bp_idx * sizeof(uint64_t));
+               bp.retired_page = retired_page;
 
-                       amdgpu_virt_ras_add_bps(adev, &bp, 1);
+               if (amdgpu_virt_ras_check_bad_page(adev, retired_page))
+                       continue;
 
-                       amdgpu_virt_ras_reserve_bps(adev);
-               }
+               if (!amdgpu_virt_ras_add_bps(adev, &bp, 1))
+                       break;
+
+               amdgpu_virt_ras_reserve_bps(adev);
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 9da0c6e9b8695..af2acf8eee6e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -263,6 +263,8 @@ struct amdgpu_virt_ras_err_handler_data {
        struct eeprom_table_record *bps;
        /* point to reserved bo array */
        struct amdgpu_bo **bps_bo;
+       /* number of slots in bps[] / bps_bo[] (always >= count) */
+       int capacity;
        /* the count of entries */
        int count;
        /* last reserved entry's index + 1 */
-- 
2.34.1

[PATCH v3] drm/amdgpu: change VF RAS bad bage space to dynamic allocation

Reply via email to