Track table length in a capacity field, allocate a small initial table, and grow bps / bps_bo together when count + pages would exceed capacity. Keep amdgpu_virt_ras_add_bps() as the single append path: validate the addition, grow if needed, then memcpy and bump count.
On allocation failure the existing tables are left unchanged and the caller stops ingesting more bad pages from the message. Signed-off-by: Chenglei Xie <[email protected]> Change-Id: I924d2b5e369bef007eb1577f287a2274ec83d5a9 --- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 114 ++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 2 + 2 files changed, 95 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 6974b1c5b56c2..8d4f30e3be212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -283,14 +283,68 @@ unsigned int amd_sriov_msg_checksum(void *obj, return ret; } +#define AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY 16 + +/** + * amdgpu_virt_ras_grow_bad_page_table - grow bad-page arrays if needed + * @adev: amdgpu device + * @eh: RAS error handler data for the VF + * @min_capacity: minimum number of bad-page slots required (bps[] / bps_bo[] length) + * + * Returns 0 if @eh already has room for @min_capacity slots or growth succeeded. + * Returns -ENOMEM if allocation failed (existing tables are left unchanged). + */ +static int amdgpu_virt_ras_grow_bad_page_table(struct amdgpu_device *adev, + struct amdgpu_virt_ras_err_handler_data *eh, + int min_capacity) +{ + struct eeprom_table_record *new_bps; + struct amdgpu_bo **new_bo; + int new_capacity; + unsigned long doubled; + + if (min_capacity <= eh->capacity) + return 0; + + new_capacity = eh->capacity ? eh->capacity : AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY; + while (new_capacity < min_capacity) { + doubled = (unsigned long)new_capacity * 2UL; + if (doubled < (unsigned long)new_capacity || doubled > INT_MAX) { + new_capacity = min_capacity; + break; + } + new_capacity = (int)doubled; + } + if (new_capacity < min_capacity) + new_capacity = min_capacity; + + new_bps = kmalloc_array(new_capacity, sizeof(*eh->bps), GFP_KERNEL); + new_bo = kcalloc(new_capacity, sizeof(*eh->bps_bo), GFP_KERNEL); + if (!new_bps || !new_bo) { + kfree(new_bps); + kfree(new_bo); + dev_warn_ratelimited(adev->dev, + "RAS WARN: failed to grow bad page table to %d slots\n", + new_capacity); + return -ENOMEM; + } + + memcpy(new_bps, eh->bps, eh->count * sizeof(*eh->bps)); + memcpy(new_bo, eh->bps_bo, eh->count * sizeof(*eh->bps_bo)); + + kfree(eh->bps); + kfree(eh->bps_bo); + eh->bps = new_bps; + eh->bps_bo = new_bo; + eh->capacity = new_capacity; + + return 0; +} + static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev) { struct amdgpu_virt *virt = &adev->virt; struct amdgpu_virt_ras_err_handler_data **data = &virt->virt_eh_data; - /* GPU will be marked bad on host if bp count more then 10, - * so alloc 512 is enough. - */ - unsigned int align_space = 512; void *bps = NULL; struct amdgpu_bo **bps_bo = NULL; @@ -298,16 +352,17 @@ static int amdgpu_virt_init_ras_err_handler_data(struct amdgpu_device *adev) if (!*data) goto data_failure; - bps = kmalloc_array(align_space, sizeof(*(*data)->bps), GFP_KERNEL); + bps = kmalloc_array(AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY, sizeof(*(*data)->bps), GFP_KERNEL); if (!bps) goto bps_failure; - bps_bo = kmalloc_array(align_space, sizeof(*(*data)->bps_bo), GFP_KERNEL); + bps_bo = kcalloc(AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY, sizeof(*(*data)->bps_bo), GFP_KERNEL); if (!bps_bo) goto bps_bo_failure; (*data)->bps = bps; (*data)->bps_bo = bps_bo; + (*data)->capacity = AMDGPU_VIRT_RAS_BAD_PAGE_TABLE_INIT_CAPACITY; (*data)->count = 0; (*data)->last_reserved = 0; @@ -361,17 +416,32 @@ void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev) virt->virt_eh_data = NULL; } -static void amdgpu_virt_ras_add_bps(struct amdgpu_device *adev, - struct eeprom_table_record *bps, int pages) +static bool amdgpu_virt_ras_add_bps(struct amdgpu_device *adev, + const struct eeprom_table_record *bps, int pages) { struct amdgpu_virt *virt = &adev->virt; struct amdgpu_virt_ras_err_handler_data *data = virt->virt_eh_data; + int need; - if (!data) - return; + if (!data || pages <= 0) + return false; + + if (pages > INT_MAX - data->count) { + dev_warn_ratelimited(adev->dev, + "RAS WARN: bad page table size overflow (count=%d pages=%d)\n", + data->count, pages); + return false; + } + + need = data->count + pages; + if (need > data->capacity && + amdgpu_virt_ras_grow_bad_page_table(adev, data, need)) + return false; memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); data->count += pages; + + return true; } static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) @@ -443,20 +513,22 @@ static void amdgpu_virt_add_bad_page(struct amdgpu_device *adev, memset(&bp, 0, sizeof(bp)); - if (bp_block_size) { - bp_cnt = bp_block_size / sizeof(uint64_t); - for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) { - retired_page = *(uint64_t *)(vram_usage_va + - bp_block_offset + bp_idx * sizeof(uint64_t)); - bp.retired_page = retired_page; + if (!bp_block_size) + return; - if (amdgpu_virt_ras_check_bad_page(adev, retired_page)) - continue; + bp_cnt = bp_block_size / sizeof(uint64_t); + for (bp_idx = 0; bp_idx < bp_cnt; bp_idx++) { + retired_page = *(uint64_t *)(vram_usage_va + + bp_block_offset + bp_idx * sizeof(uint64_t)); + bp.retired_page = retired_page; - amdgpu_virt_ras_add_bps(adev, &bp, 1); + if (amdgpu_virt_ras_check_bad_page(adev, retired_page)) + continue; - amdgpu_virt_ras_reserve_bps(adev); - } + if (!amdgpu_virt_ras_add_bps(adev, &bp, 1)) + break; + + amdgpu_virt_ras_reserve_bps(adev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 9da0c6e9b8695..af2acf8eee6e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -263,6 +263,8 @@ struct amdgpu_virt_ras_err_handler_data { struct eeprom_table_record *bps; /* point to reserved bo array */ struct amdgpu_bo **bps_bo; + /* number of slots in bps[] / bps_bo[] (always >= count) */ + int capacity; /* the count of entries */ int count; /* last reserved entry's index + 1 */ -- 2.34.1
