AMD General Sure, Will update
Best Regards, Thomas -----Original Message----- From: Zhou1, Tao <[email protected]> Sent: Tuesday, May 19, 2026 11:13 AM To: Chai, Thomas <[email protected]>; [email protected] Cc: Zhang, Hawking <[email protected]>; Yang, Stanley <[email protected]> Subject: RE: [PATCH 7/7] drm/amdgpu: check and drop invalid bad page records AMD General > -----Original Message----- > From: Chai, Thomas <[email protected]> > Sent: Monday, May 18, 2026 3:22 PM > To: [email protected] > Cc: Chai, Thomas <[email protected]>; Zhang, Hawking > <[email protected]>; Zhou1, Tao <[email protected]>; Yang, Stanley > <[email protected]>; Chai, Thomas <[email protected]> > Subject: [PATCH 7/7] drm/amdgpu: check and drop invalid bad page > records > > Check and drop invalid bad page records. > > Signed-off-by: YiPeng Chai <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 27 > +++++++++++++++++++++++++ > 1 file changed, 27 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 57f13ad5605a..b0ef0800b380 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3094,6 +3094,20 @@ static int amdgpu_ras_mca2pa(struct > amdgpu_device *adev, > return -EINVAL; > } > > +static bool __check_record_in_range(struct amdgpu_device *adev, > + struct eeprom_table_record *bps, int count) { > + int i; > + > + for (i = 0; i < count; i++) { > + if (bps[i].retired_page >= > + (adev->gmc.real_vram_size >> > AMDGPU_GPU_PAGE_SHIFT)) > + return false; > + } > + > + return true; > +} > + > static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, > struct eeprom_table_record *bps, > int > count) { @@ -3101,6 +3115,14 @@ static int > __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data *data = con->eh_data; > > + if (!__check_record_in_range(adev, bps, count)) { > + dev_warn(adev->dev, > + "Recorded address out of range: 0x%llx, 0x%llx, > + 0x%x, > 0x%x\n", > + bps[0].address, bps[0].retired_page, > + bps[0].mem_channel, bps[0].mcumc_id); [Tao] can we move the log into __check_record_in_range(adev, bps, count), and we could print out the info of bps[i] instead of bps[0]. > + return 0; > + } > + > for (j = 0; j < count; j++) { > if (!data->space_left && > amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { > @@ -5642,6 +5664,11 @@ int amdgpu_ras_reserve_page(struct > amdgpu_device *adev, uint64_t pfn) > uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; > int ret = 0; > > + if (pfn >= (adev->gmc.real_vram_size >> > AMDGPU_GPU_PAGE_SHIFT)) { > + dev_warn(adev->dev, "Ignoring out-of-range bad page > 0x%llx", start); > + return 0; > + } > + > if (amdgpu_ras_check_critical_address(adev, start)) > return 0; > > -- > 2.43.0
