[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Chai, Thomas <yipeng.c...@amd.com> > Sent: Wednesday, July 16, 2025 2:48 PM > To: amd-gfx@lists.freedesktop.org > Cc: Chai, Thomas <yipeng.c...@amd.com>; Zhang, Hawking > <hawking.zh...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>; Chai, Thomas > <yipeng.c...@amd.com> > Subject: [PATCH] drm/amdgpu: add command to check address validity > > Add command to check address validity and remove unused command codes. > > Signed-off-by: YiPeng Chai <yipeng.c...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 +++++++++++-------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ > 2 files changed, 29 insertions(+), 32 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 15bde4904996..68feec0956f0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -163,47 +163,38 @@ static bool amdgpu_ras_get_error_query_ready(struct > amdgpu_device *adev) > return false; > } > > -static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t > address) > +static int amdgpu_check_address_validity(struct amdgpu_device *adev, > +uint64_t address) > { > - struct ras_err_data err_data; > - struct eeprom_table_record err_rec; > - int ret; > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct amdgpu_vram_block_info blk_info; > + uint64_t page_pfns[32] = {0}; > + int i, ret, count; > + > + if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) > + return 0; > > if ((address >= adev->gmc.mc_vram_size) || > (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { > dev_warn(adev->dev, > - "RAS WARN: input address 0x%llx is invalid.\n", > - address); > + "RAS WARN: input address 0x%llx is invalid.\n", > + address); > return -EINVAL; > } > > - if (amdgpu_ras_check_bad_page(adev, address)) { > - dev_warn(adev->dev, > - "RAS WARN: 0x%llx has already been marked as bad > page!\n", > - address); > - return 0; > - } > - > - ret = amdgpu_ras_error_data_init(&err_data); > - if (ret) > - return ret; > + count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, > + address, page_pfns, ARRAY_SIZE(page_pfns)); > + if (count <= 0) > + return -EPERM; > > - memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); > - err_data.err_addr = &err_rec; > - amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); > - > - if (amdgpu_bad_page_threshold != 0) { > - amdgpu_ras_add_bad_pages(adev, err_data.err_addr, > - err_data.err_addr_cnt, false); > - amdgpu_ras_save_bad_pages(adev, NULL); > + for (i = 0; i < count; i++) { > + memset(&blk_info, 0, sizeof(blk_info)); > + ret = amdgpu_vram_mgr_query_address_block_info(&adev- > >mman.vram_mgr, > + page_pfns[i] << > AMDGPU_GPU_PAGE_SHIFT, &blk_info); > + if (!ret && (blk_info.task.pid == con->init_task_pid) && > + !strncmp(blk_info.task.comm, con->init_task_comm, > TASK_COMM_LEN)) > + return -EACCES; > } > > - amdgpu_ras_error_data_fini(&err_data); > - > - dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES > AND WILL CORRUPT RAS EEPROM\n"); > - dev_warn(adev->dev, "Clear EEPROM:\n"); > - dev_warn(adev->dev, " echo 1 > > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); > - > return 0; > } > > @@ -295,7 +286,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file > *f, > op = 1; > else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) > op = 2; > - else if (strstr(str, "retire_page") != NULL) > + else if (strstr(str, "check_address") != NULL)
[Tao] the added check is fine for me, but I prefer to reserve retire_page command, we can set check_address to op 4. > op = 3; > else if (str[0] && str[1] && str[2] && str[3]) > /* ascii string, but commands are not matched. */ @@ -495,7 > +486,7 > @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, > return ret; > > if (data.op == 3) { > - ret = amdgpu_reserve_page_direct(adev, data.inject.address); > + ret = amdgpu_check_address_validity(adev, data.inject.address); > if (!ret) > return size; > else > @@ -4103,6 +4094,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > goto release_con; > } > > + con->init_task_pid = task_pid_nr(current); > + get_task_comm(con->init_task_comm, current); > + > dev_info(adev->dev, "RAS INFO: ras initialized successfully, " > "hardware ability[%x] ras_mask[%x]\n", > adev->ras_hw_enabled, adev->ras_enabled); diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 927d6bff734a..7f10a7402160 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -570,6 +570,9 @@ struct amdgpu_ras { > struct ras_event_manager *event_mgr; > > uint64_t reserved_pages_in_bytes; > + > + pid_t init_task_pid; > + char init_task_comm[TASK_COMM_LEN]; > }; > > struct ras_fs_data { > -- > 2.34.1