[AMD Official Use Only - AMD Internal Distribution Only] The series looks good to me, but still need @Skvortsov, Victor or @Gande, Shravan kumar to review.
Regards, Stanley > -----Original Message----- > From: Liu, Xiang(Dean) <xiang....@amd.com> > Sent: Tuesday, August 19, 2025 1:26 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhang, Hawking <hawking.zh...@amd.com>; Zhou1, Tao > <tao.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; Chai, > Thomas <yipeng.c...@amd.com>; Liu, Xiang(Dean) <xiang....@amd.com> > Subject: [PATCH 1/2] drm/amdgpu: Introduce VF critical region check for RAS > poison injection > > The SRIOV guest send requet to host to check whether the poison injection > address is in VF critical region or not via mabox. > > Signed-off-by: Xiang Liu <xiang....@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 55 > +++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 ++ > drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 5 ++ > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 14 ++++++ > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 2 + > 5 files changed, 79 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index 13f0cdeb59c4..3328ab63376b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -828,11 +828,14 @@ static void amdgpu_virt_init_ras(struct > amdgpu_device *adev) { > ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); > ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); > + ratelimit_state_init(&adev->virt.ras.ras_chk_criti_rs, 5 * HZ, 1); > > ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, > RATELIMIT_MSG_ON_RELEASE); > ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, > RATELIMIT_MSG_ON_RELEASE); > + ratelimit_set_flags(&adev->virt.ras.ras_chk_criti_rs, > + RATELIMIT_MSG_ON_RELEASE); > > mutex_init(&adev->virt.ras.ras_telemetry_mutex); > > @@ -1501,3 +1504,55 @@ void amdgpu_virt_request_bad_pages(struct > amdgpu_device *adev) > if (virt->ops && virt->ops->req_bad_pages) > virt->ops->req_bad_pages(adev); > } > + > +static int amdgpu_virt_cache_chk_criti_hit(struct amdgpu_device *adev, > + struct amdsriov_ras_telemetry > *host_telemetry, > + bool *hit) > +{ > + struct amd_sriov_ras_chk_criti *tmp = NULL; > + uint32_t checksum, used_size; > + > + checksum = host_telemetry->header.checksum; > + used_size = host_telemetry->header.used_size; > + > + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) > + return 0; > + > + tmp = kmemdup(&host_telemetry->body.chk_criti, used_size, > GFP_KERNEL); > + if (!tmp) > + return -ENOMEM; > + > + if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0)) > + goto out; > + > + if (hit) > + *hit = tmp->hit ? true : false; > + > +out: > + kfree(tmp); > + > + return 0; > +} > + > +int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, > +u64 addr, bool *hit) { > + struct amdgpu_virt *virt = &adev->virt; > + int r = -EPERM; > + > + if (!virt->ops || !virt->ops->req_ras_chk_criti) > + return -EOPNOTSUPP; > + > + /* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, > the Host > + * will ignore incoming guest messages. Ratelimit the guest messages > to > + * prevent guest self DOS. > + */ > + if (__ratelimit(&virt->ras.ras_chk_criti_rs)) { > + mutex_lock(&virt->ras.ras_telemetry_mutex); > + if (!virt->ops->req_ras_chk_criti(adev, addr)) > + r = amdgpu_virt_cache_chk_criti_hit( > + adev, virt->fw_reserve.ras_telemetry, hit); > + mutex_unlock(&virt->ras.ras_telemetry_mutex); > + } > + > + return r; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index 3da3ebb1d9a1..6ca83abd7a4f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -98,6 +98,7 @@ struct amdgpu_virt_ops { > int (*req_ras_err_count)(struct amdgpu_device *adev); > int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr); > int (*req_bad_pages)(struct amdgpu_device *adev); > + int (*req_ras_chk_criti)(struct amdgpu_device *adev, u64 addr); > }; > > /* > @@ -252,6 +253,7 @@ struct amdgpu_virt_ras_err_handler_data { struct > amdgpu_virt_ras { > struct ratelimit_state ras_error_cnt_rs; > struct ratelimit_state ras_cper_dump_rs; > + struct ratelimit_state ras_chk_criti_rs; > struct mutex ras_telemetry_mutex; > uint64_t cper_rptr; > }; > @@ -447,4 +449,5 @@ int amdgpu_virt_ras_telemetry_post_reset(struct > amdgpu_device *adev); bool amdgpu_virt_ras_telemetry_block_en(struct > amdgpu_device *adev, > enum amdgpu_ras_block block); > void amdgpu_virt_request_bad_pages(struct amdgpu_device *adev); > +int amdgpu_virt_check_vf_critical_region(struct amdgpu_device *adev, > +u64 addr, bool *hit); > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > index 33edad1f9dcd..3a79ed7d8031 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h > @@ -405,12 +405,17 @@ struct amd_sriov_ras_cper_dump { > uint32_t buf[]; > }; > > +struct amd_sriov_ras_chk_criti { > + uint32_t hit; > +}; > + > struct amdsriov_ras_telemetry { > struct amd_sriov_ras_telemetry_header header; > > union { > struct amd_sriov_ras_telemetry_error_count error_count; > struct amd_sriov_ras_cper_dump cper_dump; > + struct amd_sriov_ras_chk_criti chk_criti; > } body; > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index f6d8597452ed..00c5db336c0e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -205,6 +205,9 @@ static int > xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev, > case IDH_REQ_RAS_BAD_PAGES: > event = IDH_RAS_BAD_PAGES_READY; > break; > + case IDH_REQ_RAS_CHK_CRITI: > + event = IDH_REQ_RAS_CHK_CRITI_READY; > + break; > default: > break; > } > @@ -535,6 +538,16 @@ static int xgpu_nv_req_ras_bad_pages(struct > amdgpu_device *adev) > return xgpu_nv_send_access_requests(adev, > IDH_REQ_RAS_BAD_PAGES); } > > +static int xgpu_nv_check_vf_critical_region(struct amdgpu_device *adev, > +u64 addr) { > + uint32_t addr_hi, addr_lo; > + > + addr_hi = (uint32_t)(addr >> 32); > + addr_lo = (uint32_t)(addr & 0xFFFFFFFF); > + return xgpu_nv_send_access_requests_with_param( > + adev, IDH_REQ_RAS_CHK_CRITI, addr_hi, addr_lo, 0); } > + > const struct amdgpu_virt_ops xgpu_nv_virt_ops = { > .req_full_gpu = xgpu_nv_request_full_gpu_access, > .rel_full_gpu = xgpu_nv_release_full_gpu_access, > @@ -548,4 +561,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { > .req_ras_err_count = xgpu_nv_req_ras_err_count, > .req_ras_cper_dump = xgpu_nv_req_ras_cper_dump, > .req_bad_pages = xgpu_nv_req_ras_bad_pages, > + .req_ras_chk_criti = xgpu_nv_check_vf_critical_region > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > index 5808689562cc..c1083e5e41e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > @@ -43,6 +43,7 @@ enum idh_request { > IDH_REQ_RAS_ERROR_COUNT = 203, > IDH_REQ_RAS_CPER_DUMP = 204, > IDH_REQ_RAS_BAD_PAGES = 205, > + IDH_REQ_RAS_CHK_CRITI = 206 > }; > > enum idh_event { > @@ -62,6 +63,7 @@ enum idh_event { > IDH_RAS_BAD_PAGES_READY = 15, > IDH_RAS_BAD_PAGES_NOTIFICATION = 16, > IDH_UNRECOV_ERR_NOTIFICATION = 17, > + IDH_REQ_RAS_CHK_CRITI_READY = 18, > > IDH_TEXT_MESSAGE = 255, > }; > -- > 2.34.1