[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: Xie, Patrick <[email protected]>
> Sent: Wednesday, October 29, 2025 7:07 PM
> To: [email protected]
> Cc: Zhou1, Tao <[email protected]>
> Subject: RE: : [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of amd-gfx-
> [email protected]
> Sent: Wednesday, October 29, 2025 6:54 PM
> To: [email protected]
> Subject: amd-gfx Digest, Vol 113, Issue 529
>
> Send amd-gfx mailing list submissions to
> [email protected]
>
> To subscribe or unsubscribe via the World Wide Web, visit
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> or, via email, send a message with subject or body 'help' to
> [email protected]
>
> You can reach the person managing the list at
> [email protected]
>
> When replying, please edit your Subject line so it is more specific than "Re:
> Contents
> of amd-gfx digest..."
>
>
> Today's Topics:
>
> 1. [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
> for PMFW manages eeprom (Tao Zhou)
> 2. [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
> address (Tao Zhou)
> 3. Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
> copying firmware to it (Timur Krist?f)
> 4. Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
> querying video codecs (Timur Krist?f)
>
>
> ----------------------------------------------------------------------
>
> Message: 1
> Date: Wed, 29 Oct 2025 18:38:02 +0800
> From: Tao Zhou <[email protected]>
> To: <[email protected]>
> Cc: Tao Zhou <[email protected]>
> Subject: [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
> for PMFW manages eeprom
> Message-ID: <[email protected]>
> Content-Type: text/plain
>
> Check if bad page threshold is reached and take actions accordingly.
>
> Signed-off-by: Tao Zhou <[email protected]>
> ---
> .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 37 ++++++++++++++++---
> 1 file changed, 32 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index de7b268a9862..0acf45d5fc54 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -896,6 +896,36 @@ int amdgpu_ras_eeprom_update_record_num(struct
> amdgpu_ras_eeprom_control *contro
> return ret;
> }
>
> +static int amdgpu_ras_smu_eeprom_append(struct
> +amdgpu_ras_eeprom_control *control) {
> + struct amdgpu_device *adev = to_amdgpu_device(control);
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +
> + if (!amdgpu_ras_smu_eeprom_supported(adev))
> + return 0;
> +
> + control->ras_num_bad_pages = con->bad_page_num;
> +
> + if (amdgpu_bad_page_threshold != 0 &&
> + control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
> + dev_warn(adev->dev,
> + "Saved bad pages %d reaches threshold value %d\n",
> + control->ras_num_bad_pages,
> + con->bad_page_cnt_threshold);
> +
> + if (adev->cper.enabled &&
> amdgpu_cper_generate_bp_threshold_record(adev))
> + dev_warn(adev->dev, "fail to generate bad page
> +threshold cper records\n");
> +
> + if ((amdgpu_bad_page_threshold != -1) &&
> + (amdgpu_bad_page_threshold != -2))
> + con->is_rma = true;
> +
> + /* ignore the -ENOTSUPP return value */
> + amdgpu_dpm_send_rma_reason(adev);
> Patrick:
> In pmfw managed eeprom, rma reason is not needed, so these two lines
> should
> be removed.
>
> Best Regards,
[Tao] thanks for the reminder, will update it.
> + }
> +
> + return 0;
> +}
> +
> /**
> * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
> * @control: pointer to control structure @@ -914,17 +944,14 @@ int
> amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
> const u32 num) {
> struct amdgpu_device *adev = to_amdgpu_device(control);
> - struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> int res, i;
> uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
>
> if (!__is_ras_eeprom_supported(adev))
> return 0;
>
> - if (amdgpu_ras_smu_eeprom_supported(adev)) {
> - control->ras_num_bad_pages = con->bad_page_num;
> - return 0;
> - }
> + if (amdgpu_ras_smu_eeprom_supported(adev))
> + return amdgpu_ras_smu_eeprom_append(control);
>
> if (num == 0) {
> dev_err(adev->dev, "will not append 0 records\n");
> --
> 2.34.1
>
>
>
> ------------------------------
>
> Message: 2
> Date: Wed, 29 Oct 2025 18:38:00 +0800
> From: Tao Zhou <[email protected]>
> To: <[email protected]>
> Cc: Tao Zhou <[email protected]>
> Subject: [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
> address
> Message-ID: <[email protected]>
> Content-Type: text/plain
>
> Instead of from physical address.
>
> v2: add comment to make the code more readable
>
> Signed-off-by: Tao Zhou <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 ++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 4 ++--
> 2 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 23d421b8ba54..ad197486d9e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3010,8 +3010,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct
> amdgpu_device *adev,
> addr_in.ma.err_addr = bps->address;
> addr_in.ma.socket_id = socket;
> addr_in.ma.ch_inst = bps->mem_channel;
> - /* tell RAS TA the node instance is not used */
> - addr_in.ma.node_inst = TA_RAS_INV_NODE;
> + if (!amdgpu_ras_smu_eeprom_supported(adev)) {
> + /* tell RAS TA the node instance is not used */
> + addr_in.ma.node_inst = TA_RAS_INV_NODE;
> + } else {
> + addr_in.ma.umc_inst = bps->mcumc_id;
> + addr_in.ma.node_inst = bps->cu;
> + }
>
> if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
> ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, @@
> -3158,7
> +3163,11 @@ static int __amdgpu_ras_convert_rec_from_rom(struct
> amdgpu_device *adev,
> save_nps = (bps->retired_page >> UMC_NPS_SHIFT) &
> UMC_NPS_MASK;
> bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
> } else {
> - save_nps = nps;
> + /* if pmfw manages eeprom, save_nps is not stored on eeprom,
> + * we should always convert mca address into physical address,
> + * make save_nps different from nps
> + */
> + save_nps = nps + 1;
> }
>
> if (save_nps == nps) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 3bf633158fa2..511c5882b37e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -1012,10 +1012,10 @@ int amdgpu_ras_eeprom_read_idx(struct
> amdgpu_ras_eeprom_control *control,
> record[i - rec_idx].retired_page = 0x1ULL;
> record[i - rec_idx].ts = ts;
> record[i - rec_idx].err_type =
> AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
> - record[i - rec_idx].cu = 0;
>
> if (adev->umc.ras->mca_ipid_parse)
> - adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
> + adev->umc.ras->mca_ipid_parse(adev, ipid,
> + (uint32_t *)&(record[i - rec_idx].cu),
> (uint32_t *)&(record[i -
> rec_idx].mem_channel),
> (uint32_t *)&(record[i - rec_idx].mcumc_id),
> NULL);
> else
> --
> 2.34.1
>
>
>
> ------------------------------
>
> Message: 3
> Date: Wed, 29 Oct 2025 11:48:41 +0100
> From: Timur Krist?f <[email protected]>
> To: Christian K?nig <[email protected]>,
> [email protected], Alex Deucher
> <[email protected]>, Alexandre Demers
> <[email protected]>, Rodrigo Siqueira <[email protected]>
> Subject: Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
> copying firmware to it
> Message-ID: <[email protected]>
> Content-Type: text/plain; charset="UTF-8"
>
> On Wed, 2025-10-29 at 11:19 +0100, Christian K?nig wrote:
> > On 10/28/25 23:06, Timur Krist?f wrote:
> > > The VCPU BO doesn't only contain the VCE firmware but also other
> > > ranges that the VCE uses for its stack and data. Let's initialize
> > > this to zero to avoid having garbage in the VCPU BO.
> >
> > Absolutely clear NAK.
> >
> > This is intentionally not initialized on resume to avoid breaking
> > encode sessions which existed before suspend.
>
> How can there be encode sessions from before suspend?
> I think that there can't be.
>
> As far as I see, before suspend we wait for the VCE to go idle, meaning that
> we wait
> for all pending work to finish.
> amdgpu_vce_suspend has a comment which says:
> suspending running encoding sessions isn't supported
>
> > Why exactly is that an issue?
>
> We need to clear at least some of the BO for the VCE1 firmware validation
> mechanism. This is done in a memset in vce_v1_0_load_fw in the old radeon
> driver.
>
> Also I think it's a good idea to avoid having garbage in the VCPU BO.
>
> > The VCE FW BO should be cleared to zero after initial allocation?
>
> To clarify, are you suggesting that I move the memset to after the BO
> creation, and
> then never clear it again? Or are you saying that amdgpu_bo_create_reserved
> already clears the BO?
>
> >
> > Regards,
> > Christian.
> >
> > >
> > > Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)")
> > > Signed-off-by: Timur Krist?f <[email protected]>
> > > ---
> > > ?drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 +
> > > ?1 file changed, 1 insertion(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > index b9060bcd4806..eaa06dbef5c4 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > > @@ -310,6 +310,7 @@ int amdgpu_vce_resume(struct amdgpu_device
> > > *adev)
> > > ? offset = le32_to_cpu(hdr->ucode_array_offset_bytes);
> > > ?
> > > ? if (drm_dev_enter(adev_to_drm(adev), &idx)) {
> > > + memset32(cpu_addr, 0, amdgpu_bo_size(adev-
> > > >vce.vcpu_bo) / 4);
> > > ? memcpy_toio(cpu_addr, adev->vce.fw->data + offset,
> > > ? ??? adev->vce.fw->size - offset);
> > > ? drm_dev_exit(idx);
>
>
> ------------------------------
>
> Message: 4
> Date: Wed, 29 Oct 2025 11:54:14 +0100
> From: Timur Krist?f <[email protected]>
> To: Christian K?nig <[email protected]>,
> [email protected], Alex Deucher
> <[email protected]>, Alexandre Demers
> <[email protected]>, Rodrigo Siqueira <[email protected]>
> Subject: Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
> querying video codecs
> Message-ID: <[email protected]>
> Content-Type: text/plain; charset="UTF-8"
>
> On Wed, 2025-10-29 at 11:35 +0100, Christian K?nig wrote:
> >
> >
> > On 10/28/25 23:06, Timur Krist?f wrote:
> > > Some harvested chips may not have any IP blocks, or we may not have
> > > the firmware for the IP blocks.
> > > In these cases, the query should return that no video codec is
> > > supported.
> > >
> > > Signed-off-by: Timur Krist?f <[email protected]>
> > > ---
> > > ?drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++-
> > > ?drivers/gpu/drm/amd/amdgpu/cik.c??????? | 6 ++++++
> > > ?drivers/gpu/drm/amd/amdgpu/si.c???????? | 6 ++++++
> > > ?drivers/gpu/drm/amd/amdgpu/vi.c???????? | 6 ++++++
> > > ?4 files changed, 20 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > index b3e6b3fcdf2c..42b5da59d00f 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > > @@ -1263,7 +1263,8 @@ int amdgpu_info_ioctl(struct drm_device *dev,
> > > void *data, struct drm_file *filp)
> > > ? -EFAULT : 0;
> > > ? }
> > > ? case AMDGPU_INFO_VIDEO_CAPS: {
> > > - const struct amdgpu_video_codecs *codecs;
> > > + static const struct amdgpu_video_codecs no_codecs
> > > = {0};
> >
> > No zero init for static variables please, that will raise you a
> > constant checker warning.
> >
> > > + const struct amdgpu_video_codecs *codecs =
> > > &no_codecs;
> > > ? struct drm_amdgpu_info_video_caps *caps;
> > > ? int r;
> > > ?
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
> > > b/drivers/gpu/drm/amd/amdgpu/cik.c
> > > index 9cd63b4177bf..b755238c2c3d 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> > > @@ -130,6 +130,12 @@ static const struct amdgpu_video_codecs
> > > cik_video_codecs_decode = ?static int cik_query_video_codecs(struct
> > > amdgpu_device *adev, bool encode,
> > > ? ? const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > + const enum amd_ip_block_type ip =
> > > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > > + return 0;
> >
> > I'm wondering if returning EOPNOTSUPP is not more appropriate here
> > than returning an empty cappability list.
>
> I don't think so.
>
> Returning EOPNOTSUPP would indicate that the operation of querying the codec
> support is not supported, and not that the list of supported codecs is empty.
>
> >
> > Anyway setting the codecs list to empty in the caller is rather bad
> > coding style.
>
> Sure, I'll come up with a better way to do this.
>
> >
> > Regards,
> > Christian.
> >
> > > +
> > > ? switch (adev->asic_type) {
> > > ? case CHIP_BONAIRE:
> > > ? case CHIP_HAWAII:
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
> > > b/drivers/gpu/drm/amd/amdgpu/si.c index e0f139de7991..9468c03bdb1b
> > > 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/si.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> > > @@ -1003,6 +1003,12 @@ static const struct amdgpu_video_codecs
> > > hainan_video_codecs_decode = ?static int
> > > si_query_video_codecs(struct amdgpu_device *adev, bool encode,
> > > ? const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > + const enum amd_ip_block_type ip =
> > > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > > + return 0;
> > > +
> > > ? switch (adev->asic_type) {
> > > ? case CHIP_VERDE:
> > > ? case CHIP_TAHITI:
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> > > b/drivers/gpu/drm/amd/amdgpu/vi.c index a611a7345125..f0e4193cf722
> > > 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > > @@ -256,6 +256,12 @@ static const struct amdgpu_video_codecs
> > > cz_video_codecs_decode = ?static int vi_query_video_codecs(struct
> > > amdgpu_device *adev, bool encode,
> > > ? const struct amdgpu_video_codecs
> > > **codecs)
> > > ?{
> > > + const enum amd_ip_block_type ip =
> > > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > > AMD_IP_BLOCK_TYPE_UVD;
> > > +
> > > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > > + return 0;
> > > +
> > > ? switch (adev->asic_type) {
> > > ? case CHIP_TOPAZ:
> > > ? if (encode)
>
>
> ------------------------------
>
> Subject: Digest Footer
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
> ------------------------------
>
> End of amd-gfx Digest, Vol 113, Issue 529
> *****************************************