[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: amd-gfx <[email protected]> On Behalf Of
[email protected]
Sent: Wednesday, October 29, 2025 6:54 PM
To: [email protected]
Subject: amd-gfx Digest, Vol 113, Issue 529
Send amd-gfx mailing list submissions to
[email protected]
To subscribe or unsubscribe via the World Wide Web, visit
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
[email protected]
You can reach the person managing the list at
[email protected]
When replying, please edit your Subject line so it is more specific than "Re:
Contents of amd-gfx digest..."
Today's Topics:
1. [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
for PMFW manages eeprom (Tao Zhou)
2. [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
address (Tao Zhou)
3. Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
copying firmware to it (Timur Krist?f)
4. Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
querying video codecs (Timur Krist?f)
----------------------------------------------------------------------
Message: 1
Date: Wed, 29 Oct 2025 18:38:02 +0800
From: Tao Zhou <[email protected]>
To: <[email protected]>
Cc: Tao Zhou <[email protected]>
Subject: [PATCH 8/8] drm/amdgpu: add RAS bad page threshold handling
for PMFW manages eeprom
Message-ID: <[email protected]>
Content-Type: text/plain
Check if bad page threshold is reached and take actions accordingly.
Signed-off-by: Tao Zhou <[email protected]>
---
.../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 37 ++++++++++++++++---
1 file changed, 32 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index de7b268a9862..0acf45d5fc54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -896,6 +896,36 @@ int amdgpu_ras_eeprom_update_record_num(struct
amdgpu_ras_eeprom_control *contro
return ret;
}
+static int amdgpu_ras_smu_eeprom_append(struct
+amdgpu_ras_eeprom_control *control) {
+ struct amdgpu_device *adev = to_amdgpu_device(control);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!amdgpu_ras_smu_eeprom_supported(adev))
+ return 0;
+
+ control->ras_num_bad_pages = con->bad_page_num;
+
+ if (amdgpu_bad_page_threshold != 0 &&
+ control->ras_num_bad_pages > con->bad_page_cnt_threshold) {
+ dev_warn(adev->dev,
+ "Saved bad pages %d reaches threshold value %d\n",
+ control->ras_num_bad_pages,
con->bad_page_cnt_threshold);
+
+ if (adev->cper.enabled &&
amdgpu_cper_generate_bp_threshold_record(adev))
+ dev_warn(adev->dev, "fail to generate bad page
threshold cper
+records\n");
+
+ if ((amdgpu_bad_page_threshold != -1) &&
+ (amdgpu_bad_page_threshold != -2))
+ con->is_rma = true;
+
+ /* ignore the -ENOTSUPP return value */
+ amdgpu_dpm_send_rma_reason(adev);
Patrick:
In pmfw managed eeprom, rma reason is not needed, so these two lines
should be removed.
Best Regards,
+ }
+
+ return 0;
+}
+
/**
* amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table
* @control: pointer to control structure @@ -914,17 +944,14 @@ int
amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
const u32 num)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int res, i;
uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
if (!__is_ras_eeprom_supported(adev))
return 0;
- if (amdgpu_ras_smu_eeprom_supported(adev)) {
- control->ras_num_bad_pages = con->bad_page_num;
- return 0;
- }
+ if (amdgpu_ras_smu_eeprom_supported(adev))
+ return amdgpu_ras_smu_eeprom_append(control);
if (num == 0) {
dev_err(adev->dev, "will not append 0 records\n");
--
2.34.1
------------------------------
Message: 2
Date: Wed, 29 Oct 2025 18:38:00 +0800
From: Tao Zhou <[email protected]>
To: <[email protected]>
Cc: Tao Zhou <[email protected]>
Subject: [PATCH 6/8] drm/amdgpu: get RAS bad page address from MCA
address
Message-ID: <[email protected]>
Content-Type: text/plain
Instead of from physical address.
v2: add comment to make the code more readable
Signed-off-by: Tao Zhou <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 ++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 4 ++--
2 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 23d421b8ba54..ad197486d9e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3010,8 +3010,13 @@ static int amdgpu_ras_mca2pa_by_idx(struct amdgpu_device
*adev,
addr_in.ma.err_addr = bps->address;
addr_in.ma.socket_id = socket;
addr_in.ma.ch_inst = bps->mem_channel;
- /* tell RAS TA the node instance is not used */
- addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ if (!amdgpu_ras_smu_eeprom_supported(adev)) {
+ /* tell RAS TA the node instance is not used */
+ addr_in.ma.node_inst = TA_RAS_INV_NODE;
+ } else {
+ addr_in.ma.umc_inst = bps->mcumc_id;
+ addr_in.ma.node_inst = bps->cu;
+ }
if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
ret = adev->umc.ras->convert_ras_err_addr(adev, err_data,
@@ -3158,7 +3163,11 @@ static int __amdgpu_ras_convert_rec_from_rom(struct
amdgpu_device *adev,
save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
} else {
- save_nps = nps;
+ /* if pmfw manages eeprom, save_nps is not stored on eeprom,
+ * we should always convert mca address into physical address,
+ * make save_nps different from nps
+ */
+ save_nps = nps + 1;
}
if (save_nps == nps) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 3bf633158fa2..511c5882b37e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1012,10 +1012,10 @@ int amdgpu_ras_eeprom_read_idx(struct
amdgpu_ras_eeprom_control *control,
record[i - rec_idx].retired_page = 0x1ULL;
record[i - rec_idx].ts = ts;
record[i - rec_idx].err_type =
AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
- record[i - rec_idx].cu = 0;
if (adev->umc.ras->mca_ipid_parse)
- adev->umc.ras->mca_ipid_parse(adev, ipid, NULL,
+ adev->umc.ras->mca_ipid_parse(adev, ipid,
+ (uint32_t *)&(record[i - rec_idx].cu),
(uint32_t *)&(record[i - rec_idx].mem_channel),
(uint32_t *)&(record[i - rec_idx].mcumc_id),
NULL);
else
--
2.34.1
------------------------------
Message: 3
Date: Wed, 29 Oct 2025 11:48:41 +0100
From: Timur Krist?f <[email protected]>
To: Christian K?nig <[email protected]>,
[email protected], Alex Deucher
<[email protected]>, Alexandre Demers
<[email protected]>, Rodrigo Siqueira <[email protected]>
Subject: Re: [PATCH 05/14] drm/amdgpu/vce: Clear VCPU BO before
copying firmware to it
Message-ID: <[email protected]>
Content-Type: text/plain; charset="UTF-8"
On Wed, 2025-10-29 at 11:19 +0100, Christian K?nig wrote:
> On 10/28/25 23:06, Timur Krist?f wrote:
> > The VCPU BO doesn't only contain the VCE firmware but also other
> > ranges that the VCE uses for its stack and data. Let's initialize
> > this to zero to avoid having garbage in the VCPU BO.
>
> Absolutely clear NAK.
>
> This is intentionally not initialized on resume to avoid breaking
> encode sessions which existed before suspend.
How can there be encode sessions from before suspend?
I think that there can't be.
As far as I see, before suspend we wait for the VCE to go idle, meaning
that we wait for all pending work to finish.
amdgpu_vce_suspend has a comment which says:
suspending running encoding sessions isn't supported
> Why exactly is that an issue?
We need to clear at least some of the BO for the VCE1 firmware
validation mechanism. This is done in a memset in vce_v1_0_load_fw in
the old radeon driver.
Also I think it's a good idea to avoid having garbage in the VCPU BO.
> The VCE FW BO should be cleared to zero after initial allocation?
To clarify, are you suggesting that I move the memset to after the BO
creation, and then never clear it again? Or are you saying that
amdgpu_bo_create_reserved already clears the BO?
>
> Regards,
> Christian.
>
> >
> > Fixes: d38ceaf99ed0 ("drm/amdgpu: add core driver (v4)")
> > Signed-off-by: Timur Krist?f <[email protected]>
> > ---
> > ?drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 +
> > ?1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > index b9060bcd4806..eaa06dbef5c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > @@ -310,6 +310,7 @@ int amdgpu_vce_resume(struct amdgpu_device
> > *adev)
> > ? offset = le32_to_cpu(hdr->ucode_array_offset_bytes);
> > ?
> > ? if (drm_dev_enter(adev_to_drm(adev), &idx)) {
> > + memset32(cpu_addr, 0, amdgpu_bo_size(adev-
> > >vce.vcpu_bo) / 4);
> > ? memcpy_toio(cpu_addr, adev->vce.fw->data + offset,
> > ? ??? adev->vce.fw->size - offset);
> > ? drm_dev_exit(idx);
------------------------------
Message: 4
Date: Wed, 29 Oct 2025 11:54:14 +0100
From: Timur Krist?f <[email protected]>
To: Christian K?nig <[email protected]>,
[email protected], Alex Deucher
<[email protected]>, Alexandre Demers
<[email protected]>, Rodrigo Siqueira <[email protected]>
Subject: Re: [PATCH 07/14] drm/amdgpu/si,cik,vi: Verify IP block when
querying video codecs
Message-ID: <[email protected]>
Content-Type: text/plain; charset="UTF-8"
On Wed, 2025-10-29 at 11:35 +0100, Christian K?nig wrote:
>
>
> On 10/28/25 23:06, Timur Krist?f wrote:
> > Some harvested chips may not have any IP blocks,
> > or we may not have the firmware for the IP blocks.
> > In these cases, the query should return that no video
> > codec is supported.
> >
> > Signed-off-by: Timur Krist?f <[email protected]>
> > ---
> > ?drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 3 ++-
> > ?drivers/gpu/drm/amd/amdgpu/cik.c??????? | 6 ++++++
> > ?drivers/gpu/drm/amd/amdgpu/si.c???????? | 6 ++++++
> > ?drivers/gpu/drm/amd/amdgpu/vi.c???????? | 6 ++++++
> > ?4 files changed, 20 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > index b3e6b3fcdf2c..42b5da59d00f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> > @@ -1263,7 +1263,8 @@ int amdgpu_info_ioctl(struct drm_device *dev,
> > void *data, struct drm_file *filp)
> > ? -EFAULT : 0;
> > ? }
> > ? case AMDGPU_INFO_VIDEO_CAPS: {
> > - const struct amdgpu_video_codecs *codecs;
> > + static const struct amdgpu_video_codecs no_codecs
> > = {0};
>
> No zero init for static variables please, that will raise you a
> constant checker warning.
>
> > + const struct amdgpu_video_codecs *codecs =
> > &no_codecs;
> > ? struct drm_amdgpu_info_video_caps *caps;
> > ? int r;
> > ?
> > diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
> > b/drivers/gpu/drm/amd/amdgpu/cik.c
> > index 9cd63b4177bf..b755238c2c3d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> > @@ -130,6 +130,12 @@ static const struct amdgpu_video_codecs
> > cik_video_codecs_decode =
> > ?static int cik_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ? ? const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > + const enum amd_ip_block_type ip =
> > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > + return 0;
>
> I'm wondering if returning EOPNOTSUPP is not more appropriate here
> than returning an empty cappability list.
I don't think so.
Returning EOPNOTSUPP would indicate that the operation of querying the
codec support is not supported, and not that the list of supported
codecs is empty.
>
> Anyway setting the codecs list to empty in the caller is rather bad
> coding style.
Sure, I'll come up with a better way to do this.
>
> Regards,
> Christian.
>
> > +
> > ? switch (adev->asic_type) {
> > ? case CHIP_BONAIRE:
> > ? case CHIP_HAWAII:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
> > b/drivers/gpu/drm/amd/amdgpu/si.c
> > index e0f139de7991..9468c03bdb1b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/si.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> > @@ -1003,6 +1003,12 @@ static const struct amdgpu_video_codecs
> > hainan_video_codecs_decode =
> > ?static int si_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ? const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > + const enum amd_ip_block_type ip =
> > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > + return 0;
> > +
> > ? switch (adev->asic_type) {
> > ? case CHIP_VERDE:
> > ? case CHIP_TAHITI:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> > b/drivers/gpu/drm/amd/amdgpu/vi.c
> > index a611a7345125..f0e4193cf722 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > @@ -256,6 +256,12 @@ static const struct amdgpu_video_codecs
> > cz_video_codecs_decode =
> > ?static int vi_query_video_codecs(struct amdgpu_device *adev, bool
> > encode,
> > ? const struct amdgpu_video_codecs
> > **codecs)
> > ?{
> > + const enum amd_ip_block_type ip =
> > + encode ? AMD_IP_BLOCK_TYPE_VCE :
> > AMD_IP_BLOCK_TYPE_UVD;
> > +
> > + if (!amdgpu_device_ip_is_valid(adev, ip))
> > + return 0;
> > +
> > ? switch (adev->asic_type) {
> > ? case CHIP_TOPAZ:
> > ? if (encode)
------------------------------
Subject: Digest Footer
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
------------------------------
End of amd-gfx Digest, Vol 113, Issue 529
*****************************************