On 8/6/2025 8:50 PM, Deucher, Alexander wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>> -----Original Message-----
>> From: Lazar, Lijo <lijo.la...@amd.com>
>> Sent: Wednesday, August 6, 2025 11:17 AM
>> To: Deucher, Alexander <alexander.deuc...@amd.com>; amd-
>> g...@lists.freedesktop.org
>> Cc: sta...@vger.kernel.org
>> Subject: Re: [PATCH] drm/amdgpu/discovery: fix fw based ip discovery
>>
>>
>>
>> On 7/30/2025 9:29 PM, Alex Deucher wrote:
>>> We only need the fw based discovery table for sysfs. No need to parse
>>> it. Additionally parsing some of the board specific tables may result
>>> in incorrect data on some boards.
>>> just load the binary and don't parse it on those boards.
>>>
>>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4441
>>> Fixes: 80a0e8282933 ("drm/amdgpu/discovery: optionally use fw based ip
>>> discovery")
>>> Cc: sta...@vger.kernel.org
>>> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
>>
>> One generic question - if discovery content is completely ignored by driver,
>> how
>> external tool using sysfs could consume the data? Wouldn't there be a
>> mismatch in
>> the config?
>
> The IP register offsets are what tools use sysfs for. It's possible some of
> the other data is invalid (e.g., the harvest tables) because they are not
> coming from IFWI in this case.
>
Thanks for the info. Probably, keeping the same comment in the commit
description or code helps - discovery binary file is loaded to provide
sysfs interface to get valid IP register offsets and data from other
tables are not reliable.
Thanks,
Lijo
> Alex
>
>
>>
>> Thanks,
>> Lijo
>>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 72
>>> ++++++++++---------
>>> 2 files changed, 41 insertions(+), 36 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index efe98ffb679a4..b2538cff222ce 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2570,9 +2570,6 @@ static int
>>> amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
>>>
>>> adev->firmware.gpu_info_fw = NULL;
>>>
>>> - if (adev->mman.discovery_bin)
>>> - return 0;
>>> -
>>> switch (adev->asic_type) {
>>> default:
>>> return 0;
>>> @@ -2594,6 +2591,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct
>> amdgpu_device *adev)
>>> chip_name = "arcturus";
>>> break;
>>> case CHIP_NAVI12:
>>> + if (adev->mman.discovery_bin)
>>> + return 0;
>>> chip_name = "navi12";
>>> break;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>>> index 81b3443c8d7f4..27bd7659961e8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
>>> @@ -2555,40 +2555,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>>> amdgpu_device *adev)
>>>
>>> switch (adev->asic_type) {
>>> case CHIP_VEGA10:
>>> - case CHIP_VEGA12:
>>> - case CHIP_RAVEN:
>>> - case CHIP_VEGA20:
>>> - case CHIP_ARCTURUS:
>>> - case CHIP_ALDEBARAN:
>>> - /* this is not fatal. We have a fallback below
>>> - * if the new firmwares are not present. some of
>>> - * this will be overridden below to keep things
>>> - * consistent with the current behavior.
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> */
>>> - r = amdgpu_discovery_reg_base_init(adev);
>>> - if (!r) {
>>> - amdgpu_discovery_harvest_ip(adev);
>>> - amdgpu_discovery_get_gfx_info(adev);
>>> - amdgpu_discovery_get_mall_info(adev);
>>> - amdgpu_discovery_get_vcn_info(adev);
>>> - }
>>> - break;
>>> - default:
>>> - r = amdgpu_discovery_reg_base_init(adev);
>>> - if (r) {
>>> - drm_err(&adev->ddev, "discovery failed: %d\n", r);
>>> - return r;
>>> - }
>>> -
>>> - amdgpu_discovery_harvest_ip(adev);
>>> - amdgpu_discovery_get_gfx_info(adev);
>>> - amdgpu_discovery_get_mall_info(adev);
>>> - amdgpu_discovery_get_vcn_info(adev);
>>> - break;
>>> - }
>>> -
>>> - switch (adev->asic_type) {
>>> - case CHIP_VEGA10:
>>> + amdgpu_discovery_init(adev);
>>> vega10_reg_base_init(adev);
>>> adev->sdma.num_instances = 2;
>>> adev->gmc.num_umc = 4;
>>> @@ -2611,6 +2582,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> adev->ip_versions[DCI_HWIP][0] = IP_VERSION(12, 0, 0);
>>> break;
>>> case CHIP_VEGA12:
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> + */
>>> + amdgpu_discovery_init(adev);
>>> vega10_reg_base_init(adev);
>>> adev->sdma.num_instances = 2;
>>> adev->gmc.num_umc = 4;
>>> @@ -2633,6 +2609,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> adev->ip_versions[DCI_HWIP][0] = IP_VERSION(12, 0, 1);
>>> break;
>>> case CHIP_RAVEN:
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> + */
>>> + amdgpu_discovery_init(adev);
>>> vega10_reg_base_init(adev);
>>> adev->sdma.num_instances = 1;
>>> adev->vcn.num_vcn_inst = 1;
>>> @@ -2674,6 +2655,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> }
>>> break;
>>> case CHIP_VEGA20:
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> + */
>>> + amdgpu_discovery_init(adev);
>>> vega20_reg_base_init(adev);
>>> adev->sdma.num_instances = 2;
>>> adev->gmc.num_umc = 8;
>>> @@ -2697,6 +2683,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> adev->ip_versions[DCI_HWIP][0] = IP_VERSION(12, 1, 0);
>>> break;
>>> case CHIP_ARCTURUS:
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> + */
>>> + amdgpu_discovery_init(adev);
>>> arct_reg_base_init(adev);
>>> adev->sdma.num_instances = 8;
>>> adev->vcn.num_vcn_inst = 2;
>>> @@ -2725,6 +2716,11 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> adev->ip_versions[UVD_HWIP][1] = IP_VERSION(2, 5, 0);
>>> break;
>>> case CHIP_ALDEBARAN:
>>> + /* This is not fatal. We only need the discovery
>>> + * binary for sysfs. We don't need it for a
>>> + * functional system.
>>> + */
>>> + amdgpu_discovery_init(adev);
>>> aldebaran_reg_base_init(adev);
>>> adev->sdma.num_instances = 5;
>>> adev->vcn.num_vcn_inst = 2;
>>> @@ -2751,6 +2747,16 @@ int amdgpu_discovery_set_ip_blocks(struct
>> amdgpu_device *adev)
>>> adev->ip_versions[XGMI_HWIP][0] = IP_VERSION(6, 1, 0);
>>> break;
>>> default:
>>> + r = amdgpu_discovery_reg_base_init(adev);
>>> + if (r) {
>>> + drm_err(&adev->ddev, "discovery failed: %d\n", r);
>>> + return r;
>>> + }
>>> +
>>> + amdgpu_discovery_harvest_ip(adev);
>>> + amdgpu_discovery_get_gfx_info(adev);
>>> + amdgpu_discovery_get_mall_info(adev);
>>> + amdgpu_discovery_get_vcn_info(adev);
>>> break;
>>> }
>>>
>