Re: [PATCH v2] drm/amgpu: Check return value of amdgpu_device_baco_enter/exit

2024-02-29 Thread Lazar, Lijo



On 2/29/2024 4:40 PM, Ma, Jun wrote:
> Hi Lijo,
> 
> On 2/29/2024 3:33 PM, Lazar, Lijo wrote:
>>
>>
>> On 2/29/2024 11:49 AM, Ma Jun wrote:
>>> Check return value of amdgpu_device_baco_enter/exit and print
>>> warning message because these errors may cause runtime resume failure
>>>
>>> Signed-off-by: Ma Jun 
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 --
>>>  1 file changed, 22 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index e68bd6f8a6a4..4928b588cd12 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -6000,15 +6000,24 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
>>>  {
>>> struct amdgpu_device *adev = drm_to_adev(dev);
>>> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>> +   int ret = 0;
>>>  
>>> -   if (!amdgpu_device_supports_baco(dev))
>>> -   return -ENOTSUPP;
>>> +   if (!amdgpu_device_supports_baco(dev)) {
>>> +   ret = -ENOTSUPP;
>>> +   goto baco_error;
>>> +   }
>>>  
>>> if (ras && adev->ras_enabled &&
>>> adev->nbio.funcs->enable_doorbell_interrupt)
>>> adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
>>>  
>>> -   return amdgpu_dpm_baco_enter(adev);
>>> +   ret = amdgpu_dpm_baco_enter(adev);
>>> +
>>> +baco_error:
>>> +   if (ret)
>>> +   dev_warn(adev->dev, "warning: device fails to enter baco. 
>>> ret=%d\n", ret);
>>> +
>>
>> This doesn't look like a real case, moreover the warning message is
> 
> In fact this is a case that actually happened.
> 
> When amdgpu_device_supports_baco returns with error because of some reasons,
> device will enter runtime suspend without calling the  
> amdgpu_device_baco_enter()
> and without any warning message being printed. Then, device is usually fails
> to resume.
> So, I add this message as a warning to help us find the real reason.
> 
>> misleading. If the device doesn't support baco, driver is not supposed
>> to call it for runpm purpose -
>> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2664
>>
> I changed this code in another patch.
> https://lists.freedesktop.org/archives/amd-gfx/2024-February/104929.html
> 

Baco support checks are all based on cached software variables and not
based on hardware interactions. So this is very unlikely. It might also
be that driver really called baco entry, but even before baco entry
happened a device usage is detected for which resume is called.

One other way to really check this is to check if baco exit is called
when the software state is not really SMU_BACO_STATE_ENTER (applicable
only for swsmu). Since baco entry is driver triggered, the imbalance
shouldn't happen.

Thanks,
Lijo

> Regards,
> Ma Jun
> 
>> Thanks,
>> Lijo
>>
>>> +   return ret;
>>>  }
>>>  
>>>  int amdgpu_device_baco_exit(struct drm_device *dev)
>>> @@ -6017,12 +6026,14 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>>> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>> int ret = 0;
>>>  
>>> -   if (!amdgpu_device_supports_baco(dev))
>>> -   return -ENOTSUPP;
>>> +   if (!amdgpu_device_supports_baco(dev)) {
>>> +   ret = -ENOTSUPP;
>>> +   goto baco_error;
>>> +   }
>>>  
>>> ret = amdgpu_dpm_baco_exit(adev);
>>> if (ret)
>>> -   return ret;
>>> +   goto baco_error;
>>>  
>>> if (ras && adev->ras_enabled &&
>>> adev->nbio.funcs->enable_doorbell_interrupt)
>>> @@ -6032,7 +6043,11 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>>> adev->nbio.funcs->clear_doorbell_interrupt)
>>> adev->nbio.funcs->clear_doorbell_interrupt(adev);
>>>  
>>> -   return 0;
>>> +baco_error:
>>> +   if (ret)
>>> +   dev_warn(adev->dev, "warning: device fails to exit from baco. 
>>> ret=%d\n", ret);
>>> +
>>> +   return ret;
>>>  }
>>>  
>>>  /**


Re: [PATCH v2] drm/amgpu: Check return value of amdgpu_device_baco_enter/exit

2024-02-29 Thread Ma, Jun
Hi Lijo,

On 2/29/2024 3:33 PM, Lazar, Lijo wrote:
> 
> 
> On 2/29/2024 11:49 AM, Ma Jun wrote:
>> Check return value of amdgpu_device_baco_enter/exit and print
>> warning message because these errors may cause runtime resume failure
>>
>> Signed-off-by: Ma Jun 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 --
>>  1 file changed, 22 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e68bd6f8a6a4..4928b588cd12 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -6000,15 +6000,24 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
>>  {
>>  struct amdgpu_device *adev = drm_to_adev(dev);
>>  struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>> +int ret = 0;
>>  
>> -if (!amdgpu_device_supports_baco(dev))
>> -return -ENOTSUPP;
>> +if (!amdgpu_device_supports_baco(dev)) {
>> +ret = -ENOTSUPP;
>> +goto baco_error;
>> +}
>>  
>>  if (ras && adev->ras_enabled &&
>>  adev->nbio.funcs->enable_doorbell_interrupt)
>>  adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
>>  
>> -return amdgpu_dpm_baco_enter(adev);
>> +ret = amdgpu_dpm_baco_enter(adev);
>> +
>> +baco_error:
>> +if (ret)
>> +dev_warn(adev->dev, "warning: device fails to enter baco. 
>> ret=%d\n", ret);
>> +
> 
> This doesn't look like a real case, moreover the warning message is

In fact this is a case that actually happened.

When amdgpu_device_supports_baco returns with error because of some reasons,
device will enter runtime suspend without calling the  
amdgpu_device_baco_enter()
and without any warning message being printed. Then, device is usually fails
to resume.
So, I add this message as a warning to help us find the real reason.

> misleading. If the device doesn't support baco, driver is not supposed
> to call it for runpm purpose -
> https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2664
> 
I changed this code in another patch.
https://lists.freedesktop.org/archives/amd-gfx/2024-February/104929.html

Regards,
Ma Jun

> Thanks,
> Lijo
> 
>> +return ret;
>>  }
>>  
>>  int amdgpu_device_baco_exit(struct drm_device *dev)
>> @@ -6017,12 +6026,14 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>>  struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>>  int ret = 0;
>>  
>> -if (!amdgpu_device_supports_baco(dev))
>> -return -ENOTSUPP;
>> +if (!amdgpu_device_supports_baco(dev)) {
>> +ret = -ENOTSUPP;
>> +goto baco_error;
>> +}
>>  
>>  ret = amdgpu_dpm_baco_exit(adev);
>>  if (ret)
>> -return ret;
>> +goto baco_error;
>>  
>>  if (ras && adev->ras_enabled &&
>>  adev->nbio.funcs->enable_doorbell_interrupt)
>> @@ -6032,7 +6043,11 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>>  adev->nbio.funcs->clear_doorbell_interrupt)
>>  adev->nbio.funcs->clear_doorbell_interrupt(adev);
>>  
>> -return 0;
>> +baco_error:
>> +if (ret)
>> +dev_warn(adev->dev, "warning: device fails to exit from baco. 
>> ret=%d\n", ret);
>> +
>> +return ret;
>>  }
>>  
>>  /**


Re: [PATCH v2] drm/amgpu: Check return value of amdgpu_device_baco_enter/exit

2024-02-28 Thread Lazar, Lijo



On 2/29/2024 11:49 AM, Ma Jun wrote:
> Check return value of amdgpu_device_baco_enter/exit and print
> warning message because these errors may cause runtime resume failure
> 
> Signed-off-by: Ma Jun 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 --
>  1 file changed, 22 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e68bd6f8a6a4..4928b588cd12 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6000,15 +6000,24 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
>  {
>   struct amdgpu_device *adev = drm_to_adev(dev);
>   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + int ret = 0;
>  
> - if (!amdgpu_device_supports_baco(dev))
> - return -ENOTSUPP;
> + if (!amdgpu_device_supports_baco(dev)) {
> + ret = -ENOTSUPP;
> + goto baco_error;
> + }
>  
>   if (ras && adev->ras_enabled &&
>   adev->nbio.funcs->enable_doorbell_interrupt)
>   adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
>  
> - return amdgpu_dpm_baco_enter(adev);
> + ret = amdgpu_dpm_baco_enter(adev);
> +
> +baco_error:
> + if (ret)
> + dev_warn(adev->dev, "warning: device fails to enter baco. 
> ret=%d\n", ret);
> +

This doesn't look like a real case, moreover the warning message is
misleading. If the device doesn't support baco, driver is not supposed
to call it for runpm purpose -
https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c#L2664

Thanks,
Lijo

> + return ret;
>  }
>  
>  int amdgpu_device_baco_exit(struct drm_device *dev)
> @@ -6017,12 +6026,14 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>   int ret = 0;
>  
> - if (!amdgpu_device_supports_baco(dev))
> - return -ENOTSUPP;
> + if (!amdgpu_device_supports_baco(dev)) {
> + ret = -ENOTSUPP;
> + goto baco_error;
> + }
>  
>   ret = amdgpu_dpm_baco_exit(adev);
>   if (ret)
> - return ret;
> + goto baco_error;
>  
>   if (ras && adev->ras_enabled &&
>   adev->nbio.funcs->enable_doorbell_interrupt)
> @@ -6032,7 +6043,11 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
>   adev->nbio.funcs->clear_doorbell_interrupt)
>   adev->nbio.funcs->clear_doorbell_interrupt(adev);
>  
> - return 0;
> +baco_error:
> + if (ret)
> + dev_warn(adev->dev, "warning: device fails to exit from baco. 
> ret=%d\n", ret);
> +
> + return ret;
>  }
>  
>  /**


[PATCH v2] drm/amgpu: Check return value of amdgpu_device_baco_enter/exit

2024-02-28 Thread Ma Jun
Check return value of amdgpu_device_baco_enter/exit and print
warning message because these errors may cause runtime resume failure

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 29 --
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e68bd6f8a6a4..4928b588cd12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6000,15 +6000,24 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
 {
struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   int ret = 0;
 
-   if (!amdgpu_device_supports_baco(dev))
-   return -ENOTSUPP;
+   if (!amdgpu_device_supports_baco(dev)) {
+   ret = -ENOTSUPP;
+   goto baco_error;
+   }
 
if (ras && adev->ras_enabled &&
adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
 
-   return amdgpu_dpm_baco_enter(adev);
+   ret = amdgpu_dpm_baco_enter(adev);
+
+baco_error:
+   if (ret)
+   dev_warn(adev->dev, "warning: device fails to enter baco. 
ret=%d\n", ret);
+
+   return ret;
 }
 
 int amdgpu_device_baco_exit(struct drm_device *dev)
@@ -6017,12 +6026,14 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int ret = 0;
 
-   if (!amdgpu_device_supports_baco(dev))
-   return -ENOTSUPP;
+   if (!amdgpu_device_supports_baco(dev)) {
+   ret = -ENOTSUPP;
+   goto baco_error;
+   }
 
ret = amdgpu_dpm_baco_exit(adev);
if (ret)
-   return ret;
+   goto baco_error;
 
if (ras && adev->ras_enabled &&
adev->nbio.funcs->enable_doorbell_interrupt)
@@ -6032,7 +6043,11 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
adev->nbio.funcs->clear_doorbell_interrupt)
adev->nbio.funcs->clear_doorbell_interrupt(adev);
 
-   return 0;
+baco_error:
+   if (ret)
+   dev_warn(adev->dev, "warning: device fails to exit from baco. 
ret=%d\n", ret);
+
+   return ret;
 }
 
 /**
-- 
2.34.1