On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
> I think it's reasonable to use the hive  specific lock for hive  specific 
> functions.
> The changes is acked-by  Shaoyun.liu < [email protected]>
>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of StDenis, 
> Tom
> Sent: Monday, January 7, 2019 10:16 AM
> To: [email protected]
> Cc: StDenis, Tom <[email protected]>
> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>
> v2: Move locks around in other functions so that this function can stand on 
> its own.  Also only hold the hive specific lock for add/remove device instead 
> of the driver global lock so you can't add/remove devices in parallel from 
> one hive.
>
> Signed-off-by: Tom St Denis <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++++++++++++++--------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>   3 files changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 39d5d058b2c7..13d8e2ad2f7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>        * by different nodes. No point also since the one node already 
> executing
>        * reset will also reset all the other nodes in the hive.
>        */
> -     hive = amdgpu_get_xgmi_hive(adev);
> +     hive = amdgpu_get_xgmi_hive(adev, 0);
>       if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>           !mutex_trylock(&hive->hive_lock))
>               return 0;

Let's say i have device 0 in hive A and it just got a gpu reset and at 
the same time device 1 is being added to same have though 
amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device 
being added and so gpu reset for device 0 bails out on 
'!mutex_trylock(&hive->hive_lock))' without completing the reset.
Also in general i feel a bit uncomfortable about the confusing 
acquisition scheme in the function and  the fact that you take the 
hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside 
of the function.

Andrey

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..ebf50809485f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
> *hive)
>       return &hive->device_list;
>   }
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
> +*adev, int lock)
>   {
>       int i;
>       struct amdgpu_hive_info *tmp;
>   
>       if (!adev->gmc.xgmi.hive_id)
>               return NULL;
> +
> +     mutex_lock(&xgmi_mutex);
> +
>       for (i = 0 ; i < hive_count; ++i) {
>               tmp = &xgmi_hives[i];
> -             if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> +             if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> +                     if (lock)
> +                             mutex_lock(&tmp->hive_lock);
> +                     mutex_unlock(&xgmi_mutex);
>                       return tmp;
> +             }
>       }
> -     if (i >= AMDGPU_MAX_XGMI_HIVE)
> +     if (i >= AMDGPU_MAX_XGMI_HIVE) {
> +             mutex_unlock(&xgmi_mutex);
>               return NULL;
> +     }
>   
>       /* initialize new hive if not exist */
>       tmp = &xgmi_hives[hive_count++];
>       tmp->hive_id = adev->gmc.xgmi.hive_id;
>       INIT_LIST_HEAD(&tmp->device_list);
>       mutex_init(&tmp->hive_lock);
> +     if (lock)
> +             mutex_lock(&tmp->hive_lock);
> +
> +     mutex_unlock(&xgmi_mutex);
>   
>       return tmp;
>   }
> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>               return ret;
>       }
>   
> -     mutex_lock(&xgmi_mutex);
> -     hive = amdgpu_get_xgmi_hive(adev);
> +     /* find hive and take lock */
> +     hive = amdgpu_get_xgmi_hive(adev, 1);
>       if (!hive)
>               goto exit;
>   
> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>                       break;
>       }
>   
> +     mutex_unlock(&hive->hive_lock);
>   exit:
> -     mutex_unlock(&xgmi_mutex);
>       return ret;
>   }
>   
> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device 
> *adev)
>       if (!adev->gmc.xgmi.supported)
>               return;
>   
> -     mutex_lock(&xgmi_mutex);
> -
> -     hive = amdgpu_get_xgmi_hive(adev);
> +     hive = amdgpu_get_xgmi_hive(adev, 1);
>       if (!hive)
> -             goto exit;
> +             return;
>   
>       if (!(hive->number_devices--))
>               mutex_destroy(&hive->hive_lock);
> -
> -exit:
> -     mutex_unlock(&xgmi_mutex);
> +     else
> +             mutex_unlock(&hive->hive_lock);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 6151eb9c8ad3..8d7984844174 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -32,7 +32,7 @@ struct amdgpu_hive_info {
>       struct mutex hive_lock;
>   };
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
> +*adev, int lock);
>   int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct 
> amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device 
> *adev);  void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
> --
> 2.17.2
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to