Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

2019-01-07 Thread Grodzovsky, Andrey
Reviewed-by: Andrey Grodzovsky 

Andrey


On 01/07/2019 12:31 PM, StDenis, Tom wrote:
> v2: Move locks around in other functions so that this
> function can stand on its own.  Also only hold the hive
> specific lock for add/remove device instead of the driver
> global lock so you can't add/remove devices in parallel from
> one hive.
>
> v3: add reset_lock
>
> Signed-off-by: Tom St Denis 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 41 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  5 +--
>   3 files changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 39d5d058b2c7..1a558dc41ba6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>* by different nodes. No point also since the one node already 
> executing
>* reset will also reset all the other nodes in the hive.
>*/
> - hive = amdgpu_get_xgmi_hive(adev);
> + hive = amdgpu_get_xgmi_hive(adev, 0);
>   if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
> - !mutex_trylock(>hive_lock))
> + !mutex_trylock(>reset_lock))
>   return 0;
>   
>   /* Start with adev pre asic reset first for soft reset check.*/
> @@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   }
>   
>   if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
> - mutex_unlock(>hive_lock);
> + mutex_unlock(>reset_lock);
>   
>   if (r)
>   dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..9e98ab8b1525 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
> *hive)
>   return >device_list;
>   }
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, 
> int lock)
>   {
>   int i;
>   struct amdgpu_hive_info *tmp;
>   
>   if (!adev->gmc.xgmi.hive_id)
>   return NULL;
> +
> + mutex_lock(_mutex);
> +
>   for (i = 0 ; i < hive_count; ++i) {
>   tmp = _hives[i];
> - if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> + if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> + if (lock)
> + mutex_lock(>hive_lock);
> + mutex_unlock(_mutex);
>   return tmp;
> + }
>   }
> - if (i >= AMDGPU_MAX_XGMI_HIVE)
> + if (i >= AMDGPU_MAX_XGMI_HIVE) {
> + mutex_unlock(_mutex);
>   return NULL;
> + }
>   
>   /* initialize new hive if not exist */
>   tmp = _hives[hive_count++];
>   tmp->hive_id = adev->gmc.xgmi.hive_id;
>   INIT_LIST_HEAD(>device_list);
>   mutex_init(>hive_lock);
> + mutex_init(>reset_lock);
> + if (lock)
> + mutex_lock(>hive_lock);
> +
> + mutex_unlock(_mutex);
>   
>   return tmp;
>   }
> @@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   return ret;
>   }
>   
> - mutex_lock(_mutex);
> - hive = amdgpu_get_xgmi_hive(adev);
> + /* find hive and take lock */
> + hive = amdgpu_get_xgmi_hive(adev, 1);
>   if (!hive)
>   goto exit;
>   
> @@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   break;
>   }
>   
> + mutex_unlock(>hive_lock);
>   exit:
> - mutex_unlock(_mutex);
>   return ret;
>   }
>   
> @@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device 
> *adev)
>   if (!adev->gmc.xgmi.supported)
>   return;
>   
> - mutex_lock(_mutex);
> -
> - hive = amdgpu_get_xgmi_hive(adev);
> + hive = amdgpu_get_xgmi_hive(adev, 1);
>   if (!hive)
> - goto exit;
> + return;
>   
> - if (!(hive->number_devices--))
> + if (!(hive->number_devices--)) {
>   mutex_destroy(>hive_lock);
> -
> -exit:
> - mutex_unlock(_mutex);
> + mutex_destroy(>reset_lock);
> + } else {
> + mutex_unlock(>hive_lock);
> + }
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 6151eb9c8ad3..14bc60664159 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -29,10 +29,11 @@ struct amdgpu_hive_info {
>   struct list_headdevice_list;
>   struct psp_xgmi_topology_info   

[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)

2019-01-07 Thread StDenis, Tom
v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

v3: add reset_lock

Signed-off-by: Tom St Denis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 41 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  5 +--
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..1a558dc41ba6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * by different nodes. No point also since the one node already 
executing
 * reset will also reset all the other nodes in the hive.
 */
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-   !mutex_trylock(>hive_lock))
+   !mutex_trylock(>reset_lock))
return 0;
 
/* Start with adev pre asic reset first for soft reset check.*/
@@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
 
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
-   mutex_unlock(>hive_lock);
+   mutex_unlock(>reset_lock);
 
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..9e98ab8b1525 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
return >device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int 
lock)
 {
int i;
struct amdgpu_hive_info *tmp;
 
if (!adev->gmc.xgmi.hive_id)
return NULL;
+
+   mutex_lock(_mutex);
+
for (i = 0 ; i < hive_count; ++i) {
tmp = _hives[i];
-   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+   if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+   if (lock)
+   mutex_lock(>hive_lock);
+   mutex_unlock(_mutex);
return tmp;
+   }
}
-   if (i >= AMDGPU_MAX_XGMI_HIVE)
+   if (i >= AMDGPU_MAX_XGMI_HIVE) {
+   mutex_unlock(_mutex);
return NULL;
+   }
 
/* initialize new hive if not exist */
tmp = _hives[hive_count++];
tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(>device_list);
mutex_init(>hive_lock);
+   mutex_init(>reset_lock);
+   if (lock)
+   mutex_lock(>hive_lock);
+
+   mutex_unlock(_mutex);
 
return tmp;
 }
@@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
}
 
-   mutex_lock(_mutex);
-   hive = amdgpu_get_xgmi_hive(adev);
+   /* find hive and take lock */
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
goto exit;
 
@@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
break;
}
 
+   mutex_unlock(>hive_lock);
 exit:
-   mutex_unlock(_mutex);
return ret;
 }
 
@@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return;
 
-   mutex_lock(_mutex);
-
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
-   goto exit;
+   return;
 
-   if (!(hive->number_devices--))
+   if (!(hive->number_devices--)) {
mutex_destroy(>hive_lock);
-
-exit:
-   mutex_unlock(_mutex);
+   mutex_destroy(>reset_lock);
+   } else {
+   mutex_unlock(>hive_lock);
+   }
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..14bc60664159 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -29,10 +29,11 @@ struct amdgpu_hive_info {
struct list_headdevice_list;
struct psp_xgmi_topology_info   topology_info;
int number_devices;
-   struct mutex hive_lock;
+   struct mutex hive_lock,
+reset_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info 

Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread Grodzovsky, Andrey


On 01/07/2019 12:03 PM, StDenis, Tom wrote:
> On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:53 AM, StDenis, Tom wrote:
>>> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>>>> I think it's reasonable to use the hive  specific lock for hive  
>>>>>>> specific functions.
>>>>>>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>>>>>>
>>>>>>> -Original Message-
>>>>>>> From: amd-gfx  On Behalf Of 
>>>>>>> StDenis, Tom
>>>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>>>> To: amd-gfx@lists.freedesktop.org
>>>>>>> Cc: StDenis, Tom 
>>>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>>>
>>>>>>> v2: Move locks around in other functions so that this function can 
>>>>>>> stand on its own.  Also only hold the hive specific lock for add/remove 
>>>>>>> device instead of the driver global lock so you can't add/remove 
>>>>>>> devices in parallel from one hive.
>>>>>>>
>>>>>>> Signed-off-by: Tom St Denis 
>>>>>>> ---
>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 
>>>>>>> ++
>>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>>> 3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct 
>>>>>>> amdgpu_device *adev,
>>>>>>>  * by different nodes. No point also since the one node 
>>>>>>> already executing
>>>>>>>  * reset will also reset all the other nodes in the 
>>>>>>> hive.
>>>>>>>  */
>>>>>>> -   hive = amdgpu_get_xgmi_hive(adev);
>>>>>>> +   hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>>> !mutex_trylock(>hive_lock))
>>>>>>> return 0;
>>>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>>>> the same time device 1 is being added to same have though
>>>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>>>> being added and so gpu reset for device 0 bails out on
>>>>>> '!mutex_trylock(>hive_lock))' without completing the reset.
>>>>>> Also in general i feel a bit uncomfortable about the confusing
>>>>>> acquisition scheme in the function and  the fact that you take the
>>>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>>>> of the function.
>>>>> Is adding a device while resetting a device even a valid operation
>>>>> anyways?
>>>> In theory it's valid if you have hot pluggable devices
>>>>> I think this means more so that the reset logic is broken.  Instead
>>>>> there should be a per-hive reset lock that is taken and that is tested
>>>>> instead.
>>>>>
>>>>> Tom
>>>> The hive->hive_lock was added exactly for this purpose and used only for
>>>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>>>> purpose :)
>>> But the add/remove should use per-hive locks not the global lock... :-)
>>>
>>> (I'm honestly not trying to bike shed I just thought the get_hive
>>> function looked wrong :-)).
>>>
>>> Tom
>> Totally agree with you, if Shayun (who origianlly added the global
>> xgmi_mutex) is fine with switching to per hive mutex then me too, I just
>> point out the problem with gpu reset and as you said we then need to
>> rename the existing hive_lock into reset_lock and then and another per
>> hive lock to do what you propose. Also - is there a way to not take the
>> hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK
>> it's an opening for problems where people use it but forget to call
>> release.
> I wanted to take the per-hive lock inside get_hive because it also takes
> the global lock so that add/remove couldn't happen in parallel.
>
> For instance, deleting the last node while adding a new node means the
> per-hive mutex could be in limbo (because remove will delete the lock).
>
> Adding a per-hive reset lock would fix the remaining issues no?
>
> Tom

Looks like it.

Andrey


> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread StDenis, Tom
On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:53 AM, StDenis, Tom wrote:
>> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>>
>>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>>> I think it's reasonable to use the hive  specific lock for hive  
>>>>>> specific functions.
>>>>>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>>>>>
>>>>>> -Original Message-
>>>>>> From: amd-gfx  On Behalf Of 
>>>>>> StDenis, Tom
>>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>>> To: amd-gfx@lists.freedesktop.org
>>>>>> Cc: StDenis, Tom 
>>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>>
>>>>>> v2: Move locks around in other functions so that this function can stand 
>>>>>> on its own.  Also only hold the hive specific lock for add/remove device 
>>>>>> instead of the driver global lock so you can't add/remove devices in 
>>>>>> parallel from one hive.
>>>>>>
>>>>>> Signed-off-by: Tom St Denis 
>>>>>> ---
>>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 
>>>>>> ++
>>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>>3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>>> *adev,
>>>>>>   * by different nodes. No point also since the one node already 
>>>>>> executing
>>>>>>   * reset will also reset all the other nodes in the hive.
>>>>>>   */
>>>>>> -hive = amdgpu_get_xgmi_hive(adev);
>>>>>> +hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>>  if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>>  !mutex_trylock(>hive_lock))
>>>>>>  return 0;
>>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>>> the same time device 1 is being added to same have though
>>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>>> being added and so gpu reset for device 0 bails out on
>>>>> '!mutex_trylock(>hive_lock))' without completing the reset.
>>>>> Also in general i feel a bit uncomfortable about the confusing
>>>>> acquisition scheme in the function and  the fact that you take the
>>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>>> of the function.
>>>> Is adding a device while resetting a device even a valid operation
>>>> anyways?
>>> In theory it's valid if you have hot pluggable devices
>>>> I think this means more so that the reset logic is broken.  Instead
>>>> there should be a per-hive reset lock that is taken and that is tested
>>>> instead.
>>>>
>>>> Tom
>>> The hive->hive_lock was added exactly for this purpose and used only for
>>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>>> purpose :)
>>
>> But the add/remove should use per-hive locks not the global lock... :-)
>>
>> (I'm honestly not trying to bike shed I just thought the get_hive
>> function looked wrong :-)).
>>
>> Tom
> 
> Totally agree with you, if Shayun (who origianlly added the global
> xgmi_mutex) is fine with switching to per hive mutex then me too, I just
> point out the problem with gpu reset and as you said we then need to
> rename the existing hive_lock into reset_lock and then and another per
> hive lock to do what you propose. Also - is there a way to not take the
> hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK
> it's an opening for problems where people use it but forget to call
> release.

I wanted to take the per-hive lock inside get_hive because it also takes 
the global lock so that add/remove couldn't happen in parallel.

For instance, deleting the last node while adding a new node means the 
per-hive mutex could be in limbo (because remove will delete the lock).

Adding a per-hive reset lock would fix the remaining issues no?

Tom
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread Grodzovsky, Andrey


On 01/07/2019 11:53 AM, StDenis, Tom wrote:
> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>>> I think it's reasonable to use the hive  specific lock for hive  specific 
>>>>> functions.
>>>>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>>>>
>>>>> -Original Message-
>>>>> From: amd-gfx  On Behalf Of 
>>>>> StDenis, Tom
>>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>>> To: amd-gfx@lists.freedesktop.org
>>>>> Cc: StDenis, Tom 
>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>>
>>>>> v2: Move locks around in other functions so that this function can stand 
>>>>> on its own.  Also only hold the hive specific lock for add/remove device 
>>>>> instead of the driver global lock so you can't add/remove devices in 
>>>>> parallel from one hive.
>>>>>
>>>>> Signed-off-by: Tom St Denis 
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 
>>>>> ++
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>>   3 files changed, 25 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>>>>> *adev,
>>>>>* by different nodes. No point also since the one node already 
>>>>> executing
>>>>>* reset will also reset all the other nodes in the hive.
>>>>>*/
>>>>> - hive = amdgpu_get_xgmi_hive(adev);
>>>>> + hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>>   if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>>   !mutex_trylock(>hive_lock))
>>>>>   return 0;
>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>>> the same time device 1 is being added to same have though
>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>>> being added and so gpu reset for device 0 bails out on
>>>> '!mutex_trylock(>hive_lock))' without completing the reset.
>>>> Also in general i feel a bit uncomfortable about the confusing
>>>> acquisition scheme in the function and  the fact that you take the
>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>>> of the function.
>>> Is adding a device while resetting a device even a valid operation
>>> anyways?
>> In theory it's valid if you have hot pluggable devices
>>> I think this means more so that the reset logic is broken.  Instead
>>> there should be a per-hive reset lock that is taken and that is tested
>>> instead.
>>>
>>> Tom
>> The hive->hive_lock was added exactly for this purpose and used only for
>> that purpose. Maybe the naming i gave it wasn't reflective of it's
>> purpose :)
>
> But the add/remove should use per-hive locks not the global lock... :-)
>
> (I'm honestly not trying to bike shed I just thought the get_hive
> function looked wrong :-)).
>
> Tom

Totally agree with you, if Shayun (who origianlly added the global 
xgmi_mutex) is fine with switching to per hive mutex then me too, I just 
point out the problem with gpu reset and as you said we then need to 
rename the existing hive_lock into reset_lock and then and another per 
hive lock to do what you propose. Also - is there a way to not take the 
hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK 
it's an opening for problems where people use it but forget to call 
release.

Andrey

> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread StDenis, Tom
On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:36 AM, StDenis, Tom wrote:
>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>>
>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>>> I think it's reasonable to use the hive  specific lock for hive  specific 
>>>> functions.
>>>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>>>
>>>> -Original Message-
>>>> From: amd-gfx  On Behalf Of 
>>>> StDenis, Tom
>>>> Sent: Monday, January 7, 2019 10:16 AM
>>>> To: amd-gfx@lists.freedesktop.org
>>>> Cc: StDenis, Tom 
>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>>
>>>> v2: Move locks around in other functions so that this function can stand 
>>>> on its own.  Also only hold the hive specific lock for add/remove device 
>>>> instead of the driver global lock so you can't add/remove devices in 
>>>> parallel from one hive.
>>>>
>>>> Signed-off-by: Tom St Denis 
>>>> ---
>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
>>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>>>  3 files changed, 25 insertions(+), 15 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>>>> *adev,
>>>> * by different nodes. No point also since the one node already 
>>>> executing
>>>> * reset will also reset all the other nodes in the hive.
>>>> */
>>>> -  hive = amdgpu_get_xgmi_hive(adev);
>>>> +  hive = amdgpu_get_xgmi_hive(adev, 0);
>>>>if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>>>!mutex_trylock(>hive_lock))
>>>>return 0;
>>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>>> the same time device 1 is being added to same have though
>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>>> being added and so gpu reset for device 0 bails out on
>>> '!mutex_trylock(>hive_lock))' without completing the reset.
>>> Also in general i feel a bit uncomfortable about the confusing
>>> acquisition scheme in the function and  the fact that you take the
>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>>> of the function.
>> Is adding a device while resetting a device even a valid operation
>> anyways?
> 
> In theory it's valid if you have hot pluggable devices
>>
>> I think this means more so that the reset logic is broken.  Instead
>> there should be a per-hive reset lock that is taken and that is tested
>> instead.
>>
>> Tom
> 
> The hive->hive_lock was added exactly for this purpose and used only for
> that purpose. Maybe the naming i gave it wasn't reflective of it's
> purpose :)


But the add/remove should use per-hive locks not the global lock... :-)

(I'm honestly not trying to bike shed I just thought the get_hive 
function looked wrong :-)).

Tom
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread Grodzovsky, Andrey


On 01/07/2019 11:36 AM, StDenis, Tom wrote:
> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
>>
>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>>> I think it's reasonable to use the hive  specific lock for hive  specific 
>>> functions.
>>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>>
>>> -Original Message-
>>> From: amd-gfx  On Behalf Of StDenis, 
>>> Tom
>>> Sent: Monday, January 7, 2019 10:16 AM
>>> To: amd-gfx@lists.freedesktop.org
>>> Cc: StDenis, Tom 
>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>>
>>> v2: Move locks around in other functions so that this function can stand on 
>>> its own.  Also only hold the hive specific lock for add/remove device 
>>> instead of the driver global lock so you can't add/remove devices in 
>>> parallel from one hive.
>>>
>>> Signed-off-by: Tom St Denis 
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>> 3 files changed, 25 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>>> *adev,
>>>  * by different nodes. No point also since the one node already 
>>> executing
>>>  * reset will also reset all the other nodes in the hive.
>>>  */
>>> -   hive = amdgpu_get_xgmi_hive(adev);
>>> +   hive = amdgpu_get_xgmi_hive(adev, 0);
>>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>> !mutex_trylock(>hive_lock))
>>> return 0;
>> Let's say i have device 0 in hive A and it just got a gpu reset and at
>> the same time device 1 is being added to same have though
>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
>> being added and so gpu reset for device 0 bails out on
>> '!mutex_trylock(>hive_lock))' without completing the reset.
>> Also in general i feel a bit uncomfortable about the confusing
>> acquisition scheme in the function and  the fact that you take the
>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
>> of the function.
> Is adding a device while resetting a device even a valid operation
> anyways?

In theory it's valid if you have hot pluggable devices
>
> I think this means more so that the reset logic is broken.  Instead
> there should be a per-hive reset lock that is taken and that is tested
> instead.
>
> Tom

The hive->hive_lock was added exactly for this purpose and used only for 
that purpose. Maybe the naming i gave it wasn't reflective of it's 
purpose :)

Andrey

>
>
>> Andrey
>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> index 8a8bc60cb6b4..ebf50809485f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>>> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
>>> *hive)
>>> return >device_list;
>>> }
>>> 
>>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
>>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>>> +*adev, int lock)
>>> {
>>> int i;
>>> struct amdgpu_hive_info *tmp;
>>> 
>>> if (!adev->gmc.xgmi.hive_id)
>>> return NULL;
>>> +
>>> +   mutex_lock(_mutex);
>>> +
>>> for (i = 0 ; i < hive_count; ++i) {
>>> tmp = _hives[i];
>>> -   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
>>> +   if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
>>> +   if (lock)
>>> +   mutex_lock(>hive_lock);
>>> +   mutex_unlock(_mutex);
>>> return tmp;
>>> +   }
>>>   

Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread StDenis, Tom
On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote:
> 
> 
> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
>> I think it's reasonable to use the hive  specific lock for hive  specific 
>> functions.
>> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>>
>> -Original Message-
>> From: amd-gfx  On Behalf Of StDenis, 
>> Tom
>> Sent: Monday, January 7, 2019 10:16 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: StDenis, Tom 
>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>>
>> v2: Move locks around in other functions so that this function can stand on 
>> its own.  Also only hold the hive specific lock for add/remove device 
>> instead of the driver global lock so you can't add/remove devices in 
>> parallel from one hive.
>>
>> Signed-off-by: Tom St Denis 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>>3 files changed, 25 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 39d5d058b2c7..13d8e2ad2f7a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>   * by different nodes. No point also since the one node already 
>> executing
>>   * reset will also reset all the other nodes in the hive.
>>   */
>> -hive = amdgpu_get_xgmi_hive(adev);
>> +hive = amdgpu_get_xgmi_hive(adev, 0);
>>  if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>  !mutex_trylock(>hive_lock))
>>  return 0;
> 
> Let's say i have device 0 in hive A and it just got a gpu reset and at
> the same time device 1 is being added to same have though
> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device
> being added and so gpu reset for device 0 bails out on
> '!mutex_trylock(>hive_lock))' without completing the reset.
> Also in general i feel a bit uncomfortable about the confusing
> acquisition scheme in the function and  the fact that you take the
> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside
> of the function.

Is adding a device while resetting a device even a valid operation 
anyways?

I think this means more so that the reset logic is broken.  Instead 
there should be a per-hive reset lock that is taken and that is tested 
instead.

Tom


> 
> Andrey
> 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> index 8a8bc60cb6b4..ebf50809485f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
>> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
>> *hive)
>>  return >device_list;
>>}
>>
>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
>> +*adev, int lock)
>>{
>>  int i;
>>  struct amdgpu_hive_info *tmp;
>>
>>  if (!adev->gmc.xgmi.hive_id)
>>  return NULL;
>> +
>> +mutex_lock(_mutex);
>> +
>>  for (i = 0 ; i < hive_count; ++i) {
>>  tmp = _hives[i];
>> -if (tmp->hive_id == adev->gmc.xgmi.hive_id)
>> +if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
>> +if (lock)
>> +mutex_lock(>hive_lock);
>> +mutex_unlock(_mutex);
>>  return tmp;
>> +}
>>  }
>> -if (i >= AMDGPU_MAX_XGMI_HIVE)
>> +if (i >= AMDGPU_MAX_XGMI_HIVE) {
>> +mutex_unlock(_mutex);
>>  return NULL;
>> +}
>>
>>  /* initialize new hive if not exist */
>>  tmp = _hives[hive_count++];
>>  tmp->hive_id = adev->gmc.xgmi.hive_id;
>>  INIT_LIST_HEAD(>device_list);
>>  mutex_init(>hive_lock);
>> +if (lock)
>> +mutex_lock(>hive_lock);
>> +
>> +mutex_unlock(_mutex);
>>
>>  return tmp;
>>}
>> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>>  

Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread Grodzovsky, Andrey


On 01/07/2019 11:16 AM, Liu, Shaoyun wrote:
> I think it's reasonable to use the hive  specific lock for hive  specific 
> functions.
> The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>
>
> -Original Message-
> From: amd-gfx  On Behalf Of StDenis, 
> Tom
> Sent: Monday, January 7, 2019 10:16 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: StDenis, Tom 
> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
>
> v2: Move locks around in other functions so that this function can stand on 
> its own.  Also only hold the hive specific lock for add/remove device instead 
> of the driver global lock so you can't add/remove devices in parallel from 
> one hive.
>
> Signed-off-by: Tom St Denis 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
>   3 files changed, 25 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 39d5d058b2c7..13d8e2ad2f7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>* by different nodes. No point also since the one node already 
> executing
>* reset will also reset all the other nodes in the hive.
>*/
> - hive = amdgpu_get_xgmi_hive(adev);
> + hive = amdgpu_get_xgmi_hive(adev, 0);
>   if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>   !mutex_trylock(>hive_lock))
>   return 0;

Let's say i have device 0 in hive A and it just got a gpu reset and at 
the same time device 1 is being added to same have though 
amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device 
being added and so gpu reset for device 0 bails out on 
'!mutex_trylock(>hive_lock))' without completing the reset.
Also in general i feel a bit uncomfortable about the confusing 
acquisition scheme in the function and  the fact that you take the 
hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside 
of the function.

Andrey

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..ebf50809485f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
> *hive)
>   return >device_list;
>   }
>   
> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device
> +*adev, int lock)
>   {
>   int i;
>   struct amdgpu_hive_info *tmp;
>   
>   if (!adev->gmc.xgmi.hive_id)
>   return NULL;
> +
> + mutex_lock(_mutex);
> +
>   for (i = 0 ; i < hive_count; ++i) {
>   tmp = _hives[i];
> - if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> + if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> + if (lock)
> + mutex_lock(>hive_lock);
> + mutex_unlock(_mutex);
>   return tmp;
> + }
>   }
> - if (i >= AMDGPU_MAX_XGMI_HIVE)
> + if (i >= AMDGPU_MAX_XGMI_HIVE) {
> + mutex_unlock(_mutex);
>   return NULL;
> + }
>   
>   /* initialize new hive if not exist */
>   tmp = _hives[hive_count++];
>   tmp->hive_id = adev->gmc.xgmi.hive_id;
>   INIT_LIST_HEAD(>device_list);
>   mutex_init(>hive_lock);
> + if (lock)
> + mutex_lock(>hive_lock);
> +
> + mutex_unlock(_mutex);
>   
>   return tmp;
>   }
> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   return ret;
>   }
>   
> - mutex_lock(_mutex);
> - hive = amdgpu_get_xgmi_hive(adev);
> + /* find hive and take lock */
> + hive = amdgpu_get_xgmi_hive(adev, 1);
>   if (!hive)
>   goto exit;
>   
> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>   break;
>   }
>   
> + mutex_unlock(>hive_lock);
>   exit:
> - mutex_unlock(_mutex);
>   return ret;
>   }
>   
> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device 
> *adev)
>   if (!adev->gmc.xgmi.supported)
>   return;
>   
> - mutex_lo

RE: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread Liu, Shaoyun
I think it's reasonable to use the hive  specific lock for hive  specific 
functions. 
The changes is acked-by  Shaoyun.liu < shaoyun@amd.com>

-Original Message-
From: amd-gfx  On Behalf Of StDenis, Tom
Sent: Monday, January 7, 2019 10:16 AM
To: amd-gfx@lists.freedesktop.org
Cc: StDenis, Tom 
Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

v2: Move locks around in other functions so that this function can stand on its 
own.  Also only hold the hive specific lock for add/remove device instead of 
the driver global lock so you can't add/remove devices in parallel from one 
hive.

Signed-off-by: Tom St Denis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..13d8e2ad2f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * by different nodes. No point also since the one node already 
executing
 * reset will also reset all the other nodes in the hive.
 */
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
!mutex_trylock(>hive_lock))
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..ebf50809485f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
return >device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock)
 {
int i;
struct amdgpu_hive_info *tmp;
 
if (!adev->gmc.xgmi.hive_id)
return NULL;
+
+   mutex_lock(_mutex);
+
for (i = 0 ; i < hive_count; ++i) {
tmp = _hives[i];
-   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+   if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+   if (lock)
+   mutex_lock(>hive_lock);
+   mutex_unlock(_mutex);
return tmp;
+   }
}
-   if (i >= AMDGPU_MAX_XGMI_HIVE)
+   if (i >= AMDGPU_MAX_XGMI_HIVE) {
+   mutex_unlock(_mutex);
return NULL;
+   }
 
/* initialize new hive if not exist */
tmp = _hives[hive_count++];
tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(>device_list);
mutex_init(>hive_lock);
+   if (lock)
+   mutex_lock(>hive_lock);
+
+   mutex_unlock(_mutex);
 
return tmp;
 }
@@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
}
 
-   mutex_lock(_mutex);
-   hive = amdgpu_get_xgmi_hive(adev);
+   /* find hive and take lock */
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
goto exit;
 
@@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
break;
}
 
+   mutex_unlock(>hive_lock);
 exit:
-   mutex_unlock(_mutex);
return ret;
 }
 
@@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return;
 
-   mutex_lock(_mutex);
-
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
-   goto exit;
+   return;
 
if (!(hive->number_devices--))
mutex_destroy(>hive_lock);
-
-exit:
-   mutex_unlock(_mutex);
+   else
+   mutex_unlock(>hive_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..8d7984844174 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -32,7 +32,7 @@ struct amdgpu_hive_info {
struct mutex hive_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device 
+*adev, int lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct 
amdgpu_device *adev);  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);  
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
--
2.17.2

_

[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)

2019-01-07 Thread StDenis, Tom
v2: Move locks around in other functions so that this
function can stand on its own.  Also only hold the hive
specific lock for add/remove device instead of the driver
global lock so you can't add/remove devices in parallel from
one hive.

Signed-off-by: Tom St Denis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 36 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +-
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 39d5d058b2c7..13d8e2ad2f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 * by different nodes. No point also since the one node already 
executing
 * reset will also reset all the other nodes in the hive.
 */
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 0);
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
!mutex_trylock(>hive_lock))
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..ebf50809485f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info 
*hive)
return >device_list;
 }
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int 
lock)
 {
int i;
struct amdgpu_hive_info *tmp;
 
if (!adev->gmc.xgmi.hive_id)
return NULL;
+
+   mutex_lock(_mutex);
+
for (i = 0 ; i < hive_count; ++i) {
tmp = _hives[i];
-   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+   if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+   if (lock)
+   mutex_lock(>hive_lock);
+   mutex_unlock(_mutex);
return tmp;
+   }
}
-   if (i >= AMDGPU_MAX_XGMI_HIVE)
+   if (i >= AMDGPU_MAX_XGMI_HIVE) {
+   mutex_unlock(_mutex);
return NULL;
+   }
 
/* initialize new hive if not exist */
tmp = _hives[hive_count++];
tmp->hive_id = adev->gmc.xgmi.hive_id;
INIT_LIST_HEAD(>device_list);
mutex_init(>hive_lock);
+   if (lock)
+   mutex_lock(>hive_lock);
+
+   mutex_unlock(_mutex);
 
return tmp;
 }
@@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
return ret;
}
 
-   mutex_lock(_mutex);
-   hive = amdgpu_get_xgmi_hive(adev);
+   /* find hive and take lock */
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
goto exit;
 
@@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
break;
}
 
+   mutex_unlock(>hive_lock);
 exit:
-   mutex_unlock(_mutex);
return ret;
 }
 
@@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.supported)
return;
 
-   mutex_lock(_mutex);
-
-   hive = amdgpu_get_xgmi_hive(adev);
+   hive = amdgpu_get_xgmi_hive(adev, 1);
if (!hive)
-   goto exit;
+   return;
 
if (!(hive->number_devices--))
mutex_destroy(>hive_lock);
-
-exit:
-   mutex_unlock(_mutex);
+   else
+   mutex_unlock(>hive_lock);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 6151eb9c8ad3..8d7984844174 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -32,7 +32,7 @@ struct amdgpu_hive_info {
struct mutex hive_lock;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int 
lock);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct 
amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
-- 
2.17.2

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive()

2019-01-07 Thread StDenis, Tom
Self NAK this ... calling functions take the same lock

We should remove the locks from the callers so this function is thread 
safe on its own...

Tom


On 2019-01-07 10:00 a.m., StDenis, Tom wrote:
> Signed-off-by: Tom St Denis 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 13 +++--
>   1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 8a8bc60cb6b4..587a5f73ae8c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -47,13 +47,20 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
> amdgpu_device *adev)
>   
>   if (!adev->gmc.xgmi.hive_id)
>   return NULL;
> +
> + mutex_lock(_mutex);
> +
>   for (i = 0 ; i < hive_count; ++i) {
>   tmp = _hives[i];
> - if (tmp->hive_id == adev->gmc.xgmi.hive_id)
> + if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
> + mutex_unlock(_mutex);
>   return tmp;
> + }
>   }
> - if (i >= AMDGPU_MAX_XGMI_HIVE)
> + if (i >= AMDGPU_MAX_XGMI_HIVE) {
> + mutex_unlock(_mutex);
>   return NULL;
> + }
>   
>   /* initialize new hive if not exist */
>   tmp = _hives[hive_count++];
> @@ -61,6 +68,8 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
> amdgpu_device *adev)
>   INIT_LIST_HEAD(>device_list);
>   mutex_init(>hive_lock);
>   
> + mutex_unlock(_mutex);
> +
>   return tmp;
>   }
>   
> 

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive()

2019-01-07 Thread StDenis, Tom
Signed-off-by: Tom St Denis 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 8a8bc60cb6b4..587a5f73ae8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -47,13 +47,20 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
 
if (!adev->gmc.xgmi.hive_id)
return NULL;
+
+   mutex_lock(_mutex);
+
for (i = 0 ; i < hive_count; ++i) {
tmp = _hives[i];
-   if (tmp->hive_id == adev->gmc.xgmi.hive_id)
+   if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
+   mutex_unlock(_mutex);
return tmp;
+   }
}
-   if (i >= AMDGPU_MAX_XGMI_HIVE)
+   if (i >= AMDGPU_MAX_XGMI_HIVE) {
+   mutex_unlock(_mutex);
return NULL;
+   }
 
/* initialize new hive if not exist */
tmp = _hives[hive_count++];
@@ -61,6 +68,8 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct 
amdgpu_device *adev)
INIT_LIST_HEAD(>device_list);
mutex_init(>hive_lock);
 
+   mutex_unlock(_mutex);
+
return tmp;
 }
 
-- 
2.17.2

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx