Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
Reviewed-by: Andrey Grodzovsky Andrey On 01/07/2019 12:31 PM, StDenis, Tom wrote: > v2: Move locks around in other functions so that this > function can stand on its own. Also only hold the hive > specific lock for add/remove device instead of the driver > global lock so you can't add/remove devices in parallel from > one hive. > > v3: add reset_lock > > Signed-off-by: Tom St Denis > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 41 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 5 +-- > 3 files changed, 33 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 39d5d058b2c7..1a558dc41ba6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > *adev, >* by different nodes. No point also since the one node already > executing >* reset will also reset all the other nodes in the hive. >*/ > - hive = amdgpu_get_xgmi_hive(adev); > + hive = amdgpu_get_xgmi_hive(adev, 0); > if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && > - !mutex_trylock(>hive_lock)) > + !mutex_trylock(>reset_lock)) > return 0; > > /* Start with adev pre asic reset first for soft reset check.*/ > @@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > *adev, > } > > if (hive && adev->gmc.xgmi.num_physical_nodes > 1) > - mutex_unlock(>hive_lock); > + mutex_unlock(>reset_lock); > > if (r) > dev_info(adev->dev, "GPU reset end with ret = %d\n", r); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > index 8a8bc60cb6b4..9e98ab8b1525 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > @@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info > *hive) > return >device_list; > } > > -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) > +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, > int lock) > { > int i; > struct amdgpu_hive_info *tmp; > > if (!adev->gmc.xgmi.hive_id) > return NULL; > + > + mutex_lock(_mutex); > + > for (i = 0 ; i < hive_count; ++i) { > tmp = _hives[i]; > - if (tmp->hive_id == adev->gmc.xgmi.hive_id) > + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { > + if (lock) > + mutex_lock(>hive_lock); > + mutex_unlock(_mutex); > return tmp; > + } > } > - if (i >= AMDGPU_MAX_XGMI_HIVE) > + if (i >= AMDGPU_MAX_XGMI_HIVE) { > + mutex_unlock(_mutex); > return NULL; > + } > > /* initialize new hive if not exist */ > tmp = _hives[hive_count++]; > tmp->hive_id = adev->gmc.xgmi.hive_id; > INIT_LIST_HEAD(>device_list); > mutex_init(>hive_lock); > + mutex_init(>reset_lock); > + if (lock) > + mutex_lock(>hive_lock); > + > + mutex_unlock(_mutex); > > return tmp; > } > @@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > return ret; > } > > - mutex_lock(_mutex); > - hive = amdgpu_get_xgmi_hive(adev); > + /* find hive and take lock */ > + hive = amdgpu_get_xgmi_hive(adev, 1); > if (!hive) > goto exit; > > @@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > break; > } > > + mutex_unlock(>hive_lock); > exit: > - mutex_unlock(_mutex); > return ret; > } > > @@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device > *adev) > if (!adev->gmc.xgmi.supported) > return; > > - mutex_lock(_mutex); > - > - hive = amdgpu_get_xgmi_hive(adev); > + hive = amdgpu_get_xgmi_hive(adev, 1); > if (!hive) > - goto exit; > + return; > > - if (!(hive->number_devices--)) > + if (!(hive->number_devices--)) { > mutex_destroy(>hive_lock); > - > -exit: > - mutex_unlock(_mutex); > + mutex_destroy(>reset_lock); > + } else { > + mutex_unlock(>hive_lock); > + } > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > index 6151eb9c8ad3..14bc60664159 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > @@ -29,10 +29,11 @@ struct amdgpu_hive_info { > struct list_headdevice_list; > struct psp_xgmi_topology_info
[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. v3: add reset_lock Signed-off-by: Tom St Denis --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 41 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 5 +-- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 39d5d058b2c7..1a558dc41ba6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * by different nodes. No point also since the one node already executing * reset will also reset all the other nodes in the hive. */ - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 0); if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && - !mutex_trylock(>hive_lock)) + !mutex_trylock(>reset_lock)) return 0; /* Start with adev pre asic reset first for soft reset check.*/ @@ -3606,7 +3606,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } if (hive && adev->gmc.xgmi.num_physical_nodes > 1) - mutex_unlock(>hive_lock); + mutex_unlock(>reset_lock); if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 8a8bc60cb6b4..9e98ab8b1525 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) return >device_list; } -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) { int i; struct amdgpu_hive_info *tmp; if (!adev->gmc.xgmi.hive_id) return NULL; + + mutex_lock(_mutex); + for (i = 0 ; i < hive_count; ++i) { tmp = _hives[i]; - if (tmp->hive_id == adev->gmc.xgmi.hive_id) + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { + if (lock) + mutex_lock(>hive_lock); + mutex_unlock(_mutex); return tmp; + } } - if (i >= AMDGPU_MAX_XGMI_HIVE) + if (i >= AMDGPU_MAX_XGMI_HIVE) { + mutex_unlock(_mutex); return NULL; + } /* initialize new hive if not exist */ tmp = _hives[hive_count++]; tmp->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(>device_list); mutex_init(>hive_lock); + mutex_init(>reset_lock); + if (lock) + mutex_lock(>hive_lock); + + mutex_unlock(_mutex); return tmp; } @@ -111,8 +125,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) return ret; } - mutex_lock(_mutex); - hive = amdgpu_get_xgmi_hive(adev); + /* find hive and take lock */ + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) goto exit; @@ -142,8 +156,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) break; } + mutex_unlock(>hive_lock); exit: - mutex_unlock(_mutex); return ret; } @@ -154,15 +168,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return; - mutex_lock(_mutex); - - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) - goto exit; + return; - if (!(hive->number_devices--)) + if (!(hive->number_devices--)) { mutex_destroy(>hive_lock); - -exit: - mutex_unlock(_mutex); + mutex_destroy(>reset_lock); + } else { + mutex_unlock(>hive_lock); + } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6151eb9c8ad3..14bc60664159 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -29,10 +29,11 @@ struct amdgpu_hive_info { struct list_headdevice_list; struct psp_xgmi_topology_info topology_info; int number_devices; - struct mutex hive_lock; + struct mutex hive_lock, +reset_lock; }; -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); +struct amdgpu_hive_info
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 01/07/2019 12:03 PM, StDenis, Tom wrote: > On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote: >> >> On 01/07/2019 11:53 AM, StDenis, Tom wrote: >>> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote: >>>> On 01/07/2019 11:36 AM, StDenis, Tom wrote: >>>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: >>>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >>>>>>> I think it's reasonable to use the hive specific lock for hive >>>>>>> specific functions. >>>>>>> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >>>>>>> >>>>>>> -Original Message- >>>>>>> From: amd-gfx On Behalf Of >>>>>>> StDenis, Tom >>>>>>> Sent: Monday, January 7, 2019 10:16 AM >>>>>>> To: amd-gfx@lists.freedesktop.org >>>>>>> Cc: StDenis, Tom >>>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >>>>>>> >>>>>>> v2: Move locks around in other functions so that this function can >>>>>>> stand on its own. Also only hold the hive specific lock for add/remove >>>>>>> device instead of the driver global lock so you can't add/remove >>>>>>> devices in parallel from one hive. >>>>>>> >>>>>>> Signed-off-by: Tom St Denis >>>>>>> --- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 >>>>>>> ++ >>>>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>>>>>> 3 files changed, 25 insertions(+), 15 deletions(-) >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct >>>>>>> amdgpu_device *adev, >>>>>>> * by different nodes. No point also since the one node >>>>>>> already executing >>>>>>> * reset will also reset all the other nodes in the >>>>>>> hive. >>>>>>> */ >>>>>>> - hive = amdgpu_get_xgmi_hive(adev); >>>>>>> + hive = amdgpu_get_xgmi_hive(adev, 0); >>>>>>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >>>>>>> !mutex_trylock(>hive_lock)) >>>>>>> return 0; >>>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at >>>>>> the same time device 1 is being added to same have though >>>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device >>>>>> being added and so gpu reset for device 0 bails out on >>>>>> '!mutex_trylock(>hive_lock))' without completing the reset. >>>>>> Also in general i feel a bit uncomfortable about the confusing >>>>>> acquisition scheme in the function and the fact that you take the >>>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside >>>>>> of the function. >>>>> Is adding a device while resetting a device even a valid operation >>>>> anyways? >>>> In theory it's valid if you have hot pluggable devices >>>>> I think this means more so that the reset logic is broken. Instead >>>>> there should be a per-hive reset lock that is taken and that is tested >>>>> instead. >>>>> >>>>> Tom >>>> The hive->hive_lock was added exactly for this purpose and used only for >>>> that purpose. Maybe the naming i gave it wasn't reflective of it's >>>> purpose :) >>> But the add/remove should use per-hive locks not the global lock... :-) >>> >>> (I'm honestly not trying to bike shed I just thought the get_hive >>> function looked wrong :-)). >>> >>> Tom >> Totally agree with you, if Shayun (who origianlly added the global >> xgmi_mutex) is fine with switching to per hive mutex then me too, I just >> point out the problem with gpu reset and as you said we then need to >> rename the existing hive_lock into reset_lock and then and another per >> hive lock to do what you propose. Also - is there a way to not take the >> hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK >> it's an opening for problems where people use it but forget to call >> release. > I wanted to take the per-hive lock inside get_hive because it also takes > the global lock so that add/remove couldn't happen in parallel. > > For instance, deleting the last node while adding a new node means the > per-hive mutex could be in limbo (because remove will delete the lock). > > Adding a per-hive reset lock would fix the remaining issues no? > > Tom Looks like it. Andrey > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 2019-01-07 12:00 p.m., Grodzovsky, Andrey wrote: > > > On 01/07/2019 11:53 AM, StDenis, Tom wrote: >> On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote: >>> >>> On 01/07/2019 11:36 AM, StDenis, Tom wrote: >>>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: >>>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >>>>>> I think it's reasonable to use the hive specific lock for hive >>>>>> specific functions. >>>>>> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >>>>>> >>>>>> -Original Message- >>>>>> From: amd-gfx On Behalf Of >>>>>> StDenis, Tom >>>>>> Sent: Monday, January 7, 2019 10:16 AM >>>>>> To: amd-gfx@lists.freedesktop.org >>>>>> Cc: StDenis, Tom >>>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >>>>>> >>>>>> v2: Move locks around in other functions so that this function can stand >>>>>> on its own. Also only hold the hive specific lock for add/remove device >>>>>> instead of the driver global lock so you can't add/remove devices in >>>>>> parallel from one hive. >>>>>> >>>>>> Signed-off-by: Tom St Denis >>>>>> --- >>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 >>>>>> ++ >>>>>>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>>>>>3 files changed, 25 insertions(+), 15 deletions(-) >>>>>> >>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644 >>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device >>>>>> *adev, >>>>>> * by different nodes. No point also since the one node already >>>>>> executing >>>>>> * reset will also reset all the other nodes in the hive. >>>>>> */ >>>>>> -hive = amdgpu_get_xgmi_hive(adev); >>>>>> +hive = amdgpu_get_xgmi_hive(adev, 0); >>>>>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >>>>>> !mutex_trylock(>hive_lock)) >>>>>> return 0; >>>>> Let's say i have device 0 in hive A and it just got a gpu reset and at >>>>> the same time device 1 is being added to same have though >>>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device >>>>> being added and so gpu reset for device 0 bails out on >>>>> '!mutex_trylock(>hive_lock))' without completing the reset. >>>>> Also in general i feel a bit uncomfortable about the confusing >>>>> acquisition scheme in the function and the fact that you take the >>>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside >>>>> of the function. >>>> Is adding a device while resetting a device even a valid operation >>>> anyways? >>> In theory it's valid if you have hot pluggable devices >>>> I think this means more so that the reset logic is broken. Instead >>>> there should be a per-hive reset lock that is taken and that is tested >>>> instead. >>>> >>>> Tom >>> The hive->hive_lock was added exactly for this purpose and used only for >>> that purpose. Maybe the naming i gave it wasn't reflective of it's >>> purpose :) >> >> But the add/remove should use per-hive locks not the global lock... :-) >> >> (I'm honestly not trying to bike shed I just thought the get_hive >> function looked wrong :-)). >> >> Tom > > Totally agree with you, if Shayun (who origianlly added the global > xgmi_mutex) is fine with switching to per hive mutex then me too, I just > point out the problem with gpu reset and as you said we then need to > rename the existing hive_lock into reset_lock and then and another per > hive lock to do what you propose. Also - is there a way to not take the > hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK > it's an opening for problems where people use it but forget to call > release. I wanted to take the per-hive lock inside get_hive because it also takes the global lock so that add/remove couldn't happen in parallel. For instance, deleting the last node while adding a new node means the per-hive mutex could be in limbo (because remove will delete the lock). Adding a per-hive reset lock would fix the remaining issues no? Tom ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 01/07/2019 11:53 AM, StDenis, Tom wrote: > On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote: >> >> On 01/07/2019 11:36 AM, StDenis, Tom wrote: >>> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: >>>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >>>>> I think it's reasonable to use the hive specific lock for hive specific >>>>> functions. >>>>> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >>>>> >>>>> -Original Message- >>>>> From: amd-gfx On Behalf Of >>>>> StDenis, Tom >>>>> Sent: Monday, January 7, 2019 10:16 AM >>>>> To: amd-gfx@lists.freedesktop.org >>>>> Cc: StDenis, Tom >>>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >>>>> >>>>> v2: Move locks around in other functions so that this function can stand >>>>> on its own. Also only hold the hive specific lock for add/remove device >>>>> instead of the driver global lock so you can't add/remove devices in >>>>> parallel from one hive. >>>>> >>>>> Signed-off-by: Tom St Denis >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 >>>>> ++ >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>>>> 3 files changed, 25 insertions(+), 15 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> index 39d5d058b2c7..13d8e2ad2f7a 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device >>>>> *adev, >>>>>* by different nodes. No point also since the one node already >>>>> executing >>>>>* reset will also reset all the other nodes in the hive. >>>>>*/ >>>>> - hive = amdgpu_get_xgmi_hive(adev); >>>>> + hive = amdgpu_get_xgmi_hive(adev, 0); >>>>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >>>>> !mutex_trylock(>hive_lock)) >>>>> return 0; >>>> Let's say i have device 0 in hive A and it just got a gpu reset and at >>>> the same time device 1 is being added to same have though >>>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device >>>> being added and so gpu reset for device 0 bails out on >>>> '!mutex_trylock(>hive_lock))' without completing the reset. >>>> Also in general i feel a bit uncomfortable about the confusing >>>> acquisition scheme in the function and the fact that you take the >>>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside >>>> of the function. >>> Is adding a device while resetting a device even a valid operation >>> anyways? >> In theory it's valid if you have hot pluggable devices >>> I think this means more so that the reset logic is broken. Instead >>> there should be a per-hive reset lock that is taken and that is tested >>> instead. >>> >>> Tom >> The hive->hive_lock was added exactly for this purpose and used only for >> that purpose. Maybe the naming i gave it wasn't reflective of it's >> purpose :) > > But the add/remove should use per-hive locks not the global lock... :-) > > (I'm honestly not trying to bike shed I just thought the get_hive > function looked wrong :-)). > > Tom Totally agree with you, if Shayun (who origianlly added the global xgmi_mutex) is fine with switching to per hive mutex then me too, I just point out the problem with gpu reset and as you said we then need to rename the existing hive_lock into reset_lock and then and another per hive lock to do what you propose. Also - is there a way to not take the hive lock inside amdgpu_get_xgmi_hive but release it outside ? AFAIK it's an opening for problems where people use it but forget to call release. Andrey > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 2019-01-07 11:51 a.m., Grodzovsky, Andrey wrote: > > > On 01/07/2019 11:36 AM, StDenis, Tom wrote: >> On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: >>> >>> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >>>> I think it's reasonable to use the hive specific lock for hive specific >>>> functions. >>>> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >>>> >>>> -Original Message- >>>> From: amd-gfx On Behalf Of >>>> StDenis, Tom >>>> Sent: Monday, January 7, 2019 10:16 AM >>>> To: amd-gfx@lists.freedesktop.org >>>> Cc: StDenis, Tom >>>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >>>> >>>> v2: Move locks around in other functions so that this function can stand >>>> on its own. Also only hold the hive specific lock for add/remove device >>>> instead of the driver global lock so you can't add/remove devices in >>>> parallel from one hive. >>>> >>>> Signed-off-by: Tom St Denis >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>>> 3 files changed, 25 insertions(+), 15 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> index 39d5d058b2c7..13d8e2ad2f7a 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device >>>> *adev, >>>> * by different nodes. No point also since the one node already >>>> executing >>>> * reset will also reset all the other nodes in the hive. >>>> */ >>>> - hive = amdgpu_get_xgmi_hive(adev); >>>> + hive = amdgpu_get_xgmi_hive(adev, 0); >>>>if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >>>>!mutex_trylock(>hive_lock)) >>>>return 0; >>> Let's say i have device 0 in hive A and it just got a gpu reset and at >>> the same time device 1 is being added to same have though >>> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device >>> being added and so gpu reset for device 0 bails out on >>> '!mutex_trylock(>hive_lock))' without completing the reset. >>> Also in general i feel a bit uncomfortable about the confusing >>> acquisition scheme in the function and the fact that you take the >>> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside >>> of the function. >> Is adding a device while resetting a device even a valid operation >> anyways? > > In theory it's valid if you have hot pluggable devices >> >> I think this means more so that the reset logic is broken. Instead >> there should be a per-hive reset lock that is taken and that is tested >> instead. >> >> Tom > > The hive->hive_lock was added exactly for this purpose and used only for > that purpose. Maybe the naming i gave it wasn't reflective of it's > purpose :) But the add/remove should use per-hive locks not the global lock... :-) (I'm honestly not trying to bike shed I just thought the get_hive function looked wrong :-)). Tom ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 01/07/2019 11:36 AM, StDenis, Tom wrote: > On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: >> >> On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >>> I think it's reasonable to use the hive specific lock for hive specific >>> functions. >>> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >>> >>> -Original Message- >>> From: amd-gfx On Behalf Of StDenis, >>> Tom >>> Sent: Monday, January 7, 2019 10:16 AM >>> To: amd-gfx@lists.freedesktop.org >>> Cc: StDenis, Tom >>> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >>> >>> v2: Move locks around in other functions so that this function can stand on >>> its own. Also only hold the hive specific lock for add/remove device >>> instead of the driver global lock so you can't add/remove devices in >>> parallel from one hive. >>> >>> Signed-off-by: Tom St Denis >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>> 3 files changed, 25 insertions(+), 15 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 39d5d058b2c7..13d8e2ad2f7a 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device >>> *adev, >>> * by different nodes. No point also since the one node already >>> executing >>> * reset will also reset all the other nodes in the hive. >>> */ >>> - hive = amdgpu_get_xgmi_hive(adev); >>> + hive = amdgpu_get_xgmi_hive(adev, 0); >>> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >>> !mutex_trylock(>hive_lock)) >>> return 0; >> Let's say i have device 0 in hive A and it just got a gpu reset and at >> the same time device 1 is being added to same have though >> amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device >> being added and so gpu reset for device 0 bails out on >> '!mutex_trylock(>hive_lock))' without completing the reset. >> Also in general i feel a bit uncomfortable about the confusing >> acquisition scheme in the function and the fact that you take the >> hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside >> of the function. > Is adding a device while resetting a device even a valid operation > anyways? In theory it's valid if you have hot pluggable devices > > I think this means more so that the reset logic is broken. Instead > there should be a per-hive reset lock that is taken and that is tested > instead. > > Tom The hive->hive_lock was added exactly for this purpose and used only for that purpose. Maybe the naming i gave it wasn't reflective of it's purpose :) Andrey > > >> Andrey >> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >>> index 8a8bc60cb6b4..ebf50809485f 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >>> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info >>> *hive) >>> return >device_list; >>> } >>> >>> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) >>> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device >>> +*adev, int lock) >>> { >>> int i; >>> struct amdgpu_hive_info *tmp; >>> >>> if (!adev->gmc.xgmi.hive_id) >>> return NULL; >>> + >>> + mutex_lock(_mutex); >>> + >>> for (i = 0 ; i < hive_count; ++i) { >>> tmp = _hives[i]; >>> - if (tmp->hive_id == adev->gmc.xgmi.hive_id) >>> + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { >>> + if (lock) >>> + mutex_lock(>hive_lock); >>> + mutex_unlock(_mutex); >>> return tmp; >>> + } >>>
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: > > > On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >> I think it's reasonable to use the hive specific lock for hive specific >> functions. >> The changes is acked-by Shaoyun.liu < shaoyun@amd.com> >> >> -Original Message- >> From: amd-gfx On Behalf Of StDenis, >> Tom >> Sent: Monday, January 7, 2019 10:16 AM >> To: amd-gfx@lists.freedesktop.org >> Cc: StDenis, Tom >> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >> >> v2: Move locks around in other functions so that this function can stand on >> its own. Also only hold the hive specific lock for add/remove device >> instead of the driver global lock so you can't add/remove devices in >> parallel from one hive. >> >> Signed-off-by: Tom St Denis >> --- >>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ >>drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >>3 files changed, 25 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 39d5d058b2c7..13d8e2ad2f7a 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device >> *adev, >> * by different nodes. No point also since the one node already >> executing >> * reset will also reset all the other nodes in the hive. >> */ >> -hive = amdgpu_get_xgmi_hive(adev); >> +hive = amdgpu_get_xgmi_hive(adev, 0); >> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >> !mutex_trylock(>hive_lock)) >> return 0; > > Let's say i have device 0 in hive A and it just got a gpu reset and at > the same time device 1 is being added to same have though > amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device > being added and so gpu reset for device 0 bails out on > '!mutex_trylock(>hive_lock))' without completing the reset. > Also in general i feel a bit uncomfortable about the confusing > acquisition scheme in the function and the fact that you take the > hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside > of the function. Is adding a device while resetting a device even a valid operation anyways? I think this means more so that the reset logic is broken. Instead there should be a per-hive reset lock that is taken and that is tested instead. Tom > > Andrey > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> index 8a8bc60cb6b4..ebf50809485f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info >> *hive) >> return >device_list; >>} >> >> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) >> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device >> +*adev, int lock) >>{ >> int i; >> struct amdgpu_hive_info *tmp; >> >> if (!adev->gmc.xgmi.hive_id) >> return NULL; >> + >> +mutex_lock(_mutex); >> + >> for (i = 0 ; i < hive_count; ++i) { >> tmp = _hives[i]; >> -if (tmp->hive_id == adev->gmc.xgmi.hive_id) >> +if (tmp->hive_id == adev->gmc.xgmi.hive_id) { >> +if (lock) >> +mutex_lock(>hive_lock); >> +mutex_unlock(_mutex); >> return tmp; >> +} >> } >> -if (i >= AMDGPU_MAX_XGMI_HIVE) >> +if (i >= AMDGPU_MAX_XGMI_HIVE) { >> +mutex_unlock(_mutex); >> return NULL; >> +} >> >> /* initialize new hive if not exist */ >> tmp = _hives[hive_count++]; >> tmp->hive_id = adev->gmc.xgmi.hive_id; >> INIT_LIST_HEAD(>device_list); >> mutex_init(>hive_lock); >> +if (lock) >> +mutex_lock(>hive_lock); >> + >> +mutex_unlock(_mutex); >> >> return tmp; >>} >> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) >>
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: > I think it's reasonable to use the hive specific lock for hive specific > functions. > The changes is acked-by Shaoyun.liu < shaoyun@amd.com> > > -Original Message- > From: amd-gfx On Behalf Of StDenis, > Tom > Sent: Monday, January 7, 2019 10:16 AM > To: amd-gfx@lists.freedesktop.org > Cc: StDenis, Tom > Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) > > v2: Move locks around in other functions so that this function can stand on > its own. Also only hold the hive specific lock for add/remove device instead > of the driver global lock so you can't add/remove devices in parallel from > one hive. > > Signed-off-by: Tom St Denis > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- > 3 files changed, 25 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 39d5d058b2c7..13d8e2ad2f7a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > *adev, >* by different nodes. No point also since the one node already > executing >* reset will also reset all the other nodes in the hive. >*/ > - hive = amdgpu_get_xgmi_hive(adev); > + hive = amdgpu_get_xgmi_hive(adev, 0); > if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && > !mutex_trylock(>hive_lock)) > return 0; Let's say i have device 0 in hive A and it just got a gpu reset and at the same time device 1 is being added to same have though amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device being added and so gpu reset for device 0 bails out on '!mutex_trylock(>hive_lock))' without completing the reset. Also in general i feel a bit uncomfortable about the confusing acquisition scheme in the function and the fact that you take the hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside of the function. Andrey > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > index 8a8bc60cb6b4..ebf50809485f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info > *hive) > return >device_list; > } > > -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) > +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device > +*adev, int lock) > { > int i; > struct amdgpu_hive_info *tmp; > > if (!adev->gmc.xgmi.hive_id) > return NULL; > + > + mutex_lock(_mutex); > + > for (i = 0 ; i < hive_count; ++i) { > tmp = _hives[i]; > - if (tmp->hive_id == adev->gmc.xgmi.hive_id) > + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { > + if (lock) > + mutex_lock(>hive_lock); > + mutex_unlock(_mutex); > return tmp; > + } > } > - if (i >= AMDGPU_MAX_XGMI_HIVE) > + if (i >= AMDGPU_MAX_XGMI_HIVE) { > + mutex_unlock(_mutex); > return NULL; > + } > > /* initialize new hive if not exist */ > tmp = _hives[hive_count++]; > tmp->hive_id = adev->gmc.xgmi.hive_id; > INIT_LIST_HEAD(>device_list); > mutex_init(>hive_lock); > + if (lock) > + mutex_lock(>hive_lock); > + > + mutex_unlock(_mutex); > > return tmp; > } > @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > return ret; > } > > - mutex_lock(_mutex); > - hive = amdgpu_get_xgmi_hive(adev); > + /* find hive and take lock */ > + hive = amdgpu_get_xgmi_hive(adev, 1); > if (!hive) > goto exit; > > @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > break; > } > > + mutex_unlock(>hive_lock); > exit: > - mutex_unlock(_mutex); > return ret; > } > > @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device > *adev) > if (!adev->gmc.xgmi.supported) > return; > > - mutex_lo
RE: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
I think it's reasonable to use the hive specific lock for hive specific functions. The changes is acked-by Shaoyun.liu < shaoyun@amd.com> -Original Message- From: amd-gfx On Behalf Of StDenis, Tom Sent: Monday, January 7, 2019 10:16 AM To: amd-gfx@lists.freedesktop.org Cc: StDenis, Tom Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. Signed-off-by: Tom St Denis --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 39d5d058b2c7..13d8e2ad2f7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * by different nodes. No point also since the one node already executing * reset will also reset all the other nodes in the hive. */ - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 0); if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && !mutex_trylock(>hive_lock)) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 8a8bc60cb6b4..ebf50809485f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) return >device_list; } -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device +*adev, int lock) { int i; struct amdgpu_hive_info *tmp; if (!adev->gmc.xgmi.hive_id) return NULL; + + mutex_lock(_mutex); + for (i = 0 ; i < hive_count; ++i) { tmp = _hives[i]; - if (tmp->hive_id == adev->gmc.xgmi.hive_id) + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { + if (lock) + mutex_lock(>hive_lock); + mutex_unlock(_mutex); return tmp; + } } - if (i >= AMDGPU_MAX_XGMI_HIVE) + if (i >= AMDGPU_MAX_XGMI_HIVE) { + mutex_unlock(_mutex); return NULL; + } /* initialize new hive if not exist */ tmp = _hives[hive_count++]; tmp->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(>device_list); mutex_init(>hive_lock); + if (lock) + mutex_lock(>hive_lock); + + mutex_unlock(_mutex); return tmp; } @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) return ret; } - mutex_lock(_mutex); - hive = amdgpu_get_xgmi_hive(adev); + /* find hive and take lock */ + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) goto exit; @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) break; } + mutex_unlock(>hive_lock); exit: - mutex_unlock(_mutex); return ret; } @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return; - mutex_lock(_mutex); - - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) - goto exit; + return; if (!(hive->number_devices--)) mutex_destroy(>hive_lock); - -exit: - mutex_unlock(_mutex); + else + mutex_unlock(>hive_lock); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6151eb9c8ad3..8d7984844174 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -32,7 +32,7 @@ struct amdgpu_hive_info { struct mutex hive_lock; }; -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device +*adev, int lock); int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); int amdgpu_xgmi_add_device(struct amdgpu_device *adev); void amdgpu_xgmi_remove_device(struct amdgpu_device *adev); -- 2.17.2 _
[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2)
v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. Signed-off-by: Tom St Denis --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- 3 files changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 39d5d058b2c7..13d8e2ad2f7a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * by different nodes. No point also since the one node already executing * reset will also reset all the other nodes in the hive. */ - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 0); if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && !mutex_trylock(>hive_lock)) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 8a8bc60cb6b4..ebf50809485f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) return >device_list; } -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) { int i; struct amdgpu_hive_info *tmp; if (!adev->gmc.xgmi.hive_id) return NULL; + + mutex_lock(_mutex); + for (i = 0 ; i < hive_count; ++i) { tmp = _hives[i]; - if (tmp->hive_id == adev->gmc.xgmi.hive_id) + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { + if (lock) + mutex_lock(>hive_lock); + mutex_unlock(_mutex); return tmp; + } } - if (i >= AMDGPU_MAX_XGMI_HIVE) + if (i >= AMDGPU_MAX_XGMI_HIVE) { + mutex_unlock(_mutex); return NULL; + } /* initialize new hive if not exist */ tmp = _hives[hive_count++]; tmp->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(>device_list); mutex_init(>hive_lock); + if (lock) + mutex_lock(>hive_lock); + + mutex_unlock(_mutex); return tmp; } @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) return ret; } - mutex_lock(_mutex); - hive = amdgpu_get_xgmi_hive(adev); + /* find hive and take lock */ + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) goto exit; @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) break; } + mutex_unlock(>hive_lock); exit: - mutex_unlock(_mutex); return ret; } @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return; - mutex_lock(_mutex); - - hive = amdgpu_get_xgmi_hive(adev); + hive = amdgpu_get_xgmi_hive(adev, 1); if (!hive) - goto exit; + return; if (!(hive->number_devices--)) mutex_destroy(>hive_lock); - -exit: - mutex_unlock(_mutex); + else + mutex_unlock(>hive_lock); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 6151eb9c8ad3..8d7984844174 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -32,7 +32,7 @@ struct amdgpu_hive_info { struct mutex hive_lock; }; -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock); int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); int amdgpu_xgmi_add_device(struct amdgpu_device *adev); void amdgpu_xgmi_remove_device(struct amdgpu_device *adev); -- 2.17.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive()
Self NAK this ... calling functions take the same lock We should remove the locks from the callers so this function is thread safe on its own... Tom On 2019-01-07 10:00 a.m., StDenis, Tom wrote: > Signed-off-by: Tom St Denis > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 13 +++-- > 1 file changed, 11 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > index 8a8bc60cb6b4..587a5f73ae8c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > @@ -47,13 +47,20 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct > amdgpu_device *adev) > > if (!adev->gmc.xgmi.hive_id) > return NULL; > + > + mutex_lock(_mutex); > + > for (i = 0 ; i < hive_count; ++i) { > tmp = _hives[i]; > - if (tmp->hive_id == adev->gmc.xgmi.hive_id) > + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { > + mutex_unlock(_mutex); > return tmp; > + } > } > - if (i >= AMDGPU_MAX_XGMI_HIVE) > + if (i >= AMDGPU_MAX_XGMI_HIVE) { > + mutex_unlock(_mutex); > return NULL; > + } > > /* initialize new hive if not exist */ > tmp = _hives[hive_count++]; > @@ -61,6 +68,8 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct > amdgpu_device *adev) > INIT_LIST_HEAD(>device_list); > mutex_init(>hive_lock); > > + mutex_unlock(_mutex); > + > return tmp; > } > > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] add missing mutex lock to amdgpu_get_xgmi_hive()
Signed-off-by: Tom St Denis --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 8a8bc60cb6b4..587a5f73ae8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -47,13 +47,20 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) if (!adev->gmc.xgmi.hive_id) return NULL; + + mutex_lock(_mutex); + for (i = 0 ; i < hive_count; ++i) { tmp = _hives[i]; - if (tmp->hive_id == adev->gmc.xgmi.hive_id) + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { + mutex_unlock(_mutex); return tmp; + } } - if (i >= AMDGPU_MAX_XGMI_HIVE) + if (i >= AMDGPU_MAX_XGMI_HIVE) { + mutex_unlock(_mutex); return NULL; + } /* initialize new hive if not exist */ tmp = _hives[hive_count++]; @@ -61,6 +68,8 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) INIT_LIST_HEAD(>device_list); mutex_init(>hive_lock); + mutex_unlock(_mutex); + return tmp; } -- 2.17.2 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx