Re: [PATCH] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-11 Thread Felix Kuehling

Am 2022-04-11 um 21:14 schrieb Joshi, Mukul:

[AMD Official Use Only]




-Original Message-
From: Kuehling, Felix 
Sent: Monday, April 11, 2022 8:16 PM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Cc: Shuotao Xu 
Subject: Re: [PATCH] drm/amdkfd: Cleanup IO links during KFD device
removal

Am 2022-04-07 um 12:15 schrieb Mukul Joshi:

Currently, the IO-links to the device being removed from topology, are
not cleared. As a result, there would be dangling links left in the
KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
 values are consistent after the device is removed:
 a. Adding a device and removing a GPU device are made mutually
exclusive.
 b. The global proximity domain counter is no longer required to be
an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
 changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 

Looks good to me. I have two nit-picks inline.



---
   drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
   drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79

---

   3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct

crat_subtype_iolink *iolink,

 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL))

{

-   to_dev =

kfd_topology_device_by_proximity_domain(id_to);

+   to_dev =

kfd_topology_device_by_proximity_domain_no_lock(id_to);

if (!to_dev)
return -ENODEV;
/* same everything but the other direction */ @@ -2225,7

+2225,7

@@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev =

kfd_topology_device_by_proximity_domain(nid);

+   peer_dev =

kfd_topology_device_by_proximity_domain_no_lock(nid);

if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id) diff --

git

a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev

*gpu);

   int kfd_topology_remove_device(struct kfd_dev *gpu);
   struct kfd_topology_device

*kfd_topology_device_by_proximity_domain(

uint32_t proximity_domain);
+struct kfd_topology_device

*kfd_topology_device_by_proximity_domain_no_lock(

+   uint32_t proximity_domain);
   struct kfd_topology_device *kfd_topology_device_by_id(uint32_t

gpu_id);

   struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
   struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..874a273b81f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,27 +46,38 @@ static struct list_head topology_device_list;
   static struct kfd_system_properties sys_props;

   static DECLARE_RWSEM(topology_lock); -static atomic_t
topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;

-struct kfd_topology_device

*kfd_topology_device_by_proximity_domain(

+struct kfd_topology_device
+*kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)

I remember we discussed this and I suggested splitting a no_lock version out
of this function. But now I don't see it being used anywhere. Was that lost
somewhere in refactoring or porting to the upstream branch?
Maybe the no_lock version isn't needed any more.


Its used in the changes in kfd_crat.c (in kfd_create_vcrat_image_gpu() and
kfd_parse_subtype_iolink ()) and  below in 
kfd_topology_device_by_proximity_domain().


You're right, I missed the changes in kfd_crat.c. And they are needed 
because the whole CRAT table parsing is now under the topology lock. 
Thanks for the reminder.


Regards,
  Felix





   {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device

RE: [PATCH] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-11 Thread Joshi, Mukul
[AMD Official Use Only]



> -Original Message-
> From: Kuehling, Felix 
> Sent: Monday, April 11, 2022 8:16 PM
> To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
> Cc: Shuotao Xu 
> Subject: Re: [PATCH] drm/amdkfd: Cleanup IO links during KFD device
> removal
> 
> Am 2022-04-07 um 12:15 schrieb Mukul Joshi:
> > Currently, the IO-links to the device being removed from topology, are
> > not cleared. As a result, there would be dangling links left in the
> > KFD topology. This patch aims to fix the following:
> > 1. Cleanup all IO links to the device being removed.
> > 2. Ensure that node numbering in sysfs and nodes proximity domain
> > values are consistent after the device is removed:
> > a. Adding a device and removing a GPU device are made mutually
> >exclusive.
> > b. The global proximity domain counter is no longer required to be
> >an atomic counter. A normal 32-bit counter can be used instead.
> > 3. Update generation_count to let user-mode know that topology has
> > changed due to device removal.
> >
> > CC: Shuotao Xu 
> > Signed-off-by: Mukul Joshi 
> 
> Looks good to me. I have two nit-picks inline.
> 
> 
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79
> ---
> >   3 files changed, 74 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index 1eaabd2cb41b..afc8a7fcdad8 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct
> crat_subtype_iolink *iolink,
> >  * table, add corresponded reversed direction link now.
> >  */
> > if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL))
> {
> > -   to_dev =
> kfd_topology_device_by_proximity_domain(id_to);
> > +   to_dev =
> kfd_topology_device_by_proximity_domain_no_lock(id_to);
> > if (!to_dev)
> > return -ENODEV;
> > /* same everything but the other direction */ @@ -2225,7
> +2225,7
> > @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
> >  */
> > if (kdev->hive_id) {
> > for (nid = 0; nid < proximity_domain; ++nid) {
> > -   peer_dev =
> kfd_topology_device_by_proximity_domain(nid);
> > +   peer_dev =
> kfd_topology_device_by_proximity_domain_no_lock(nid);
> > if (!peer_dev->gpu)
> > continue;
> > if (peer_dev->gpu->hive_id != kdev->hive_id) diff --
> git
> > a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index e1b7e6afa920..8a43def1f638 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev
> *gpu);
> >   int kfd_topology_remove_device(struct kfd_dev *gpu);
> >   struct kfd_topology_device
> *kfd_topology_device_by_proximity_domain(
> > uint32_t proximity_domain);
> > +struct kfd_topology_device
> *kfd_topology_device_by_proximity_domain_no_lock(
> > +   uint32_t proximity_domain);
> >   struct kfd_topology_device *kfd_topology_device_by_id(uint32_t
> gpu_id);
> >   struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
> >   struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > index 3bdcae239bc0..874a273b81f7 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > @@ -46,27 +46,38 @@ static struct list_head topology_device_list;
> >   static struct kfd_system_properties sys_props;
> >
> >   static DECLARE_RWSEM(topology_lock); -static atomic_t
> > topology_crat_proximity_domain;
> > +static uint32_t topology_crat_proximity_domain;
> >
> > -struct kfd_topology_device
> *kfd_topology_device_by_proximity_domain(
> > +struct kfd_topology_device
> > +*kfd_topology_device_by_proximity_domain_no_lock(
> > uint32_t proximity_domain)
> 
> I re

Re: [PATCH] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-11 Thread Felix Kuehling

Am 2022-04-07 um 12:15 schrieb Mukul Joshi:

Currently, the IO-links to the device being removed from topology,
are not cleared. As a result, there would be dangling links left in
the KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
values are consistent after the device is removed:
a. Adding a device and removing a GPU device are made mutually
   exclusive.
b. The global proximity domain counter is no longer required to be
   an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 


Looks good to me. I have two nit-picks inline.



---
  drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79 ---
  3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
-   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
@@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev = kfd_topology_device_by_proximity_domain(nid);
+   peer_dev = 
kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
  int kfd_topology_remove_device(struct kfd_dev *gpu);
  struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
+   uint32_t proximity_domain);
  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..874a273b81f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,27 +46,38 @@ static struct list_head topology_device_list;
  static struct kfd_system_properties sys_props;
  
  static DECLARE_RWSEM(topology_lock);

-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
  
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(

+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)


I remember we discussed this and I suggested splitting a no_lock version 
out of this function. But now I don't see it being used anywhere. Was 
that lost somewhere in refactoring or porting to the upstream branch? 
Maybe the no_lock version isn't needed any more.




  {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
  
-	down_read(_lock);

-
list_for_each_entry(top_dev, _device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
  
+	return device;

+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+   uint32_t proximity_domain)
+{
+   struct kfd_topology_device *device = NULL;
+
+   down_read(_lock);
+
+   device = kfd_topology_device_by_proximity_domain_no_lock(
+   proximity_domain);
up_read(_lock);
  
  	return device;

  }
  
+

  struct kfd_topology_device 

Re: [PATCH] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-07 Thread Andrey Grodzovsky

I suggest adding another patch to handle unbalanced decrement of
kfd_lock in kgd2kfd_suspend. This patch alone is not enough to fix
all removal issues.

Andrey

On 2022-04-07 12:15, Mukul Joshi wrote:

Currently, the IO-links to the device being removed from topology,
are not cleared. As a result, there would be dangling links left in
the KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
values are consistent after the device is removed:
a. Adding a device and removing a GPU device are made mutually
   exclusive.
b. The global proximity domain counter is no longer required to be
   an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 
---
  drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79 ---
  3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
-   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
@@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev = kfd_topology_device_by_proximity_domain(nid);
+   peer_dev = 
kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
  int kfd_topology_remove_device(struct kfd_dev *gpu);
  struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
+   uint32_t proximity_domain);
  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..874a273b81f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,27 +46,38 @@ static struct list_head topology_device_list;
  static struct kfd_system_properties sys_props;
  
  static DECLARE_RWSEM(topology_lock);

-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
  
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(

+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)
  {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
  
-	down_read(_lock);

-
list_for_each_entry(top_dev, _device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
  
+	return device;

+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+   uint32_t proximity_domain)
+{
+   struct kfd_topology_device *device = NULL;
+
+   down_read(_lock);
+
+   device = kfd_topology_device_by_proximity_domain_no_lock(
+   proximity_domain);
up_read(_lock);
  
  	return device;

  }
  
+

  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id)
  {
struct kfd_topology_device *top_dev = NULL;
@@ -1060,7 +1071,7 @@ int kfd_topology_init(void)

[PATCH] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-07 Thread Mukul Joshi
Currently, the IO-links to the device being removed from topology,
are not cleared. As a result, there would be dangling links left in
the KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
   values are consistent after the device is removed:
   a. Adding a device and removing a GPU device are made mutually
  exclusive.
   b. The global proximity domain counter is no longer required to be
  an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
   changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 
---
 drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79 ---
 3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
-   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
@@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev = kfd_topology_device_by_proximity_domain(nid);
+   peer_dev = 
kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
 int kfd_topology_remove_device(struct kfd_dev *gpu);
 struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
+   uint32_t proximity_domain);
 struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..874a273b81f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,27 +46,38 @@ static struct list_head topology_device_list;
 static struct kfd_system_properties sys_props;
 
 static DECLARE_RWSEM(topology_lock);
-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
 
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)
 {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
 
-   down_read(_lock);
-
list_for_each_entry(top_dev, _device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
 
+   return device;
+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+   uint32_t proximity_domain)
+{
+   struct kfd_topology_device *device = NULL;
+
+   down_read(_lock);
+
+   device = kfd_topology_device_by_proximity_domain_no_lock(
+   proximity_domain);
up_read(_lock);
 
return device;
 }
 
+
 struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id)
 {
struct kfd_topology_device *top_dev = NULL;
@@ -1060,7 +1071,7 @@ int kfd_topology_init(void)
down_write(_lock);
kfd_topology_update_device_list(_topology_device_list,
_device_list);
-   atomic_set(_crat_proximity_domain, sys_props.num_devices-1);
+