Re: [PATCHv2] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-11 Thread Felix Kuehling

Am 2022-04-11 um 22:15 schrieb Mukul Joshi:

Currently, the IO-links to the device being removed from topology,
are not cleared. As a result, there would be dangling links left in
the KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
values are consistent after the device is removed:
a. Adding a device and removing a GPU device are made mutually
   exclusive.
b. The global proximity domain counter is no longer required to be
   an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 
Reviewed-by: Shuotao Xu 


Reviewed-by: Felix Kuehling 



---
v1->v2:
- Remove comments from inside kfd_topology_update_io_links()
   and add them as kernel-doc comments.

  drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 83 ---
  3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
-   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
@@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev = kfd_topology_device_by_proximity_domain(nid);
+   peer_dev = 
kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
  int kfd_topology_remove_device(struct kfd_dev *gpu);
  struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
+   uint32_t proximity_domain);
  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
  struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..98a51847cd8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,22 +46,32 @@ static struct list_head topology_device_list;
  static struct kfd_system_properties sys_props;
  
  static DECLARE_RWSEM(topology_lock);

-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
  
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(

+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)
  {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
  
-	down_read(_lock);

-
list_for_each_entry(top_dev, _device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
  
+	return device;

+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+   uint32_t proximity_domain)
+{
+   struct kfd_topology_device *device = NULL;
+
+   down_read(_lock);
+
+   device = kfd_topology_device_by_proximity_domain_no_lock(
+   proximity_domain);
up_read(_lock);
  
  	return device;

@@ -1060,7 +1070,7 @@ int kfd_topology_init(void)
down_write(_lock);
kfd_topology_update_device_list(_topology_device_list,
_device_list);
-   

[PATCHv2] drm/amdkfd: Cleanup IO links during KFD device removal

2022-04-11 Thread Mukul Joshi
Currently, the IO-links to the device being removed from topology,
are not cleared. As a result, there would be dangling links left in
the KFD topology. This patch aims to fix the following:
1. Cleanup all IO links to the device being removed.
2. Ensure that node numbering in sysfs and nodes proximity domain
   values are consistent after the device is removed:
   a. Adding a device and removing a GPU device are made mutually
  exclusive.
   b. The global proximity domain counter is no longer required to be
  an atomic counter. A normal 32-bit counter can be used instead.
3. Update generation_count to let user-mode know that topology has
   changed due to device removal.

CC: Shuotao Xu 
Signed-off-by: Mukul Joshi 
Reviewed-by: Shuotao Xu 
---
v1->v2:
- Remove comments from inside kfd_topology_update_io_links()
  and add them as kernel-doc comments.

 drivers/gpu/drm/amd/amdkfd/kfd_crat.c |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 +
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 83 ---
 3 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 1eaabd2cb41b..afc8a7fcdad8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
 * table, add corresponded reversed direction link now.
 */
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
-   to_dev = kfd_topology_device_by_proximity_domain(id_to);
+   to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
if (!to_dev)
return -ENODEV;
/* same everything but the other direction */
@@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 */
if (kdev->hive_id) {
for (nid = 0; nid < proximity_domain; ++nid) {
-   peer_dev = kfd_topology_device_by_proximity_domain(nid);
+   peer_dev = 
kfd_topology_device_by_proximity_domain_no_lock(nid);
if (!peer_dev->gpu)
continue;
if (peer_dev->gpu->hive_id != kdev->hive_id)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e1b7e6afa920..8a43def1f638 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
 int kfd_topology_remove_device(struct kfd_dev *gpu);
 struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
+   uint32_t proximity_domain);
 struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..98a51847cd8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -46,22 +46,32 @@ static struct list_head topology_device_list;
 static struct kfd_system_properties sys_props;
 
 static DECLARE_RWSEM(topology_lock);
-static atomic_t topology_crat_proximity_domain;
+static uint32_t topology_crat_proximity_domain;
 
-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
uint32_t proximity_domain)
 {
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
 
-   down_read(_lock);
-
list_for_each_entry(top_dev, _device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
 
+   return device;
+}
+
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+   uint32_t proximity_domain)
+{
+   struct kfd_topology_device *device = NULL;
+
+   down_read(_lock);
+
+   device = kfd_topology_device_by_proximity_domain_no_lock(
+   proximity_domain);
up_read(_lock);
 
return device;
@@ -1060,7 +1070,7 @@ int kfd_topology_init(void)
down_write(_lock);
kfd_topology_update_device_list(_topology_device_list,
_device_list);
-   atomic_set(_crat_proximity_domain, sys_props.num_devices-1);
+