> On Apr 12, 2022, at 8:07 AM, Felix Kuehling <[email protected]> wrote:
> 
> Am 2022-04-08 um 04:45 schrieb Shuotao Xu:
>> Currently, the IO-links to the device being removed from topology,
>> are not cleared. As a result, there would be dangling links left in
>> the KFD topology. This patch aims to fix the following:
>> 1. Cleanup all IO links to the device being removed.
>> 2. Ensure that node numbering in sysfs and nodes proximity domain
>>    values are consistent after the device is removed:
>>    a. Adding a device and removing a GPU device are made mutually
>>       exclusive.
>>    b. The global proximity domain counter is no longer required to be
>>       an atomic counter. A normal 32-bit counter can be used instead.
>> 3. Update generation_count to let user-mode know that topology has
>>    changed due to device removal.
>> 
>> Reviewed-by: Shuotao Xu <[email protected]>
>> CC: Shuotao Xu <[email protected]>
>> Signed-off-by: Mukul Joshi <[email protected]>
> 
> This looks like Mukul's patch, but with you as the author (otherwise I would 
> have expected a "From: Mukul ..." line at the start of the email). Did you 
> make any changes to it?
> 
> Regards,
>   Felix

Yes it was Mukul’s patch.Was trying to from a patch series, but I could did it 
wrong.

Regards,
Shuotao
> 
> 
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c     |  4 +-
>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h     |  2 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 79 ++++++++++++++++++++---
>>  3 files changed, 74 insertions(+), 11 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> index 1eaabd2cb41b..afc8a7fcdad8 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> @@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct 
>> crat_subtype_iolink *iolink,
>>       * table, add corresponded reversed direction link now.
>>       */
>>      if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
>> -            to_dev = kfd_topology_device_by_proximity_domain(id_to);
>> +            to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
>>              if (!to_dev)
>>                      return -ENODEV;
>>              /* same everything but the other direction */
>> @@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void 
>> *pcrat_image,
>>       */
>>      if (kdev->hive_id) {
>>              for (nid = 0; nid < proximity_domain; ++nid) {
>> -                    peer_dev = kfd_topology_device_by_proximity_domain(nid);
>> +                    peer_dev = 
>> kfd_topology_device_by_proximity_domain_no_lock(nid);
>>                      if (!peer_dev->gpu)
>>                              continue;
>>                      if (peer_dev->gpu->hive_id != kdev->hive_id)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index e1b7e6afa920..8a43def1f638 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
>>  int kfd_topology_remove_device(struct kfd_dev *gpu);
>>  struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
>>                                              uint32_t proximity_domain);
>> +struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
>> +                                            uint32_t proximity_domain);
>>  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
>>  struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
>>  struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> index 3bdcae239bc0..874a273b81f7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
>> @@ -46,27 +46,38 @@ static struct list_head topology_device_list;
>>  static struct kfd_system_properties sys_props;
>>    static DECLARE_RWSEM(topology_lock);
>> -static atomic_t topology_crat_proximity_domain;
>> +static uint32_t topology_crat_proximity_domain;
>>  -struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
>> +struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
>>                                              uint32_t proximity_domain)
>>  {
>>      struct kfd_topology_device *top_dev;
>>      struct kfd_topology_device *device = NULL;
>>  -   down_read(&topology_lock);
>> -
>>      list_for_each_entry(top_dev, &topology_device_list, list)
>>              if (top_dev->proximity_domain == proximity_domain) {
>>                      device = top_dev;
>>                      break;
>>              }
>>  +   return device;
>> +}
>> +
>> +struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
>> +                                            uint32_t proximity_domain)
>> +{
>> +    struct kfd_topology_device *device = NULL;
>> +
>> +    down_read(&topology_lock);
>> +
>> +    device = kfd_topology_device_by_proximity_domain_no_lock(
>> +                                                    proximity_domain);
>>      up_read(&topology_lock);
>>      return device;
>>  }
>>  +
>>  struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id)
>>  {
>>      struct kfd_topology_device *top_dev = NULL;
>> @@ -1060,7 +1071,7 @@ int kfd_topology_init(void)
>>      down_write(&topology_lock);
>>      kfd_topology_update_device_list(&temp_topology_device_list,
>>                                      &topology_device_list);
>> -    atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
>> +    topology_crat_proximity_domain = sys_props.num_devices-1;
>>      ret = kfd_topology_update_sysfs();
>>      up_write(&topology_lock);
>>  @@ -1295,8 +1306,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>      pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
>>  -   proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
>> -
>>      /* Include the CPU in xGMI hive if xGMI connected by assigning it the 
>> hive ID. */
>>      if (gpu->hive_id && gpu->adev->gmc.xgmi.connected_to_cpu) {
>>              struct kfd_topology_device *top_dev;
>> @@ -1321,12 +1330,16 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>       */
>>      dev = kfd_assign_gpu(gpu);
>>      if (!dev) {
>> +            down_write(&topology_lock);
>> +            proximity_domain = ++topology_crat_proximity_domain;
>> +
>>              res = kfd_create_crat_image_virtual(&crat_image, &image_size,
>>                                                  COMPUTE_UNIT_GPU, gpu,
>>                                                  proximity_domain);
>>              if (res) {
>>                      pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
>>                             gpu_id);
>> +                    topology_crat_proximity_domain--;
>>                      return res;
>>              }
>>              res = kfd_parse_crat_table(crat_image,
>> @@ -1335,10 +1348,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>              if (res) {
>>                      pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
>>                             gpu_id);
>> +                    topology_crat_proximity_domain--;
>>                      goto err;
>>              }
>>  -           down_write(&topology_lock);
>>              kfd_topology_update_device_list(&temp_topology_device_list,
>>                      &topology_device_list);
>>  @@ -1485,25 +1498,73 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>>      return res;
>>  }
>>  +static void kfd_topology_update_io_links(int proximity_domain)
>> +{
>> +    struct kfd_topology_device *dev;
>> +    struct kfd_iolink_properties *iolink, *p2plink, *tmp;
>> +    /*
>> +     * The topology list currently is arranged in increasing
>> +     * order of proximity domain.
>> +     *
>> +     * Two things need to be done when a device is removed:
>> +     * 1. All the IO links to this device need to be
>> +     *    removed.
>> +     * 2. All nodes after the current device node need to move
>> +     *    up once this device node is removed from the topology
>> +     *    list. As a result, the proximity domain values for
>> +     *    all nodes after the node being deleted reduce by 1.
>> +     *    This would also cause the proximity domain values for
>> +     *    io links to be updated based on new proximity
>> +     *    domain values.
>> +     */
>> +    list_for_each_entry(dev, &topology_device_list, list) {
>> +            if (dev->proximity_domain > proximity_domain)
>> +                    dev->proximity_domain--;
>> +
>> +            list_for_each_entry_safe(iolink, tmp, &dev->io_link_props, 
>> list) {
>> +                    /*
>> +                     * If there is an io link to the dev being deleted
>> +                     * then remove that IO link also.
>> +                     */
>> +                    if (iolink->node_to == proximity_domain) {
>> +                            list_del(&iolink->list);
>> +                            dev->io_link_count--;
>> +                            dev->node_props.io_links_count--;
>> +                    } else if (iolink->node_from > proximity_domain) {
>> +                            iolink->node_from--;
>> +                    } else if (iolink->node_to > proximity_domain) {
>> +                            iolink->node_to--;
>> +                    }
>> +            }
>> +
>> +    }
>> +}
>> +
>>  int kfd_topology_remove_device(struct kfd_dev *gpu)
>>  {
>>      struct kfd_topology_device *dev, *tmp;
>>      uint32_t gpu_id;
>>      int res = -ENODEV;
>> +    int i = 0;
>>      down_write(&topology_lock);
>>  -   list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
>> +    list_for_each_entry_safe(dev, tmp, &topology_device_list, list) {
>>              if (dev->gpu == gpu) {
>>                      gpu_id = dev->gpu_id;
>>                      kfd_remove_sysfs_node_entry(dev);
>>                      kfd_release_topology_device(dev);
>>                      sys_props.num_devices--;
>> +                    kfd_topology_update_io_links(i);
>> +                    topology_crat_proximity_domain = 
>> sys_props.num_devices-1;
>> +                    sys_props.generation_count++;
>>                      res = 0;
>>                      if (kfd_topology_update_sysfs() < 0)
>>                              kfd_topology_release_sysfs();
>>                      break;
>>              }
>> +            i++;
>> +    }
>>      up_write(&topology_lock);
>>  

Reply via email to