On 6/1/26 07:05, Geoffrey McRae wrote:
> Fix a sysfs duplication error when reinitializing the device:
>
> sysfs: cannot create duplicate filename '.../ip_discovery'
> kobject_add_internal failed for ip_discovery with -EEXIST
> ...
> Failed to create device file mem_info_preempt_used (-17)
>
> The failure is caused by stale sysfs entries not being removed during
> device teardown, leading to -EEXIST when the driver is reprobed. In
> particular:
>
> - amdgpu_discovery sysfs kobjects were not fully torn down early enough,
> and ip_top remained non-NULL after cleanup
> - the preempt manager sysfs attribute was removed only conditionally
> and not during the common hw fini path
>
> Fix this by:
> - calling amdgpu_discovery_sysfs_fini() and
> amdgpu_preempt_mgr_sysfs_fini() from amdgpu_device_fini_hw()
> - making amdgpu_discovery_sysfs_fini() externally visible and clearing
> adev->discovery.ip_top to prevent reuse
> - centralizing preempt sysfs removal into a helper and reusing it from
> both fini paths
>
> This ensures sysfs state is fully cleaned up before reprobe and avoids
> duplicate kobject/file creation.
>
> Change-Id: Ib91bf9eac4a1901c05bdb17b20de3e4122323b34
> Cc: Christian König <[email protected]>
> Cc: Alex Deucher <[email protected]>
> Signed-off-by: Geoffrey McRae <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 5 ++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c | 14 ++++++++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 +
> 5 files changed, 20 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5ff224163bab..ef5cc4997656 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4200,6 +4200,10 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
>
> if (adev->mman.initialized)
> drain_workqueue(adev->mman.bdev.wq);
> +
> + amdgpu_discovery_sysfs_fini(adev);
> + amdgpu_preempt_mgr_sysfs_fini(adev);
> +
That should probably rather be in amdgpu_device_sys_interface_fini()
> adev->shutdown = true;
>
> unregister_pm_notifier(&adev->pm_nb);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> index 7c2212985273..a2ae26bb11ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
> @@ -704,8 +704,6 @@ static int amdgpu_discovery_init(struct amdgpu_device
> *adev)
> return r;
> }
>
> -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev);
> -
> void amdgpu_discovery_fini(struct amdgpu_device *adev)
> {
> amdgpu_discovery_sysfs_fini(adev);
> @@ -1391,7 +1389,7 @@ static void amdgpu_discovery_sysfs_die_free(struct
> ip_die_entry *ip_die_entry)
> kobject_put(&ip_die_entry->ip_kset.kobj);
> }
>
> -static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
> +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev)
> {
> struct ip_discovery_top *ip_top = adev->discovery.ip_top;
> struct list_head *el, *tmp;
> @@ -1400,6 +1398,7 @@ static void amdgpu_discovery_sysfs_fini(struct
> amdgpu_device *adev)
> if (!ip_top)
> return;
>
> + adev->discovery.ip_top = NULL;
> die_kset = &ip_top->die_kset;
> spin_lock(&die_kset->list_lock);
> list_for_each_prev_safe(el, tmp, &die_kset->list) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
> index e0010f6a3eda..cff33ab2cb25 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
> @@ -41,6 +41,7 @@ struct amdgpu_discovery_info {
> bool reserve_tmr;
> };
>
> +void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev);
> void amdgpu_discovery_fini(struct amdgpu_device *adev);
> int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
> index 34b5e22b44e5..eab81206c050 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_preempt_mgr.c
> @@ -46,6 +46,17 @@ static ssize_t mem_info_preempt_used_show(struct device
> *dev,
>
> static DEVICE_ATTR_RO(mem_info_preempt_used);
>
> +/**
> + * amdgpu_preempt_mgr_sysfs_fini - remove PREEMPT manager sysfs attributes
> + *
> + * @adev: amdgpu_device pointer
> + */
> +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev)
> +{
> + if (adev->dev->kobj.sd)
> + device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
> +}
> +
> /**
> * amdgpu_preempt_mgr_new - allocate a new node
> *
> @@ -137,8 +148,7 @@ void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev)
> if (ret)
> return;
>
> - if (adev->dev->kobj.sd)
> - device_remove_file(adev->dev, &dev_attr_mem_info_preempt_used);
> + amdgpu_preempt_mgr_sysfs_fini(adev);
That looks superflous now.
Regards,
Christian.
>
> ttm_resource_manager_cleanup(man);
> ttm_set_driver_manager(&adev->mman.bdev, AMDGPU_PL_PREEMPT, NULL);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index 2d72fa217274..00acec7226f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -140,6 +140,7 @@ int amdgpu_gtt_mgr_init(struct amdgpu_device *adev,
> uint64_t gtt_size);
> void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev);
> int amdgpu_preempt_mgr_init(struct amdgpu_device *adev);
> void amdgpu_preempt_mgr_fini(struct amdgpu_device *adev);
> +void amdgpu_preempt_mgr_sysfs_fini(struct amdgpu_device *adev);
> int amdgpu_vram_mgr_init(struct amdgpu_device *adev);
> void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>