When driver probe fails (missing firmware, unsupported hardware, etc.), the entire device is torn down including the ip_discovery sysfs folder, preventing users from identifying what hardware is present.
Export ip_discovery sysfs even when probe fails by creating it early in the probe flow and tying its lifetime to the PCI device rather than the driver. The sysfs folder persists across probe failures and module reloads, but is cleaned up on driver unbind. Signed-off-by: Mario Limonciello <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 248 ++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + 3 files changed, 234 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index af3d2fd61cf3f..f88cffa56c6ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -22,6 +22,7 @@ */ #include <linux/firmware.h> +#include <linux/kernfs.h> #include "amdgpu.h" #include "amdgpu_discovery.h" @@ -139,6 +140,26 @@ MODULE_FIRMWARE("amdgpu/aldebaran_ip_discovery.bin"); #define mmMM_INDEX_HI 0x6 #define mmMM_DATA 0x1 +struct ip_discovery_top { + struct kobject kobj; + struct kset die_kset; + struct pci_dev *pdev; + struct amdgpu_device *adev; + uint8_t *discovery_bin; + uint32_t bin_size; + bool standalone_mode; +}; + +/* List to track early-initialized ip_discovery_top entries */ +struct early_ip_discovery { + struct list_head list; + struct pci_dev *pdev; + struct ip_discovery_top *ip_top; +}; + +static LIST_HEAD(early_ip_discovery_list); +static DEFINE_MUTEX(early_ip_discovery_mutex); + static const char *hw_id_names[HW_ID_MAX] = { [MP1_HWID] = "MP1", [MP2_HWID] = "MP2", @@ -676,7 +697,9 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev); void amdgpu_discovery_fini(struct amdgpu_device *adev) { - amdgpu_discovery_sysfs_fini(adev); + if (adev->discovery.ip_top && !adev->discovery.ip_top->standalone_mode) + amdgpu_discovery_sysfs_fini(adev); + kfree(adev->discovery.bin); adev->discovery.bin = NULL; } @@ -685,15 +708,17 @@ static int amdgpu_discovery_validate_ip(struct amdgpu_device *adev, uint8_t instance, uint16_t hw_id) { if (instance >= HWIP_MAX_INSTANCE) { - dev_err(adev->dev, - "Unexpected instance_number (%d) from ip discovery blob\n", - instance); + if (adev) + dev_err(adev->dev, + "Unexpected instance_number (%d) from ip discovery blob\n", + instance); return -EINVAL; } if (hw_id >= HW_ID_MAX) { - dev_err(adev->dev, - "Unexpected hw_id (%d) from ip discovery blob\n", - hw_id); + if (adev) + dev_err(adev->dev, + "Unexpected hw_id (%d) from ip discovery blob\n", + hw_id); return -EINVAL; } @@ -1056,12 +1081,6 @@ static const struct kobj_type ip_discovery_ktype = { .sysfs_ops = &kobj_sysfs_ops, }; -struct ip_discovery_top { - struct kobject kobj; /* ip_discovery/ */ - struct kset die_kset; /* ip_discovery/die/, contains ip_die_entry */ - struct amdgpu_device *adev; -}; - static void die_kobj_release(struct kobject *kobj) { struct ip_discovery_top *ip_top = container_of(to_kset(kobj), @@ -1077,8 +1096,14 @@ static void ip_disc_release(struct kobject *kobj) kobj); struct amdgpu_device *adev = ip_top->adev; + /* In standalone mode, discovery_bin is managed by devm and will be + * freed automatically when the PCI device is removed. Do not manually + * free it here to avoid double-free. + */ + kfree(ip_top); - adev->discovery.ip_top = NULL; + if (adev) + adev->discovery.ip_top = NULL; } static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, @@ -1086,6 +1111,10 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, { uint8_t harvest = 0; + /* In early init mode (adev == NULL), harvest info is not available */ + if (!adev) + return 0; + /* Until a uniform way is figured, get mask based on hwid */ switch (hw_id) { case VCN_HWID: @@ -1114,11 +1143,14 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev, } static int amdgpu_discovery_sysfs_ips(struct amdgpu_device *adev, + struct ip_discovery_top *ip_top, struct ip_die_entry *ip_die_entry, const size_t _ip_offset, const int num_ips, bool reg_base_64) { - uint8_t *discovery_bin = adev->discovery.bin; + uint8_t *discovery_bin = ip_top->standalone_mode ? + ip_top->discovery_bin : + adev->discovery.bin; int ii, jj, kk, res; uint16_t hw_id; uint8_t inst; @@ -1220,10 +1252,12 @@ static int amdgpu_discovery_sysfs_ips(struct amdgpu_device *adev, return 0; } -static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev) +static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev, + struct ip_discovery_top *ip_top) { - struct ip_discovery_top *ip_top = adev->discovery.ip_top; - uint8_t *discovery_bin = adev->discovery.bin; + uint8_t *discovery_bin = ip_top->standalone_mode ? + ip_top->discovery_bin : + adev->discovery.bin; struct binary_header *bhdr; struct ip_discovery_header *ihdr; struct die_header *dhdr; @@ -1270,7 +1304,8 @@ static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev) return res; } - amdgpu_discovery_sysfs_ips(adev, ip_die_entry, ip_offset, num_ips, !!ihdr->base_addr_64_bit); + amdgpu_discovery_sysfs_ips(adev, ip_top, ip_die_entry, ip_offset, + num_ips, !!ihdr->base_addr_64_bit); } return 0; @@ -1286,12 +1321,30 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) if (!discovery_bin) return -EINVAL; + /* If early init already created sysfs in standalone mode, skip normal init */ + if (adev->discovery.ip_top && adev->discovery.ip_top->standalone_mode) + return 0; + ip_top = kzalloc_obj(*ip_top); if (!ip_top) return -ENOMEM; ip_top->adev = adev; - adev->discovery.ip_top = ip_top; + + /* Check if ip_discovery already exists before creating. + * This shouldn't normally happen but handle it gracefully. + */ + if (adev->dev->kobj.sd) { + struct kernfs_node *existing; + + existing = kernfs_find_and_get(adev->dev->kobj.sd, "ip_discovery"); + if (existing) { + kernfs_put(existing); + kfree(ip_top); + return 0; + } + } + res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype, &adev->dev->kobj, "ip_discovery"); if (res) { @@ -1299,6 +1352,8 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) goto Err; } + adev->discovery.ip_top = ip_top; + die_kset = &ip_top->die_kset; kobject_set_name(&die_kset->kobj, "%s", "die"); die_kset->kobj.parent = &ip_top->kobj; @@ -1313,7 +1368,7 @@ static int amdgpu_discovery_sysfs_init(struct amdgpu_device *adev) ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr; ip_hw_instance_attrs[ii] = NULL; - res = amdgpu_discovery_sysfs_recurse(adev); + res = amdgpu_discovery_sysfs_recurse(adev, ip_top); return res; Err: @@ -1366,6 +1421,9 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) struct list_head *el, *tmp; struct kset *die_kset; + if (!ip_top) + return; + die_kset = &ip_top->die_kset; spin_lock(&die_kset->list_lock); list_for_each_prev_safe(el, tmp, &die_kset->list) { @@ -1379,6 +1437,151 @@ static void amdgpu_discovery_sysfs_fini(struct amdgpu_device *adev) kobject_put(&ip_top->kobj); } +int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, struct pci_dev *pdev) +{ + struct ip_discovery_top *ip_top; + struct early_ip_discovery *early_entry, *tmp; + struct kset *die_kset; + uint8_t *discovery_bin; + int res, ii; + + if (!adev || !adev->discovery.bin) + return -EINVAL; + + if (adev->discovery.ip_top) + return 0; + + mutex_lock(&early_ip_discovery_mutex); + list_for_each_entry_safe(early_entry, tmp, &early_ip_discovery_list, list) { + if (early_entry->pdev == pdev) { + adev->discovery.ip_top = early_entry->ip_top; + early_entry->ip_top->adev = adev; + mutex_unlock(&early_ip_discovery_mutex); + return 0; + } + } + mutex_unlock(&early_ip_discovery_mutex); + + discovery_bin = adev->discovery.bin; + + early_entry = kzalloc_obj(*early_entry); + if (!early_entry) + return -ENOMEM; + + ip_top = kzalloc_obj(*ip_top); + if (!ip_top) { + kfree(early_entry); + return -ENOMEM; + } + + ip_top->discovery_bin = devm_kmemdup(&pdev->dev, discovery_bin, + DISCOVERY_TMR_SIZE, GFP_KERNEL); + if (!ip_top->discovery_bin) { + kfree(ip_top); + kfree(early_entry); + return -ENOMEM; + } + + ip_top->bin_size = DISCOVERY_TMR_SIZE; + ip_top->pdev = pdev; + ip_top->adev = adev; + ip_top->standalone_mode = true; + + /* Check if ip_discovery already exists (from previous probe attempt). + * This can happen if the module was unloaded and reloaded but the + * sysfs persisted (tied to PCI device lifetime). + */ + if (pdev->dev.kobj.sd) { + struct kernfs_node *existing; + + existing = kernfs_find_and_get(pdev->dev.kobj.sd, "ip_discovery"); + if (existing) { + kernfs_put(existing); + kfree(ip_top); + kfree(early_entry); + return 0; + } + } + + res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype, + &pdev->dev.kobj, "ip_discovery"); + if (res) + goto err_put_kobj; + + adev->discovery.ip_top = ip_top; + + die_kset = &ip_top->die_kset; + kobject_set_name(&die_kset->kobj, "%s", "die"); + die_kset->kobj.parent = &ip_top->kobj; + die_kset->kobj.ktype = &die_kobj_ktype; + res = kset_register(&ip_top->die_kset); + if (res) + goto err_put_die_kset; + + for (ii = 0; ii < ARRAY_SIZE(ip_hw_attr); ii++) + ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr; + ip_hw_instance_attrs[ii] = NULL; + + res = amdgpu_discovery_sysfs_recurse(NULL, ip_top); + if (res) + goto err_put_die_kset; + + early_entry->pdev = pdev; + early_entry->ip_top = ip_top; + mutex_lock(&early_ip_discovery_mutex); + list_add(&early_entry->list, &early_ip_discovery_list); + mutex_unlock(&early_ip_discovery_mutex); + + return 0; + +err_put_die_kset: + kobject_put(&ip_top->die_kset.kobj); +err_put_kobj: + kobject_put(&ip_top->kobj); + kfree(early_entry); + adev->discovery.ip_top = NULL; + return res; +} + +void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev) +{ + struct early_ip_discovery *entry, *tmp_entry; + struct ip_discovery_top *ip_top = NULL; + struct list_head *el, *tmp; + struct kset *die_kset; + + /* Find the entry in our tracking list */ + mutex_lock(&early_ip_discovery_mutex); + list_for_each_entry_safe(entry, tmp_entry, &early_ip_discovery_list, list) { + if (entry->pdev == pdev) { + ip_top = entry->ip_top; + list_del(&entry->list); + kfree(entry); + break; + } + } + mutex_unlock(&early_ip_discovery_mutex); + + if (!ip_top) + return; + + /* Clean up sysfs hierarchy */ + die_kset = &ip_top->die_kset; + + spin_lock(&die_kset->list_lock); + list_for_each_prev_safe(el, tmp, &die_kset->list) { + list_del_init(el); + spin_unlock(&die_kset->list_lock); + amdgpu_discovery_sysfs_die_free(to_ip_die_entry(list_to_kobj(el))); + spin_lock(&die_kset->list_lock); + } + spin_unlock(&die_kset->list_lock); + + kobject_put(&ip_top->die_kset.kobj); + kobject_put(&ip_top->kobj); + /* ip_top itself will be freed by kobject_put via ip_disc_release */ +} + /* ================================================== */ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev) @@ -1403,6 +1606,9 @@ static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev) r = amdgpu_discovery_init(adev); if (r) return r; + + amdgpu_discovery_sysfs_early_init(adev, adev->pdev); + discovery_bin = adev->discovery.bin; wafl_ver = 0; adev->gfx.xcc_mask = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h index 4ce04486cc319..05a19cfe83988 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h @@ -47,4 +47,9 @@ int amdgpu_discovery_get_nps_info(struct amdgpu_device *adev, struct amdgpu_gmc_memrange **ranges, int *range_cnt, bool refresh); +/* Early sysfs functions for persistent ip_discovery export */ +int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, + struct pci_dev *pdev); +void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev); + #endif /* __AMDGPU_DISCOVERY__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 95d26f086d545..acfebb33b0e6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2562,6 +2562,8 @@ amdgpu_pci_remove(struct pci_dev *pdev) amdgpu_driver_unload_kms(dev); + amdgpu_discovery_sysfs_early_fini(pdev); + /* * Flush any in flight DMA operations from device. * Clear the Bus Master Enable bit and then wait on the PCIe Device -- 2.53.0
