When driver probe fails (missing firmware, unsupported hardware, etc.),
the entire device is torn down including the ip_discovery sysfs folder,
preventing users from identifying what hardware is present.

Export ip_discovery sysfs even when probe fails by creating it early
in the probe flow and tying its lifetime to the PCI device rather than
the driver. The sysfs folder persists across probe failures and module
reloads, but is cleaned up on driver unbind.

Signed-off-by: Mario Limonciello <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 248 ++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |   2 +
 3 files changed, 234 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index af3d2fd61cf3f..f88cffa56c6ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/firmware.h>
+#include <linux/kernfs.h>
 
 #include "amdgpu.h"
 #include "amdgpu_discovery.h"
@@ -139,6 +140,26 @@ MODULE_FIRMWARE("amdgpu/aldebaran_ip_discovery.bin");
 #define mmMM_INDEX_HI          0x6
 #define mmMM_DATA              0x1
 
+struct ip_discovery_top {
+       struct kobject kobj;
+       struct kset die_kset;
+       struct pci_dev *pdev;
+       struct amdgpu_device *adev;
+       uint8_t *discovery_bin;
+       uint32_t bin_size;
+       bool standalone_mode;
+};
+
+/* List to track early-initialized ip_discovery_top entries */
+struct early_ip_discovery {
+       struct list_head list;
+       struct pci_dev *pdev;
+       struct ip_discovery_top *ip_top;
+};
+
+static LIST_HEAD(early_ip_discovery_list);
+static DEFINE_MUTEX(early_ip_discovery_mutex);
+
 static const char *hw_id_names[HW_ID_MAX] = {
        [MP1_HWID]              = "MP1",
        [MP2_HWID]              = "MP2",
@@ -676,7 +697,9 @@ static void amdgpu_discovery_sysfs_fini(struct 
amdgpu_device *adev);
 
 void amdgpu_discovery_fini(struct amdgpu_device *adev)
 {
-       amdgpu_discovery_sysfs_fini(adev);
+       if (adev->discovery.ip_top && !adev->discovery.ip_top->standalone_mode)
+               amdgpu_discovery_sysfs_fini(adev);
+
        kfree(adev->discovery.bin);
        adev->discovery.bin = NULL;
 }
@@ -685,15 +708,17 @@ static int amdgpu_discovery_validate_ip(struct 
amdgpu_device *adev,
                                        uint8_t instance, uint16_t hw_id)
 {
        if (instance >= HWIP_MAX_INSTANCE) {
-               dev_err(adev->dev,
-                       "Unexpected instance_number (%d) from ip discovery 
blob\n",
-                       instance);
+               if (adev)
+                       dev_err(adev->dev,
+                               "Unexpected instance_number (%d) from ip 
discovery blob\n",
+                               instance);
                return -EINVAL;
        }
        if (hw_id >= HW_ID_MAX) {
-               dev_err(adev->dev,
-                       "Unexpected hw_id (%d) from ip discovery blob\n",
-                       hw_id);
+               if (adev)
+                       dev_err(adev->dev,
+                               "Unexpected hw_id (%d) from ip discovery 
blob\n",
+                               hw_id);
                return -EINVAL;
        }
 
@@ -1056,12 +1081,6 @@ static const struct kobj_type ip_discovery_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
 };
 
-struct ip_discovery_top {
-       struct kobject kobj;    /* ip_discovery/ */
-       struct kset die_kset;   /* ip_discovery/die/, contains ip_die_entry */
-       struct amdgpu_device *adev;
-};
-
 static void die_kobj_release(struct kobject *kobj)
 {
        struct ip_discovery_top *ip_top = container_of(to_kset(kobj),
@@ -1077,8 +1096,14 @@ static void ip_disc_release(struct kobject *kobj)
                                                       kobj);
        struct amdgpu_device *adev = ip_top->adev;
 
+       /* In standalone mode, discovery_bin is managed by devm and will be
+        * freed automatically when the PCI device is removed. Do not manually
+        * free it here to avoid double-free.
+        */
+
        kfree(ip_top);
-       adev->discovery.ip_top = NULL;
+       if (adev)
+               adev->discovery.ip_top = NULL;
 }
 
 static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
@@ -1086,6 +1111,10 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct 
amdgpu_device *adev,
 {
        uint8_t harvest = 0;
 
+       /* In early init mode (adev == NULL), harvest info is not available */
+       if (!adev)
+               return 0;
+
        /* Until a uniform way is figured, get mask based on hwid */
        switch (hw_id) {
        case VCN_HWID:
@@ -1114,11 +1143,14 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct 
amdgpu_device *adev,
 }
 
 static int amdgpu_discovery_sysfs_ips(struct amdgpu_device *adev,
+                                     struct ip_discovery_top *ip_top,
                                      struct ip_die_entry *ip_die_entry,
                                      const size_t _ip_offset, const int 
num_ips,
                                      bool reg_base_64)
 {
-       uint8_t *discovery_bin = adev->discovery.bin;
+       uint8_t *discovery_bin = ip_top->standalone_mode ?
+                                ip_top->discovery_bin :
+                                adev->discovery.bin;
        int ii, jj, kk, res;
        uint16_t hw_id;
        uint8_t inst;
@@ -1220,10 +1252,12 @@ static int amdgpu_discovery_sysfs_ips(struct 
amdgpu_device *adev,
        return 0;
 }
 
-static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev)
+static int amdgpu_discovery_sysfs_recurse(struct amdgpu_device *adev,
+                                         struct ip_discovery_top *ip_top)
 {
-       struct ip_discovery_top *ip_top = adev->discovery.ip_top;
-       uint8_t *discovery_bin = adev->discovery.bin;
+       uint8_t *discovery_bin = ip_top->standalone_mode ?
+                                ip_top->discovery_bin :
+                                adev->discovery.bin;
        struct binary_header *bhdr;
        struct ip_discovery_header *ihdr;
        struct die_header *dhdr;
@@ -1270,7 +1304,8 @@ static int amdgpu_discovery_sysfs_recurse(struct 
amdgpu_device *adev)
                        return res;
                }
 
-               amdgpu_discovery_sysfs_ips(adev, ip_die_entry, ip_offset, 
num_ips, !!ihdr->base_addr_64_bit);
+               amdgpu_discovery_sysfs_ips(adev, ip_top, ip_die_entry, 
ip_offset,
+                                          num_ips, !!ihdr->base_addr_64_bit);
        }
 
        return 0;
@@ -1286,12 +1321,30 @@ static int amdgpu_discovery_sysfs_init(struct 
amdgpu_device *adev)
        if (!discovery_bin)
                return -EINVAL;
 
+       /* If early init already created sysfs in standalone mode, skip normal 
init */
+       if (adev->discovery.ip_top && adev->discovery.ip_top->standalone_mode)
+               return 0;
+
        ip_top = kzalloc_obj(*ip_top);
        if (!ip_top)
                return -ENOMEM;
 
        ip_top->adev = adev;
-       adev->discovery.ip_top = ip_top;
+
+       /* Check if ip_discovery already exists before creating.
+        * This shouldn't normally happen but handle it gracefully.
+        */
+       if (adev->dev->kobj.sd) {
+               struct kernfs_node *existing;
+
+               existing = kernfs_find_and_get(adev->dev->kobj.sd, 
"ip_discovery");
+               if (existing) {
+                       kernfs_put(existing);
+                       kfree(ip_top);
+                       return 0;
+               }
+       }
+
        res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype,
                                   &adev->dev->kobj, "ip_discovery");
        if (res) {
@@ -1299,6 +1352,8 @@ static int amdgpu_discovery_sysfs_init(struct 
amdgpu_device *adev)
                goto Err;
        }
 
+       adev->discovery.ip_top = ip_top;
+
        die_kset = &ip_top->die_kset;
        kobject_set_name(&die_kset->kobj, "%s", "die");
        die_kset->kobj.parent = &ip_top->kobj;
@@ -1313,7 +1368,7 @@ static int amdgpu_discovery_sysfs_init(struct 
amdgpu_device *adev)
                ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr;
        ip_hw_instance_attrs[ii] = NULL;
 
-       res = amdgpu_discovery_sysfs_recurse(adev);
+       res = amdgpu_discovery_sysfs_recurse(adev, ip_top);
 
        return res;
 Err:
@@ -1366,6 +1421,9 @@ static void amdgpu_discovery_sysfs_fini(struct 
amdgpu_device *adev)
        struct list_head *el, *tmp;
        struct kset *die_kset;
 
+       if (!ip_top)
+               return;
+
        die_kset = &ip_top->die_kset;
        spin_lock(&die_kset->list_lock);
        list_for_each_prev_safe(el, tmp, &die_kset->list) {
@@ -1379,6 +1437,151 @@ static void amdgpu_discovery_sysfs_fini(struct 
amdgpu_device *adev)
        kobject_put(&ip_top->kobj);
 }
 
+int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev, struct 
pci_dev *pdev)
+{
+       struct ip_discovery_top *ip_top;
+       struct early_ip_discovery *early_entry, *tmp;
+       struct kset *die_kset;
+       uint8_t *discovery_bin;
+       int res, ii;
+
+       if (!adev || !adev->discovery.bin)
+               return -EINVAL;
+
+       if (adev->discovery.ip_top)
+               return 0;
+
+       mutex_lock(&early_ip_discovery_mutex);
+       list_for_each_entry_safe(early_entry, tmp, &early_ip_discovery_list, 
list) {
+               if (early_entry->pdev == pdev) {
+                       adev->discovery.ip_top = early_entry->ip_top;
+                       early_entry->ip_top->adev = adev;
+                       mutex_unlock(&early_ip_discovery_mutex);
+                       return 0;
+               }
+       }
+       mutex_unlock(&early_ip_discovery_mutex);
+
+       discovery_bin = adev->discovery.bin;
+
+       early_entry = kzalloc_obj(*early_entry);
+       if (!early_entry)
+               return -ENOMEM;
+
+       ip_top = kzalloc_obj(*ip_top);
+       if (!ip_top) {
+               kfree(early_entry);
+               return -ENOMEM;
+       }
+
+       ip_top->discovery_bin = devm_kmemdup(&pdev->dev, discovery_bin,
+                                            DISCOVERY_TMR_SIZE, GFP_KERNEL);
+       if (!ip_top->discovery_bin) {
+               kfree(ip_top);
+               kfree(early_entry);
+               return -ENOMEM;
+       }
+
+       ip_top->bin_size = DISCOVERY_TMR_SIZE;
+       ip_top->pdev = pdev;
+       ip_top->adev = adev;
+       ip_top->standalone_mode = true;
+
+       /* Check if ip_discovery already exists (from previous probe attempt).
+        * This can happen if the module was unloaded and reloaded but the
+        * sysfs persisted (tied to PCI device lifetime).
+        */
+       if (pdev->dev.kobj.sd) {
+               struct kernfs_node *existing;
+
+               existing = kernfs_find_and_get(pdev->dev.kobj.sd, 
"ip_discovery");
+               if (existing) {
+                       kernfs_put(existing);
+                       kfree(ip_top);
+                       kfree(early_entry);
+                       return 0;
+               }
+       }
+
+       res = kobject_init_and_add(&ip_top->kobj, &ip_discovery_ktype,
+                                  &pdev->dev.kobj, "ip_discovery");
+       if (res)
+               goto err_put_kobj;
+
+       adev->discovery.ip_top = ip_top;
+
+       die_kset = &ip_top->die_kset;
+       kobject_set_name(&die_kset->kobj, "%s", "die");
+       die_kset->kobj.parent = &ip_top->kobj;
+       die_kset->kobj.ktype = &die_kobj_ktype;
+       res = kset_register(&ip_top->die_kset);
+       if (res)
+               goto err_put_die_kset;
+
+       for (ii = 0; ii < ARRAY_SIZE(ip_hw_attr); ii++)
+               ip_hw_instance_attrs[ii] = &ip_hw_attr[ii].attr;
+       ip_hw_instance_attrs[ii] = NULL;
+
+       res = amdgpu_discovery_sysfs_recurse(NULL, ip_top);
+       if (res)
+               goto err_put_die_kset;
+
+       early_entry->pdev = pdev;
+       early_entry->ip_top = ip_top;
+       mutex_lock(&early_ip_discovery_mutex);
+       list_add(&early_entry->list, &early_ip_discovery_list);
+       mutex_unlock(&early_ip_discovery_mutex);
+
+       return 0;
+
+err_put_die_kset:
+       kobject_put(&ip_top->die_kset.kobj);
+err_put_kobj:
+       kobject_put(&ip_top->kobj);
+       kfree(early_entry);
+       adev->discovery.ip_top = NULL;
+       return res;
+}
+
+void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev)
+{
+       struct early_ip_discovery *entry, *tmp_entry;
+       struct ip_discovery_top *ip_top = NULL;
+       struct list_head *el, *tmp;
+       struct kset *die_kset;
+
+       /* Find the entry in our tracking list */
+       mutex_lock(&early_ip_discovery_mutex);
+       list_for_each_entry_safe(entry, tmp_entry, &early_ip_discovery_list, 
list) {
+               if (entry->pdev == pdev) {
+                       ip_top = entry->ip_top;
+                       list_del(&entry->list);
+                       kfree(entry);
+                       break;
+               }
+       }
+       mutex_unlock(&early_ip_discovery_mutex);
+
+       if (!ip_top)
+               return;
+
+       /* Clean up sysfs hierarchy */
+       die_kset = &ip_top->die_kset;
+
+       spin_lock(&die_kset->list_lock);
+       list_for_each_prev_safe(el, tmp, &die_kset->list) {
+               list_del_init(el);
+               spin_unlock(&die_kset->list_lock);
+               
amdgpu_discovery_sysfs_die_free(to_ip_die_entry(list_to_kobj(el)));
+               spin_lock(&die_kset->list_lock);
+       }
+       spin_unlock(&die_kset->list_lock);
+
+       kobject_put(&ip_top->die_kset.kobj);
+       kobject_put(&ip_top->kobj);
+       /* ip_top itself will be freed by kobject_put via ip_disc_release */
+}
+
 /* ================================================== */
 
 static int amdgpu_discovery_reg_base_init(struct amdgpu_device *adev)
@@ -1403,6 +1606,9 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
        r = amdgpu_discovery_init(adev);
        if (r)
                return r;
+
+       amdgpu_discovery_sysfs_early_init(adev, adev->pdev);
+
        discovery_bin = adev->discovery.bin;
        wafl_ver = 0;
        adev->gfx.xcc_mask = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
index 4ce04486cc319..05a19cfe83988 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.h
@@ -47,4 +47,9 @@ int amdgpu_discovery_get_nps_info(struct amdgpu_device *adev,
                                  struct amdgpu_gmc_memrange **ranges,
                                  int *range_cnt, bool refresh);
 
+/* Early sysfs functions for persistent ip_discovery export */
+int amdgpu_discovery_sysfs_early_init(struct amdgpu_device *adev,
+                                      struct pci_dev *pdev);
+void amdgpu_discovery_sysfs_early_fini(struct pci_dev *pdev);
+
 #endif /* __AMDGPU_DISCOVERY__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 95d26f086d545..acfebb33b0e6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2562,6 +2562,8 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 
        amdgpu_driver_unload_kms(dev);
 
+       amdgpu_discovery_sysfs_early_fini(pdev);
+
        /*
         * Flush any in flight DMA operations from device.
         * Clear the Bus Master Enable bit and then wait on the PCIe Device
-- 
2.53.0

Reply via email to