[AMD Official Use Only - AMD Internal Distribution Only] Tested-by: Ce Sun <cesun...@amd.com>
Thanks Ce Sun ________________________________ From: Lazar, Lijo <lijo.la...@amd.com> Sent: Thursday, August 7, 2025 3:33 PM To: amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org> Cc: Zhang, Hawking <hawking.zh...@amd.com>; Deucher, Alexander <alexander.deuc...@amd.com>; Sun, Ce(Overlord) <ce....@amd.com> Subject: Re: [PATCH v3] drm/amdgpu: Save and restore switch state <Ping> On 8/1/2025 6:10 PM, Lijo Lazar wrote: > During a DPC error kernel waits for the link to be active before > notifying downstream devices. On certain platforms with Broadcom switch > in synthetiic mode, switch responds with values even though the link is > not fully ready. The config space restoration done by pcie port driver > for SWUS/DS of dGPU is thus not effective as the switch is still doing > internal enumeration. > > As a workaround, save state of SWUS/DS device in driver. Add additional > check to see if link is active and restore the values during DPC error > callbacks. > > Signed-off-by: Lijo Lazar <lijo.la...@amd.com> > --- > v2: Use usleep_range as sleep is short. Remove dev_info logs. > v3: remove redundant increment of 'i' in loop (Ce Sun). > > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 72 +++++++++++++++++++++- > 2 files changed, 73 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 3550c2fac184..96d772aadb04 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -904,6 +904,9 @@ struct amdgpu_pcie_reset_ctx { > bool in_link_reset; > bool occurs_dpc; > bool audio_suspended; > + struct pci_dev *swus; > + struct pci_saved_state *swus_pcistate; > + struct pci_saved_state *swds_pcistate; > }; > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index cfd72faec16e..e58f42531974 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -178,6 +178,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = { > BIT(AMD_IP_BLOCK_TYPE_PSP) > }; > > +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); > + > static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, > enum amd_ip_block_type block) > { > @@ -5006,7 +5008,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) > adev->reset_domain = NULL; > > kfree(adev->pci_state); > - > + kfree(adev->pcie_reset_ctx.swds_pcistate); > + kfree(adev->pcie_reset_ctx.swus_pcistate); > } > > /** > @@ -6963,16 +6966,27 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev > *pdev) > struct amdgpu_device *tmp_adev; > struct amdgpu_hive_info *hive; > struct list_head device_list; > + struct pci_dev *link_dev; > int r = 0, i; > u32 memsize; > + u16 status; > > dev_info(adev->dev, "PCI error: slot reset callback!!\n"); > > memset(&reset_context, 0, sizeof(reset_context)); > > + if (adev->pcie_reset_ctx.swus) > + link_dev = adev->pcie_reset_ctx.swus; > + else > + link_dev = adev->pdev; > /* wait for asic to come out of reset */ > - msleep(700); > + do { > + usleep_range(10000, 10500); > + r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); > + } while ((status != PCI_VENDOR_ID_ATI) && > + (status != PCI_VENDOR_ID_AMD)); > > + amdgpu_device_load_switch_state(adev); > /* Restore PCI confspace */ > amdgpu_device_load_pci_state(pdev); > > @@ -7074,6 +7088,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > } > } > > +static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) > +{ > + struct pci_dev *parent = pci_upstream_bridge(adev->pdev); > + int r; > + > + if (parent->vendor != PCI_VENDOR_ID_ATI) > + return; > + > + /* If already saved, return */ > + if (adev->pcie_reset_ctx.swus) > + return; > + /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ > + r = pci_save_state(parent); > + if (r) > + return; > + adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent); > + > + parent = pci_upstream_bridge(parent); > + r = pci_save_state(parent); > + if (r) > + return; > + adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent); > + > + adev->pcie_reset_ctx.swus = parent; > +} > + > +static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) > +{ > + struct pci_dev *pdev; > + int r; > + > + if (!adev->pcie_reset_ctx.swds_pcistate || > + !adev->pcie_reset_ctx.swus_pcistate) > + return; > + > + pdev = adev->pcie_reset_ctx.swus; > + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); > + if (!r) { > + pci_restore_state(pdev); > + } else { > + dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); > + return; > + } > + > + pdev = pci_upstream_bridge(adev->pdev); > + r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); > + if (!r) > + pci_restore_state(pdev); > + else > + dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); > +} > + > bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) > { > struct drm_device *dev = pci_get_drvdata(pdev); > @@ -7098,6 +7164,8 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) > return false; > } > > + amdgpu_device_cache_switch_state(adev); > + > return true; > } >