PUNIT errors can only be recovered using a power-cycle. Xe KMD sends a uevent to notify userspace to trigger a power cycle. On platforms where link drop caused by powering the device off and back on is reported by hardware as a Surprise Link Down (SLD), which AER then escalates as an Uncorrectable Fatal Error. That error fires before the device finishes coming back up and defeats the very recovery we are attempting.
To keep the expected, recovery-induced link drop from being raised as a fatal AER event, mask the Surprise Link Down bit (PCI_ERR_UNC_SURPDN) in the upstream port's AER Uncorrectable Error Mask register before punit_error_handler() requests the cold reset. After the reset finishes, clear the Surprise Down Error Status bit in the Uncorrectable Error Status register. Signed-off-by: Mallesh Koujalagi <[email protected]> --- v6: - Expand commit message to explain why SUR_DN is masked. (Raag/Riana) - Check Slot Implemented bit before reading Slot Capabilities, per PCIe spec. (Riana) - Add debug log. v7: - Handle surprise link down event properly. (Aravind/Riana) - Update commit message. (Riana) - Correct log message. v8: - Use find_usp_dev() in punit_error_handler() function. v9: - Removed #ifdef CONFIG_PCIEAER. (Riana) - Used pci_find_ext_capability() instead of usp->aer_cap. - Clear the PCI_ERR_UNC_SURPDN status bit (W1C) after reset complete. (Lukas Wunner) - Use pci_clear_and_set_config_dword() helper. --- drivers/gpu/drm/xe/xe_ras.c | 60 +++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 1eb2bbaccd9b..5a2fee5a1308 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -256,8 +256,37 @@ static struct pci_dev *find_usp_dev(struct pci_dev *pdev) return pci_upstream_bridge(vsp); } +static void pcie_suppress_surprise_link_down(struct pci_dev *usp) +{ + u16 aer_cap; + + /* + * Cold reset power-cycles the slot, dropping the PCIe link. + * This triggers a spurious Surprise Link Down AER event on the + * Upstream Switch Port (USP). Mask this error to avoid false + * error reporting during recovery. + */ + aer_cap = pci_find_ext_capability(usp, PCI_EXT_CAP_ID_ERR); + if (!aer_cap) { + dev_dbg(&usp->dev, + "AER capability not present\n"); + return; + } + + pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 0, PCI_ERR_UNC_SURPDN); + dev_dbg(&usp->dev, "Surprise Link Down masked for cold reset\n"); +} + static void punit_error_handler(struct xe_device *xe) { + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct pci_dev *usp; + + usp = find_usp_dev(pdev); + + if (usp) + pcie_suppress_surprise_link_down(usp); + xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_COLD_RESET); xe_device_declare_wedged(xe); } @@ -573,7 +602,7 @@ int xe_ras_clear_counter(struct xe_device *xe, u8 severity, u8 component) static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); - u32 aer_uncorr_mask, aer_uncorr_sev, aer_uncorr_status; + u32 aer_uncorr_status; struct pci_dev *usp; u16 aer_cap; @@ -595,6 +624,17 @@ static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe) if (aer_uncorr_status & PCI_ERR_UNC_INTN) pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_INTN); + /* + * Even though we masked the Surprise Link Down bit before the reset, the + * USP hardware still records the event in its status register. Since the + * USP itself is never power-cycled, that status bit survives the reset and + * stays set. If we unmask the error below without clearing the status first, + * it will immediately fire as a new AER error and undo the recovery we just + * completed. Write 1 to clear it (W1C) before unmasking. + */ + if (aer_uncorr_status & PCI_ERR_UNC_SURPDN) + pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, PCI_ERR_UNC_SURPDN); + /* * All errors are steered to USP which is a PCIe AER Compliant device. * Downgrade all the errors to non-fatal to prevent PCIe bus driver @@ -604,20 +644,22 @@ static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe) * The Uncorrectable Error Severity Register has the 'Uncorrectable * Internal Error Severity' set to fatal by default. Set this to * non-fatal and unmask the error. + * + * Also restore the Surprise Link Down mask that was set in + * pcie_suppress_surprise_link_down() before the cold reset. The USP is + * never power-cycled, so the mask bit persists and must be cleared here + * to ensure link-down events are reported normally going forward. */ /* Initialize Uncorrectable Error Severity Register */ - pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, &aer_uncorr_sev); - aer_uncorr_sev &= ~PCI_ERR_UNC_INTN; - pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, aer_uncorr_sev); + pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, PCI_ERR_UNC_INTN, 0); - /* Initialize Uncorrectable Error Mask Register */ - pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, &aer_uncorr_mask); - aer_uncorr_mask &= ~PCI_ERR_UNC_INTN; - pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, aer_uncorr_mask); + /* Unmask Uncorrectable Internal Error and restore Surprise Link Down to unmasked */ + pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, + PCI_ERR_UNC_INTN | PCI_ERR_UNC_SURPDN, 0); pci_save_state(usp); - dev_dbg(&usp->dev, "Uncorrectable Internal Errors downgraded and unmasked\n"); + dev_dbg(&usp->dev, "AER: INTN downgraded to non-fatal, INTN and SLD unmasked\n"); } /** -- 2.34.1
