PUNIT errors can only be recovered using a power-cycle. Xe KMD
sends a uevent to notify userspace to trigger a power cycle.
On platforms where link drop caused by powering the device off and
back on is reported by hardware as a Surprise Link Down (SLD), which
AER then escalates as an Uncorrectable Fatal Error. That error fires
before the device finishes coming back up and defeats the
very recovery we are attempting.

To keep the expected, recovery-induced link drop from being raised as
a fatal AER event, mask the Surprise Link Down bit
(PCI_ERR_UNC_SURPDN) in the upstream port's AER Uncorrectable Error
Mask register before punit_error_handler() requests the cold reset.
After the reset finishes, clear the Surprise Down Error Status bit
in the Uncorrectable Error Status register.

Signed-off-by: Mallesh Koujalagi <[email protected]>
---
v6:
- Expand commit message to explain why SUR_DN is masked. (Raag/Riana)
- Check Slot Implemented bit before reading Slot Capabilities, per
  PCIe spec. (Riana)
- Add debug log.

v7:
- Handle surprise link down event properly. (Aravind/Riana)
- Update commit message. (Riana)
- Correct log message.

v8:
- Use find_usp_dev() in punit_error_handler() function.

v9:
- Removed #ifdef CONFIG_PCIEAER. (Riana)
- Used pci_find_ext_capability() instead of usp->aer_cap.
- Clear the PCI_ERR_UNC_SURPDN status bit (W1C) after
  reset complete. (Lukas Wunner)
- Use pci_clear_and_set_config_dword() helper.
---
 drivers/gpu/drm/xe/xe_ras.c | 60 +++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 1eb2bbaccd9b..5a2fee5a1308 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -256,8 +256,37 @@ static struct pci_dev *find_usp_dev(struct pci_dev *pdev)
        return pci_upstream_bridge(vsp);
 }
 
+static void pcie_suppress_surprise_link_down(struct pci_dev *usp)
+{
+       u16 aer_cap;
+
+       /*
+        * Cold reset power-cycles the slot, dropping the PCIe link.
+        * This triggers a spurious Surprise Link Down AER event on the
+        * Upstream Switch Port (USP). Mask this error to avoid false
+        * error reporting during recovery.
+        */
+       aer_cap = pci_find_ext_capability(usp, PCI_EXT_CAP_ID_ERR);
+       if (!aer_cap) {
+               dev_dbg(&usp->dev,
+                       "AER capability not present\n");
+               return;
+       }
+
+       pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 0, 
PCI_ERR_UNC_SURPDN);
+       dev_dbg(&usp->dev, "Surprise Link Down masked for cold reset\n");
+}
+
 static void punit_error_handler(struct xe_device *xe)
 {
+       struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+       struct pci_dev *usp;
+
+       usp = find_usp_dev(pdev);
+
+       if (usp)
+               pcie_suppress_surprise_link_down(usp);
+
        xe_device_set_wedged_method(xe, DRM_WEDGE_RECOVERY_COLD_RESET);
        xe_device_declare_wedged(xe);
 }
@@ -573,7 +602,7 @@ int xe_ras_clear_counter(struct xe_device *xe, u8 severity, 
u8 component)
 static void aer_unmask_and_downgrade_internal_error(struct xe_device *xe)
 {
        struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
-       u32 aer_uncorr_mask, aer_uncorr_sev, aer_uncorr_status;
+       u32 aer_uncorr_status;
        struct pci_dev *usp;
        u16 aer_cap;
 
@@ -595,6 +624,17 @@ static void aer_unmask_and_downgrade_internal_error(struct 
xe_device *xe)
        if (aer_uncorr_status & PCI_ERR_UNC_INTN)
                pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, 
PCI_ERR_UNC_INTN);
 
+       /*
+        * Even though we masked the Surprise Link Down bit before the reset, 
the
+        * USP hardware still records the event in its status register. Since 
the
+        * USP itself is never power-cycled, that status bit survives the reset 
and
+        * stays set. If we unmask the error below without clearing the status 
first,
+        * it will immediately fire as a new AER error and undo the recovery we 
just
+        * completed. Write 1 to clear it (W1C) before unmasking.
+        */
+       if (aer_uncorr_status & PCI_ERR_UNC_SURPDN)
+               pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_STATUS, 
PCI_ERR_UNC_SURPDN);
+
        /*
         * All errors are steered to USP which is a PCIe AER Compliant device.
         * Downgrade all the errors to non-fatal to prevent PCIe bus driver
@@ -604,20 +644,22 @@ static void 
aer_unmask_and_downgrade_internal_error(struct xe_device *xe)
         * The Uncorrectable Error Severity Register has the 'Uncorrectable
         * Internal Error Severity' set to fatal by default. Set this to
         * non-fatal and unmask the error.
+        *
+        * Also restore the Surprise Link Down mask that was set in
+        * pcie_suppress_surprise_link_down() before the cold reset. The USP is
+        * never power-cycled, so the mask bit persists and must be cleared here
+        * to ensure link-down events are reported normally going forward.
         */
 
        /* Initialize Uncorrectable Error Severity Register */
-       pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, 
&aer_uncorr_sev);
-       aer_uncorr_sev &= ~PCI_ERR_UNC_INTN;
-       pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, 
aer_uncorr_sev);
+       pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_SEVER, 
PCI_ERR_UNC_INTN, 0);
 
-       /* Initialize Uncorrectable Error Mask Register */
-       pci_read_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 
&aer_uncorr_mask);
-       aer_uncorr_mask &= ~PCI_ERR_UNC_INTN;
-       pci_write_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK, 
aer_uncorr_mask);
+       /* Unmask Uncorrectable Internal Error and restore Surprise Link Down 
to unmasked */
+       pci_clear_and_set_config_dword(usp, aer_cap + PCI_ERR_UNCOR_MASK,
+                                      PCI_ERR_UNC_INTN | PCI_ERR_UNC_SURPDN, 
0);
 
        pci_save_state(usp);
-       dev_dbg(&usp->dev, "Uncorrectable Internal Errors downgraded and 
unmasked\n");
+       dev_dbg(&usp->dev, "AER: INTN downgraded to non-fatal, INTN and SLD 
unmasked\n");
 }
 
 /**
-- 
2.34.1

Reply via email to