EEH (Enhanced Error Handling) allows a driver to recover from the
temporary failure of an attached PCI card. Enable basic CXL support
for EEH.
Signed-off-by: Daniel Axtens d...@axtens.net
---
drivers/misc/cxl/cxl.h | 1 +
drivers/misc/cxl/pci.c | 253
drivers/misc/cxl/vphb.c | 8 ++
3 files changed, 262 insertions(+)
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index cda02412b01e..6f5386653dae 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -726,6 +726,7 @@ int cxl_psl_purge(struct cxl_afu *afu);
void cxl_stop_trace(struct cxl *cxl);
int cxl_pci_vphb_add(struct cxl_afu *afu);
+void cxl_pci_vphb_reconfigure(struct cxl_afu *afu);
void cxl_pci_vphb_remove(struct cxl_afu *afu);
extern struct pci_driver cxl_pci_driver;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 965524a6ae7c..5c2dc82da92f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -24,6 +24,7 @@
#include asm/io.h
#include cxl.h
+#include misc/cxl.h
#define CXL_PCI_VSEC_ID0x1280
@@ -1275,10 +1276,262 @@ static void cxl_remove(struct pci_dev *dev)
cxl_remove_adapter(adapter);
}
+static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
+ pci_channel_state_t state)
+{
+ struct pci_dev *afu_dev;
+ pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+ pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
+
+ /* There should only be one entry, but go through the list
+* anyway
+*/
+ list_for_each_entry(afu_dev, afu-phb-bus-devices, bus_list) {
+ if (!afu_dev-driver)
+ continue;
+
+ afu_dev-error_state = state;
+
+ if (afu_dev-driver-err_handler)
+ afu_result =
afu_dev-driver-err_handler-error_detected(afu_dev,
+
state);
+ /* Disconnect trumps all, NONE trumps NEED_RESET */
+ if (afu_result == PCI_ERS_RESULT_DISCONNECT)
+ result = PCI_ERS_RESULT_DISCONNECT;
+ else if ((afu_result == PCI_ERS_RESULT_NONE)
+(result == PCI_ERS_RESULT_NEED_RESET))
+ result = PCI_ERS_RESULT_NONE;
+ }
+ return result;
+}
+
+static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct cxl *adapter = pci_get_drvdata(pdev);
+ struct cxl_afu *afu;
+ pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+ int i;
+
+ /* At this point, we could still have an interrupt pending.
+* Let's try to get them out of the way before they do
+* anything we don't like.
+*/
+ schedule();
+
+ /* If we're permanently dead, give up. */
+ if (state == pci_channel_io_perm_failure) {
+ /* Tell the AFU drivers; but we don't care what they
+* say, we're going away.
+*/
+ for (i = 0; i adapter-slices; i++) {
+ afu = adapter-afu[i];
+ cxl_vphb_error_detected(afu, state);
+ }
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ /* Are we reflashing?
+*
+* If we reflash, we could come back as something entirely
+* different, including a non-CAPI card. As such, by default
+* we don't participate in the process. We'll be unbound and
+* the slot re-probed. (TODO: check EEH doesn't blindly rebind
+* us!)
+*
+* However, this isn't the entire story: for reliablity
+* reasons, we usually want to reflash the FPGA on PERST in
+* order to get back to a more reliable known-good state.
+*
+* This causes us a bit of a problem: if we reflash we can't
+* trust that we'll come back the same - we could have a new
+* image and been PERSTed in order to load that
+* image. However, most of the time we actually *will* come
+* back the same - for example a regular EEH event.
+*
+* Therefore, we allow the user to assert that the image is
+* indeed the same and that we should continue on into EEH
+* anyway.
+*/
+ if (adapter-perst_loads_image !adapter-perst_same_image) {
+ /* TODO take the PHB out of CXL mode */
+ dev_info(pdev-dev, reflashing, so opting out of EEH!\n);
+ return PCI_ERS_RESULT_NONE;
+ }
+
+ /*
+* At this point, we want to try to recover. We'll always
+* need a complete slot reset: we don't trust any other reset.
+*
+* Now, we go through each AFU:
+* - We send the driver, if bound, an error_detected