Re: [PATCH v4 10/11] cxl: EEH support

2015-08-14 Thread Ian Munsie
Acked-by: Ian Munsie imun...@au1.ibm.com

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 10/11] cxl: EEH support

2015-08-12 Thread Daniel Axtens
EEH (Enhanced Error Handling) allows a driver to recover from the
temporary failure of an attached PCI card. Enable basic CXL support
for EEH.

Signed-off-by: Daniel Axtens d...@axtens.net
---
 drivers/misc/cxl/cxl.h  |   1 +
 drivers/misc/cxl/pci.c  | 253 
 drivers/misc/cxl/vphb.c |   8 ++
 3 files changed, 262 insertions(+)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index cda02412b01e..6f5386653dae 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -726,6 +726,7 @@ int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
+void cxl_pci_vphb_reconfigure(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
 
 extern struct pci_driver cxl_pci_driver;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 965524a6ae7c..5c2dc82da92f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -24,6 +24,7 @@
 #include asm/io.h
 
 #include cxl.h
+#include misc/cxl.h
 
 
 #define CXL_PCI_VSEC_ID0x1280
@@ -1275,10 +1276,262 @@ static void cxl_remove(struct pci_dev *dev)
cxl_remove_adapter(adapter);
 }
 
+static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
+   pci_channel_state_t state)
+{
+   struct pci_dev *afu_dev;
+   pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+   pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
+
+   /* There should only be one entry, but go through the list
+* anyway
+*/
+   list_for_each_entry(afu_dev, afu-phb-bus-devices, bus_list) {
+   if (!afu_dev-driver)
+   continue;
+
+   afu_dev-error_state = state;
+
+   if (afu_dev-driver-err_handler)
+   afu_result = 
afu_dev-driver-err_handler-error_detected(afu_dev,
+   
  state);
+   /* Disconnect trumps all, NONE trumps NEED_RESET */
+   if (afu_result == PCI_ERS_RESULT_DISCONNECT)
+   result = PCI_ERS_RESULT_DISCONNECT;
+   else if ((afu_result == PCI_ERS_RESULT_NONE) 
+(result == PCI_ERS_RESULT_NEED_RESET))
+   result = PCI_ERS_RESULT_NONE;
+   }
+   return result;
+}
+
+static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+  pci_channel_state_t state)
+{
+   struct cxl *adapter = pci_get_drvdata(pdev);
+   struct cxl_afu *afu;
+   pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+   int i;
+
+   /* At this point, we could still have an interrupt pending.
+* Let's try to get them out of the way before they do
+* anything we don't like.
+*/
+   schedule();
+
+   /* If we're permanently dead, give up. */
+   if (state == pci_channel_io_perm_failure) {
+   /* Tell the AFU drivers; but we don't care what they
+* say, we're going away.
+*/
+   for (i = 0; i  adapter-slices; i++) {
+   afu = adapter-afu[i];
+   cxl_vphb_error_detected(afu, state);
+   }
+   return PCI_ERS_RESULT_DISCONNECT;
+   }
+
+   /* Are we reflashing?
+*
+* If we reflash, we could come back as something entirely
+* different, including a non-CAPI card. As such, by default
+* we don't participate in the process. We'll be unbound and
+* the slot re-probed. (TODO: check EEH doesn't blindly rebind
+* us!)
+*
+* However, this isn't the entire story: for reliablity
+* reasons, we usually want to reflash the FPGA on PERST in
+* order to get back to a more reliable known-good state.
+*
+* This causes us a bit of a problem: if we reflash we can't
+* trust that we'll come back the same - we could have a new
+* image and been PERSTed in order to load that
+* image. However, most of the time we actually *will* come
+* back the same - for example a regular EEH event.
+*
+* Therefore, we allow the user to assert that the image is
+* indeed the same and that we should continue on into EEH
+* anyway.
+*/
+   if (adapter-perst_loads_image  !adapter-perst_same_image) {
+   /* TODO take the PHB out of CXL mode */
+   dev_info(pdev-dev, reflashing, so opting out of EEH!\n);
+   return PCI_ERS_RESULT_NONE;
+   }
+
+   /*
+* At this point, we want to try to recover.  We'll always
+* need a complete slot reset: we don't trust any other reset.
+*
+* Now, we go through each AFU:
+*  - We send the driver, if bound, an error_detected