Re: [PATCH v3 10/11] cxl: EEH support

2015-08-12 Thread Cyril Bur
On Wed, 12 Aug 2015 10:48:19 +1000
Daniel Axtens d...@axtens.net wrote:

 EEH (Enhanced Error Handling) allows a driver to recover from the
 temporary failure of an attached PCI card. Enable basic CXL support
 for EEH.
 

Looks like the only change since was the removal of the #ifdef, if that is
correct.

Reviewed-by: Cyril Bur cyril...@gmail.com

 Signed-off-by: Daniel Axtens d...@axtens.net
 ---
  drivers/misc/cxl/cxl.h  |   1 +
  drivers/misc/cxl/pci.c  | 252 
 
  drivers/misc/cxl/vphb.c |   8 ++
  3 files changed, 261 insertions(+)
 
 diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
 index cda02412b01e..6f5386653dae 100644
 --- a/drivers/misc/cxl/cxl.h
 +++ b/drivers/misc/cxl/cxl.h
 @@ -726,6 +726,7 @@ int cxl_psl_purge(struct cxl_afu *afu);
  
  void cxl_stop_trace(struct cxl *cxl);
  int cxl_pci_vphb_add(struct cxl_afu *afu);
 +void cxl_pci_vphb_reconfigure(struct cxl_afu *afu);
  void cxl_pci_vphb_remove(struct cxl_afu *afu);
  
  extern struct pci_driver cxl_pci_driver;
 diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
 index b4a68a896a33..1eb26a357ce0 100644
 --- a/drivers/misc/cxl/pci.c
 +++ b/drivers/misc/cxl/pci.c
 @@ -24,6 +24,7 @@
  #include asm/io.h
  
  #include cxl.h
 +#include misc/cxl.h
  
  
  #define CXL_PCI_VSEC_ID  0x1280
 @@ -1246,10 +1247,261 @@ static void cxl_remove(struct pci_dev *dev)
   cxl_remove_adapter(adapter);
  }
  
 +static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
 + pci_channel_state_t state)
 +{
 + struct pci_dev *afu_dev;
 + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
 + pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
 +
 + /* There should only be one entry, but go through the list
 +  * anyway
 +  */
 + list_for_each_entry(afu_dev, afu-phb-bus-devices, bus_list) {
 + if (!afu_dev-driver)
 + continue;
 +
 + afu_dev-error_state = state;
 +
 + if (afu_dev-driver-err_handler)
 + afu_result = 
 afu_dev-driver-err_handler-error_detected(afu_dev,
 + 
   state);
 + /* Disconnect trumps all, NONE trumps NEED_RESET */
 + if (afu_result == PCI_ERS_RESULT_DISCONNECT)
 + result = PCI_ERS_RESULT_DISCONNECT;
 + else if ((afu_result == PCI_ERS_RESULT_NONE) 
 +  (result == PCI_ERS_RESULT_NEED_RESET))
 + result = PCI_ERS_RESULT_NONE;
 + }
 + return result;
 +}
 +
 +static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 +pci_channel_state_t state)
 +{
 + struct cxl *adapter = pci_get_drvdata(pdev);
 + struct cxl_afu *afu;
 + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
 + int i;
 +
 + /* At this point, we could still have an interrupt pending.
 +  * Let's try to get them out of the way before they do
 +  * anything we don't like.
 +  */
 + schedule();
 +
 + /* If we're permanently dead, give up. */
 + if (state == pci_channel_io_perm_failure) {
 + /* Tell the AFU drivers; but we don't care what they
 +  * say, we're going away.
 +  */
 + for (i = 0; i  adapter-slices; i++) {
 + afu = adapter-afu[i];
 + cxl_vphb_error_detected(afu, state);
 + }
 + return PCI_ERS_RESULT_DISCONNECT;
 + }
 +
 + /* Are we reflashing?
 +  *
 +  * If we reflash, we could come back as something entirely
 +  * different, including a non-CAPI card. As such, by default
 +  * we don't participate in the process. We'll be unbound and
 +  * the slot re-probed. (TODO: check EEH doesn't blindly rebind
 +  * us!)
 +  *
 +  * However, this isn't the entire story: for reliablity
 +  * reasons, we usually want to reflash the FPGA on PERST in
 +  * order to get back to a more reliable known-good state.
 +  *
 +  * This causes us a bit of a problem: if we reflash we can't
 +  * trust that we'll come back the same - we could have a new
 +  * image and been PERSTed in order to load that
 +  * image. However, most of the time we actually *will* come
 +  * back the same - for example a regular EEH event.
 +  *
 +  * Therefore, we allow the user to assert that the image is
 +  * indeed the same and that we should continue on into EEH
 +  * anyway.
 +  */
 + if (adapter-perst_loads_image  !adapter-perst_same_image) {
 + /* TODO take the PHB out of CXL mode */
 + dev_info(pdev-dev, reflashing, so opting out of EEH!\n);
 + return PCI_ERS_RESULT_NONE;
 + }
 +
 + /*
 +  * At this point, we want to try to recover.  We'll always
 + 

[PATCH v3 10/11] cxl: EEH support

2015-08-11 Thread Daniel Axtens
EEH (Enhanced Error Handling) allows a driver to recover from the
temporary failure of an attached PCI card. Enable basic CXL support
for EEH.

Signed-off-by: Daniel Axtens d...@axtens.net
---
 drivers/misc/cxl/cxl.h  |   1 +
 drivers/misc/cxl/pci.c  | 252 
 drivers/misc/cxl/vphb.c |   8 ++
 3 files changed, 261 insertions(+)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index cda02412b01e..6f5386653dae 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -726,6 +726,7 @@ int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
+void cxl_pci_vphb_reconfigure(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
 
 extern struct pci_driver cxl_pci_driver;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b4a68a896a33..1eb26a357ce0 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -24,6 +24,7 @@
 #include asm/io.h
 
 #include cxl.h
+#include misc/cxl.h
 
 
 #define CXL_PCI_VSEC_ID0x1280
@@ -1246,10 +1247,261 @@ static void cxl_remove(struct pci_dev *dev)
cxl_remove_adapter(adapter);
 }
 
+static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
+   pci_channel_state_t state)
+{
+   struct pci_dev *afu_dev;
+   pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+   pci_ers_result_t afu_result = PCI_ERS_RESULT_NEED_RESET;
+
+   /* There should only be one entry, but go through the list
+* anyway
+*/
+   list_for_each_entry(afu_dev, afu-phb-bus-devices, bus_list) {
+   if (!afu_dev-driver)
+   continue;
+
+   afu_dev-error_state = state;
+
+   if (afu_dev-driver-err_handler)
+   afu_result = 
afu_dev-driver-err_handler-error_detected(afu_dev,
+   
  state);
+   /* Disconnect trumps all, NONE trumps NEED_RESET */
+   if (afu_result == PCI_ERS_RESULT_DISCONNECT)
+   result = PCI_ERS_RESULT_DISCONNECT;
+   else if ((afu_result == PCI_ERS_RESULT_NONE) 
+(result == PCI_ERS_RESULT_NEED_RESET))
+   result = PCI_ERS_RESULT_NONE;
+   }
+   return result;
+}
+
+static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
+  pci_channel_state_t state)
+{
+   struct cxl *adapter = pci_get_drvdata(pdev);
+   struct cxl_afu *afu;
+   pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+   int i;
+
+   /* At this point, we could still have an interrupt pending.
+* Let's try to get them out of the way before they do
+* anything we don't like.
+*/
+   schedule();
+
+   /* If we're permanently dead, give up. */
+   if (state == pci_channel_io_perm_failure) {
+   /* Tell the AFU drivers; but we don't care what they
+* say, we're going away.
+*/
+   for (i = 0; i  adapter-slices; i++) {
+   afu = adapter-afu[i];
+   cxl_vphb_error_detected(afu, state);
+   }
+   return PCI_ERS_RESULT_DISCONNECT;
+   }
+
+   /* Are we reflashing?
+*
+* If we reflash, we could come back as something entirely
+* different, including a non-CAPI card. As such, by default
+* we don't participate in the process. We'll be unbound and
+* the slot re-probed. (TODO: check EEH doesn't blindly rebind
+* us!)
+*
+* However, this isn't the entire story: for reliablity
+* reasons, we usually want to reflash the FPGA on PERST in
+* order to get back to a more reliable known-good state.
+*
+* This causes us a bit of a problem: if we reflash we can't
+* trust that we'll come back the same - we could have a new
+* image and been PERSTed in order to load that
+* image. However, most of the time we actually *will* come
+* back the same - for example a regular EEH event.
+*
+* Therefore, we allow the user to assert that the image is
+* indeed the same and that we should continue on into EEH
+* anyway.
+*/
+   if (adapter-perst_loads_image  !adapter-perst_same_image) {
+   /* TODO take the PHB out of CXL mode */
+   dev_info(pdev-dev, reflashing, so opting out of EEH!\n);
+   return PCI_ERS_RESULT_NONE;
+   }
+
+   /*
+* At this point, we want to try to recover.  We'll always
+* need a complete slot reset: we don't trust any other reset.
+*
+* Now, we go through each AFU:
+*  - We send the driver, if bound, an error_detected