[PATCH] PCI/AER: Clear uncorrectable error status for device

2018-09-18 Thread Oza Pawandeep
PCI based device drivers handles ERR_NONFATAL  by registering
pci_error_handlers. some of the drivers clear AER uncorrectable status
in slot_reset while some in resume.

Drivers should not have responsibility of clearing the AER status, instead
shall be done by error and recovery framework defined in err.c

Clear the status while resuming, after reset_link was successful.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/crypto/qat/qat_common/adf_aer.c 
b/drivers/crypto/qat/qat_common/adf_aer.c
index da8a2d3..61ded36 100644
--- a/drivers/crypto/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/qat/qat_common/adf_aer.c
@@ -198,7 +198,6 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
pr_err("QAT: Can't find acceleration device\n");
return PCI_ERS_RESULT_DISCONNECT;
}
-   pci_cleanup_aer_uncorrect_error_status(pdev);
if (adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_SYNC))
return PCI_ERS_RESULT_DISCONNECT;
 
diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 4fa4c06..80c475f 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -1267,12 +1267,6 @@ static pci_ers_result_t 
ioat_pcie_error_slot_reset(struct pci_dev *pdev)
pci_wake_from_d3(pdev, false);
}
 
-   err = pci_cleanup_aer_uncorrect_error_status(pdev);
-   if (err) {
-   dev_err(>dev,
-   "AER uncorrect error status clear failed: %#x\n", err);
-   }
-
return result;
 }
 
diff --git a/drivers/infiniband/hw/hfi1/pcie.c 
b/drivers/infiniband/hw/hfi1/pcie.c
index baf7c32..38bc804 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -655,7 +655,6 @@ pci_resume(struct pci_dev *pdev)
struct hfi1_devdata *dd = pci_get_drvdata(pdev);
 
dd_dev_info(dd, "HFI1 resume function called\n");
-   pci_cleanup_aer_uncorrect_error_status(pdev);
/*
 * Running jobs will fail, since it's asynchronous
 * unlike sysfs-requested reset.   Better than
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c 
b/drivers/infiniband/hw/qib/qib_pcie.c
index 5ac7b31..30595b3 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -597,7 +597,6 @@ qib_pci_resume(struct pci_dev *pdev)
struct qib_devdata *dd = pci_get_drvdata(pdev);
 
qib_devinfo(pdev, "QIB resume function called\n");
-   pci_cleanup_aer_uncorrect_error_status(pdev);
/*
 * Running jobs will fail, since it's asynchronous
 * unlike sysfs-requested reset.   Better than
diff --git a/drivers/net/ethernet/atheros/alx/main.c 
b/drivers/net/ethernet/atheros/alx/main.c
index 567ee54..0d0b6a4 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1960,8 +1960,6 @@ static pci_ers_result_t alx_pci_error_slot_reset(struct 
pci_dev *pdev)
if (!alx_reset_mac(hw))
rc = PCI_ERS_RESULT_RECOVERED;
 out:
-   pci_cleanup_aer_uncorrect_error_status(pdev);
-
rtnl_unlock();
 
return rc;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c 
b/drivers/net/ethernet/broadcom/bnx2.c
index 122fdb8..bbb2471 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8793,13 +8793,6 @@ static pci_ers_result_t bnx2_io_slot_reset(struct 
pci_dev *pdev)
if (!(bp->flags & BNX2_FLAG_AER_ENABLED))
return result;
 
-   err = pci_cleanup_aer_uncorrect_error_status(pdev);
-   if (err) {
-   dev_err(>dev,
-   "pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
-err); /* non-fatal, continue */
-   }
-
return result;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 5b1ed24..cfb6c89 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -14379,14 +14379,6 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct 
pci_dev *pdev)
 
rtnl_unlock();
 
-   /* If AER, perform cleanup of the PCIe registers */
-   if (bp->flags & AER_ENABLED) {
-   if (pci_cleanup_aer_uncorrect_error_status(pdev))
-   BNX2X_ERR("pci_cleanup_aer_uncorrect_error_status 
failed\n");
-   else
-   DP(NETIF_MSG_HW, 
"pci_cleanup_aer_uncorrect_error_status succeeded\n");
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 176fc9f..b4d1db9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9076,13 +9076,6 @@ static pci_ers

[PATCH] PCI/AER: Clear uncorrectable error status for device

2018-09-18 Thread Oza Pawandeep
PCI based device drivers handles ERR_NONFATAL  by registering
pci_error_handlers. some of the drivers clear AER uncorrectable status
in slot_reset while some in resume.

Drivers should not have responsibility of clearing the AER status, instead
shall be done by error and recovery framework defined in err.c

Clear the status while resuming, after reset_link was successful.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/crypto/qat/qat_common/adf_aer.c 
b/drivers/crypto/qat/qat_common/adf_aer.c
index da8a2d3..61ded36 100644
--- a/drivers/crypto/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/qat/qat_common/adf_aer.c
@@ -198,7 +198,6 @@ static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
pr_err("QAT: Can't find acceleration device\n");
return PCI_ERS_RESULT_DISCONNECT;
}
-   pci_cleanup_aer_uncorrect_error_status(pdev);
if (adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_SYNC))
return PCI_ERS_RESULT_DISCONNECT;
 
diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 4fa4c06..80c475f 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -1267,12 +1267,6 @@ static pci_ers_result_t 
ioat_pcie_error_slot_reset(struct pci_dev *pdev)
pci_wake_from_d3(pdev, false);
}
 
-   err = pci_cleanup_aer_uncorrect_error_status(pdev);
-   if (err) {
-   dev_err(>dev,
-   "AER uncorrect error status clear failed: %#x\n", err);
-   }
-
return result;
 }
 
diff --git a/drivers/infiniband/hw/hfi1/pcie.c 
b/drivers/infiniband/hw/hfi1/pcie.c
index baf7c32..38bc804 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -655,7 +655,6 @@ pci_resume(struct pci_dev *pdev)
struct hfi1_devdata *dd = pci_get_drvdata(pdev);
 
dd_dev_info(dd, "HFI1 resume function called\n");
-   pci_cleanup_aer_uncorrect_error_status(pdev);
/*
 * Running jobs will fail, since it's asynchronous
 * unlike sysfs-requested reset.   Better than
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c 
b/drivers/infiniband/hw/qib/qib_pcie.c
index 5ac7b31..30595b3 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -597,7 +597,6 @@ qib_pci_resume(struct pci_dev *pdev)
struct qib_devdata *dd = pci_get_drvdata(pdev);
 
qib_devinfo(pdev, "QIB resume function called\n");
-   pci_cleanup_aer_uncorrect_error_status(pdev);
/*
 * Running jobs will fail, since it's asynchronous
 * unlike sysfs-requested reset.   Better than
diff --git a/drivers/net/ethernet/atheros/alx/main.c 
b/drivers/net/ethernet/atheros/alx/main.c
index 567ee54..0d0b6a4 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1960,8 +1960,6 @@ static pci_ers_result_t alx_pci_error_slot_reset(struct 
pci_dev *pdev)
if (!alx_reset_mac(hw))
rc = PCI_ERS_RESULT_RECOVERED;
 out:
-   pci_cleanup_aer_uncorrect_error_status(pdev);
-
rtnl_unlock();
 
return rc;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c 
b/drivers/net/ethernet/broadcom/bnx2.c
index 122fdb8..bbb2471 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8793,13 +8793,6 @@ static pci_ers_result_t bnx2_io_slot_reset(struct 
pci_dev *pdev)
if (!(bp->flags & BNX2_FLAG_AER_ENABLED))
return result;
 
-   err = pci_cleanup_aer_uncorrect_error_status(pdev);
-   if (err) {
-   dev_err(>dev,
-   "pci_cleanup_aer_uncorrect_error_status failed 0x%0x\n",
-err); /* non-fatal, continue */
-   }
-
return result;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 5b1ed24..cfb6c89 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -14379,14 +14379,6 @@ static pci_ers_result_t bnx2x_io_slot_reset(struct 
pci_dev *pdev)
 
rtnl_unlock();
 
-   /* If AER, perform cleanup of the PCIe registers */
-   if (bp->flags & AER_ENABLED) {
-   if (pci_cleanup_aer_uncorrect_error_status(pdev))
-   BNX2X_ERR("pci_cleanup_aer_uncorrect_error_status 
failed\n");
-   else
-   DP(NETIF_MSG_HW, 
"pci_cleanup_aer_uncorrect_error_status succeeded\n");
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 176fc9f..b4d1db9 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -9076,13 +9076,6 @@ static pci_ers

[PATCH v2 0/6] Fix issues and cleanup for ERR_FATAL and ERR_NONFATAL

2018-06-22 Thread Oza Pawandeep
These are follow up patches for the series which modifies ERR_FATAL handling.
although there were couple of problems existed before which, itis also fixing.

patch-1:
Fixes the problem where ERR_FATAL and ERR_NONFATAL status should be cleared
taking severity mask into account.

patch-2:
Takes care of clearing error fatal status

patch-3:
Follow up patch where no more need of handling ERR_FATAL
case.

patch-4:
Fixes clearing device status in case of uncorrectable errors.
(e.g. ERR_FATAL and ERR_NONFATAL)

patch-5:
Fixes clearing device status in case of correctable errors.

patch-6:
Follow up patch where no more need of handling pci_channel_io_frozen
in pcie_portdrv_slot_reset()

Oza Pawandeep (6):
  PCI/AER: Take severity mask into account while clearing error bits
  PCI/AER: Clear uncorrectable fatal error status bits
  PCI/ERR: Cleanup ERR_FATAL of error broadcast
  PCI/AER: Clear device error status bits during ERR_FATAL and
ERR_NONFATAL
  PCI/AER: Fix correctable status bits clearing in device register
  PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

 drivers/pci/pcie/aer.c | 35 +++
 drivers/pci/pcie/err.c | 15 +++
 drivers/pci/pcie/portdrv_pci.c | 18 --
 include/linux/aer.h|  5 +
 4 files changed, 35 insertions(+), 38 deletions(-)

-- 
2.7.4



[PATCH v2 0/6] Fix issues and cleanup for ERR_FATAL and ERR_NONFATAL

2018-06-22 Thread Oza Pawandeep
These are follow up patches for the series which modifies ERR_FATAL handling.
although there were couple of problems existed before which, itis also fixing.

patch-1:
Fixes the problem where ERR_FATAL and ERR_NONFATAL status should be cleared
taking severity mask into account.

patch-2:
Takes care of clearing error fatal status

patch-3:
Follow up patch where no more need of handling ERR_FATAL
case.

patch-4:
Fixes clearing device status in case of uncorrectable errors.
(e.g. ERR_FATAL and ERR_NONFATAL)

patch-5:
Fixes clearing device status in case of correctable errors.

patch-6:
Follow up patch where no more need of handling pci_channel_io_frozen
in pcie_portdrv_slot_reset()

Oza Pawandeep (6):
  PCI/AER: Take severity mask into account while clearing error bits
  PCI/AER: Clear uncorrectable fatal error status bits
  PCI/ERR: Cleanup ERR_FATAL of error broadcast
  PCI/AER: Clear device error status bits during ERR_FATAL and
ERR_NONFATAL
  PCI/AER: Fix correctable status bits clearing in device register
  PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

 drivers/pci/pcie/aer.c | 35 +++
 drivers/pci/pcie/err.c | 15 +++
 drivers/pci/pcie/portdrv_pci.c | 18 --
 include/linux/aer.h|  5 +
 4 files changed, 35 insertions(+), 38 deletions(-)

-- 
2.7.4



[PATCH v2 5/6] PCI/AER: Fix correctable status bits clearing in device register

2018-06-22 Thread Oza Pawandeep
In case of correctable error Device Status Register sets
Correctable Error Detected, which should be cleared after handling
the error

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d2d6868..a42b071 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -818,6 +818,7 @@ static void handle_error_source(struct pci_dev *dev, struct 
aer_err_info *info)
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
+   pci_cleanup_aer_error_device_status(dev);
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-- 
2.7.4



[PATCH v2 5/6] PCI/AER: Fix correctable status bits clearing in device register

2018-06-22 Thread Oza Pawandeep
In case of correctable error Device Status Register sets
Correctable Error Detected, which should be cleared after handling
the error

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d2d6868..a42b071 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -818,6 +818,7 @@ static void handle_error_source(struct pci_dev *dev, struct 
aer_err_info *info)
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
+   pci_cleanup_aer_error_device_status(dev);
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-- 
2.7.4



[PATCH v2 3/6] PCI/ERR: Cleanup ERR_FATAL of error broadcast

2018-06-22 Thread Oza Pawandeep
ERR_FATAL is handled by resetting the Link in software, skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, so now no more ERR_FATAL handling is
required inside pci_broadcast_error_message()

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 00d2875..404bb69 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -259,15 +259,10 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
/*
 * If the error is reported by an end point, we think this
 * error is related to the upstream link of the end point.
+* the error is non fatal so the bus is ok, just invoke
+* the callback for the function that logged the error.
 */
-   if (state == pci_channel_io_normal)
-   /*
-* the error is non fatal so the bus is ok, just invoke
-* the callback for the function that logged the error.
-*/
-   cb(dev, _data);
-   else
-   pci_walk_bus(dev->bus, cb, _data);
+   cb(dev, _data);
}
 
return result_data.result;
-- 
2.7.4



[PATCH v2 3/6] PCI/ERR: Cleanup ERR_FATAL of error broadcast

2018-06-22 Thread Oza Pawandeep
ERR_FATAL is handled by resetting the Link in software, skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, so now no more ERR_FATAL handling is
required inside pci_broadcast_error_message()

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 00d2875..404bb69 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -259,15 +259,10 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
/*
 * If the error is reported by an end point, we think this
 * error is related to the upstream link of the end point.
+* the error is non fatal so the bus is ok, just invoke
+* the callback for the function that logged the error.
 */
-   if (state == pci_channel_io_normal)
-   /*
-* the error is non fatal so the bus is ok, just invoke
-* the callback for the function that logged the error.
-*/
-   cb(dev, _data);
-   else
-   pci_walk_bus(dev->bus, cb, _data);
+   cb(dev, _data);
}
 
return result_data.result;
-- 
2.7.4



[PATCH v2 6/6] PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

2018-06-22 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, because of, no need to handle
pci_channel_io_frozen case anymore.

Besides the walk on the bus is happening on subordinates, inside
broadcast_error_message(), which means that pcie_portdrv_slot_reset()
is never called for RP, and now since the all the devices are removed under
this downstream link, we can safely get rid of ERR_FATAL handling code
in pcie_portdrv_slot_reset().

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b8..b970a6d 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
 
 /* global data */
 
-static int pcie_portdrv_restore_config(struct pci_dev *dev)
-{
-   int retval;
-
-   retval = pci_enable_device(dev);
-   if (retval)
-   return retval;
-   pci_set_master(dev);
-   return 0;
-}
-
 #ifdef CONFIG_PM
 static int pcie_port_runtime_suspend(struct device *dev)
 {
@@ -163,13 +152,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct 
pci_dev *dev)
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
/* If fatal, restore cfg space for possible link reset at upstream */
-   if (dev->error_state == pci_channel_io_frozen) {
-   dev->state_saved = true;
-   pci_restore_state(dev);
-   pcie_portdrv_restore_config(dev);
-   pci_enable_pcie_error_reporting(dev);
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
-- 
2.7.4



[PATCH v2 1/6] PCI/AER: Take severity mask into account while clearing error bits

2018-06-22 Thread Oza Pawandeep
pci_cleanup_aer_uncorrect_error_status() is called by different slot_reset
callbacks in case of ERR_NONFATAL.

AER uncorrectable error status should take severity into account in order
to clear the bits, so that ERR_NONFATAL path does not clear the bit which
are marked with severity fatal.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2e8838..d6cb1f0 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -360,13 +360,16 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
-   u32 status;
+   u32 status, mask;
 
pos = dev->aer_cap;
if (!pos)
return -EIO;
 
+   /* Clean AER Root Error Status */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
+   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
+   status &= ~mask; /* Clear corresponding nonfatal bits */
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
@@ -1336,8 +1339,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev 
*dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   int pos;
-   u32 status, mask;
u16 reg16;
 
/* Clean up Root device status */
@@ -1345,11 +1346,7 @@ static void aer_error_resume(struct pci_dev *dev)
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
 
/* Clean AER Root Error Status */
-   pos = dev->aer_cap;
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
-   pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+   pci_cleanup_aer_uncorrect_error_status(dev);
 }
 
 static struct pcie_port_service_driver aerdriver = {
-- 
2.7.4



[PATCH v2 1/6] PCI/AER: Take severity mask into account while clearing error bits

2018-06-22 Thread Oza Pawandeep
pci_cleanup_aer_uncorrect_error_status() is called by different slot_reset
callbacks in case of ERR_NONFATAL.

AER uncorrectable error status should take severity into account in order
to clear the bits, so that ERR_NONFATAL path does not clear the bit which
are marked with severity fatal.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2e8838..d6cb1f0 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -360,13 +360,16 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
-   u32 status;
+   u32 status, mask;
 
pos = dev->aer_cap;
if (!pos)
return -EIO;
 
+   /* Clean AER Root Error Status */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
+   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
+   status &= ~mask; /* Clear corresponding nonfatal bits */
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
@@ -1336,8 +1339,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev 
*dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   int pos;
-   u32 status, mask;
u16 reg16;
 
/* Clean up Root device status */
@@ -1345,11 +1346,7 @@ static void aer_error_resume(struct pci_dev *dev)
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
 
/* Clean AER Root Error Status */
-   pos = dev->aer_cap;
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
-   pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+   pci_cleanup_aer_uncorrect_error_status(dev);
 }
 
 static struct pcie_port_service_driver aerdriver = {
-- 
2.7.4



[PATCH v2 6/6] PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

2018-06-22 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, because of, no need to handle
pci_channel_io_frozen case anymore.

Besides the walk on the bus is happening on subordinates, inside
broadcast_error_message(), which means that pcie_portdrv_slot_reset()
is never called for RP, and now since the all the devices are removed under
this downstream link, we can safely get rid of ERR_FATAL handling code
in pcie_portdrv_slot_reset().

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b8..b970a6d 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
 
 /* global data */
 
-static int pcie_portdrv_restore_config(struct pci_dev *dev)
-{
-   int retval;
-
-   retval = pci_enable_device(dev);
-   if (retval)
-   return retval;
-   pci_set_master(dev);
-   return 0;
-}
-
 #ifdef CONFIG_PM
 static int pcie_port_runtime_suspend(struct device *dev)
 {
@@ -163,13 +152,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct 
pci_dev *dev)
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
/* If fatal, restore cfg space for possible link reset at upstream */
-   if (dev->error_state == pci_channel_io_frozen) {
-   dev->state_saved = true;
-   pci_restore_state(dev);
-   pcie_portdrv_restore_config(dev);
-   pci_enable_pcie_error_reporting(dev);
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
-- 
2.7.4



[PATCH v2 2/6] PCI/AER: Clear uncorrectable fatal error status bits

2018-06-22 Thread Oza Pawandeep
During ERR_FATAL handling, AER calls pci_cleanup_aer_uncorrect_error_status
which should handle pci_channel_io_frozen case in order to determine if it
has to clear fatal bits.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d6cb1f0..e9c115d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -369,7 +369,12 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev 
*dev)
/* Clean AER Root Error Status */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
+
+   if (dev->error_state == pci_channel_io_normal)
+   status &= ~mask; /* Clear corresponding nonfatal bits */
+   else
+   status &= mask; /* Clear corresponding fatal bits */
+
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f7ce0cb..00d2875 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -288,6 +288,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
struct pci_dev *pdev, *temp;
pci_ers_result_t result;
 
+   dev->error_state = pci_channel_io_frozen;
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
udev = dev;
else
@@ -323,6 +324,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
pci_info(dev, "Device recovery from fatal error successful\n");
+   dev->error_state = pci_channel_io_normal;
} else {
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
pci_info(dev, "Device recovery from fatal error failed\n");
-- 
2.7.4



[PATCH v2 2/6] PCI/AER: Clear uncorrectable fatal error status bits

2018-06-22 Thread Oza Pawandeep
During ERR_FATAL handling, AER calls pci_cleanup_aer_uncorrect_error_status
which should handle pci_channel_io_frozen case in order to determine if it
has to clear fatal bits.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index d6cb1f0..e9c115d 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -369,7 +369,12 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev 
*dev)
/* Clean AER Root Error Status */
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
+
+   if (dev->error_state == pci_channel_io_normal)
+   status &= ~mask; /* Clear corresponding nonfatal bits */
+   else
+   status &= mask; /* Clear corresponding fatal bits */
+
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f7ce0cb..00d2875 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -288,6 +288,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
struct pci_dev *pdev, *temp;
pci_ers_result_t result;
 
+   dev->error_state = pci_channel_io_frozen;
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
udev = dev;
else
@@ -323,6 +324,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
pci_info(dev, "Device recovery from fatal error successful\n");
+   dev->error_state = pci_channel_io_normal;
} else {
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
pci_info(dev, "Device recovery from fatal error failed\n");
-- 
2.7.4



[PATCH v2 4/6] PCI/AER: Clear device error status bits during ERR_FATAL and ERR_NONFATAL

2018-06-22 Thread Oza Pawandeep
In both ERR_FATAL and ERR_NONFATA cases the device error status
bits needs to be cleared.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e9c115d..d2d6868 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -357,6 +357,17 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   u16 reg16;
+
+   /* Clean up Root device status */
+   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
+   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+
+   return 0;
+}
+
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
@@ -1344,11 +1355,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev 
*dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   u16 reg16;
-
/* Clean up Root device status */
-   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
-   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+   pci_cleanup_aer_error_device_status(dev);
 
/* Clean AER Root Error Status */
pci_cleanup_aer_uncorrect_error_status(dev);
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 404bb69..410c35c 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
dev->error_state = state;
pci_walk_bus(dev->subordinate, cb, _data);
if (cb == report_resume) {
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
dev->error_state = pci_channel_io_normal;
}
@@ -312,6 +313,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
 * do error recovery on all subordinates of the bridge instead
 * of the bridge and clear the error status of the bridge.
 */
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
}
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 514bffa..165a147 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,6 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
 #else
@@ -55,6 +56,10 @@ static inline int pci_disable_pcie_error_reporting(struct 
pci_dev *dev)
 {
return -EINVAL;
 }
+static inline int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   return -EINVAL;
+}
 static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
return -EINVAL;
-- 
2.7.4



[PATCH v2 4/6] PCI/AER: Clear device error status bits during ERR_FATAL and ERR_NONFATAL

2018-06-22 Thread Oza Pawandeep
In both ERR_FATAL and ERR_NONFATA cases the device error status
bits needs to be cleared.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index e9c115d..d2d6868 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -357,6 +357,17 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   u16 reg16;
+
+   /* Clean up Root device status */
+   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
+   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+
+   return 0;
+}
+
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
@@ -1344,11 +1355,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev 
*dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   u16 reg16;
-
/* Clean up Root device status */
-   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
-   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+   pci_cleanup_aer_error_device_status(dev);
 
/* Clean AER Root Error Status */
pci_cleanup_aer_uncorrect_error_status(dev);
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 404bb69..410c35c 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
dev->error_state = state;
pci_walk_bus(dev->subordinate, cb, _data);
if (cb == report_resume) {
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
dev->error_state = pci_channel_io_normal;
}
@@ -312,6 +313,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
 * do error recovery on all subordinates of the bridge instead
 * of the bridge and clear the error status of the bridge.
 */
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
}
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 514bffa..165a147 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,6 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
 #else
@@ -55,6 +56,10 @@ static inline int pci_disable_pcie_error_reporting(struct 
pci_dev *dev)
 {
return -EINVAL;
 }
+static inline int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   return -EINVAL;
+}
 static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
return -EINVAL;
-- 
2.7.4



[PATCH NEXT 5/6] PCI/AER: Clear correctable status bits in device register

2018-06-07 Thread Oza Pawandeep
In case of correctable error Device Status Register sets
Correctable Error Detected, which should be cleared after handling
the error

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 95e9828..0e4e99a 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -271,6 +271,7 @@ static void handle_error_source(struct pcie_device *aerdev,
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
+   pci_cleanup_aer_error_device_status(dev);
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-- 
2.7.4



[PATCH NEXT 2/6] PCI/AER: Clear uncorrectable fatal error status bits

2018-06-07 Thread Oza Pawandeep
During ERR_FATAL handling, AER calls pci_cleanup_aer_uncorrect_error_status
which should handle pci_channel_io_frozen case in order to determine if it
has to clear fatal bits or nonfatal bits.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 309f3f5..6745e37 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -60,7 +60,12 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev 
*dev)
pos = dev->aer_cap;
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
+
+   if (dev->error_state == pci_channel_io_normal)
+   status &= ~mask; /* Clear corresponding nonfatal bits */
+   else
+   status &= mask; /* Clear corresponding fatal bits */
+
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f7ce0cb..00d2875 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -288,6 +288,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
struct pci_dev *pdev, *temp;
pci_ers_result_t result;
 
+   dev->error_state = pci_channel_io_frozen;
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
udev = dev;
else
@@ -323,6 +324,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
pci_info(dev, "Device recovery from fatal error successful\n");
+   dev->error_state = pci_channel_io_normal;
} else {
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
pci_info(dev, "Device recovery from fatal error failed\n");
-- 
2.7.4



[PATCH NEXT 5/6] PCI/AER: Clear correctable status bits in device register

2018-06-07 Thread Oza Pawandeep
In case of correctable error Device Status Register sets
Correctable Error Detected, which should be cleared after handling
the error

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 95e9828..0e4e99a 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -271,6 +271,7 @@ static void handle_error_source(struct pcie_device *aerdev,
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
+   pci_cleanup_aer_error_device_status(dev);
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-- 
2.7.4



[PATCH NEXT 2/6] PCI/AER: Clear uncorrectable fatal error status bits

2018-06-07 Thread Oza Pawandeep
During ERR_FATAL handling, AER calls pci_cleanup_aer_uncorrect_error_status
which should handle pci_channel_io_frozen case in order to determine if it
has to clear fatal bits or nonfatal bits.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 309f3f5..6745e37 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -60,7 +60,12 @@ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev 
*dev)
pos = dev->aer_cap;
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
+
+   if (dev->error_state == pci_channel_io_normal)
+   status &= ~mask; /* Clear corresponding nonfatal bits */
+   else
+   status &= mask; /* Clear corresponding fatal bits */
+
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index f7ce0cb..00d2875 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -288,6 +288,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
struct pci_dev *pdev, *temp;
pci_ers_result_t result;
 
+   dev->error_state = pci_channel_io_frozen;
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
udev = dev;
else
@@ -323,6 +324,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
pci_info(dev, "Device recovery from fatal error successful\n");
+   dev->error_state = pci_channel_io_normal;
} else {
pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
pci_info(dev, "Device recovery from fatal error failed\n");
-- 
2.7.4



[PATCH NEXT 6/6] PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

2018-06-07 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, as a result of that, no more calling
pcie_portdrv_slot_reset in ERR_FATAL case.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b8..92f5d330 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
 
 /* global data */
 
-static int pcie_portdrv_restore_config(struct pci_dev *dev)
-{
-   int retval;
-
-   retval = pci_enable_device(dev);
-   if (retval)
-   return retval;
-   pci_set_master(dev);
-   return 0;
-}
-
 #ifdef CONFIG_PM
 static int pcie_port_runtime_suspend(struct device *dev)
 {
@@ -162,14 +151,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct 
pci_dev *dev)
 
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
-   /* If fatal, restore cfg space for possible link reset at upstream */
-   if (dev->error_state == pci_channel_io_frozen) {
-   dev->state_saved = true;
-   pci_restore_state(dev);
-   pcie_portdrv_restore_config(dev);
-   pci_enable_pcie_error_reporting(dev);
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
-- 
2.7.4



[PATCH NEXT 4/6] PCI/AER: Clear device status error bits during ERR_FATAL and ERR_NONFATAL

2018-06-07 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, the device status has to be cleared,
which fixes BUG existed before.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 8cbc62b..0d9eaba 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -341,12 +341,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   u16 reg16;
-
/* Clean up Root device status */
-   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
-   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
-
+   pci_cleanup_aer_error_device_status(dev);
/* Clean AER Root Error Status */
pci_cleanup_aer_uncorrect_error_status(dev);
 }
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 6745e37..95e9828 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -47,6 +47,17 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   u16 reg16;
+
+   /* Clean up Root device status */
+   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
+   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+
+   return 0;
+}
+
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 3998ed7..e1e642c 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
dev->error_state = state;
pci_walk_bus(dev->subordinate, cb, _data);
if (cb == report_resume) {
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
dev->error_state = pci_channel_io_normal;
}
@@ -312,6 +313,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
 * do error recovery on all subordinates of the bridge instead
 * of the bridge and clear the error status of the bridge.
 */
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
}
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 514bffa..165a147 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,6 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
 #else
@@ -55,6 +56,10 @@ static inline int pci_disable_pcie_error_reporting(struct 
pci_dev *dev)
 {
return -EINVAL;
 }
+static inline int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   return -EINVAL;
+}
 static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
return -EINVAL;
-- 
2.7.4



[PATCH NEXT 3/6] PCI/ERR: Cleanup ERR_FATAL of error broadcast

2018-06-07 Thread Oza Pawandeep
ERR_FATAL is handled by resetting the Link in software, skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, so now no more ERR_FATAL handling is
required inside pci_broadcast_error_message()

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 00d2875..3998ed7 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -259,15 +259,10 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
/*
 * If the error is reported by an end point, we think this
 * error is related to the upstream link of the end point.
+* The error is non fatal so the bus is ok, just invoke
+* the callback for the function that logged the error.
 */
-   if (state == pci_channel_io_normal)
-   /*
-* the error is non fatal so the bus is ok, just invoke
-* the callback for the function that logged the error.
-*/
-   cb(dev, _data);
-   else
-   pci_walk_bus(dev->bus, cb, _data);
+   cb(dev, _data);
}
 
return result_data.result;
-- 
2.7.4



[PATCH NEXT 6/6] PCI/PORTDRV: Remove ERR_FATAL handling from pcie_portdrv_slot_reset()

2018-06-07 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, as a result of that, no more calling
pcie_portdrv_slot_reset in ERR_FATAL case.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 973f1b8..92f5d330 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -42,17 +42,6 @@ __setup("pcie_ports=", pcie_port_setup);
 
 /* global data */
 
-static int pcie_portdrv_restore_config(struct pci_dev *dev)
-{
-   int retval;
-
-   retval = pci_enable_device(dev);
-   if (retval)
-   return retval;
-   pci_set_master(dev);
-   return 0;
-}
-
 #ifdef CONFIG_PM
 static int pcie_port_runtime_suspend(struct device *dev)
 {
@@ -162,14 +151,6 @@ static pci_ers_result_t pcie_portdrv_mmio_enabled(struct 
pci_dev *dev)
 
 static pci_ers_result_t pcie_portdrv_slot_reset(struct pci_dev *dev)
 {
-   /* If fatal, restore cfg space for possible link reset at upstream */
-   if (dev->error_state == pci_channel_io_frozen) {
-   dev->state_saved = true;
-   pci_restore_state(dev);
-   pcie_portdrv_restore_config(dev);
-   pci_enable_pcie_error_reporting(dev);
-   }
-
return PCI_ERS_RESULT_RECOVERED;
 }
 
-- 
2.7.4



[PATCH NEXT 4/6] PCI/AER: Clear device status error bits during ERR_FATAL and ERR_NONFATAL

2018-06-07 Thread Oza Pawandeep
We are handling ERR_FATAL by resetting the Link in software,skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, the device status has to be cleared,
which fixes BUG existed before.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 8cbc62b..0d9eaba 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -341,12 +341,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   u16 reg16;
-
/* Clean up Root device status */
-   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
-   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
-
+   pci_cleanup_aer_error_device_status(dev);
/* Clean AER Root Error Status */
pci_cleanup_aer_uncorrect_error_status(dev);
 }
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 6745e37..95e9828 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -47,6 +47,17 @@ int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   u16 reg16;
+
+   /* Clean up Root device status */
+   pcie_capability_read_word(dev, PCI_EXP_DEVSTA, );
+   pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
+
+   return 0;
+}
+
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 3998ed7..e1e642c 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -252,6 +252,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
dev->error_state = state;
pci_walk_bus(dev->subordinate, cb, _data);
if (cb == report_resume) {
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
dev->error_state = pci_channel_io_normal;
}
@@ -312,6 +313,7 @@ void pcie_do_fatal_recovery(struct pci_dev *dev, u32 
service)
 * do error recovery on all subordinates of the bridge instead
 * of the bridge and clear the error status of the bridge.
 */
+   pci_cleanup_aer_error_device_status(dev);
pci_cleanup_aer_uncorrect_error_status(dev);
}
 
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 514bffa..165a147 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -44,6 +44,7 @@ struct aer_capability_regs {
 /* PCIe port driver needs this function to enable AER */
 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+int pci_cleanup_aer_error_device_status(struct pci_dev *dev);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 int pci_cleanup_aer_error_status_regs(struct pci_dev *dev);
 #else
@@ -55,6 +56,10 @@ static inline int pci_disable_pcie_error_reporting(struct 
pci_dev *dev)
 {
return -EINVAL;
 }
+static inline int pci_cleanup_aer_error_device_status(struct pci_dev *dev)
+{
+   return -EINVAL;
+}
 static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
return -EINVAL;
-- 
2.7.4



[PATCH NEXT 3/6] PCI/ERR: Cleanup ERR_FATAL of error broadcast

2018-06-07 Thread Oza Pawandeep
ERR_FATAL is handled by resetting the Link in software, skipping the
driver pci_error_handlers callbacks, removing the devices from the PCI
subsystem, and re-enumerating, so now no more ERR_FATAL handling is
required inside pci_broadcast_error_message()

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 00d2875..3998ed7 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -259,15 +259,10 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
/*
 * If the error is reported by an end point, we think this
 * error is related to the upstream link of the end point.
+* The error is non fatal so the bus is ok, just invoke
+* the callback for the function that logged the error.
 */
-   if (state == pci_channel_io_normal)
-   /*
-* the error is non fatal so the bus is ok, just invoke
-* the callback for the function that logged the error.
-*/
-   cb(dev, _data);
-   else
-   pci_walk_bus(dev->bus, cb, _data);
+   cb(dev, _data);
}
 
return result_data.result;
-- 
2.7.4



[PATCH NEXT 1/6] PCI/AER: Take mask into account while clearing error bits

2018-06-07 Thread Oza Pawandeep
PCIe ERR_NONFATAL and ERR_FATAL are uncorrectable errors, and clearing
uncorrectable error bits should take error mask into account.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 377e576..8cbc62b 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -341,8 +341,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   int pos;
-   u32 status, mask;
u16 reg16;
 
/* Clean up Root device status */
@@ -350,11 +348,7 @@ static void aer_error_resume(struct pci_dev *dev)
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
 
/* Clean AER Root Error Status */
-   pos = dev->aer_cap;
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
-   pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+   pci_cleanup_aer_uncorrect_error_status(dev);
 }
 
 /**
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 946f3f6..309f3f5 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -50,13 +50,17 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
-   u32 status;
+   u32 status, mask;
 
pos = dev->aer_cap;
if (!pos)
return -EIO;
 
+   /* Clean AER Root Error Status */
+   pos = dev->aer_cap;
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
+   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
+   status &= ~mask; /* Clear corresponding nonfatal bits */
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
-- 
2.7.4



[PATCH NEXT 1/6] PCI/AER: Take mask into account while clearing error bits

2018-06-07 Thread Oza Pawandeep
PCIe ERR_NONFATAL and ERR_FATAL are uncorrectable errors, and clearing
uncorrectable error bits should take error mask into account.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 377e576..8cbc62b 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -341,8 +341,6 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
  */
 static void aer_error_resume(struct pci_dev *dev)
 {
-   int pos;
-   u32 status, mask;
u16 reg16;
 
/* Clean up Root device status */
@@ -350,11 +348,7 @@ static void aer_error_resume(struct pci_dev *dev)
pcie_capability_write_word(dev, PCI_EXP_DEVSTA, reg16);
 
/* Clean AER Root Error Status */
-   pos = dev->aer_cap;
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
-   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
-   status &= ~mask; /* Clear corresponding nonfatal bits */
-   pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
+   pci_cleanup_aer_uncorrect_error_status(dev);
 }
 
 /**
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 946f3f6..309f3f5 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -50,13 +50,17 @@ EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 {
int pos;
-   u32 status;
+   u32 status, mask;
 
pos = dev->aer_cap;
if (!pos)
return -EIO;
 
+   /* Clean AER Root Error Status */
+   pos = dev->aer_cap;
pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, );
+   pci_read_config_dword(dev, pos + PCI_ERR_UNCOR_SEVER, );
+   status &= ~mask; /* Clear corresponding nonfatal bits */
if (status)
pci_write_config_dword(dev, pos + PCI_ERR_UNCOR_STATUS, status);
 
-- 
2.7.4



[PATCH v17 4/9] PCI/AER: Factor out error reporting to drivers/pci/pcie/err.c

2018-05-17 Thread Oza Pawandeep
Move the error reporting callbacks from aerdrv_core.c to err.c, where they
can be used by DPC in addition to AER.

As part of aerdrv_core.c, these callbacks were built under CONFIG_PCIEAER.
Moving them to the new err.c means they will now be built under
CONFIG_PCIEPORTBUS, so adjust the definition of pci_uevent_ers() to match.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: in reset_link(), initialize "driver" even if CONFIG_PCIEAER is
unset, update pci_uevent_ers() #ifdef wrapper]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 6ace470..ffb9564 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1535,7 +1535,7 @@ static int pci_uevent(struct device *dev, struct 
kobj_uevent_env *env)
return 0;
 }
 
-#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
+#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH)
 /**
  * pci_uevent_ers - emit a uevent during recovery path of PCI device
  * @pdev: PCI device undergoing error recovery
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..5e8857a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,10 @@ static inline resource_size_t 
pci_resource_alignment(struct pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index a2d7cc7..4fa1ee4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an err

[PATCH v17 1/9] PCI: Add generic pcie_wait_for_link() interface

2018-05-17 Thread Oza Pawandeep
Clients such as hotplug and Downstream Port Containment (DPC) both need to
wait until a link becomes active or inactive.

Add a generic pcie_wait_link_active() interface and use it instead of
duplicating the code.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..764bf64 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait until link is active or inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v17 4/9] PCI/AER: Factor out error reporting to drivers/pci/pcie/err.c

2018-05-17 Thread Oza Pawandeep
Move the error reporting callbacks from aerdrv_core.c to err.c, where they
can be used by DPC in addition to AER.

As part of aerdrv_core.c, these callbacks were built under CONFIG_PCIEAER.
Moving them to the new err.c means they will now be built under
CONFIG_PCIEPORTBUS, so adjust the definition of pci_uevent_ers() to match.

Signed-off-by: Oza Pawandeep 
[bhelgaas: in reset_link(), initialize "driver" even if CONFIG_PCIEAER is
unset, update pci_uevent_ers() #ifdef wrapper]
Signed-off-by: Bjorn Helgaas 

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 6ace470..ffb9564 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1535,7 +1535,7 @@ static int pci_uevent(struct device *dev, struct 
kobj_uevent_env *env)
return 0;
 }
 
-#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH)
+#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH)
 /**
  * pci_uevent_ers - emit a uevent during recovery path of PCI device
  * @pdev: PCI device undergoing error recovery
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..5e8857a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,10 @@ static inline resource_size_t 
pci_resource_alignment(struct pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index a2d7cc7..4fa1ee4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_

[PATCH v17 1/9] PCI: Add generic pcie_wait_for_link() interface

2018-05-17 Thread Oza Pawandeep
Clients such as hotplug and Downstream Port Containment (DPC) both need to
wait until a link becomes active or inactive.

Add a generic pcie_wait_link_active() interface and use it instead of
duplicating the code.

Signed-off-by: Oza Pawandeep 
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..764bf64 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait until link is active or inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v17 3/9] PCI/AER: Rename error recovery interfaces to generic PCI naming

2018-05-17 Thread Oza Pawandeep
Rename error recovery interfaces with "pcie_" prefix so they can be made
non-static.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: move declaration to later patch, leave functions static]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index b56f9c1..a2d7cc7 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -476,14 +476,14 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_fatal_recovery - handle fatal error recovery process
+ * pcie_do_fatal_recovery - handle fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is fatal. Once being invoked, removes the devices
  * beneath this AER agent, followed by reset link e.g. secondary bus reset
  * followed by re-enumeration of devices.
  */
-static void do_fatal_recovery(struct pci_dev *dev)
+static void pcie_do_fatal_recovery(struct pci_dev *dev)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -533,14 +533,14 @@ static void do_fatal_recovery(struct pci_dev *dev)
 }
 
 /**
- * do_nonfatal_recovery - handle nonfatal error recovery process
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is nonfatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_nonfatal_recovery(struct pci_dev *dev)
+static void pcie_do_nonfatal_recovery(struct pci_dev *dev)
 {
pci_ers_result_t status;
enum pci_channel_state state;
@@ -611,9 +611,9 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else if (info->severity == AER_NONFATAL)
-   do_nonfatal_recovery(dev);
+   pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -678,9 +678,9 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity == AER_NONFATAL)
-   do_nonfatal_recovery(pdev);
+   pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v17 3/9] PCI/AER: Rename error recovery interfaces to generic PCI naming

2018-05-17 Thread Oza Pawandeep
Rename error recovery interfaces with "pcie_" prefix so they can be made
non-static.

Signed-off-by: Oza Pawandeep 
[bhelgaas: move declaration to later patch, leave functions static]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index b56f9c1..a2d7cc7 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -476,14 +476,14 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_fatal_recovery - handle fatal error recovery process
+ * pcie_do_fatal_recovery - handle fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is fatal. Once being invoked, removes the devices
  * beneath this AER agent, followed by reset link e.g. secondary bus reset
  * followed by re-enumeration of devices.
  */
-static void do_fatal_recovery(struct pci_dev *dev)
+static void pcie_do_fatal_recovery(struct pci_dev *dev)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -533,14 +533,14 @@ static void do_fatal_recovery(struct pci_dev *dev)
 }
 
 /**
- * do_nonfatal_recovery - handle nonfatal error recovery process
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is nonfatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_nonfatal_recovery(struct pci_dev *dev)
+static void pcie_do_nonfatal_recovery(struct pci_dev *dev)
 {
pci_ers_result_t status;
enum pci_channel_state state;
@@ -611,9 +611,9 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else if (info->severity == AER_NONFATAL)
-   do_nonfatal_recovery(dev);
+   pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -678,9 +678,9 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity == AER_NONFATAL)
-   do_nonfatal_recovery(pdev);
+   pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v17 0/9] Address error and recovery for AER and DPC

2018-05-17 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v16:
Bjorn's comments addressed
> remove call pci_walk_bus(dev->subordinate, report_resume, _data)
> pci_cleanup_aer_uncorrect_error_status(dev); happens only if service is 
AER
> aer_error_resume does not handle ERR_FATAL clearing anymore
Changes since v15:
Bjorn's comments addressed
> minor comments fixed
> made FATAL sequence aligned to existing one, as far as clearing status 
are concerned.
> pcie_do_fatal_recovery and pcie_do_nonfatal_recovery functions made to 
modularize
> pcie_do_fatal_recovery now takes service as an argument
Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find servi

[PATCH v17 0/9] Address error and recovery for AER and DPC

2018-05-17 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v16:
Bjorn's comments addressed
> remove call pci_walk_bus(dev->subordinate, report_resume, _data)
> pci_cleanup_aer_uncorrect_error_status(dev); happens only if service is 
AER
> aer_error_resume does not handle ERR_FATAL clearing anymore
Changes since v15:
Bjorn's comments addressed
> minor comments fixed
> made FATAL sequence aligned to existing one, as far as clearing status 
are concerned.
> pcie_do_fatal_recovery and pcie_do_nonfatal_recovery functions made to 
modularize
> pcie_do_fatal_recovery now takes service as an argument
Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find servi

[PATCH v17 2/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-17 Thread Oza Pawandeep
PCIe ERR_FATAL errors mean the Link is unreliable.  Components on the Link
may need to be reset to return to reliable operation (PCIe r4.0, sec
6.2.2).  We previously handled these errors much differently depending on
whether the platform supports Downstream Port Containment (DPC) (PCIe r4.0,
sec 6.2.10) or not.

The AER driver has historically logged the error details, called
driver-supplied pci_error_handlers callbacks, and reset the Link.  This
reset downstream devices, but did not remove them from the PCI subsystem,
re-enumerate them, or call their driver .remove() or .probe() methods.

DPC is different because the hardware automatically disables the Link when
it detects ERR_FATAL, which resets downstream devices.  There's no
opportunity for pci_error_handlers callbacks before resetting the Link.
The DPC driver removes affected devices (which calls their .remove()
methods), brings the Link back up, and re-enumerates (which calls driver
.probe() methods).

Align AER ERR_FATAL handling with DPC by resetting the Link in software,
skipping the driver pci_error_handlers callbacks, removing the devices from
the PCI subsystem, and re-enumerating.  The idea is that drivers and
devices should see the same behavior for ERR_FATAL events, regardless of
whether they're handled by AER or DPC.

Here are the basic ERR_FATAL recovery steps, showing the previous AER
behavior, the AER behavior after this patch, and the DPC behavior:

  AERAER  DPC
  previous   new  behavior
     ---  
  Log error   yesyes  yes (minimal)
  drv.error_detected()yesno   no
  Reset Link  yesyes  yes
  drv.mmio_enabled()  yesno   no
  drv.slot_reset()yesno   no
  drv.resume()yesno   no
  Remove PCI devices  no yes  yes
(calls drv.remove())
  Re-enumerateno yes  yes
(calls drv.probe())

N.B. With DPC, the Link reset happens before the driver .remove() calls,
while with AER, the reset happens *after* the .remove() calls.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: changelog, squash doc patch into this]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "sol

[PATCH v17 2/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-17 Thread Oza Pawandeep
PCIe ERR_FATAL errors mean the Link is unreliable.  Components on the Link
may need to be reset to return to reliable operation (PCIe r4.0, sec
6.2.2).  We previously handled these errors much differently depending on
whether the platform supports Downstream Port Containment (DPC) (PCIe r4.0,
sec 6.2.10) or not.

The AER driver has historically logged the error details, called
driver-supplied pci_error_handlers callbacks, and reset the Link.  This
reset downstream devices, but did not remove them from the PCI subsystem,
re-enumerate them, or call their driver .remove() or .probe() methods.

DPC is different because the hardware automatically disables the Link when
it detects ERR_FATAL, which resets downstream devices.  There's no
opportunity for pci_error_handlers callbacks before resetting the Link.
The DPC driver removes affected devices (which calls their .remove()
methods), brings the Link back up, and re-enumerates (which calls driver
.probe() methods).

Align AER ERR_FATAL handling with DPC by resetting the Link in software,
skipping the driver pci_error_handlers callbacks, removing the devices from
the PCI subsystem, and re-enumerating.  The idea is that drivers and
devices should see the same behavior for ERR_FATAL events, regardless of
whether they're handled by AER or DPC.

Here are the basic ERR_FATAL recovery steps, showing the previous AER
behavior, the AER behavior after this patch, and the DPC behavior:

  AERAER  DPC
  previous   new  behavior
     ---  
  Log error   yesyes  yes (minimal)
  drv.error_detected()yesno   no
  Reset Link  yesyes  yes
  drv.mmio_enabled()  yesno   no
  drv.slot_reset()yesno   no
  drv.resume()yesno   no
  Remove PCI devices  no yes  yes
(calls drv.remove())
  Re-enumerateno yes  yes
(calls drv.probe())

N.B. With DPC, the Link reset happens before the driver .remove() calls,
while with AER, the reset happens *after* the .remove() calls.

Signed-off-by: Oza Pawandeep 
[bhelgaas: changelog, squash doc patch into this]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Keith Busch 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+---

[PATCH v17 8/9] PCI/AER: Pass service type to pcie_do_fatal_recovery()

2018-05-17 Thread Oza Pawandeep
Pass the service type to pcie_do_fatal_recovery() instead of assuming AER.
We will make DPC also use pcie_do_fatal_recovery(), and it needs to do
things a little differently for AER and DPC.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5e8857a..6af7595 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -354,7 +354,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 void pci_enable_acs(struct pci_dev *dev);
 
 /* PCI error reporting and recovery */
-void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
 void pcie_do_nonfatal_recovery(struct pci_dev *dev);
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index fdfc474..36e622d 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -254,7 +254,7 @@ static void handle_error_source(struct pcie_device *aerdev,
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   pcie_do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -321,7 +321,7 @@ static void aer_recover_work_func(struct work_struct *work)
if (entry.severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   pcie_do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
pci_dev_put(pdev);
}
 }
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index a1668e9..8d68cd7 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -180,7 +180,7 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
 {
struct pci_dev *udev;
pci_ers_result_t status;
@@ -195,7 +195,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -281,7 +281,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
  * beneath this AER agent, followed by reset link e.g. secondary bus reset
  * followed by re-enumeration of devices.
  */
-void pcie_do_fatal_recovery(struct pci_dev *dev)
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -307,9 +307,10 @@ void pcie_do_fatal_recovery(struct pci_dev *dev)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, service);
 
-   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+   if ((service == PCIE_PORT_SERVICE_AER) &&
+   (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
/*
 * If the error is reported by a bridge, we think this error
 * is related to the downstream link of the bridge, so we
-- 
2.7.4



[PATCH v17 8/9] PCI/AER: Pass service type to pcie_do_fatal_recovery()

2018-05-17 Thread Oza Pawandeep
Pass the service type to pcie_do_fatal_recovery() instead of assuming AER.
We will make DPC also use pcie_do_fatal_recovery(), and it needs to do
things a little differently for AER and DPC.

Signed-off-by: Oza Pawandeep 
[bhelgaas: split to separate patch]
Signed-off-by: Bjorn Helgaas 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5e8857a..6af7595 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -354,7 +354,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 void pci_enable_acs(struct pci_dev *dev);
 
 /* PCI error reporting and recovery */
-void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
 void pcie_do_nonfatal_recovery(struct pci_dev *dev);
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index fdfc474..36e622d 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -254,7 +254,7 @@ static void handle_error_source(struct pcie_device *aerdev,
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   pcie_do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -321,7 +321,7 @@ static void aer_recover_work_func(struct work_struct *work)
if (entry.severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   pcie_do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
pci_dev_put(pdev);
}
 }
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index a1668e9..8d68cd7 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -180,7 +180,7 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
 {
struct pci_dev *udev;
pci_ers_result_t status;
@@ -195,7 +195,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -281,7 +281,7 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
  * beneath this AER agent, followed by reset link e.g. secondary bus reset
  * followed by re-enumeration of devices.
  */
-void pcie_do_fatal_recovery(struct pci_dev *dev)
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -307,9 +307,10 @@ void pcie_do_fatal_recovery(struct pci_dev *dev)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, service);
 
-   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+   if ((service == PCIE_PORT_SERVICE_AER) &&
+   (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
/*
 * If the error is reported by a bridge, we think this error
 * is related to the downstream link of the bridge, so we
-- 
2.7.4



[PATCH v17 5/9] PCI/portdrv: Add generic pcie_port_find_service()

2018-05-17 Thread Oza Pawandeep
Add generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 4fa1ee4..fdfc474 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 307120b..a1668e9 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -194,10 +194,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..e5bbf08 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service is associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v17 5/9] PCI/portdrv: Add generic pcie_port_find_service()

2018-05-17 Thread Oza Pawandeep
Add generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep 
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 4fa1ee4..fdfc474 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 307120b..a1668e9 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -194,10 +194,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..e5bbf08 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service is associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v17 6/9] PCI/portdrv: Add generic pcie_port_find_device()

2018-05-17 Thread Oza Pawandeep
Add generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index e5bbf08..a5b3b3a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service is associated with
+ * @service: For the service to find
+ *
+ * Find the struct device associated with given service on a pci_dev
+ */
+struct device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v17 6/9] PCI/portdrv: Add generic pcie_port_find_device()

2018-05-17 Thread Oza Pawandeep
Add generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index e5bbf08..a5b3b3a 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service is associated with
+ * @service: For the service to find
+ *
+ * Find the struct device associated with given service on a pci_dev
+ */
+struct device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v17 9/9] PCI/DPC: Use the generic pcie_do_fatal_recovery() path

2018-05-17 Thread Oza Pawandeep
Our goal is to handle ERR_FATAL errors similarly, whether they are reported
via AER or via DPC.  A previous commit changed AER so it handles ERR_FATAL
by calling driver .remove() methods and resetting the Link.  DPC already
does that (although the Link reset is done automatically by hardware and
happens before we call the driver .remove() methods).

Restructure the DPC code so it calls the same pcie_do_fatal_recovery()
interface used by AER.  This makes it clearer that we want to use the same
path.

Implement the .reset_link() method used by pcie_do_fatal_recovery().  For
DPC, the actual reset is done automatically by hardware, so we really only
have to wait for the Link to be inactive, then release the Port from DPC.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: changelog, DPC_FATAL is not a bitfield, can be sequential]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 361903f..6064041 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,30 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
-
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   /*
+* DPC disables the Link automatically in hardware, so it has
+* already been reset by the time we get here.
+*/
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
+
+   /*
+* Wait until the Link is inactive, then clear DPC Trigger Status
+* to allow the Port to leave DPC.
+*/
dpc_wait_link_inactive(dpc);
+
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +109,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* We configure DPC so it only triggers on ERR_FATAL */
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +300,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 8f87bbe..514bffa 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -14,6 +14,7 @@
 #define AER_NONFATAL   0
 #define AER_FATAL  1
 #define AER_CORRECTABLE2
+#define DPC_FATAL  3
 
 struct pci_dev;
 
-- 
2.7.4



[PATCH v17 9/9] PCI/DPC: Use the generic pcie_do_fatal_recovery() path

2018-05-17 Thread Oza Pawandeep
Our goal is to handle ERR_FATAL errors similarly, whether they are reported
via AER or via DPC.  A previous commit changed AER so it handles ERR_FATAL
by calling driver .remove() methods and resetting the Link.  DPC already
does that (although the Link reset is done automatically by hardware and
happens before we call the driver .remove() methods).

Restructure the DPC code so it calls the same pcie_do_fatal_recovery()
interface used by AER.  This makes it clearer that we want to use the same
path.

Implement the .reset_link() method used by pcie_do_fatal_recovery().  For
DPC, the actual reset is done automatically by hardware, so we really only
have to wait for the Link to be inactive, then release the Port from DPC.

Signed-off-by: Oza Pawandeep 
[bhelgaas: changelog, DPC_FATAL is not a bitfield, can be sequential]
Signed-off-by: Bjorn Helgaas 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 361903f..6064041 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,30 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
-
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   /*
+* DPC disables the Link automatically in hardware, so it has
+* already been reset by the time we get here.
+*/
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
+
+   /*
+* Wait until the Link is inactive, then clear DPC Trigger Status
+* to allow the Port to leave DPC.
+*/
dpc_wait_link_inactive(dpc);
+
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +109,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* We configure DPC so it only triggers on ERR_FATAL */
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +300,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 8f87bbe..514bffa 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -14,6 +14,7 @@
 #define AER_NONFATAL   0
 #define AER_FATAL  1
 #define AER_CORRECTABLE2
+#define DPC_FATAL  3
 
 struct pci_dev;
 
-- 
2.7.4



[PATCH v17 7/9] PCI/DPC: Disable ERR_NONFATAL handling by DPC

2018-05-17 Thread Oza Pawandeep
PCIe ERR_NONFATAL errors mean a particular transaction is unreliable but
the Link is otherwise fully functional (PCIe r4.0, sec 6.2.2).

The AER driver handles these by logging the error details and calling
driver-supplied pci_error_handlers callbacks.  It does not reset downstream
devices, does not remove them from the PCI subsystem, does not re-enumerate
them, and does not call their driver .remove() or .probe() methods.

But DPC driver previously enabled DPC on ERR_NONFATAL, so if the hardware
supports DPC, these errors caused a Link reset (performed automatically by
the hardware), followed by the DPC driver removing affected devices (which
calls their .remove() methods), bringing the Link back up, and
re-enumerating (which calls driver .probe() methods).

Disable ERR_NONFATAL DPC triggering so these errors will only be handled by
AER.  This means drivers won't have to deal with different usage of their
pci_error_handlers callbacks and .probe() and .remove() methods based on
whether the platform has DPC support.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <helg...@kernel.org>

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..361903f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -260,7 +260,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -278,7 +278,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..5182e0d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v17 7/9] PCI/DPC: Disable ERR_NONFATAL handling by DPC

2018-05-17 Thread Oza Pawandeep
PCIe ERR_NONFATAL errors mean a particular transaction is unreliable but
the Link is otherwise fully functional (PCIe r4.0, sec 6.2.2).

The AER driver handles these by logging the error details and calling
driver-supplied pci_error_handlers callbacks.  It does not reset downstream
devices, does not remove them from the PCI subsystem, does not re-enumerate
them, and does not call their driver .remove() or .probe() methods.

But DPC driver previously enabled DPC on ERR_NONFATAL, so if the hardware
supports DPC, these errors caused a Link reset (performed automatically by
the hardware), followed by the DPC driver removing affected devices (which
calls their .remove() methods), bringing the Link back up, and
re-enumerating (which calls driver .probe() methods).

Disable ERR_NONFATAL DPC triggering so these errors will only be handled by
AER.  This means drivers won't have to deal with different usage of their
pci_error_handlers callbacks and .probe() and .remove() methods based on
whether the platform has DPC support.

Signed-off-by: Oza Pawandeep 
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..361903f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -260,7 +260,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -278,7 +278,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..5182e0d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v16 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-11 Thread Oza Pawandeep
DPC driver implements link_reset callback, and calls
pci_do_fatal_recovery().

Which follows standard path of ERR_FATAL recovery.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5e8857a..6af7595 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -354,7 +354,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 void pci_enable_acs(struct pci_dev *dev);
 
 /* PCI error reporting and recovery */
-void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
 void pcie_do_nonfatal_recovery(struct pci_dev *dev);
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index fdfc474..36e622d 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -254,7 +254,7 @@ static void handle_error_source(struct pcie_device *aerdev,
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   pcie_do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -321,7 +321,7 @@ static void aer_recover_work_func(struct work_struct *work)
if (entry.severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   pcie_do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
pci_dev_put(pdev);
}
 }
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..5680c13 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,31 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
-
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   /*
+* DPC disables the Link automatically in hardware, so it has
+* already been reset by the time we get here.
+*/
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
+
+   /*
+* Waiting until the link is inactive, then clearing DPC
+* trigger status to allow the port to leave DPC.
+*/
dpc_wait_link_inactive(dpc);
+
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +110,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +301,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 33a16b1..29ff148 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@

[PATCH v16 6/9] PCI/PORTDRV: Implement generic find service

2018-05-11 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 4fa1ee4..fdfc474 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index c4ded88..33a16b1 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -199,10 +199,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v16 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-11 Thread Oza Pawandeep
DPC driver implements link_reset callback, and calls
pci_do_fatal_recovery().

Which follows standard path of ERR_FATAL recovery.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 5e8857a..6af7595 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -354,7 +354,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 void pci_enable_acs(struct pci_dev *dev);
 
 /* PCI error reporting and recovery */
-void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
 void pcie_do_nonfatal_recovery(struct pci_dev *dev);
 
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index fdfc474..36e622d 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -254,7 +254,7 @@ static void handle_error_source(struct pcie_device *aerdev,
} else if (info->severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   pcie_do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -321,7 +321,7 @@ static void aer_recover_work_func(struct work_struct *work)
if (entry.severity == AER_NONFATAL)
pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   pcie_do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
pci_dev_put(pdev);
}
 }
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..5680c13 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,31 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
-
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   /*
+* DPC disables the Link automatically in hardware, so it has
+* already been reset by the time we get here.
+*/
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
+
+   /*
+* Waiting until the link is inactive, then clearing DPC
+* trigger status to allow the port to leave DPC.
+*/
dpc_wait_link_inactive(dpc);
+
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +110,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +301,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 33a16b1..29ff148 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -185,7 +185,7 @@ st

[PATCH v16 6/9] PCI/PORTDRV: Implement generic find service

2018-05-11 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 4fa1ee4..fdfc474 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index c4ded88..33a16b1 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -199,10 +199,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v16 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-11 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..5e8857a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,10 @@ static inline resource_size_t 
pci_resource_alignment(struct pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 649dd1f..aa4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -476,7 +476,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_fatal_recovery - handle fatal error recovery process
+ * pcie_do_fatal_recovery - handle fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is fatal. Once being invoked, removes the devices
@@ -484,7 +484,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  * followed by re-enumeration of devices.
  */
 
-static void do_fatal_recovery(struct pci_dev *dev)
+void pcie_do_fatal_recovery(struct pci_dev *dev)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -535,14 +535,14 @@ static void do_fatal_recovery(struct pci_dev *dev)
 }
 
 /**
- * do_nonfatal_recovery - handle nonfatal error recovery process
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_nonfatal_recovery(struct pci_dev *dev)
+void pcie_do_nonfatal_recovery(struct pci_dev *dev)
 {
pci_ers_result_t status;
enum pci_channel_state state;
@@ -613,9 +613,9 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else if (info->severity == AER_NONFATAL)
-   do_nonfatal_recovery(dev);
+   pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -680,9 +680,9 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity == AER_NONFATAL)
-   do_nonfatal_recovery(pdev);
+   pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v16 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-11 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..5e8857a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,10 @@ static inline resource_size_t 
pci_resource_alignment(struct pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_fatal_recovery(struct pci_dev *dev);
+void pcie_do_nonfatal_recovery(struct pci_dev *dev);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 649dd1f..aa4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -476,7 +476,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_fatal_recovery - handle fatal error recovery process
+ * pcie_do_fatal_recovery - handle fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is fatal. Once being invoked, removes the devices
@@ -484,7 +484,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  * followed by re-enumeration of devices.
  */
 
-static void do_fatal_recovery(struct pci_dev *dev)
+void pcie_do_fatal_recovery(struct pci_dev *dev)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -535,14 +535,14 @@ static void do_fatal_recovery(struct pci_dev *dev)
 }
 
 /**
- * do_nonfatal_recovery - handle nonfatal error recovery process
+ * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  *
  * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_nonfatal_recovery(struct pci_dev *dev)
+void pcie_do_nonfatal_recovery(struct pci_dev *dev)
 {
pci_ers_result_t status;
enum pci_channel_state state;
@@ -613,9 +613,9 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else if (info->severity == AER_NONFATAL)
-   do_nonfatal_recovery(dev);
+   pcie_do_nonfatal_recovery(dev);
else if (info->severity == AER_FATAL)
-   do_fatal_recovery(dev);
+   pcie_do_fatal_recovery(dev);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -680,9 +680,9 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity == AER_NONFATAL)
-   do_nonfatal_recovery(pdev);
+   pcie_do_nonfatal_recovery(pdev);
else if (entry.severity == AER_FATAL)
-   do_fatal_recovery(pdev);
+   pcie_do_fatal_recovery(pdev);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v16 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-11 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..649dd1f 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -475,35 +476,84 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * do_fatal_recovery - handle fatal error recovery process
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
+ *
+ * Invoked when an error is fatal. Once being invoked, removes the devices
+ * benetah this AER agent, followed by reset link e.g. secondary bus reset
+ * followed by re-enumeration of devices.
+ */
+
+static void do_fatal_recovery(struct pci_dev *dev)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+   struct aer_broadcast_data result_data;
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+   /*
+* If the error is reported by a bridge, we think this error
+* is related to the downstream link of the bridge, so we
+* do error recovery on all subordinates of the bridge instead
+* of the bridge and clear the error status of the bridge.
+*/
+   pci_walk_bus(dev->subordinate, report_resume, _data);
+   pci_cleanup_aer_uncorrect_error_status(dev);
+   }
+
+   if (result == PCI_ERS_RESULT_RECOVERED) {
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+   } else {
+   pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+   pci_info(dev, "AER: Device recovery failed\n");
+   }
+
+   pci_unlock_rescan_remove();
+}
+
+/**
+ * do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
- * @severity: error severity type
  *
  * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+static void do_nonfatal_recovery(struct pci_dev *dev)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
-   else
-   state = pci_channel_io_normal;
+   state = pci_channel_io_normal;
 
status = broadcast_error_message(dev,
state,
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
@@ -562,8 +612,10 @@ static void handle_error_source(struct pcie_device *aerdev,
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
-   } else
-   do_recovery(dev, info->severit

[PATCH v16 5/9] PCI/AER: Factor out error reporting from AER

2018-05-11 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index aa4..4fa1ee4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_ha

[PATCH v16 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-11 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..649dd1f 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -475,35 +476,84 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * do_fatal_recovery - handle fatal error recovery process
+ * @dev: pointer to a pci_dev data structure of agent detecting an error
+ *
+ * Invoked when an error is fatal. Once being invoked, removes the devices
+ * benetah this AER agent, followed by reset link e.g. secondary bus reset
+ * followed by re-enumeration of devices.
+ */
+
+static void do_fatal_recovery(struct pci_dev *dev)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+   struct aer_broadcast_data result_data;
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+   /*
+* If the error is reported by a bridge, we think this error
+* is related to the downstream link of the bridge, so we
+* do error recovery on all subordinates of the bridge instead
+* of the bridge and clear the error status of the bridge.
+*/
+   pci_walk_bus(dev->subordinate, report_resume, _data);
+   pci_cleanup_aer_uncorrect_error_status(dev);
+   }
+
+   if (result == PCI_ERS_RESULT_RECOVERED) {
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+   } else {
+   pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
+   pci_info(dev, "AER: Device recovery failed\n");
+   }
+
+   pci_unlock_rescan_remove();
+}
+
+/**
+ * do_nonfatal_recovery - handle nonfatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
- * @severity: error severity type
  *
  * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+static void do_nonfatal_recovery(struct pci_dev *dev)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
-   else
-   state = pci_channel_io_normal;
+   state = pci_channel_io_normal;
 
status = broadcast_error_message(dev,
state,
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
@@ -562,8 +612,10 @@ static void handle_error_source(struct pcie_device *aerdev,
if (pos)
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
-   } else
-   do_recovery(dev, info->severity);
+   } else if (info->severity == AER_NO

[PATCH v16 5/9] PCI/AER: Factor out error reporting from AER

2018-05-11 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index aa4..4fa1ee4 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_handler = dev->driver->

[PATCH v16 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-11 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v16 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-11 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v16 0/9] Address error and recovery for AER and DPC

2018-05-11 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v15:
Bjorn's comments addressed
> minor comments fixed
> made FATAL sequence aligned to existing one, as far as clearing status 
are concerned.
> pcie_do_fatal_recovery and pcie_do_nonfatal_recovery functions made to 
modularize
> pcie_do_fatal_recovery now takes service as an argument
Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
    > pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/p

[PATCH v16 0/9] Address error and recovery for AER and DPC

2018-05-11 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v15:
Bjorn's comments addressed
> minor comments fixed
> made FATAL sequence aligned to existing one, as far as clearing status 
are concerned.
> pcie_do_fatal_recovery and pcie_do_nonfatal_recovery functions made to 
modularize
> pcie_do_fatal_recovery now takes service as an argument
Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
    > pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/p

[PATCH v16 1/9] PCI: Unify wait for link active into generic PCI

2018-05-11 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..adfc553 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active?/inactive?
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v16 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-11 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 5680c13..358b4324 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -273,7 +273,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -291,7 +291,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..5182e0d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v16 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-11 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 5680c13..358b4324 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -273,7 +273,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -291,7 +291,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..5182e0d 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v16 1/9] PCI: Unify wait for link active into generic PCI

2018-05-11 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..adfc553 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active?/inactive?
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v16 7/9] PCI/PORTDRV: Implement generic find device

2018-05-11 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..bc2c337 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v16 7/9] PCI/PORTDRV: Implement generic find device

2018-05-11 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..bc2c337 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v15 5/9] PCI/AER: Factor out error reporting from AER

2018-05-02 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index be4ee3b..51515d1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_ha

[PATCH v15 5/9] PCI/AER: Factor out error reporting from AER

2018-05-02 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 800e1d4..03f4e0b 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -2,7 +2,7 @@
 #
 # Makefile for PCI Express features and port driver
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o err.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
 
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 08b4584..b4c9506 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index be4ee3b..51515d1 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,191 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->mmio_enabled)
-   goto out;
-
-   err_handler = dev->driver->

[PATCH v15 6/9] PCI/PORTDRV: Implement generic find service

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 51515d1..a525296 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 55df974..877785d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -195,10 +195,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v15 6/9] PCI/PORTDRV: Implement generic find service

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_service() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 51515d1..a525296 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -228,32 +228,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int find_aer_service_iter(struct device *device, void *data)
-{
-   struct pcie_port_service_driver *service_driver, **drv;
-
-   drv = (struct pcie_port_service_driver **) data;
-
-   if (device->bus == _port_bus_type && device->driver) {
-   service_driver = to_service_driver(device->driver);
-   if (service_driver->service == PCIE_PORT_SERVICE_AER) {
-   *drv = service_driver;
-   return 1;
-   }
-   }
-
-   return 0;
-}
-
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev)
-{
-   struct pcie_port_service_driver *drv = NULL;
-
-   device_for_each_child(>dev, , find_aer_service_iter);
-
-   return drv;
-}
-
 /**
  * handle_error_source - handle logging error into an event log
  * @aerdev: pointer to pcie_device data structure of the root port
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 55df974..877785d 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -195,10 +195,8 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
udev = dev->bus->self;
}
 
-#if IS_ENABLED(CONFIG_PCIEAER)
/* Use the aer driver of the component firstly */
-   driver = find_aer_service(udev);
-#endif
+   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 47c9824..ba6c963 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -112,5 +112,6 @@ static inline bool pcie_pme_no_msi(void) { return false; }
 static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
-struct pcie_port_service_driver *find_aer_service(struct pci_dev *dev);
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index c9c0663..d843055 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -18,6 +18,10 @@
 
 #include "../pci.h"
 #include "portdrv.h"
+struct portdrv_service_data {
+   struct pcie_port_service_driver *drv;
+   u32 service;
+};
 
 /**
  * release_pcie_device - free PCI Express port service device structure
@@ -398,6 +402,46 @@ static int remove_iter(struct device *dev, void *data)
return 0;
 }
 
+static int find_service_iter(struct device *device, void *data)
+{
+   struct pcie_port_service_driver *service_driver;
+   struct portdrv_service_data *pdrvs;
+   u32 service;
+
+   pdrvs = (struct portdrv_service_data *) data;
+   service = pdrvs->service;
+
+   if (device->bus == _port_bus_type && device->driver) {
+   service_driver = to_service_driver(device->driver);
+   if (service_driver->service == service) {
+   pdrvs->drv = service_driver;
+   return 1;
+   }
+   }
+
+   return 0;
+}
+/**
+ * pcie_port_find_service - find the service driver
+ * @dev: PCI Express port the service devices associated with
+ * @service: Service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
+   u32 service)
+{
+   struct pcie_port_service_driver *drv;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.drv = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   drv = pdrvs.drv;
+   return drv;
+}
+
 /**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
-- 
2.7.4



[PATCH v15 7/9] PCI/PORTDRV: Implement generic find device

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..c6147c4 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v15 7/9] PCI/PORTDRV: Implement generic find device

2018-05-02 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index ba6c963..896608a 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -114,4 +114,6 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev 
*dev, bool en) {}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index d843055..c6147c4 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -20,6 +20,7 @@
 #include "portdrv.h"
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -415,6 +416,7 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
@@ -443,6 +445,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v15 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-02 Thread Oza Pawandeep
Current DPC driver does not do recovery, e.g. calling end-point's driver's
callbacks, which sanitize the sw.

DPC driver implements link_reset callback, and calls pci_do_recovery().

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..aed7c9f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,21 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
 
dpc_wait_link_inactive(dpc);
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +100,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_recovery(pdev, DPC_FATAL);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +291,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 877785d..526aba8 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -181,11 +181,12 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, int severity)
 {
struct pci_dev *udev;
pci_ers_result_t status;
struct pcie_port_service_driver *driver;
+   u32 service;
 
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
/* Reset this port for all subordinates */
@@ -196,7 +197,12 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   if (severity == DPC_FATAL)
+   service = PCIE_PORT_SERVICE_DPC;
+   else
+   service = PCIE_PORT_SERVICE_AER;
+
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -302,7 +308,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, severity);
if (result == PCI_ERS_RESULT_RECOVERED)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
@@ -326,7 +332,8 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL) {
+   if ((severity == AER_FATAL) ||
+  (severity == DPC_FATAL)) {
status = do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)

[PATCH v15 8/9] PCI/DPC: Unify and plumb error handling into DPC

2018-05-02 Thread Oza Pawandeep
Current DPC driver does not do recovery, e.g. calling end-point's driver's
callbacks, which sanitize the sw.

DPC driver implements link_reset callback, and calls pci_do_recovery().

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 80ec384..aed7c9f 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -73,29 +73,21 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
pcie_wait_for_link(pdev, false);
 }
 
-static void dpc_work(struct work_struct *work)
+static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
-   struct pci_dev *dev, *temp, *pdev = dpc->dev->port;
-   struct pci_bus *parent = pdev->subordinate;
-   u16 cap = dpc->cap_pos, ctl;
-
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);
-   }
-   pci_unlock_rescan_remove();
+   struct dpc_dev *dpc;
+   struct pcie_device *pciedev;
+   struct device *devdpc;
+   u16 cap, ctl;
+
+   devdpc = pcie_port_find_device(pdev, PCIE_PORT_SERVICE_DPC);
+   pciedev = to_pcie_device(devdpc);
+   dpc = get_service_data(pciedev);
+   cap = dpc->cap_pos;
 
dpc_wait_link_inactive(dpc);
if (dpc->rp_extensions && dpc_wait_rp_inactive(dpc))
-   return;
+   return PCI_ERS_RESULT_DISCONNECT;
if (dpc->rp_extensions && dpc->rp_pio_status) {
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS,
   dpc->rp_pio_status);
@@ -108,6 +100,17 @@ static void dpc_work(struct work_struct *work)
pci_read_config_word(pdev, cap + PCI_EXP_DPC_CTL, );
pci_write_config_word(pdev, cap + PCI_EXP_DPC_CTL,
  ctl | PCI_EXP_DPC_CTL_INT_EN);
+
+   return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void dpc_work(struct work_struct *work)
+{
+   struct dpc_dev *dpc = container_of(work, struct dpc_dev, work);
+   struct pci_dev *pdev = dpc->dev->port;
+
+   /* From DPC point of view error is always FATAL. */
+   pcie_do_recovery(pdev, DPC_FATAL);
 }
 
 static void dpc_process_rp_pio_error(struct dpc_dev *dpc)
@@ -288,6 +291,7 @@ static struct pcie_port_service_driver dpcdriver = {
.service= PCIE_PORT_SERVICE_DPC,
.probe  = dpc_probe,
.remove = dpc_remove,
+   .reset_link = dpc_reset_link,
 };
 
 static int __init dpc_service_init(void)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 877785d..526aba8 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -181,11 +181,12 @@ static pci_ers_result_t default_reset_link(struct pci_dev 
*dev)
return PCI_ERS_RESULT_RECOVERED;
 }
 
-static pci_ers_result_t reset_link(struct pci_dev *dev)
+static pci_ers_result_t reset_link(struct pci_dev *dev, int severity)
 {
struct pci_dev *udev;
pci_ers_result_t status;
struct pcie_port_service_driver *driver;
+   u32 service;
 
if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
/* Reset this port for all subordinates */
@@ -196,7 +197,12 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
}
 
/* Use the aer driver of the component firstly */
-   driver = pcie_port_find_service(udev, PCIE_PORT_SERVICE_AER);
+   if (severity == DPC_FATAL)
+   service = PCIE_PORT_SERVICE_DPC;
+   else
+   service = PCIE_PORT_SERVICE_AER;
+
+   driver = pcie_port_find_service(udev, service);
 
if (driver && driver->reset_link) {
status = driver->reset_link(udev);
@@ -302,7 +308,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
pci_dev_put(pdev);
}
 
-   result = reset_link(udev);
+   result = reset_link(udev, severity);
if (result == PCI_ERS_RESULT_RECOVERED)
if (pcie_wait_for_link(udev, true))
pci_rescan_bus(udev->bus);
@@ -326,7 +332,8 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL) {
+   if ((severity == AER_FATAL) ||
+  (severity == DPC_FATAL)) {
status = do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto f

[PATCH v15 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-02 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v15 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-02 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index aed7c9f..6966e00 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -263,7 +263,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -281,7 +281,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..86f1cc2 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define PCI_EXP_DPC_CTL_EN_FATAL   0x0001  /* Enable trigger on 
ERR_FATAL message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v15 2/9] pci-error-recovery: Add AER_FATAL handling

2018-05-02 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v15 9/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-05-02 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index aed7c9f..6966e00 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -263,7 +263,7 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -281,7 +281,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 103ba79..86f1cc2 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -981,6 +981,7 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
+#define PCI_EXP_DPC_CTL_EN_FATAL   0x0001  /* Enable trigger on 
ERR_FATAL message */
 #define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
-- 
2.7.4



[PATCH v15 1/9] PCI: Unify wait for link active into generic PCI

2018-05-02 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..2e4d1e4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v15 1/9] PCI: Unify wait for link active into generic PCI

2018-05-02 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655..2e4d1e4 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4138,6 +4138,35 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 
return pci_dev_wait(dev, "PM D3->D0", PCIE_RESET_READY_POLL_MS);
 }
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
 
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 023f7cf..cec9d8c 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,7 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 8c57d60..80ec384 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 static void dpc_work(struct work_struct *work)
-- 
2.7.4



[PATCH v15 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-02 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 779b387..206f590 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -330,6 +330,13 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   /*
+* This function is called only on ERR_FATAL now, and since
+* the pci_report_resume is called only in ERR_NONFATAL case,
+* the clearing part has to be taken care here.
+*/
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..655d4e8 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -474,6 +475,44 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
+static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+   if (result == PCI_ERS_RESULT_RECOVERED)
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -485,11 +524,15 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  */
 static void do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
+   if (severity == AER_FATAL) {
+   status = do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -498,12 +541,6 @@ static void do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
-- 
2.7.4



[PATCH v15 0/9] Address error and recovery for AER and DPC

2018-05-02 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/pci/pci.c|  29 +++
 drivers/pci/pci.h|   4 +
 drivers/pci/pcie/Makefile|   2 +-
 drivers/pci/pcie/aer/aerdrv.c|   2 +
 drivers/pci/pcie/aer/aerdrv.h|  30 ---
 drivers/pci/pcie/aer/aerdrv_core.c   | 317 +-

[PATCH v15 3/9] PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices

2018-05-02 Thread Oza Pawandeep
This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration.

So the errors are handled in a different way as follows:
ERR_NONFATAL => call driver recovery entry points
ERR_FATAL=> remove and re-enumerate

please refer to Documentation/PCI/pci-error-recovery.txt for more details.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 779b387..206f590 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -330,6 +330,13 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   /*
+* This function is called only on ERR_FATAL now, and since
+* the pci_report_resume is called only in ERR_NONFATAL case,
+* the clearing part has to be taken care here.
+*/
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 0ea5acc..655d4e8 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -474,6 +475,44 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
+static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev);
+   if (result == PCI_ERS_RESULT_RECOVERED)
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -485,11 +524,15 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
  */
 static void do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
-   if (severity == AER_FATAL)
-   state = pci_channel_io_frozen;
+   if (severity == AER_FATAL) {
+   status = do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -498,12 +541,6 @@ static void do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if (severity == AER_FATAL) {
-   result = reset_link(dev);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
-- 
2.7.4



[PATCH v15 0/9] Address error and recovery for AER and DPC

2018-05-02 Thread Oza Pawandeep
This patch set brings in error handling support for DPC

The current implementation of AER and error message broadcasting to the
EP driver is tightly coupled and limited to AER service driver.
It is important to factor out broadcasting and other link handling
callbacks. So that not only when AER gets triggered, but also when DPC get
triggered (for e.g. ERR_FATAL), callbacks are handled appropriately.

The goal of the patch-set is:
DPC should handle the error handling and recovery similar to AER, because 
finally both are attempting recovery in some or the other way,
and for that error handling and recovery framework has to be loosely
coupled.

It achieves uniformity and transparency to the error handling agents such
as AER, DPC, with respect to recovery and error handling.

So, this patch-set tries to unify lot of things between error agents and
make them behave in a well defined way. (be it error (FATAL, NON_FATAL)
handling or recovery).

The FATAL error handling is handled with remove/reset_link/re-enumerate
sequence while the NON_FATAL follows the default path.
Documentation/PCI/pci-error-recovery.txt talks more on that.

Changes since v14:
Bjorn's comments addressed
> simplified the patch set, and moved AER_FATAL handling in the beginning.
> rebase the code to 4.17-rc1.
Changes since v13:
Bjorn's comments addressed
> handke FATAL errors with remove devices followed by re-enumeration.
> changes in AER and DPC along with required Documentation.
Changes since v12:
Bjorn's and Keith's Comments addressed.
> Made DPC and AER error handling identical 
> hanldled cases for hotplug enabled system differently.
Changes since v11:
Bjorn's comments addressed.
> rename pcie-err.c to err.c
> removed EXPORT_SYMBOL
> made generic find_serivce function in port driver.
> removed mutex patch as no need to have mutex in pcie_do_recovery
> brough in DPC_FATAL in aer.h
> so now all the error codes (AER and DPC) are unified in aer.h
Changes since v10:
Christoph Hellwig's, David Laight's and Randy Dunlap's
comments addressed.
> renamed pci_do_recovery to pcie_do_recovery
> removed inner braces in conditional statements.
> restrctured the code in pci_wait_for_link
> EXPORT_SYMBOL_GPL
Changes since v9:
Sinan's comments addressed.
> bool active = true; unnecessary variable removed.
Changes since v8:
Fixed Kbuild errors.
Changes since v7:
Rebased the code on pci master
> https://kernel.googlesource.com/pub/scm/linux/kernel/git/helgaas/pci
Changes since v6:
Sinan's and Stefan's comments implemented.
> reordered patch 6 and 7
> cleaned up
Changes since v5:
Sinan's and Keith's comments incorporated.
> made separate patch for mutex
> unified error repotting codes into driver/pci/pci.h
> got rid of wait link active/inactive and
  made generic function in driver/pci/pci.c
Changes since v4:
Bjorn's comments incorporated.
> Renamed only do_recovery.
> moved the things more locally to drivers/pci/pci.h
Changes since v3:
Bjorn's comments incorporated.
> Made separate patch renaming generic pci_err.c
> Introduce pci_err.h to contain all the error types and recovery
> removed all the dependencies on pci.h
Changes since v2:
Based on feedback from Keith:
"
When DPC is triggered due to receipt of an uncorrectable error Message,
the Requester ID from the Message is recorded in the DPC Error
Source ID register and that Message is discarded and not forwarded Upstream.
"
Removed the patch where AER checks if DPC service is active
Changes since v1:
Kbuild errors fixed:
> pci_find_dpc_dev made static
> ras_event.h updated
> pci_find_aer_service call with CONFIG check
> pci_find_dpc_service call with CONFIG check

Oza Pawandeep (9):
  PCI: Unify wait for link active into generic PCI
  pci-error-recovery: Add AER_FATAL handling
  PCI/AER: Handle ERRR_FATAL with removal and re-enumeration of devices
  PCI/AER: Rename error recovery to generic PCI naming
  PCI/AER: Factor out error reporting from AER
  PCI/PORTDRV: Implement generic find service
  PCI/PORTDRV: Implement generic find device
  PCI/DPC: Unify and plumb error handling into DPC
  PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

 Documentation/PCI/pci-error-recovery.txt |  35 ++-
 drivers/pci/hotplug/pciehp_hpc.c |  20 +-
 drivers/pci/pci.c|  29 +++
 drivers/pci/pci.h|   4 +
 drivers/pci/pcie/Makefile|   2 +-
 drivers/pci/pcie/aer/aerdrv.c|   2 +
 drivers/pci/pcie/aer/aerdrv.h|  30 ---
 drivers/pci/pcie/aer/aerdrv_core.c   | 317 +-

[PATCH v15 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-02 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..22a9589 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,9 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_recovery(struct pci_dev *dev, int severity);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 655d4e8..be4ee3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -475,7 +475,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
-static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+static pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int 
severity)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -514,7 +514,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  * @severity: error severity type
  *
@@ -522,13 +522,13 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
pci_ers_result_t status;
enum pci_channel_state state;
 
if (severity == AER_FATAL) {
-   status = do_fatal_recovery(dev, severity);
+   status = pcie_do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
return;
@@ -600,7 +600,7 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else
-   do_recovery(dev, info->severity);
+   pcie_do_recovery(dev, info->severity);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -665,7 +665,7 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity != AER_CORRECTABLE)
-   do_recovery(pdev, entry.severity);
+   pcie_do_recovery(pdev, entry.severity);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v15 4/9] PCI/AER: Rename error recovery to generic PCI naming

2018-05-02 Thread Oza Pawandeep
This patch renames error recovery to generic name with pcie prefix

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index cec9d8c..22a9589 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -353,6 +353,9 @@ static inline resource_size_t pci_resource_alignment(struct 
pci_dev *dev,
 
 void pci_enable_acs(struct pci_dev *dev);
 
+/* PCI error reporting and recovery */
+void pcie_do_recovery(struct pci_dev *dev, int severity);
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index 655d4e8..be4ee3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -475,7 +475,7 @@ static pci_ers_result_t reset_link(struct pci_dev *dev)
return status;
 }
 
-static pci_ers_result_t do_fatal_recovery(struct pci_dev *dev, int severity)
+static pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int 
severity)
 {
struct pci_dev *udev;
struct pci_bus *parent;
@@ -514,7 +514,7 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
 }
 
 /**
- * do_recovery - handle nonfatal/fatal error recovery process
+ * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
  * @severity: error severity type
  *
@@ -522,13 +522,13 @@ static pci_ers_result_t do_fatal_recovery(struct pci_dev 
*dev, int severity)
  * error detected message to all downstream drivers within a hierarchy in
  * question and return the returned code.
  */
-static void do_recovery(struct pci_dev *dev, int severity)
+void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
pci_ers_result_t status;
enum pci_channel_state state;
 
if (severity == AER_FATAL) {
-   status = do_fatal_recovery(dev, severity);
+   status = pcie_do_fatal_recovery(dev, severity);
if (status != PCI_ERS_RESULT_RECOVERED)
goto failed;
return;
@@ -600,7 +600,7 @@ static void handle_error_source(struct pcie_device *aerdev,
pci_write_config_dword(dev, pos + PCI_ERR_COR_STATUS,
info->status);
} else
-   do_recovery(dev, info->severity);
+   pcie_do_recovery(dev, info->severity);
 }
 
 #ifdef CONFIG_ACPI_APEI_PCIEAER
@@ -665,7 +665,7 @@ static void aer_recover_work_func(struct work_struct *work)
}
cper_print_aer(pdev, entry.severity, entry.regs);
if (entry.severity != AER_CORRECTABLE)
-   do_recovery(pdev, entry.severity);
+   pcie_do_recovery(pdev, entry.severity);
pci_dev_put(pdev);
}
 }
-- 
2.7.4



[PATCH v14 4/9] PCI/PORTDRV: Implement generic find device

2018-04-23 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 419bdf3..06f4e11d 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -81,4 +81,6 @@ static inline void pcie_port_platform_notify(struct pci_dev 
*port, int *mask){}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 94de1fa..dd13cc8 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -25,6 +25,7 @@ bool pciehp_msi_disabled;
 
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -432,12 +433,14 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
 
return 0;
 }
+
 /**
  * pcie_port_find_service - find the service driver
  * @dev: PCI Express port the service devices associated with
@@ -460,6 +463,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v14 4/9] PCI/PORTDRV: Implement generic find device

2018-04-23 Thread Oza Pawandeep
This patch implements generic pcie_port_find_device() routine.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 419bdf3..06f4e11d 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -81,4 +81,6 @@ static inline void pcie_port_platform_notify(struct pci_dev 
*port, int *mask){}
 
 struct pcie_port_service_driver *pcie_port_find_service(struct pci_dev *dev,
u32 service);
+struct device *pcie_port_find_device(struct pci_dev *dev,
+u32 service);
 #endif /* _PORTDRV_H_ */
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 94de1fa..dd13cc8 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -25,6 +25,7 @@ bool pciehp_msi_disabled;
 
 struct portdrv_service_data {
struct pcie_port_service_driver *drv;
+   struct device *dev;
u32 service;
 };
 
@@ -432,12 +433,14 @@ static int find_service_iter(struct device *device, void 
*data)
service_driver = to_service_driver(device->driver);
if (service_driver->service == service) {
pdrvs->drv = service_driver;
+   pdrvs->dev = device;
return 1;
}
}
 
return 0;
 }
+
 /**
  * pcie_port_find_service - find the service driver
  * @dev: PCI Express port the service devices associated with
@@ -460,6 +463,27 @@ struct pcie_port_service_driver 
*pcie_port_find_service(struct pci_dev *dev,
 }
 
 /**
+ * pcie_port_find_device - find the struct device
+ * @dev: PCI Express port the service devices associated with
+ * @service: For the service to find
+ *
+ * Find PCI Express port service driver associated with given service
+ */
+struct  device *pcie_port_find_device(struct pci_dev *dev,
+ u32 service)
+{
+   struct device *device;
+   struct portdrv_service_data pdrvs;
+
+   pdrvs.dev = NULL;
+   pdrvs.service = service;
+   device_for_each_child(>dev, , find_service_iter);
+
+   device = pdrvs.dev;
+   return device;
+}
+
+/**
  * pcie_port_device_remove - unregister PCI Express port service devices
  * @dev: PCI Express port the service devices to unregister are associated with
  *
-- 
2.7.4



[PATCH v14 2/9] PCI/AER: Factor out error reporting from AER

2018-04-23 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>
Reviewed-by: Keith Busch <keith.bu...@intel.com>

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 223e4c3..f0b1a78 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -6,7 +6,7 @@
 # Build PCI Express ASPM if needed
 obj-$(CONFIG_PCIEASPM) += aspm.o
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o portdrv_bus.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o portdrv_bus.o 
err.o
 pcieportdrv-$(CONFIG_ACPI) += portdrv_acpi.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 5449e5c..bc9db53 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index aeb83a0..4acec3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -230,191 +231,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev

[PATCH v14 2/9] PCI/AER: Factor out error reporting from AER

2018-04-23 Thread Oza Pawandeep
This patch factors out error reporting callbacks, which are currently
tightly coupled with AER.

DPC should be able to register callbacks and attempt recovery when DPC
trigger event occurs.

Signed-off-by: Oza Pawandeep 
Reviewed-by: Keith Busch 

diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 223e4c3..f0b1a78 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -6,7 +6,7 @@
 # Build PCI Express ASPM if needed
 obj-$(CONFIG_PCIEASPM) += aspm.o
 
-pcieportdrv-y  := portdrv_core.o portdrv_pci.o portdrv_bus.o
+pcieportdrv-y  := portdrv_core.o portdrv_pci.o portdrv_bus.o 
err.o
 pcieportdrv-$(CONFIG_ACPI) += portdrv_acpi.o
 
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 5449e5c..bc9db53 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -76,36 +76,6 @@ struct aer_rpc {
 */
 };
 
-struct aer_broadcast_data {
-   enum pci_channel_state state;
-   enum pci_ers_result result;
-};
-
-static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
-   enum pci_ers_result new)
-{
-   if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
-   return PCI_ERS_RESULT_NO_AER_DRIVER;
-
-   if (new == PCI_ERS_RESULT_NONE)
-   return orig;
-
-   switch (orig) {
-   case PCI_ERS_RESULT_CAN_RECOVER:
-   case PCI_ERS_RESULT_RECOVERED:
-   orig = new;
-   break;
-   case PCI_ERS_RESULT_DISCONNECT:
-   if (new == PCI_ERS_RESULT_NEED_RESET)
-   orig = PCI_ERS_RESULT_NEED_RESET;
-   break;
-   default:
-   break;
-   }
-
-   return orig;
-}
-
 extern struct bus_type pcie_port_bus_type;
 void aer_isr(struct work_struct *work);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
diff --git a/drivers/pci/pcie/aer/aerdrv_core.c 
b/drivers/pci/pcie/aer/aerdrv_core.c
index aeb83a0..4acec3b 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include "aerdrv.h"
+#include "../../pci.h"
 
 #definePCI_EXP_AER_FLAGS   (PCI_EXP_DEVCTL_CERE | 
PCI_EXP_DEVCTL_NFERE | \
 PCI_EXP_DEVCTL_FERE | PCI_EXP_DEVCTL_URRE)
@@ -230,191 +231,6 @@ static bool find_source_device(struct pci_dev *parent,
return true;
 }
 
-static int report_error_detected(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-   const struct pci_error_handlers *err_handler;
-   struct aer_broadcast_data *result_data;
-   result_data = (struct aer_broadcast_data *) data;
-
-   device_lock(>dev);
-   dev->error_state = result_data->state;
-
-   if (!dev->driver ||
-   !dev->driver->err_handler ||
-   !dev->driver->err_handler->error_detected) {
-   if (result_data->state == pci_channel_io_frozen &&
-   dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-   /*
-* In case of fatal recovery, if one of down-
-* stream device has no driver. We might be
-* unable to recover because a later insmod
-* of a driver for this device is unaware of
-* its hw state.
-*/
-   pci_printk(KERN_DEBUG, dev, "device has %s\n",
-  dev->driver ?
-  "no AER-aware driver" : "no driver");
-   }
-
-   /*
-* If there's any device in the subtree that does not
-* have an error_detected callback, returning
-* PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-* the subsequent mmio_enabled/slot_reset/resume
-* callbacks of "any" device in the subtree. All the
-* devices in the subtree are left in the error state
-* without recovery.
-*/
-
-   if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
-   vote = PCI_ERS_RESULT_NO_AER_DRIVER;
-   else
-   vote = PCI_ERS_RESULT_NONE;
-   } else {
-   err_handler = dev->driver->err_handler;
-   vote = err_handler->error_detected(dev, result_data->state);
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
-   }
-
-   result_data->result = merge_result(result_data->result, vote);
-   device_unlock(>dev);
-   return 0;
-}
-
-static int report_mmio_enabled(struct pci_dev *dev, void *data)
-{
-   pci_ers_result_t vote;
-  

[PATCH v14 6/9] PCI: Unify wait for link active into generic PCI

2018-04-23 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index f6a4dd1..2bcf977 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4176,6 +4176,36 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
return 0;
 }
 
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
+
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
u16 ctrl;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index abc514e..5c44fbc 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -345,6 +345,8 @@ void pci_enable_acs(struct pci_dev *dev);
 /* PCI error reporting and recovery */
 void pcie_do_recovery(struct pci_dev *dev, int severity);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
+
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index ad02298..6baed85 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 /**
-- 
2.7.4



[PATCH v14 6/9] PCI: Unify wait for link active into generic PCI

2018-04-23 Thread Oza Pawandeep
Clients such as HP, DPC are using pcie_wait_link_active(), which waits
till the link becomes active or inactive.

Made generic function and moved it to drivers/pci/pci.c

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 18a42f8..e0c2b8e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -231,25 +231,11 @@ bool pciehp_check_link_active(struct controller *ctrl)
return ret;
 }
 
-static void __pcie_wait_link_active(struct controller *ctrl, bool active)
-{
-   int timeout = 1000;
-
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   while (timeout > 0) {
-   msleep(10);
-   timeout -= 10;
-   if (pciehp_check_link_active(ctrl) == active)
-   return;
-   }
-   ctrl_dbg(ctrl, "Data Link Layer Link Active not %s in 1000 msec\n",
-   active ? "set" : "cleared");
-}
-
 static void pcie_wait_link_active(struct controller *ctrl)
 {
-   __pcie_wait_link_active(ctrl, true);
+   struct pci_dev *pdev = ctrl_dev(ctrl);
+
+   pcie_wait_for_link(pdev, true);
 }
 
 static bool pci_bus_check_dev(struct pci_bus *bus, int devfn)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index f6a4dd1..2bcf977 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4176,6 +4176,36 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
return 0;
 }
 
+/**
+ * pcie_wait_for_link - Wait for link till it's active/inactive
+ * @pdev: Bridge device
+ * @active: waiting for active or inactive ?
+ *
+ * Use this to wait till link becomes active or inactive.
+ */
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active)
+{
+   int timeout = 1000;
+   bool ret;
+   u16 lnk_status;
+
+   for (;;) {
+   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
+   ret = !!(lnk_status & PCI_EXP_LNKSTA_DLLLA);
+   if (ret == active)
+   return true;
+   if (timeout <= 0)
+   break;
+   msleep(10);
+   timeout -= 10;
+   }
+
+   pci_info(pdev, "Data Link Layer Link Active not %s in 1000 msec\n",
+active ? "set" : "cleared");
+
+   return false;
+}
+
 void pci_reset_secondary_bus(struct pci_dev *dev)
 {
u16 ctrl;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index abc514e..5c44fbc 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -345,6 +345,8 @@ void pci_enable_acs(struct pci_dev *dev);
 /* PCI error reporting and recovery */
 void pcie_do_recovery(struct pci_dev *dev, int severity);
 
+bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
+
 #ifdef CONFIG_PCIEASPM
 void pcie_aspm_init_link_state(struct pci_dev *pdev);
 void pcie_aspm_exit_link_state(struct pci_dev *pdev);
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index ad02298..6baed85 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -68,19 +68,9 @@ static int dpc_wait_rp_inactive(struct dpc_dev *dpc)
 
 static void dpc_wait_link_inactive(struct dpc_dev *dpc)
 {
-   unsigned long timeout = jiffies + HZ;
struct pci_dev *pdev = dpc->dev->port;
-   struct device *dev = >dev->device;
-   u16 lnk_status;
 
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   while (lnk_status & PCI_EXP_LNKSTA_DLLLA &&
-   !time_after(jiffies, timeout)) {
-   msleep(10);
-   pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status);
-   }
-   if (lnk_status & PCI_EXP_LNKSTA_DLLLA)
-   dev_warn(dev, "Link state not disabled for DPC event\n");
+   pcie_wait_for_link(pdev, false);
 }
 
 /**
-- 
2.7.4



[PATCH v14 8/9] PCI/AER/DPC: Align FATAL error handling for AER and DPC

2018-04-23 Thread Oza Pawandeep
If there is a DPC support in the switch then ERR_FATAL and ERR_NONFATAL
should be handled in a same way with respect to DPC.

This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration, and it is applicable to both AER and DPC, so that we have
unified error handling from error agents (SW) point of view.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index da8331f..b2eaa3f 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -334,6 +334,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index d02e029..99d52a0 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -273,6 +273,44 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
return result_data.result;
 }
 
+pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev, severity);
+
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -284,12 +322,16 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
  */
 void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
if ((severity == AER_FATAL) ||
-   (severity == DPC_FATAL))
-   state = pci_channel_io_frozen;
+   (severity == DPC_FATAL)) {
+   status = pcie_do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -298,13 +340,6 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if ((severity == AER_FATAL) ||
-   (severity == DPC_FATAL)) {
-   result = reset_link(dev, severity);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index cd15862..a3e9b25 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -81,8 +81,6 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
  */
 static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct pci_bus *parent = pdev->subordinate;
-   struct pci_dev *dev, *temp;
struct dpc_dev *dpc;
struct pcie_device *pciedev;
struct device *devdpc;
@@ -93,19 +91,6 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
dpc = get_service_data(pciedev);
cap = dpc->cap_pos;
 
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put

[PATCH v14 8/9] PCI/AER/DPC: Align FATAL error handling for AER and DPC

2018-04-23 Thread Oza Pawandeep
If there is a DPC support in the switch then ERR_FATAL and ERR_NONFATAL
should be handled in a same way with respect to DPC.

This patch alters the behavior of handling of ERR_FATAL, where removal
of devices is initiated, followed by reset link, followed by
re-enumeration, and it is applicable to both AER and DPC, so that we have
unified error handling from error agents (SW) point of view.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index da8331f..b2eaa3f 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -334,6 +334,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev)
reg32 |= ROOT_PORT_INTR_ON_MESG_MASK;
pci_write_config_dword(dev, pos + PCI_ERR_ROOT_COMMAND, reg32);
 
+   aer_error_resume(dev);
+
return PCI_ERS_RESULT_RECOVERED;
 }
 
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index d02e029..99d52a0 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -273,6 +273,44 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
return result_data.result;
 }
 
+pci_ers_result_t pcie_do_fatal_recovery(struct pci_dev *dev, int severity)
+{
+   struct pci_dev *udev;
+   struct pci_bus *parent;
+   struct pci_dev *pdev, *temp;
+   pci_ers_result_t result = PCI_ERS_RESULT_RECOVERED;
+
+   if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+   udev = dev;
+   else
+   udev = dev->bus->self;
+
+   if (severity == AER_FATAL)
+   pci_cleanup_aer_uncorrect_error_status(dev);
+
+   parent = udev->subordinate;
+   pci_lock_rescan_remove();
+   list_for_each_entry_safe_reverse(pdev, temp, >devices,
+bus_list) {
+   pci_dev_get(pdev);
+   pci_dev_set_disconnected(pdev, NULL);
+   if (pci_has_subordinate(pdev))
+   pci_walk_bus(pdev->subordinate,
+pci_dev_set_disconnected, NULL);
+   pci_stop_and_remove_bus_device(pdev);
+   pci_dev_put(pdev);
+   }
+
+   result = reset_link(udev, severity);
+
+   if (pcie_wait_for_link(udev, true))
+   pci_rescan_bus(udev->bus);
+
+   pci_unlock_rescan_remove();
+
+   return result;
+}
+
 /**
  * pcie_do_recovery - handle nonfatal/fatal error recovery process
  * @dev: pointer to a pci_dev data structure of agent detecting an error
@@ -284,12 +322,16 @@ static pci_ers_result_t broadcast_error_message(struct 
pci_dev *dev,
  */
 void pcie_do_recovery(struct pci_dev *dev, int severity)
 {
-   pci_ers_result_t status, result = PCI_ERS_RESULT_RECOVERED;
+   pci_ers_result_t status;
enum pci_channel_state state;
 
if ((severity == AER_FATAL) ||
-   (severity == DPC_FATAL))
-   state = pci_channel_io_frozen;
+   (severity == DPC_FATAL)) {
+   status = pcie_do_fatal_recovery(dev, severity);
+   if (status != PCI_ERS_RESULT_RECOVERED)
+   goto failed;
+   return;
+   }
else
state = pci_channel_io_normal;
 
@@ -298,13 +340,6 @@ void pcie_do_recovery(struct pci_dev *dev, int severity)
"error_detected",
report_error_detected);
 
-   if ((severity == AER_FATAL) ||
-   (severity == DPC_FATAL)) {
-   result = reset_link(dev, severity);
-   if (result != PCI_ERS_RESULT_RECOVERED)
-   goto failed;
-   }
-
if (status == PCI_ERS_RESULT_CAN_RECOVER)
status = broadcast_error_message(dev,
state,
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index cd15862..a3e9b25 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -81,8 +81,6 @@ static void dpc_wait_link_inactive(struct dpc_dev *dpc)
  */
 static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 {
-   struct pci_bus *parent = pdev->subordinate;
-   struct pci_dev *dev, *temp;
struct dpc_dev *dpc;
struct pcie_device *pciedev;
struct device *devdpc;
@@ -93,19 +91,6 @@ static pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
dpc = get_service_data(pciedev);
cap = dpc->cap_pos;
 
-   pci_lock_rescan_remove();
-   list_for_each_entry_safe_reverse(dev, temp, >devices,
-bus_list) {
-   pci_dev_get(dev);
-   pci_dev_set_disconnected(dev, NULL);
-   if (pci_has_subordinate(dev))
-   pci_walk_bus(dev->subordinate,
-pci_dev_set_disconnected, NULL);
-   pci_stop_and_remove_bus_device(dev);
-   pci_dev_put(dev);

[PATCH v14 7/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-04-23 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index 6baed85..cd15862 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -283,7 +283,8 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL |
+ PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -301,7 +302,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 0c79eac..dcc3957 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -978,7 +978,8 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
-#define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
+#define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
 #define PCI_EXP_DPC_STATUS 8   /* DPC Status */
-- 
2.7.4



[PATCH v14 7/9] PCI/DPC: Disable ERR_NONFATAL and enable ERR_FATAL for DPC

2018-04-23 Thread Oza Pawandeep
This patch disables ERR_NONFATAL trigger for DPC, so now DPC
handles only ERR_FATAL.

Signed-off-by: Oza Pawandeep 

diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index 6baed85..cd15862 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -283,7 +283,8 @@ static int dpc_probe(struct pcie_device *dev)
}
}
 
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_NONFATAL | 
PCI_EXP_DPC_CTL_INT_EN;
+   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL |
+ PCI_EXP_DPC_CTL_INT_EN;
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 
dev_info(device, "DPC error containment capabilities: Int Msg #%d, 
RPExt%c PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
@@ -301,7 +302,7 @@ static void dpc_remove(struct pcie_device *dev)
u16 ctl;
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, );
-   ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
+   ctl &= ~(PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
 }
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 0c79eac..dcc3957 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -978,7 +978,8 @@
 #define  PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000  /* ERR_COR signal on DL_Active 
supported */
 
 #define PCI_EXP_DPC_CTL6   /* DPC control */
-#define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
+#define  PCI_EXP_DPC_CTL_EN_FATAL  0x0001  /* Enable trigger on ERR_FATAL 
message */
+#define  PCI_EXP_DPC_CTL_EN_NONFATAL   0x0002  /* Enable trigger on 
ERR_NONFATAL message */
 #define  PCI_EXP_DPC_CTL_INT_EN0x0008  /* DPC Interrupt Enable */
 
 #define PCI_EXP_DPC_STATUS 8   /* DPC Status */
-- 
2.7.4



[PATCH v14 9/9] pci-error-recovery: Add AER_FATAL handling

2018-04-23 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep <p...@codeaurora.org>

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



[PATCH v14 9/9] pci-error-recovery: Add AER_FATAL handling

2018-04-23 Thread Oza Pawandeep
It adds description on AER_FATAL error handling.

Signed-off-by: Oza Pawandeep 

diff --git a/Documentation/PCI/pci-error-recovery.txt 
b/Documentation/PCI/pci-error-recovery.txt
index 0b6bb3e..688b691 100644
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI 
error
 event will be platform-dependent, but will follow the general
 sequence described below.
 
-STEP 0: Error Event
+STEP 0: Error Event: ERR_NONFATAL
 ---
 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
 is isolated, in that all I/O is blocked: all reads return 0x,
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume 
Operations).
 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
 proceeds to STEP 4 (Slot Reset)
 
-STEP 3: Link Reset
---
-The platform resets the link.  This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
+STEP 3: Slot Reset
 --
 
 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
@@ -320,7 +314,7 @@ Failure).
 >>> However, it probably should.
 
 
-STEP 5: Resume Operations
+STEP 4: Resume Operations
 -
 The platform will call the resume() callback on all affected device
 drivers if all drivers on the segment have returned
@@ -332,7 +326,7 @@ a result code.
 At this point, if a new error happens, the platform will restart
 a new error recovery sequence.
 
-STEP 6: Permanent Failure
+STEP 5: Permanent Failure
 -
 A "permanent failure" has occurred, and the platform cannot recover
 the device.  The platform will call error_detected() with a
@@ -355,6 +349,27 @@ errors. See the discussion in 
powerpc/eeh-pci-error-recovery.txt
 for additional detail on real-life experience of the causes of
 software errors.
 
+STEP 0: Error Event: ERR_FATAL
+---
+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
+isolated, in that all I/O is blocked: all reads return 0x, all
+writes are ignored.
+
+STEP 1: Remove devices
+
+Platform removes the devices depending on the error agent, it could be
+this port for all subordinates or upstream component (likely downstream
+port)
+
+STEP 2: Reset link
+
+The platform resets the link.  This is a PCI-Express specific step and is
+done whenever a fatal error has been detected that can be "solved" by
+resetting the link.
+
+STEP 3: Re-enumerate the devices
+
+Initiates the re-enumeration.
 
 Conclusion; General Remarks
 ---
-- 
2.7.4



  1   2   3   4   5   >