On Tue, 2012-09-18 at 11:34 +0800, Gavin Shan wrote:
> The EEH core is talking with the PCI device driver to determine the
> action (purely reset, or PCI device removal). During the period, the
> driver might be unloaded and in turn causes kernel crash as follows:

Stable folks: I'm about to push out a powerpc-next with equivalent of
those two patches in it. The patches are fairly different between 3.6
and 3.5- however since Gavin rewrote a lot of the EEH infrastructure in
3.6, hence these separate backport patches.

Cheers,
Ben.

> EEH: Detected PCI bus error on PHB#4-PE#10000
> EEH: This PCI device has failed 3 times in the last hour
> lpfc 0004:01:00.0: 0:2710 PCI channel disable preparing for reset
> Unable to handle kernel paging request for data at address 0x00000490
> Faulting instruction address: 0xd00000000e682c90
> cpu 0x1: Vector: 300 (Data Access) at [c000000fc75ffa20]
>     pc: d00000000e682c90: .lpfc_io_error_detected+0x30/0x240 [lpfc]
>     lr: d00000000e682c8c: .lpfc_io_error_detected+0x2c/0x240 [lpfc]
>     sp: c000000fc75ffca0
>    msr: 8000000000009032
>    dar: 490
>  dsisr: 40000000
>   current = 0xc000000fc79b88b0
>   paca    = 0xc00000000edb0380   softe: 0        irq_happened: 0x00
>     pid   = 3386, comm = eehd
> enter ? for help
> [c000000fc75ffca0] c000000fc75ffd30 (unreliable)
> [c000000fc75ffd30] c00000000004fd3c .eeh_report_error+0x7c/0xf0
> [c000000fc75ffdc0] c00000000004ee00 .eeh_pe_dev_traverse+0xa0/0x180
> [c000000fc75ffe70] c00000000004ffd8 .eeh_handle_event+0x68/0x300
> [c000000fc75fff00] c0000000000503a0 .eeh_event_handler+0x130/0x1a0
> [c000000fc75fff90] c000000000020138 .kernel_thread+0x54/0x70
> 1:mon>
> 
> The patch increases the reference of the corresponding driver modules
> while EEH core does the negotiation with PCI device driver so that the
> corresponding driver modules can't be unloaded during the period and
> we're safe to refer the callbacks.
> 
> Cc: [email protected]
> Reported-by: Alexey Kardashevskiy <[email protected]>
> Signed-off-by: Gavin Shan <[email protected]>
> ---
>  arch/powerpc/platforms/pseries/eeh_driver.c |   93 
> +++++++++++++++++++++------
>  1 files changed, 73 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c 
> b/arch/powerpc/platforms/pseries/eeh_driver.c
> index baf92cd..5bde648 100644
> --- a/arch/powerpc/platforms/pseries/eeh_driver.c
> +++ b/arch/powerpc/platforms/pseries/eeh_driver.c
> @@ -25,6 +25,7 @@
>  #include <linux/delay.h>
>  #include <linux/interrupt.h>
>  #include <linux/irq.h>
> +#include <linux/module.h>
>  #include <linux/pci.h>
>  #include <asm/eeh.h>
>  #include <asm/eeh_event.h>
> @@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev 
> *pdev)
>       return "";
>  }
>  
> +/**
> + * eeh_pcid_get - Get the PCI device driver
> + * @pdev: PCI device
> + *
> + * The function is used to retrieve the PCI device driver for
> + * the indicated PCI device. Besides, we will increase the reference
> + * of the PCI device driver to prevent that being unloaded on
> + * the fly. Otherwise, kernel crash would be seen.
> + */
> +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
> +{
> +     if (!pdev || !pdev->driver)
> +             return NULL;
> +
> +     if (!try_module_get(pdev->driver->driver.owner))
> +             return NULL;
> +
> +     return pdev->driver;
> +}
> +
> +/**
> + * eeh_pcid_put - Dereference on the PCI device driver
> + * @pdev: PCI device
> + *
> + * The function is called to do dereference on the PCI device
> + * driver of the indicated PCI device.
> + */
> +static inline void eeh_pcid_put(struct pci_dev *pdev)
> +{
> +     if (!pdev || !pdev->driver)
> +             return;
> +
> +     module_put(pdev->driver->driver.owner);
> +}
> +
>  #if 0
>  static void print_device_node_tree(struct pci_dn *pdn, int dent)
>  {
> @@ -126,18 +162,20 @@ static void eeh_enable_irq(struct pci_dev *dev)
>  static int eeh_report_error(struct pci_dev *dev, void *userdata)
>  {
>       enum pci_ers_result rc, *res = userdata;
> -     struct pci_driver *driver = dev->driver;
> +     struct pci_driver *driver;
>  
>       dev->error_state = pci_channel_io_frozen;
>  
> -     if (!driver)
> -             return 0;
> +     driver = eeh_pcid_get(dev);
> +     if (!driver) return 0;
>  
>       eeh_disable_irq(dev);
>  
>       if (!driver->err_handler ||
> -         !driver->err_handler->error_detected)
> +         !driver->err_handler->error_detected) {
> +             eeh_pcid_put(dev);
>               return 0;
> +     }
>  
>       rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
>  
> @@ -145,6 +183,7 @@ static int eeh_report_error(struct pci_dev *dev, void 
> *userdata)
>       if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
>       if (*res == PCI_ERS_RESULT_NONE) *res = rc;
>  
> +     eeh_pcid_put(dev);
>       return 0;
>  }
>  
> @@ -160,12 +199,16 @@ static int eeh_report_error(struct pci_dev *dev, void 
> *userdata)
>  static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
>  {
>       enum pci_ers_result rc, *res = userdata;
> -     struct pci_driver *driver = dev->driver;
> +     struct pci_driver *driver;
>  
> -     if (!driver ||
> -         !driver->err_handler ||
> -         !driver->err_handler->mmio_enabled)
> +     driver = eeh_pcid_get(dev);
> +     if (!driver) return 0;
> +
> +     if (!driver->err_handler ||
> +         !driver->err_handler->mmio_enabled) {
> +             eeh_pcid_put(dev);
>               return 0;
> +     }
>  
>       rc = driver->err_handler->mmio_enabled(dev);
>  
> @@ -173,6 +216,7 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, 
> void *userdata)
>       if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
>       if (*res == PCI_ERS_RESULT_NONE) *res = rc;
>  
> +     eeh_pcid_put(dev);
>       return 0;
>  }
>  
> @@ -189,18 +233,20 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, 
> void *userdata)
>  static int eeh_report_reset(struct pci_dev *dev, void *userdata)
>  {
>       enum pci_ers_result rc, *res = userdata;
> -     struct pci_driver *driver = dev->driver;
> +     struct pci_driver *driver;
>  
> -     if (!driver)
> -             return 0;
> +     driver = eeh_pcid_get(dev);
> +     if (!driver) return 0;
>  
>       dev->error_state = pci_channel_io_normal;
>  
>       eeh_enable_irq(dev);
>  
>       if (!driver->err_handler ||
> -         !driver->err_handler->slot_reset)
> +         !driver->err_handler->slot_reset) {
> +             eeh_pcid_put(dev);
>               return 0;
> +     }
>  
>       rc = driver->err_handler->slot_reset(dev);
>       if ((*res == PCI_ERS_RESULT_NONE) ||
> @@ -208,6 +254,7 @@ static int eeh_report_reset(struct pci_dev *dev, void 
> *userdata)
>       if (*res == PCI_ERS_RESULT_DISCONNECT &&
>            rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
>  
> +     eeh_pcid_put(dev);
>       return 0;
>  }
>  
> @@ -222,21 +269,24 @@ static int eeh_report_reset(struct pci_dev *dev, void 
> *userdata)
>   */
>  static int eeh_report_resume(struct pci_dev *dev, void *userdata)
>  {
> -     struct pci_driver *driver = dev->driver;
> +     struct pci_driver *driver;
>  
>       dev->error_state = pci_channel_io_normal;
>  
> -     if (!driver)
> -             return 0;
> +     driver = eeh_pcid_get(dev);
> +     if (!driver) return 0;
>  
>       eeh_enable_irq(dev);
>  
>       if (!driver->err_handler ||
> -         !driver->err_handler->resume)
> +         !driver->err_handler->resume) {
> +             eeh_pcid_put(dev);
>               return 0;
> +     }
>  
>       driver->err_handler->resume(dev);
>  
> +     eeh_pcid_put(dev);
>       return 0;
>  }
>  
> @@ -250,21 +300,24 @@ static int eeh_report_resume(struct pci_dev *dev, void 
> *userdata)
>   */
>  static int eeh_report_failure(struct pci_dev *dev, void *userdata)
>  {
> -     struct pci_driver *driver = dev->driver;
> +     struct pci_driver *driver;
>  
>       dev->error_state = pci_channel_io_perm_failure;
>  
> -     if (!driver)
> -             return 0;
> +     driver = eeh_pcid_get(dev);
> +     if (!driver) return 0;
>  
>       eeh_disable_irq(dev);
>  
>       if (!driver->err_handler ||
> -         !driver->err_handler->error_detected)
> +         !driver->err_handler->error_detected) {
> +             eeh_pcid_put(dev);
>               return 0;
> +     }
>  
>       driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
>  
> +     eeh_pcid_put(dev);
>       return 0;
>  }
>  


--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to