On Tue, 2012-09-18 at 11:34 +0800, Gavin Shan wrote: > The EEH core is talking with the PCI device driver to determine the > action (purely reset, or PCI device removal). During the period, the > driver might be unloaded and in turn causes kernel crash as follows:
Stable folks: I'm about to push out a powerpc-next with equivalent of those two patches in it. The patches are fairly different between 3.6 and 3.5- however since Gavin rewrote a lot of the EEH infrastructure in 3.6, hence these separate backport patches. Cheers, Ben. > EEH: Detected PCI bus error on PHB#4-PE#10000 > EEH: This PCI device has failed 3 times in the last hour > lpfc 0004:01:00.0: 0:2710 PCI channel disable preparing for reset > Unable to handle kernel paging request for data at address 0x00000490 > Faulting instruction address: 0xd00000000e682c90 > cpu 0x1: Vector: 300 (Data Access) at [c000000fc75ffa20] > pc: d00000000e682c90: .lpfc_io_error_detected+0x30/0x240 [lpfc] > lr: d00000000e682c8c: .lpfc_io_error_detected+0x2c/0x240 [lpfc] > sp: c000000fc75ffca0 > msr: 8000000000009032 > dar: 490 > dsisr: 40000000 > current = 0xc000000fc79b88b0 > paca = 0xc00000000edb0380 softe: 0 irq_happened: 0x00 > pid = 3386, comm = eehd > enter ? for help > [c000000fc75ffca0] c000000fc75ffd30 (unreliable) > [c000000fc75ffd30] c00000000004fd3c .eeh_report_error+0x7c/0xf0 > [c000000fc75ffdc0] c00000000004ee00 .eeh_pe_dev_traverse+0xa0/0x180 > [c000000fc75ffe70] c00000000004ffd8 .eeh_handle_event+0x68/0x300 > [c000000fc75fff00] c0000000000503a0 .eeh_event_handler+0x130/0x1a0 > [c000000fc75fff90] c000000000020138 .kernel_thread+0x54/0x70 > 1:mon> > > The patch increases the reference of the corresponding driver modules > while EEH core does the negotiation with PCI device driver so that the > corresponding driver modules can't be unloaded during the period and > we're safe to refer the callbacks. > > Cc: [email protected] > Reported-by: Alexey Kardashevskiy <[email protected]> > Signed-off-by: Gavin Shan <[email protected]> > --- > arch/powerpc/platforms/pseries/eeh_driver.c | 93 > +++++++++++++++++++++------ > 1 files changed, 73 insertions(+), 20 deletions(-) > > diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c > b/arch/powerpc/platforms/pseries/eeh_driver.c > index baf92cd..5bde648 100644 > --- a/arch/powerpc/platforms/pseries/eeh_driver.c > +++ b/arch/powerpc/platforms/pseries/eeh_driver.c > @@ -25,6 +25,7 @@ > #include <linux/delay.h> > #include <linux/interrupt.h> > #include <linux/irq.h> > +#include <linux/module.h> > #include <linux/pci.h> > #include <asm/eeh.h> > #include <asm/eeh_event.h> > @@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev > *pdev) > return ""; > } > > +/** > + * eeh_pcid_get - Get the PCI device driver > + * @pdev: PCI device > + * > + * The function is used to retrieve the PCI device driver for > + * the indicated PCI device. Besides, we will increase the reference > + * of the PCI device driver to prevent that being unloaded on > + * the fly. Otherwise, kernel crash would be seen. > + */ > +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) > +{ > + if (!pdev || !pdev->driver) > + return NULL; > + > + if (!try_module_get(pdev->driver->driver.owner)) > + return NULL; > + > + return pdev->driver; > +} > + > +/** > + * eeh_pcid_put - Dereference on the PCI device driver > + * @pdev: PCI device > + * > + * The function is called to do dereference on the PCI device > + * driver of the indicated PCI device. > + */ > +static inline void eeh_pcid_put(struct pci_dev *pdev) > +{ > + if (!pdev || !pdev->driver) > + return; > + > + module_put(pdev->driver->driver.owner); > +} > + > #if 0 > static void print_device_node_tree(struct pci_dn *pdn, int dent) > { > @@ -126,18 +162,20 @@ static void eeh_enable_irq(struct pci_dev *dev) > static int eeh_report_error(struct pci_dev *dev, void *userdata) > { > enum pci_ers_result rc, *res = userdata; > - struct pci_driver *driver = dev->driver; > + struct pci_driver *driver; > > dev->error_state = pci_channel_io_frozen; > > - if (!driver) > - return 0; > + driver = eeh_pcid_get(dev); > + if (!driver) return 0; > > eeh_disable_irq(dev); > > if (!driver->err_handler || > - !driver->err_handler->error_detected) > + !driver->err_handler->error_detected) { > + eeh_pcid_put(dev); > return 0; > + } > > rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); > > @@ -145,6 +183,7 @@ static int eeh_report_error(struct pci_dev *dev, void > *userdata) > if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; > if (*res == PCI_ERS_RESULT_NONE) *res = rc; > > + eeh_pcid_put(dev); > return 0; > } > > @@ -160,12 +199,16 @@ static int eeh_report_error(struct pci_dev *dev, void > *userdata) > static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) > { > enum pci_ers_result rc, *res = userdata; > - struct pci_driver *driver = dev->driver; > + struct pci_driver *driver; > > - if (!driver || > - !driver->err_handler || > - !driver->err_handler->mmio_enabled) > + driver = eeh_pcid_get(dev); > + if (!driver) return 0; > + > + if (!driver->err_handler || > + !driver->err_handler->mmio_enabled) { > + eeh_pcid_put(dev); > return 0; > + } > > rc = driver->err_handler->mmio_enabled(dev); > > @@ -173,6 +216,7 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, > void *userdata) > if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; > if (*res == PCI_ERS_RESULT_NONE) *res = rc; > > + eeh_pcid_put(dev); > return 0; > } > > @@ -189,18 +233,20 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, > void *userdata) > static int eeh_report_reset(struct pci_dev *dev, void *userdata) > { > enum pci_ers_result rc, *res = userdata; > - struct pci_driver *driver = dev->driver; > + struct pci_driver *driver; > > - if (!driver) > - return 0; > + driver = eeh_pcid_get(dev); > + if (!driver) return 0; > > dev->error_state = pci_channel_io_normal; > > eeh_enable_irq(dev); > > if (!driver->err_handler || > - !driver->err_handler->slot_reset) > + !driver->err_handler->slot_reset) { > + eeh_pcid_put(dev); > return 0; > + } > > rc = driver->err_handler->slot_reset(dev); > if ((*res == PCI_ERS_RESULT_NONE) || > @@ -208,6 +254,7 @@ static int eeh_report_reset(struct pci_dev *dev, void > *userdata) > if (*res == PCI_ERS_RESULT_DISCONNECT && > rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; > > + eeh_pcid_put(dev); > return 0; > } > > @@ -222,21 +269,24 @@ static int eeh_report_reset(struct pci_dev *dev, void > *userdata) > */ > static int eeh_report_resume(struct pci_dev *dev, void *userdata) > { > - struct pci_driver *driver = dev->driver; > + struct pci_driver *driver; > > dev->error_state = pci_channel_io_normal; > > - if (!driver) > - return 0; > + driver = eeh_pcid_get(dev); > + if (!driver) return 0; > > eeh_enable_irq(dev); > > if (!driver->err_handler || > - !driver->err_handler->resume) > + !driver->err_handler->resume) { > + eeh_pcid_put(dev); > return 0; > + } > > driver->err_handler->resume(dev); > > + eeh_pcid_put(dev); > return 0; > } > > @@ -250,21 +300,24 @@ static int eeh_report_resume(struct pci_dev *dev, void > *userdata) > */ > static int eeh_report_failure(struct pci_dev *dev, void *userdata) > { > - struct pci_driver *driver = dev->driver; > + struct pci_driver *driver; > > dev->error_state = pci_channel_io_perm_failure; > > - if (!driver) > - return 0; > + driver = eeh_pcid_get(dev); > + if (!driver) return 0; > > eeh_disable_irq(dev); > > if (!driver->err_handler || > - !driver->err_handler->error_detected) > + !driver->err_handler->error_detected) { > + eeh_pcid_put(dev); > return 0; > + } > > driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); > > + eeh_pcid_put(dev); > return 0; > } > -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
