On Fri, 12 Sep 2014, Alan Stern wrote:
> On Fri, 12 Sep 2014, Joe Lawrence wrote:
>
> > On Fri, 12 Sep 2014 11:31:46 -0400
> > Alan Stern <[email protected]> wrote:
> >
> > > On Thu, 11 Sep 2014, Joe Lawrence wrote:
> > >
> > > > Hi Alan,
> > > >
> > > > I've got another USB bug to report that manifests during automated
> > > > device removal testing on RHEL7. This one hits the BUG() inside
> > > > qh_destroy:
> > >
> > > How reliably can you trigger this bug?
> >
> > I have collected a few crashes within a few days, so somewhat
> > frequently.
> >
> > > > 67 static void qh_destroy(struct ehci_hcd *ehci, struct ehci_qh *qh)
> > > > 68 {
> > > > 69 /* clean qtds first, and know this is not linked */
> > > > 70 if (!list_empty (&qh->qtd_list) || qh->qh_next.ptr) {
> > > > 71 ehci_dbg (ehci, "unused qh not empty!\n");
> > > > 72 BUG ();
> > > > 73 }
> > >
> > > > and finally a dump of the ehci_qh in question:
> > > >
> > > > crash> struct ehci_qh ffff88084b84dc80
> > > > struct ehci_qh {
> > > > hw = 0xffff880078d1a000,
> > >
> > > It would be good to see the contents of the ehci_qh_hw structure. That
> > > would tell us what device and endpoint this QH was for.
> >
> > crash> struct ehci_qh_hw 0xffff880078d1a000
> > struct ehci_qh_hw {
> > hw_next = 0x78d1a062,
> > hw_info1 = 0x8000,
>
> No maxpacket value, device address, or endpoint number, but the QH_HEAD
> bit is set. That happens only with the head of the async ring. And
> indeed, the QH address agrees with ehci->async = 0xffff88084b84dc80 in
> your earlier email.
>
> > hw_info2 = 0x0,
> > hw_current = 0x0,
> > hw_qtd_next = 0x1,
> > hw_alt_next = 0x78d22000,
> > hw_token = 0x40,
> > hw_buf = {0x0, 0x0, 0x0, 0x0, 0x0},
> > hw_buf_hi = {0x0, 0x0, 0x0, 0x0, 0x0}
> > }
> >
> > > > qh_dma = 0x78d1a000,
> > > > qh_next = {
> > > > qh = 0xffff88084efe6730,
> > > > itd = 0xffff88084efe6730,
> > > > sitd = 0xffff88084efe6730,
> > > > fstn = 0xffff88084efe6730,
> > > > hw_next = 0xffff88084efe6730,
> > > > ptr = 0xffff88084efe6730 << !NULL
> > > > },
>
> So there's a leftover qh_next pointer, presumably to a QH that used to
> be on the async list but no longer exists.
>
> This means the list pointers got corrupted somehow. No way at this
> point to know just how. You can add some debugging code to check the
> links at the end of qh_link_async (which adds a new QH to the async
> list) and single_unlink_async (which removes a QH from the list).
Hi Alan,
I've collected 16 crashes since kicking off automated tests a little
over 24 hrs ago.
Each crash hit the BUG in qh_destroy() and the only unique debugging
printk from ehci_stop() was: "ehci_stop: ehci->num_async = 0".
I can include (or upload) a full (or filtered) vmcore-dmesg.txt if that
would be more helpful.
The debug code I added as you suggested is provided below...
Thanks,
-- Joe
diff -Nupr before/drivers/usb/host/ehci.h after/drivers/usb/host/ehci.h
--- before/drivers/usb/host/ehci.h 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci.h 2014-09-12 15:47:18.943478397 -0400
@@ -231,6 +231,8 @@ struct ehci_hcd { /* one per controlle
/* platform-specific data -- must come last */
unsigned long priv[0] __aligned(sizeof(s64));
+
+ int num_async;
};
/* convert between an HCD pointer and the corresponding EHCI_HCD */
diff -Nupr before/drivers/usb/host/ehci-hcd.c after/drivers/usb/host/ehci-hcd.c
--- before/drivers/usb/host/ehci-hcd.c 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-hcd.c 2014-09-12 15:53:16.863163627 -0400
@@ -440,6 +440,8 @@ static void ehci_stop (struct usb_hcd *h
if (ehci->amd_pll_fix == 1)
usb_amd_dev_put();
+ pr_err("%s: ehci->num_async = %d\n", __func__, ehci->num_async);
+
dbg_status (ehci, "ehci_stop completed",
ehci_readl(ehci, &ehci->regs->status));
}
diff -Nupr before/drivers/usb/host/ehci-q.c after/drivers/usb/host/ehci-q.c
--- before/drivers/usb/host/ehci-q.c 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-q.c 2014-09-12 15:52:08.023292291 -0400
@@ -959,6 +959,24 @@ static void disable_async(struct ehci_hc
ehci_poll_ASS(ehci);
}
+static void check_async_ring(struct ehci_hcd *ehci, int add)
+{
+ struct ehci_qh *qh;
+ int n;
+
+ qh = ehci->async->qh_next.qh;
+ n = ehci->num_async += add;
+ while (qh && n > 0) {
+ qh = qh->qh_next.qh;
+ --n;
+ }
+ if (qh || n != 0) {
+ ehci_err(ehci, "EHCI async list corrupted: num %d n %d qh %p\n",
+ ehci->num_async, n, qh);
+ BUG();
+ }
+}
+
/* move qh (and its qtds) onto async queue; maybe enable queue. */
static void qh_link_async (struct ehci_hcd *ehci, struct ehci_qh *qh)
@@ -990,6 +1008,8 @@ static void qh_link_async (struct ehci_h
/* qtd completions reported later by interrupt */
enable_async(ehci);
+
+ check_async_ring(ehci, 1);
}
/*-------------------------------------------------------------------------*/
@@ -1258,6 +1278,8 @@ static void single_unlink_async(struct e
prev->qh_next = qh->qh_next;
if (ehci->qh_scan_next == qh)
ehci->qh_scan_next = qh->qh_next.qh;
+
+ check_async_ring(ehci, -1);
}
static void start_iaa_cycle(struct ehci_hcd *ehci)
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html