On Tue, 16 Sep 2014 11:56:14 -0400
Alan Stern <[email protected]> wrote:
> On Tue, 16 Sep 2014, Joe Lawrence wrote:
>
> > > Anyway, the log above means that a QH was linked before the HC died,
> > > but then it was never unlinked. Please add a line at the start of
> > > ehci_endpoint_disable, printing out the value of ep->hcpriv for
> > > comparison with the values listed above.
> >
> > Will do.
> >
> > > Also, let's see the contents of *qh and *(qh->hw) for the left-over QH
> > > at *(ehci->async->qh_next.qh). You can get those right away from your
> > > crash dump data, right?
> >
> > Yep, interestingly enough, *(ehci->async->qh_next.qh) appears to be
> > mostly full of zeros:
>
> > crash> struct ehci_qh 0xffff88083d6b4558 <<
> > *(ehci->async->qh_next.qh)
> > struct ehci_qh {
> > hw = 0x0,
> > qh_dma = 0x0,
> > qh_next = {
> > qh = 0x0,
> > itd = 0x0,
> > sitd = 0x0,
> > fstn = 0x0,
> > hw_next = 0x0,
> > ptr = 0x0
> > },
> > qtd_list = {
> > next = 0x0,
> > prev = 0x0
> > },
> > intr_node = {
> > next = 0x0,
> > prev = 0x0
> > },
> > dummy = 0x0,
> > unlink_node = {
> > next = 0x0,
> > prev = 0x0
> > },
> > unlink_cycle = 0x6,
> > qh_state = 0x0,
> > xacterrs = 0x0,
> > usecs = 0x0,
> > gap_uf = 0x0,
> > c_usecs = 0x0,
> > tt_usecs = 0x0,
> > period = 0x0,
> > start = 0x0,
> > dev = 0x6,
> > is_out = 0x0,
> > clearing_tt = 0x0,
> > dequeue_during_giveback = 0x0,
> > exception = 0x0
> > }
>
> So this QH was destroyed properly, but somehow the link pointer in
> ehci->async continued to point at it. And it got destroyed without
> passing through single_unlink_async.
>
> Maybe you should also add a check at the end of ehci_endpoint_disable,
> just before the done: label. For example,
>
> BUG_ON(ehci->async->qh_next.qh == qh);
>
> Then if you really want to be thorough, in single_unlink_async, just
> after the "while" loop, print the values of prev, prev->qh_next.qh, qh,
> and qh->qh_next.qh. And in qh_link_async, just after the wmb(), print
> the values of head->qh_next.qh and qh. (This could end up generating
> quite a lot of output...)
>
> Alan Stern
>
Quick turn around on this one :)
crash> log | grep 0000:2c:00.0
pci 0000:2c:00.0: [8086:1d26] type 00 class 0x0c0320
pci 0000:2c:00.0: reg 0x10: [mem 0x90000000-0x900003ff]
pci 0000:2c:00.0: PME# supported from D0 D3hot D3cold
ehci-pci 0000:2c:00.0: EHCI Host Controller
ehci-pci 0000:2c:00.0: new USB bus registered, assigned bus number 1
ehci-pci 0000:2c:00.0: debug port 2
ehci-pci 0000:2c:00.0: cache line size of 64 is not supported
ehci-pci 0000:2c:00.0: irq 10, io mem 0x90000000
ehci-pci 0000:2c:00.0: USB 2.0 started, EHCI 1.00
usb usb1: SerialNumber: 0000:2c:00.0
ehci-pci 0000:2c:00.0: qh_link_async:1003 ehci = ffff88084ff1b088,
head->qh_next.qh = (null), qh = ffff88083eacae50
ehci-pci 0000:2c:00.0: HC died; cleaning up
ehci-pci 0000:2c:00.0: ehci_endpoint_disable:944 ep->hcpriv = ffff88083eacae50,
ehci(ffff88084ff1b088)->num_async = 1, ehci->async->qh_next.qh =
ffff88083eacae50
------------[ cut here ]------------
kernel BUG at drivers/usb/host/ehci-hcd.c:1002!
invalid opcode: 0000 [#1] SMP
...
CPU: 0 PID: 207 Comm: khubd Tainted: PF O--------------
3.10.0-123.6.3.el7.bigphysarea_expedient1_usb4.x86_64 #1
Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013
task: ffff880853ff8cb0 ti: ffff880853434000 task.ti: ffff880853434000
RIP: 0010:[<ffffffff814157c6>] [<ffffffff814157c6>]
ehci_endpoint_disable+0x1f6/0x200
RSP: 0018:ffff880853435ca0 EFLAGS: 00010046
RAX: ffff8800372cc720 RBX: ffff88083eacae50 RCX: dead000000200200
RDX: ffff88085f800710 RSI: 0000000000000047 RDI: 0000000000000047
RBP: ffff880853435ce0 R08: ffff88085f800710 R09: 0000000100230022
R10: ffff88085f807600 R11: ffffffff81412158 R12: ffff88084ff1ae68
R13: ffff88084ff1b174 R14: ffff880851914a88 R15: 0000000000000246
FS: 0000000000000000(0000) GS:ffff88085fc00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffc501a2000 CR3: 0000000850402000 CR4: 00000000000407f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Stack:
ffff880800000001 ffff88083eacae50 ffff88084ff1b088 ffff880851914a40
ffff880851914a88 0000000000000001 ffff880851914a40 00000000ffffff00
ffff880853435d00 ffffffff813fabf3 ffff880851914a40 ffff880851914a88
Call Trace:
[<ffffffff813fabf3>] usb_hcd_disable_endpoint+0x33/0x40
[<ffffffff813fd8b9>] usb_disable_endpoint+0x69/0x90
[<ffffffff813effbc>] usb_ep0_reinit+0x1c/0x40
[<ffffffff813f4fba>] hub_port_connect_change+0x33a/0xb50
[<ffffffff813f5b49>] hub_events+0x379/0xa30
[<ffffffff813f6245>] hub_thread+0x45/0x1a0
[<ffffffff81086ac0>] ? wake_up_bit+0x30/0x30
[<ffffffff813f6200>] ? hub_events+0xa30/0xa30
[<ffffffff81085aff>] kthread+0xcf/0xe0
[<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140
[<ffffffff815f31ec>] ret_from_fork+0x7c/0xb0
[<ffffffff81085a30>] ? kthread_create_on_node+0x140/0x140
Just to review, I pasted the USB related patches to this kernel below. The
drivers/usb/core/hub.c changes include:
d8521af "usb: assign default peer ports for root hubs" (maxchild parts)
c605f3c "usb: hub: take hub->hdev reference when processing from eventlist"
the rest are debugging for the qh_destroy BUG.
-- Joe
diff -Nupr before/drivers/usb/core/hub.c after/drivers/usb/core/hub.c
--- before/drivers/usb/core/hub.c 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/core/hub.c 2014-09-16 12:35:26.816249313 -0400
@@ -1304,6 +1304,7 @@ static int hub_configure(struct usb_hub
char *message = "out of memory";
unsigned unit_load;
unsigned full_load;
+ unsigned maxchild;
hub->buffer = kmalloc(sizeof(*hub->buffer), GFP_KERNEL);
if (!hub->buffer) {
@@ -1342,12 +1343,11 @@ static int hub_configure(struct usb_hub
goto fail;
}
- hdev->maxchild = hub->descriptor->bNbrPorts;
- dev_info (hub_dev, "%d port%s detected\n", hdev->maxchild,
- (hdev->maxchild == 1) ? "" : "s");
+ maxchild = hub->descriptor->bNbrPorts;
+ dev_info (hub_dev, "%d port%s detected\n", maxchild,
+ (maxchild == 1) ? "" : "s");
- hub->ports = kzalloc(hdev->maxchild * sizeof(struct usb_port *),
- GFP_KERNEL);
+ hub->ports = kzalloc(maxchild * sizeof(struct usb_port *), GFP_KERNEL);
if (!hub->ports) {
ret = -ENOMEM;
goto fail;
@@ -1368,11 +1368,11 @@ static int hub_configure(struct usb_hub
int i;
char portstr [USB_MAXCHILDREN + 1];
- for (i = 0; i < hdev->maxchild; i++)
+ for (i = 0; i < maxchild; i++)
portstr[i] = hub->descriptor->u.hs.DeviceRemovable
[((i + 1) / 8)] & (1 << ((i + 1) % 8))
? 'F' : 'R';
- portstr[hdev->maxchild] = 0;
+ portstr[maxchild] = 0;
dev_dbg(hub_dev, "compound device; port removable status:
%s\n", portstr);
} else
dev_dbg(hub_dev, "standalone hub\n");
@@ -1484,7 +1484,7 @@ static int hub_configure(struct usb_hub
if (hcd->power_budget > 0)
hdev->bus_mA = hcd->power_budget;
else
- hdev->bus_mA = full_load * hdev->maxchild;
+ hdev->bus_mA = full_load * maxchild;
if (hdev->bus_mA >= full_load)
hub->mA_per_port = full_load;
else {
@@ -1499,7 +1499,7 @@ static int hub_configure(struct usb_hub
hub->descriptor->bHubContrCurrent);
hub->limited_power = 1;
- if (remaining < hdev->maxchild * unit_load)
+ if (remaining < maxchild * unit_load)
dev_warn(hub_dev,
"insufficient power available "
"to use all downstream ports\n");
@@ -1567,15 +1567,17 @@ static int hub_configure(struct usb_hub
if (hub->has_indicators && blinkenlights)
hub->indicator [0] = INDICATOR_CYCLE;
- for (i = 0; i < hdev->maxchild; i++) {
+ for (i = 0; i < maxchild; i++) {
ret = usb_hub_create_port_device(hub, i + 1);
if (ret < 0) {
dev_err(hub->intfdev,
"couldn't create port%d device.\n", i + 1);
- hdev->maxchild = i;
- goto fail_keep_maxchild;
+ break;
}
}
+ hdev->maxchild = i;
+ if (ret < 0)
+ goto fail;
usb_hub_adjust_deviceremovable(hdev, hub->descriptor);
@@ -1583,8 +1585,6 @@ static int hub_configure(struct usb_hub
return 0;
fail:
- hdev->maxchild = 0;
-fail_keep_maxchild:
dev_err (hub_dev, "config failed, %s (err %d)\n",
message, ret);
/* hub_disconnect() frees urb and descriptor */
@@ -4650,9 +4650,10 @@ static void hub_events(void)
hub = list_entry(tmp, struct usb_hub, event_list);
kref_get(&hub->kref);
+ hdev = hub->hdev;
+ usb_get_dev(hdev);
spin_unlock_irq(&hub_event_lock);
- hdev = hub->hdev;
hub_dev = hub->intfdev;
intf = to_usb_interface(hub_dev);
dev_dbg(hub_dev, "state %d ports %d chg %04x evt %04x\n",
@@ -4864,6 +4865,7 @@ static void hub_events(void)
usb_autopm_put_interface(intf);
loop_disconnected:
usb_unlock_device(hdev);
+ usb_put_dev(hdev);
kref_put(&hub->kref, hub_release);
} /* end while (1) */
diff -Nupr before/drivers/usb/host/ehci.h after/drivers/usb/host/ehci.h
--- before/drivers/usb/host/ehci.h 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci.h 2014-09-16 12:35:26.836249471 -0400
@@ -229,6 +229,8 @@ struct ehci_hcd { /* one per controlle
struct dentry *debug_dir;
#endif
+ int num_async;
+
/* platform-specific data -- must come last */
unsigned long priv[0] __aligned(sizeof(s64));
};
diff -Nupr before/drivers/usb/host/ehci-hcd.c after/drivers/usb/host/ehci-hcd.c
--- before/drivers/usb/host/ehci-hcd.c 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-hcd.c 2014-09-16 12:42:24.603336165 -0400
@@ -417,6 +417,7 @@ static void ehci_stop (struct usb_hcd *h
ehci_dbg (ehci, "stop\n");
+ ehci_info(ehci, "%s:%d ehci(%p)->num_async = %d,
ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ehci, ehci->num_async,
ehci->async->qh_next.qh);
/* no more interrupts ... */
spin_lock_irq(&ehci->lock);
@@ -940,6 +941,8 @@ ehci_endpoint_disable (struct usb_hcd *h
/* ASSERT: any requests/urbs are being unlinked */
/* ASSERT: nobody can be submitting urbs for this any more */
+ ehci_info(ehci, "%s:%d ep->hcpriv = %p, ehci(%p)->num_async = %d,
ehci->async->qh_next.qh = %p\n", __func__, __LINE__, ep->hcpriv, ehci,
ehci->num_async, ehci->async->qh_next.qh);
+
rescan:
spin_lock_irqsave (&ehci->lock, flags);
qh = ep->hcpriv;
@@ -995,6 +998,9 @@ idle_timeout:
list_empty (&qh->qtd_list) ? "" : "(has tds)");
break;
}
+
+ BUG_ON(ehci->async->qh_next.qh == qh);
+
done:
ep->hcpriv = NULL;
spin_unlock_irqrestore (&ehci->lock, flags);
diff -Nupr before/drivers/usb/host/ehci-q.c after/drivers/usb/host/ehci-q.c
--- before/drivers/usb/host/ehci-q.c 2014-07-16 14:25:31.000000000 -0400
+++ after/drivers/usb/host/ehci-q.c 2014-09-16 12:42:21.553284025 -0400
@@ -959,6 +959,25 @@ static void disable_async(struct ehci_hc
ehci_poll_ASS(ehci);
}
+static void check_async_ring(struct ehci_hcd *ehci, int add)
+{
+ struct ehci_qh *qh;
+ int n;
+
+ qh = ehci->async->qh_next.qh;
+ n = ehci->num_async += add;
+
+ while (qh && n > 0) {
+ qh = qh->qh_next.qh;
+ --n;
+ }
+ if (qh || n != 0) {
+ ehci_err(ehci, "EHCI async list corrupted: num %d n %d qh %p\n",
+ ehci->num_async, n, qh);
+ BUG();
+ }
+}
+
/* move qh (and its qtds) onto async queue; maybe enable queue. */
static void qh_link_async (struct ehci_hcd *ehci, struct ehci_qh *qh)
@@ -981,6 +1000,8 @@ static void qh_link_async (struct ehci_h
qh->hw->hw_next = head->hw->hw_next;
wmb ();
+ ehci_info(ehci, "%s:%d ehci = %p, head->qh_next.qh = %p, qh = %p\n",
__func__, __LINE__, ehci, head->qh_next.qh, qh);
+
head->qh_next.qh = qh;
head->hw->hw_next = dma;
@@ -990,6 +1011,8 @@ static void qh_link_async (struct ehci_h
/* qtd completions reported later by interrupt */
enable_async(ehci);
+
+ check_async_ring(ehci, 1);
}
/*-------------------------------------------------------------------------*/
@@ -1254,10 +1277,14 @@ static void single_unlink_async(struct e
while (prev->qh_next.qh != qh)
prev = prev->qh_next.qh;
+ ehci_info(ehci, "%s:%d ehci = %p prev = %p, prev->qh_next.qh = %p, qh =
%p, qh->qh_next.qh = %p\n", __func__, __LINE__, ehci, prev, prev->qh_next.qh,
qh, qh->qh_next.qh);
+
prev->hw->hw_next = qh->hw->hw_next;
prev->qh_next = qh->qh_next;
if (ehci->qh_scan_next == qh)
ehci->qh_scan_next = qh->qh_next.qh;
+
+ check_async_ring(ehci, -1);
}
static void start_iaa_cycle(struct ehci_hcd *ehci)
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html