Hi Alan,
I've got another USB bug to report that manifests during automated
device removal testing on RHEL7. This one hits the BUG() inside
qh_destroy:
PID: 139 TASK: ffff881054101960 CPU: 22 COMMAND: "kworker/u66:0"
#0 [ffff881054113540] machine_kexec at ffffffff810411b1
#1 [ffff881054113598] crash_kexec at ffffffff810cf272
#2 [ffff881054113668] oops_end at ffffffff815eb548
#3 [ffff881054113690] die at ffffffff8101632b
#4 [ffff8810541136c0] do_trap at ffffffff815eac20
#5 [ffff881054113710] do_invalid_op at ffffffff81013134
#6 [ffff8810541137c0] invalid_op at ffffffff815f485e
[exception RIP: qh_destroy+0x66]
RIP: ffffffff81412066 RSP: ffff881054113870 RFLAGS: 00010246
RAX: ffff88084b84dc98 RBX: ffff88084b84dc80 RCX: ffff88107fdf0874
RDX: 0000000000000056 RSI: ffff88084b84dc80 RDI: ffff880078f12740
RBP: ffff881054113880 R8: ffffffff819e4ba0 R9: 084f8036100c0000
R10: f79281ddd0018403 R11: 0000000000000400 R12: ffff880078f12740
R13: ffff880078f1282c R14: ffff8810405745b8 R15: 0000000000000048
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
#7 [ffff881054113888] ehci_mem_cleanup at ffffffff814132b8
#8 [ffff8810541138c8] ehci_stop at ffffffff81417a92
#9 [ffff881054113950] usb_remove_hcd at ffffffff813f8d38
#10 [ffff881054113980] usb_hcd_pci_remove at ffffffff8140a15f
#11 [ffff8810541139a8] pci_device_remove at ffffffff812ec5bb
#12 [ffff8810541139d0] __device_release_driver at ffffffff813b6b9f
#13 [ffff8810541139f0] device_release_driver at ffffffff813b6c33
#14 [ffff881054113a10] bus_remove_device at ffffffff813b63a8
#15 [ffff881054113a38] device_del at ffffffff813b2955
#16 [ffff881054113a68] pci_stop_bus_device at ffffffff812e54b4
#17 [ffff881054113a90] pci_stop_bus_device at ffffffff812e545b
#18 [ffff881054113ab8] pci_stop_bus_device at ffffffff812e545b
#19 [ffff881054113ae0] pci_stop_and_remove_bus_device at ffffffff812e55a2
#20 [ffff881054113af8] ioboard_bringdown at ffffffffa0400917 [ftmod]
#21 [ffff881054113b20] ftmod_ioboard_event at ffffffffa0400c07 [ftmod]
#22 [ffff881054113ba8] io_state_change at ffffffffa03fe7b2 [ftmod]
#23 [ffff881054113bf8] OsIoStateChange at ffffffffa06d25d4 [ccmod]
#24 [ffff881054113c10] IoStateChange at ffffffffa06d3008 [ccmod]
#25 [ffff881054113d40] CcIoPoll at ffffffffa06d6cb4 [ccmod]
#26 [ffff881054113de0] CcIoPoll at ffffffffa03f9b76 [ftmod]
#27 [ffff881054113df8] io_poll at ffffffffa03fe6d5 [ftmod]
#28 [ffff881054113e20] process_one_work at ffffffff8107e03b
#29 [ffff881054113e68] worker_thread at ffffffff8107ee0b
#30 [ffff881054113ec8] kthread at ffffffff81085aff
#31 [ffff881054113f50] ret_from_fork at ffffffff815f306c
drivers/usb/host/ehci-mem.c: 68
<qh_destroy+0xa>: mov %rsp,%rbp
<qh_destroy+0xd>: push %r12
<qh_destroy+0xf>: mov %rdi,%r12
<qh_destroy+0x12>: push %rbx
...
drivers/usb/host/ehci-mem.c: 68
<qh_destroy+0x17>: mov %rsi,%rbx
...
drivers/usb/host/ehci-mem.c: 72
<qh_destroy+0x66>: ud2
67 static void qh_destroy(struct ehci_hcd *ehci, struct ehci_qh *qh)
68 {
69 /* clean qtds first, and know this is not linked */
70 if (!list_empty (&qh->qtd_list) || qh->qh_next.ptr) {
71 ehci_dbg (ehci, "unused qh not empty!\n");
72 BUG ();
73 }
74 if (qh->dummy)
75 ehci_qtd_free (ehci, qh->dummy);
76 dma_pool_free(ehci->qh_pool, qh->hw, qh->qh_dma);
77 kfree(qh);
78 }
According to disassembly, R12 has a copy of the initial RDI value and
and RBX has a copy of the initial RSI, so:
R12: struct ehci_hcd ffff880078f12740
RBX: struct ehci_qh ffff88084b84dc80
crash> struct ehci_hcd ffff880078f12740
struct ehci_hcd {
next_hrtimer_event = 99,
enabled_hrtimer_events = 0x0,
hr_timeouts = {{
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}, {
tv64 = 0x0
}},
hrtimer = {
node = {
node = {
__rb_parent_color = 0xffff880078f127a0,
rb_right = 0x0,
rb_left = 0x0
},
expires = {
tv64 = 0x0
}
},
_softexpires = {
tv64 = 0x0
},
function = 0xffffffff814103c0 <ehci_hrtimer_func>,
base = 0xffff88085fc0e780,
state = 0x0,
start_pid = 0xffffffff,
start_site = 0x0,
start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
},
PSS_poll_count = 0x0,
ASS_poll_count = 0x0,
died_poll_count = 0x0,
caps = 0xffffc9001680e000,
regs = 0xffffc9001680e020,
debug = 0x0,
hcs_params = 0x204008,
lock = {
{
rlock = {
raw_lock = {
{
head_tail = 0x580058,
tickets = {
head = 0x58,
tail = 0x58
}
}
}
}
}
},
rh_state = EHCI_RH_HALTED,
scanning = 0x0,
need_rescan = 0x0,
intr_unlinking = 0x0,
iaa_in_progress = 0x0,
async_unlinking = 0x0,
shutdown = 0x1,
qh_scan_next = 0x0,
async = 0xffff88084b84dc80,
dummy = 0x0,
async_unlink = {
next = 0xffff880078f12850,
prev = 0xffff880078f12850
},
async_idle = {
next = 0xffff880078f12860,
prev = 0xffff880078f12860
},
async_unlink_cycle = 0x0,
async_count = 0x1,
periodic_size = 0x400,
periodic = 0xffff880078d58000,
periodic_dma = 0x78d58000,
intr_qh_list = {
next = 0xffff880078f12890,
prev = 0xffff880078f12890
},
i_thresh = 0x0,
pshadow = 0xffff88084b7d4290,
intr_unlink_wait = {
next = 0xffff880078f128b0,
prev = 0xffff880078f128b0
},
intr_unlink = {
next = 0xffff880078f128c0,
prev = 0xffff880078f128c0
},
intr_unlink_wait_cycle = 0x0,
intr_unlink_cycle = 0x0,
now_frame = 0x0,
last_iso_frame = 0x0,
intr_count = 0x0,
isoc_count = 0x0,
periodic_count = 0x0,
uframe_periodic_max = 0x64,
cached_itd_list = {
next = 0xffff880078f128f0,
prev = 0xffff880078f128f0
},
last_itd_to_free = 0x0,
cached_sitd_list = {
next = 0xffff880078f12908,
prev = 0xffff880078f12908
},
last_sitd_to_free = 0x0,
reset_done = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0},
bus_suspended = 0x0,
companion_ports = 0x0,
owned_ports = 0x0,
port_c_suspend = 0x0,
suspended_ports = 0x0,
resuming_ports = 0x0,
qh_pool = 0xffff88084b84c1c8,
qtd_pool = 0xffff88084b84f900,
itd_pool = 0xffff88084b84fac8,
sitd_pool = 0xffff88084b84c000,
random_frame = 0x0,
next_statechange = 0x10023df2f,
last_periodic_enable = {
tv64 = 0x1392c8d7a295b667
},
command = 0x10000,
no_selective_suspend = 0x0,
has_fsl_port_bug = 0x0,
big_endian_mmio = 0x0,
big_endian_desc = 0x0,
big_endian_capbase = 0x0,
has_amcc_usb23 = 0x0,
need_io_watchdog = 0x0,
amd_pll_fix = 0x0,
use_dummy_qh = 0x0,
has_synopsys_hc_bug = 0x0,
frame_index_bug = 0x0,
need_oc_pp_cycle = 0x0,
ohci_hcctrl_reg = 0x0,
has_hostpc = 0x0,
has_ppcd = 0x0,
sbrn = 0x20,
stats = {
normal = 0x0,
error = 0x0,
iaa = 0x0,
lost_iaa = 0x0,
complete = 0x1,
unlink = 0x0
},
debug_dir = 0xffff88084f900208,
priv = 0xffff880078f12a50
}
from the ehci_hcd, we can also figure out the containing usb_hcd:
crash> struct -o usb_hcd | grep hcd_priv
[0x220] unsigned long hcd_priv[];
crash> p 0xffff880078f12740-0x220
$2 struct = 0xffff880078f12520
struct usb_hcd 0xffff880078f12520
crash> struct usb_hcd 0xffff880078f12520
struct usb_hcd {
self = {
controller = 0xffff8810405745b8,
busnum = 0x3,
bus_name = 0xffff88103fc11428 "0000:66:00.0",
uses_dma = 0x1,
uses_pio_for_control = 0x0,
otg_port = 0x0,
is_b_host = 0x0,
b_hnp_enable = 0x0,
no_stop_on_short = 0x0,
no_sg_constraint = 0x0,
sg_tablesize = 0xffffffff,
devnum_next = 0x6,
devmap = {
devicemap = {0x0, 0x0}
},
root_hub = 0xffff880078f16f60,
hs_companion = 0x0,
bus_list = {
next = 0xffff88084b7b6fa8,
prev = 0xffffffff81994030 <usb_bus_list>
},
bandwidth_allocated = 0x0,
bandwidth_int_reqs = 0x0,
bandwidth_isoc_reqs = 0x0,
resuming_ports = 0x0,
mon_bus = 0xffff88084b84f1e0,
monitored = 0x0
},
kref = {
refcount = {
counter = 0x2
}
},
product_desc = 0xffffffff8183b3d8 "EHCI Host Controller",
speed = 0x20,
irq_descr = "ehci_hcd:usb3\000\000\000\000\000\000\000\000\000\000",
rh_timer = {
entry = {
next = 0x0,
prev = 0x0
},
expires = 0x0,
base = 0xffffffff81c9b4c0 <boot_tvec_bases>,
function = 0xffffffff813f8600 <rh_timer_func>,
data = 0xffff880078f12520,
slack = 0xffffffff,
start_pid = 0xffffffff,
start_site = 0x0,
start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
},
status_urb = 0x0,
wakeup_work = {
data = {
counter = 0xfffffffe0
},
entry = {
next = 0xffff880078f12628,
prev = 0xffff880078f12628
},
func = 0xffffffff813f7cf0 <hcd_resume_work>
},
driver = 0xffffffff819ec2a0 <ehci_pci_hc_driver>,
phy = 0x0,
flags = 0x41,
rh_registered = 0x0,
rh_pollable = 0x0,
msix_enabled = 0x0,
uses_new_polling = 0x1,
wireless = 0x0,
authorized_default = 0x1,
has_tt = 0x0,
irq = 0xc,
regs = 0xffffc9001680e000,
rsrc_start = 0xa0000000,
rsrc_len = 0x400,
power_budget = 0x0,
high_prio_bh = {
running = 0x0,
lock = {
{
rlock = {
raw_lock = {
{
head_tail = 0x60006,
tickets = {
head = 0x6,
tail = 0x6
}
}
}
}
}
},
head = {
next = 0xffff880078f12688,
prev = 0xffff880078f12688
},
bh = {
next = 0x0,
state = 0x0,
count = {
counter = 0x0
},
func = 0xffffffff813f9050 <usb_giveback_urb_bh>,
data = 0xffff880078f12680
}
},
low_prio_bh = {
running = 0x0,
lock = {
{
rlock = {
raw_lock = {
{
head_tail = 0xc000c0,
tickets = {
head = 0xc0,
tail = 0xc0
}
}
}
}
}
},
head = {
next = 0xffff880078f126c8,
prev = 0xffff880078f126c8
},
bh = {
next = 0x0,
state = 0x0,
count = {
counter = 0x0
},
func = 0xffffffff813f9050 <usb_giveback_urb_bh>,
data = 0xffff880078f126c0
}
},
bandwidth_mutex = 0xffff88084c231a08,
shared_hcd = 0x0,
primary_hcd = 0x0,
pool = {0xffff88084b84ec88, 0xffff88084b84f738, 0xffff88084b84cc78,
0xffff88084b84c390},
state = 0x85,
hcd_priv = 0xffff880078f12740
}
and finally a dump of the ehci_qh in question:
crash> struct ehci_qh ffff88084b84dc80
struct ehci_qh {
hw = 0xffff880078d1a000,
qh_dma = 0x78d1a000,
qh_next = {
qh = 0xffff88084efe6730,
itd = 0xffff88084efe6730,
sitd = 0xffff88084efe6730,
fstn = 0xffff88084efe6730,
hw_next = 0xffff88084efe6730,
ptr = 0xffff88084efe6730 << !NULL
},
qtd_list = { << list_empty
next = 0xffff88084b84dc98,
prev = 0xffff88084b84dc98
},
intr_node = {
next = 0x0,
prev = 0x0
},
dummy = 0xffff880078d22000,
unlink_node = {
next = 0xffff88084b84dcc0,
prev = 0xffff88084b84dcc0
},
unlink_cycle = 0x0,
qh_state = 0x1, << QH_STATE_LINKED
xacterrs = 0x0,
usecs = 0x0,
gap_uf = 0x0,
c_usecs = 0x0,
tt_usecs = 0x0,
period = 0x0,
start = 0x0,
dev = 0x0,
is_out = 0x0,
clearing_tt = 0x0,
dequeue_during_giveback = 0x0,
exception = 0x0
}
The qtd_list is empty, contains only one entry, itself.
crash> struct -o ehci_qh | grep td_list
[0x18] struct list_head qtd_list;
crash> p/x 0xffff88084b84dc80 + 0x18
$1 = 0xffff88084b84dc98
but qh->qh_next.ptr is !NULL, so we hit the BUG. However, it seems that
the memory at qh->qh_next.ptr has been freed:
crash> struct ehci_qh 0xffff88084efe6730
struct ehci_qh {
hw = 0x6b6b6b6b6b6b6b6b,
qh_dma = 0x6b6b6b6b6b6b6b6b,
qh_next = {
qh = 0x6b6b6b6b6b6b6b6b,
itd = 0x6b6b6b6b6b6b6b6b,
sitd = 0x6b6b6b6b6b6b6b6b,
fstn = 0x6b6b6b6b6b6b6b6b,
hw_next = 0x6b6b6b6b6b6b6b6b,
ptr = 0x6b6b6b6b6b6b6b6b
},
qtd_list = {
next = 0x6b6b6b6b6b6b6b6b,
prev = 0x6b6b6b6b6b6b6b6b
},
intr_node = {
next = 0x6b6b6b6b6b6b6b6b,
prev = 0x6b6b6b6b6b6b6b6b
},
dummy = 0x6b6b6b6b6b6b6b6b,
unlink_node = {
next = 0x6b6b6b6b6b6b6b6b,
prev = 0x6b6b6b6b6b6b6b6b
},
unlink_cycle = 0x6b6b6b6b,
qh_state = 0x6b,
xacterrs = 0x6b,
usecs = 0x6b,
gap_uf = 0x6b,
c_usecs = 0x6b,
tt_usecs = 0x6b6b,
period = 0x6b6b,
start = 0x6b6b,
dev = 0x6b6b6b6b6b6b6b6b,
is_out = 0x1,
clearing_tt = 0x1,
dequeue_during_giveback = 0x0,
exception = 0x1
}
crash> kmem 0xffff88084efe6730
CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE
ffff88085f807600 kmalloc-128 128 2761 4305 123 16k
SLAB MEMORY NODE TOTAL ALLOCATED FREE
ffffea00213bf900 ffff88084efe4000 0 35 29 6
FREE / [ALLOCATED]
ffff88084efe6730
PAGE PHYSICAL MAPPING INDEX CNT FLAGS
ffffea00213bf980 84efe6000 0 0 0 2fffff00008000 tail
I'm not too familiar with the USB code stack, so any suggestions on
instrumentation that I can add to aid in debugging would be helpful.
Maybe some tracing in qh_link_async / single_unlink_async /
end_unlink_async /qh_link_periodic can reveal the sequence that is
leaving this dangling qh_next.ptr?
Note: This does bear some resemblance to a bug that Stratus hit a few
years ago [1] [2], however enough of the code has changed that I'm not
sure the fix for that one would apply to a modern kernel.
Regards,
-- Joe
[1] https://bugzilla.redhat.com/show_bug.cgi?id=680987
[2] http://thread.gmane.org/gmane.linux.usb.general/44788
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html