On Sun, Aug 03, 2008 at 09:32:40AM +0200, Pawel Jakub Dawidek wrote: > Hi. > > Linker can easly deadlock when we try to load the same kernel module > from two processes at the same time. This is because we drop kld_sx in > linker_load_file() and reacquire it, which leads to LOR, because we > already held vnode lock at this point. Interesing backtraces below. > > First process: > > db> tr 3066 > Tracing pid 3066 tid 100090 td 0x8514b240 > sched_switch(8514b240,0,104,177,bb6bbb2e,...) at sched_switch+0x40e > mi_switch(104,0,80681605,1ca,0,...) at mi_switch+0x200 > sleepq_switch(8514b240,0,80681605,237,80a281ec,...) at sleepq_switch+0x14d > sleepq_wait(80a281ec,0,8067a18b,3,0,...) at sleepq_wait+0x63 > _sx_xlock_hard(80a281ec,8514b240,0,8067a1cf,1a0,...) at _sx_xlock_hard+0x2c6 > _sx_xlock(80a281ec,0,8067a1cf,1a0,0,...) at _sx_xlock+0x99 > linker_load_module(853a1264,0,83ba8940,83ba893c,83ba8938,...) at > linker_load_module+0xa4a > linker_load_dependencies(84fb8500,bb74,8539f000,2adc,156000,...) at > linker_load_dependencies+0x194 > link_elf_load_file(806b74e0,8557e4c0,83ba8c24,17c,0,...) at > link_elf_load_file+0x4f0 > linker_load_module(0,83ba8c4c,8067a1cf,3cd,280cb730,...) at > linker_load_module+0x8db > kern_kldload(8514b240,8592d400,83ba8c70,0,b395eb11,...) at kern_kldload+0xc8 > [...] > db> show lock 0x80a281ec > class: sx > name: kernel linker > state: XLOCK: 0x8514bd80 (tid 100117, pid 3065, "zpool") > waiters: exclusive > > Second process: > > db> tr 3065 > Tracing pid 3065 tid 100117 td 0x8514bd80 > sched_switch(8514bd80,0,104,177,bb7e358b,...) at sched_switch+0x40e > mi_switch(104,0,80681605,1ca,50,...) at mi_switch+0x200 > sleepq_switch(8514bd80,0,80681605,237,8523d9c0,...) at sleepq_switch+0x14d > sleepq_wait(8523d9c0,50,806906bb,4,0,...) at sleepq_wait+0x63 > __lockmgr_args(8523d9c0,80100,8523da28,0,0,...) at __lockmgr_args+0x9a5 > vop_stdlock(83bd2660,8508aa80,2,80100,8523d968,...) at vop_stdlock+0x65 > VOP_LOCK1_APV(806c3560,83bd2660,806d2ac0,8523d968,80100,...) at > VOP_LOCK1_APV+0xa5 > _vn_lock(8523d968,80100,8068815b,802,804c9cb4,...) at _vn_lock+0x5e > vget(8523d968,80100,8514bd80,1b7,8065d00f,...) at vget+0xc9 > cache_lookup(85090158,83bd2a00,83bd2a14,0,84f3b400,...) at cache_lookup+0x4c2 > nfs_lookup(83bd2838,80688e43,806d2720,80000,85090158,...) at nfs_lookup+0x101 > VOP_LOOKUP_APV(806c3560,83bd2838,8068783d,1bd,83bd2a00,...) at > VOP_LOOKUP_APV+0xe5 > lookup(83bd29e8,8068783d,e0,c0,8506e52c,...) at lookup+0x52e > namei(83bd29e8,81159a38,80a352b4,4,8067be1f,...) at namei+0x48b > vn_open_cred(83bd29e8,83bd2a4c,0,84f3b400,0,...) at vn_open_cred+0x2ba > vn_open(83bd29e8,83bd2a4c,0,0,806b2a00,...) at vn_open+0x33 > linker_lookup_file(3,0,3,8514bd80,0,...) at linker_lookup_file+0x163 > linker_load_module(0,83bd2c4c,8067a1cf,3cd,280cb730,...) at > linker_load_module+0x7bd > kern_kldload(8514bd80,85a7e400,83bd2c70,0,b395eb11,...) at kern_kldload+0xc8 > [...] > db> show vnode 0x8523d968 > vnode 0x8523d968: tag nfs, type VREG > usecount 1, writecount 0, refcount 189 mountedhere 0 > flags () > v_object 0x852489b0 ref 0 pages 372 > lock type nfs: EXCL by thread 0x8514b240 (pid 3066) > with exclusive waiters pending > #0 0x804c2e5d at __lockmgr_args+0xa6d > #1 0x80546c85 at vop_stdlock+0x65 > #2 0x8065dcd5 at VOP_LOCK1_APV+0xa5 > #3 0x805627ee at _vn_lock+0x5e > #4 0x80557419 at vget+0xc9 > #5 0x805444b2 at cache_lookup+0x4c2 > #6 0x805c3b51 at nfs_lookup+0x101 > #7 0x8065ee65 at VOP_LOOKUP_APV+0xe5 > #8 0x8054a9be at lookup+0x52e > #9 0x8054b5eb at namei+0x48b > #10 0x805621da at vn_open_cred+0x2ba > #11 0x80562463 at vn_open+0x33 > #12 0x804f45e8 at link_elf_load_file+0x68 > #13 0x804c0f9b at linker_load_module+0x8db > #14 0x804c1568 at kern_kldload+0xc8 > #15 0x804c1624 at kldload+0x74 > #16 0x80650513 at syscall+0x283 > #17 0x80634e40 at Xint0x80_syscall+0x20 > [...]
Source line backtraces would be nicer, since gcc inliner forces me to make
a guess. It seems that linker_load_module() calls linker_load_file()
that drops and reaquires the linker lock.
Then, it seems that dropping the module' vnode lock around the call to
linker_load_dependencies() should help.
diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index 2664ba9..52b3f8f 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -802,7 +802,9 @@ link_elf_load_file(linker_class_t cls, const char* filename,
goto out;
link_elf_reloc_local(lf);
+ VOP_UNLOCK(nd.ni_vp, 0);
error = linker_load_dependencies(lf);
+ vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
if (error)
goto out;
#if 0 /* this will be more trouble than it's worth for now */
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
index d8e9219..657dd0e 100644
--- a/sys/kern/link_elf_obj.c
+++ b/sys/kern/link_elf_obj.c
@@ -798,7 +798,9 @@ link_elf_load_file(linker_class_t cls, const char *filename,
link_elf_reloc_local(lf);
/* Pull in dependencies */
+ VOP_UNLOCK(nd.ni_vp);
error = linker_load_dependencies(lf);
+ vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
if (error)
goto out;
pgpGwo2RUgLt3.pgp
Description: PGP signature

