Kernel is 5.3  +  Sandvine modifications

                I have uncovered a race condition during reboot, as the system 
is going down it kernel panics.
                This problem is reproducible on my system, it occurs approx 1 
out of every 20 reboots.

                Stack Trace and variables of interest.
                #0  doadump () at pcpu.h:159
                #1  0xa05924a2 in boot (howto=260) at 
/usr/src/sys/kern/kern_shutdown.c:421
                #2  0xa05928a0 in panic (fmt=0xa076ff74 "%s") at 
/usr/src/sys/kern/kern_shutdown.c:584
                #3  0xa073853f in trap_fatal (frame=0xcdb54c1c, eva=0) at 
/usr/src/sys/i386/i386/trap.c:829
                #4  0xa0738215 in trap_pfault (frame=0xcdb54c1c, usermode=0, 
eva=8) at /usr/src/sys/i386/i386/trap.c:746
                #5  0xa0737d9e in trap (frame=
                      {tf_fs = 24, tf_es = 16, tf_ds = -1520304112, tf_edi = 
15, tf_esi = -1511925548, tf_ebp = -843756432, tf_isp = -843756472, tf_ebx = 
-1516062592
                0, tf_ecx = -1516062592, tf_eax = 0, tf_trapno = 12, tf_err = 
0, tf_eip = -1604789009, tf_cs = 8, tf_eflags = 66118, tf_esp = -1516062592, 
tf_ss = 0}
                    at /usr/src/sys/i386/i386/trap.c:436
                #6  0xa0723d8a in calltrap () at 
/usr/src/sys/i386/i386/exception.s:202
                #7  0x00000018 in ?? ()
                #8  0x00000010 in ?? ()
                #9  0xa5620010 in ?? ()
                #10 0x0000000f in ?? ()
                #11 0xa5e1d8d4 in ?? ()
                #12 0xcdb54c70 in ?? ()
                #13 0xcdb54c48 in ?? ()
                #14 0xa5a2b880 in ?? ()
                #15 0x00000000 in ?? ()
                #16 0xa5a2b880 in ?? ()
                #17 0x00000000 in ?? ()
                #18 0x0000000c in ?? ()
                #19 0x00000000 in ?? ()
                #20 0xa058dcef in cr_cansignal (cred=0xa5a2b880, 
proc=0xa5e1d8d4, signum=15) at /usr/src/sys/kern/kern_prot.c:1495
                #21 0xa058dd87 in p_cansignal (td=0xa5789c80, p=0xa5e1d8d4, 
signum=15) at /usr/src/sys/kern/kern_prot.c:1535
                #22 0xa0595192 in killpg1 (td=0xa5789c80, sig=15, 
pgid=-1511925548, all=1) at /usr/src/sys/kern/kern_sig.c:1321
                #23 0xa059553e in kill (td=0xa5789c80, uap=0xcdb54d14) at 
/usr/src/sys/kern/kern_sig.c:1398
                #24 0xa07388db in syscall (frame=
                      {tf_fs = 47, tf_es = 47, tf_ds = 47, tf_edi = 0, tf_esi = 
1, tf_ebp = -1614811884, tf_isp = -843756172, tf_ebx = 1746232072, tf_edx = 2, 
tf_ecx
                x = 37, tf_trapno = 12, tf_err = 2, tf_eip = 1745696235, tf_cs 
= 31, tf_eflags = 642, tf_esp = -1614811972, tf_ss = 47})
                    at /usr/src/sys/i386/i386/trap.c:1021
                #25 0xa0723ddf in Xint0x80_syscall () at 
/usr/src/sys/i386/i386/exception.s:263



                The code where the crash occurred in cr_cansignal

                        if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
                            cred->cr_ruid != proc->p_ucred->cr_svuid &&
                            cred->cr_uid != proc->p_ucred->cr_ruid &&
                            cred->cr_uid != proc->p_ucred->cr_svuid) {
                                /* Not permitted without privilege. */
                                error = suser_cred(cred, SUSER_ALLOWJAIL);
                                if (error)
                                        return (error);
                        }


                (kgdb) p *cred
                $2 = {cr_ref = 2614, cr_uid = 0, cr_ruid = 0, cr_svuid = 0, 
cr_ngroups = 3, cr_groups = {0, 0, 5, 0 <repeats 13 times>}, cr_rgid = 0, 
cr_svgid = 0,
                  cr_uidinfo = 0xa5620740, cr_ruidinfo = 0xa5620740, cr_prison 
= 0x0, cr_label = 0x0, cr_mtxp = 0xa560946c}
                (kgdb) p *proc
                $3 = {p_list = {le_next = 0xa5b0154c, le_prev = 0xa07edc64}, 
p_ksegrps = {tqh_first = 0xa56fa620, tqh_last = 0xa56fa624}, p_threads = {
                    tqh_first = 0xa5b51e10, tqh_last = 0xa5b51e18}, p_suspended 
= {tqh_first = 0x0, tqh_last = 0xa5e1d8ec}, p_ucred = 0x0, p_fd = 0x0, p_fdtol 
= 0x0,
                  p_stats = 0xd0019000, p_limit = 0x0, p_upages_obj = 
0xa5b1cc60, p_sigacts = 0x0, p_flag = 24576, p_sflag = 1, p_state = PRS_NEW, 
p_pid = 1465, p_ha
                    le_next = 0x0, le_prev = 0xa561b6e4}, p_pglist = {le_next = 
0xa5b4cc5c, le_prev = 0xa5b4b054}, p_pptr = 0xa5b4b000, p_sibling = {le_next = 
0x0,
                    le_prev = 0xa5b4b068}, p_children = {lh_first = 0x0}, p_mtx 
= {mtx_object = {lo_class = 0xa07c19dc, lo_name = 0xa0789fa5 "process lock",
                      lo_type = 0xa0789fa5 "process lock", lo_flags = 4390912, 
lo_list = {tqe_next = 0x0, tqe_prev = 0x0}, lo_witness = 0x0}, mtx_lock = 
2776145026,
                    mtx_recurse = 0}, p_oppid = 0, p_vmspace = 0x0, p_swtime = 
9, p_realtimer = {it_interval = {tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 
0,
                      tv_usec = 0}}, p_runtime = {sec = 0, frac = 
29866236321160512}, p_uu = 0, p_su = 1590, p_iu = 0, p_uticks = 0, p_sticks = 
0, p_iticks = 0,
                  p_profthreads = 0, p_maxthrwaits = 0, p_traceflag = 0, 
p_tracevp = 0x0, p_tracecred = 0x0, p_textvp = 0x0, p_siglist = {__bits = {0, 
0, 0, 0}},
                  p_lock = 0 '\0', p_sigiolst = {slh_first = 0x0}, p_sigparent 
= 20, p_sig = 0, p_code = 0, p_stops = 0, p_stype = 0, p_step = 0 '\0', 
p_pfsflags = 0
                  p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, 
p_suspcount = 0, p_xthread = 0xa5b51e10, p_boundary_count = 0, p_magic = 
3203398350,
                  p_comm = "sleep\000r", '\0' <repeats 12 times>, p_pgrp = 0x0, 
p_sysent = 0xa07dae20, p_args = 0x0, p_cpulimit = 9223372036854775807, p_nice = 
0 '\0
                  p_xstat = 0, p_klist = {kl_lock = 0xa5e1d940, kl_list = 
{slh_first = 0x0}}, p_numthreads = 1, p_numksegrps = 1, p_md = {md_ldt = 0x0}, 
p_itcallout
                    c_links = {sle = {sle_next = 0x0}, tqe = {tqe_next = 0x0, 
tqe_prev = 0x0}}, c_time = 0, c_arg = 0x0, c_func = 0, c_flags = 8}, p_uarea = 
0xd00190
                  p_acflag = 0, p_ru = 0x0, p_peers = 0x0, p_leader = 
0xa5e1d8d4, p_emuldata = 0x0, p_label = 0x0, p_sched = 0xa5e1da98}
                (kgdb)


                Notice the value of proc->ucred
                (kgdb) p proc->p_ucred
                $4 = (struct ucred *) 0x0
                (kgdb)

                Thus the crash.
                Somehow the p_ucred has been nulled during this routine.
                At the time of the crash the following variables had theses 
values
                cred->jail=0 
                see_other_uids=1
                see_other_gids=1
                thus proc->p_ucred is not used before the crash.

                Uncertain where the race condition could reside.


                I noticed in function kern_wait() in kern_exit.c
                that the setting p_p_ucred=NULL was not protected.
                I added PROC_LOCK(p) and PROC_UNLOCK(p) around the call 
                but the panic still occurred.

                This is a critical issue for us and I am willing to assist in 
anyway that I can.


Richard Legault
Senior Engineer
519-880-2400 ext 2722
www.sandvine.com


_______________________________________________
freebsd-stable@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "[EMAIL PROTECTED]"

Reply via email to