On Thu, Dec 31, 2015 at 3:24 PM, Emmanuel Dreyfus <[email protected]> wrote:
> On Thu, Dec 31, 2015 at 02:51:41PM +0530, Raghavendra Talur wrote: > > To our surprise though, the hung test started proceeding. > > You mean a process gets stuck into a system call for hours and then > is able to ascape? > > Some hints: > > 1) ps -axl shows the waiting channel (WCHAN column) for a process stuck > in kernel. What is it? > # ps -axl | grep 23268 0 23268 1 0 85 0 56436 12544 select Isl ? 0:02.06 glusterfs --attribute-timeout=0 --entry-timeout=0 -s nbsla # ps -axl | grep 26515 0 26515 1 0 85 0 4508 1440 kqueue S+ pts/0 0:00.04 perfused: perfused /mnt/glusterfs/0 (perfused) > > 2) crash is a kernel debugger that can be used while running multiuser. > Of course since the system is running, the output is obsolete most > of the time, but for a stuck process we can extract valuable information. > > Run crash from the shell, then inside crash, run the ps command. Find the > relevant process and note the address in the STRUCT LWP * column. For an > example, let us say it is c63452a0. > relevant lines: 23268 8 3 0 80 c4c9f000 glusterfsd parked 23268 7 3 0 80 c5223a80 glusterfsd netio 23268 6 3 0 80 c542e560 glusterfsd nanoslp 23268 5 3 1 80 c5229a80 glusterfsd parked 23268 4 3 0 80 c5418d40 glusterfsd parked 23268 3 3 0 80 c5346540 glusterfsd sigwait 23268 2 3 0 80 c4ce22c0 glusterfsd nanoslp 23268 1 3 1 80 c5418020 glusterfsd select 26515 1 3 1 80 c53692c0 perfused kqueue > > bt/a c63452a0 will produce a kernel backtrace for the process. This can > be extremely valuable to understand hat is going on. If we are awaiting > for a lock, we can track what process is holdoing it. > bt/a c4c9f000 trace: pid 23268 lid 8 at 0xdc171e9c sleepq_block(0,1,c047f728,c049a1bc,6,5000a018,8,dc171f54,c4c9f000,6) at sleepq_block+0x9b lwp_park(0,1,0,bb1ac150,1de,dc171f54,6,dc171f40,c4c9f000,c0494528) at lwp_park+0x115 sys____lwp_park60(c4c9f000,dc171f54,dc171f7c,dc171fa0,c02eabb7,dc171f54,1de,103,0,1) at sys____lwp_park60+0x50 syscall() at syscall+0x89 --- syscall (number 478) --- bb3bb4b7: bt/a c5223a80 trace: pid 23268 lid 7 at 0xdb781d0c sleepq_block(0,1,c0482887,c0495450,c5223a80,6473,c2d42d40,c2d41dc0,c2d29f82,0) at sleepq_block+0x9b cv_timedwait_sig(c4dabf2c,c386c3c0,0,c040eb9f,c50a1001,c4eb2f47,65686ee0,c4dabe44,c4dabe44,0) at cv_timedwait_sig+0xaa sbwait(c4dabf00,0,db781dbc,c01215cc,0,c50a1080,db781dcc,140eb9f,0,c049a760) at sbwait+0x57 soreceive(c4dabe44,0,db781ec8,0,0,0,db781e5c,c040eb9f,2,2) at soreceive+0xc59 soo_read(c4c932c0,c4c932c0,db781ec8,c4c79000,1,c2d29f80,db781e8c,c02552a5,0,0) at soo_read+0x3c do_filereadv(a,b88fff8c,2,c4c932c0,1,db781f7c,3,c5223a80,c5223a80,c5223a80) at do_filereadv+0x1f0 sys_readv(c5223a80,db781f54,db781f7c,db781fa0,c02eabb7,db781f54,78,db781f7c,a,b88fff8c) at sys_readv+0x38 syscall() at syscall+0x18b --- syscall (number 120) --- bb351877: bt/a c542e560 trace: pid 23268 lid 6 at 0xdc5edddc sleepq_block(1f5,1,c047363e,c0495dc8,0,3ffff,0,c2d41440,c049cb00,1f5) at sleepq_block+0xea kpause(c047363e,1,1f5,0,dc5edea4,c4e5df80,ffffffff,ffffffff,c4da9c00,c3458bb0) at kpause+0xe8 nanosleep1(c542e560,3,0,dc5edefc,dc5edf08,9,c38b7360,0,c542e560,c0492efc) at nanosleep1+0xe5 sys___nanosleep50(c542e560,dc5edf54,dc5edf7c,c08eb880,c048ce80,dc5edf54,1ae,b9a0e7c0,b8fff730,b8fff73c) at sys___nanosleep50+0x5f syscall() at syscall+0x89 --- syscall (number 430) --- bb351957: bt/a c5229a80 trace: pid 23268 lid 5 at 0xdba17e9c sleepq_block(ea60,1,c047f728,c049a1bc,0,64,dba17efc,c0371aeb,0,c5418d40) at sleepq_block+0xea lwp_park(0,1,dba17f18,ba40d1a4,0,257,0,3acd705f,c5229a80,c0494528) at lwp_park+0x115 sys____lwp_park60(c5229a80,dba17f54,dba17f7c,dba17fa0,c02eab83,dba17f54,1de,103,0,1) at sys____lwp_park60+0x50 syscall() at syscall+0x89 --- syscall (number 478) --- bb3bb4b7: bt/a c5418d40 trace: pid 23268 lid 4 at 0xdb691e9c sleepq_block(ea60,1,c047f728,c049a1bc,0,db691fa8,db691eec,c0251425,db691ed4,6) at sleepq_block+0xea lwp_park(0,1,db691f18,ba40d1a4,0,257,0,3accfb61,c5418d40,c0494528) at lwp_park+0x115 sys____lwp_park60(c5418d40,db691f54,db691f7c,db691fa0,c02eab83,db691f54,1de,103,0,1) at sys____lwp_park60+0x50 syscall() at syscall+0x89 --- syscall (number 478) --- bb3bb4b7: bt/a c5346540 trace: pid 23268 lid 3 at 0xdb727e1c sleepq_block(0,1,c04716ba,c0495450,c040bc16,0,c2d42d40,c2d41400,c5346540,0) at sleepq_block+0x9b cv_timedwait_sig(c53466b4,c5004b80,0,c53466a4,3,db727e90,c53466a4,c41eb528,db727eac,7ff0) at cv_timedwait_sig+0xaa sigtimedwait1(c5346540,db727f54,db727f7c,c01026f0,c01026a0,c01026f0,c01026a0,c06d9000,c02ea954,c3469b10) at sigtimedwait1+0x242 sys_____sigtimedwait50(c5346540,db727f54,db727f7c,db727fa0,c02eab83,db727f54,1af,103,ba1fefbc,0) at sys_____sigtimedwait50+0x3f syscall() at syscall+0x89 --- syscall (number 431) --- bb39f8c7: bt/a c4ce22c0 trace: pid 23268 lid 2 at 0xdcccdddc sleepq_block(65,1,c047363e,c0495dc8,0,c4eb2f47,e1f297c1,c2d40540,c049c7dc,65) at sleepq_block+0xea kpause(c047363e,1,65,0,dcccdea4,dcccdec4,dcccdeac,c0251276,c048cce0,1) at kpause+0xe8 nanosleep1(c4ce22c0,3,0,dcccdefc,0,c2d1f548,fffffffd,a5b55001,1b,dcccdfa8) at nanosleep1+0xe5 sys___nanosleep50(c4ce22c0,dcccdf54,dcccdf7c,dcccdfa0,c02eab83,dcccdf54,1ae,103,ba3fff98,0) at sys___nanosleep50+0x5f syscall() at syscall+0x89 --- syscall (number 430) --- bb351957: bt/a c5418020 trace: pid 23268 lid 1 at 0xdb721d0c sleepq_block(2,1,c047fc22,c049a1f8,c56aae94,1,ffffffff,c040eb25,c048ce80,0) at sleepq_block+0xea sel_do_scan(30,db721f18,0,db721f7c,c2d42bc2,c4eb2f47,d5d7bdcf,c02369f7,3,3) at sel_do_scan+0x46e pollcommon(db721f7c,ba4143c0,6,db721f18,0,db721fa8,db721f2c,db721f40,c040eb9f,0) at pollcommon+0xe7 sys_poll(c5418020,db721f54,db721f7c,db721fa0,c02eabb7,db721f54,d1,103,ba4143c0,6) at sys_poll+0x6a syscall() at syscall+0x89 --- syscall (number 209) --- bb351917: bt/a c53692c0 trace: pid 26515 lid 1 at 0xdbce3d2c sleepq_block(0,1,c0471305,c0495450,c5066f80,1,c2d42c00,c2d42d80,1,0) at sleepq_block+0x9b cv_timedwait_sig(c38eff3c,c38eff10,0,c012f509,c048ce80,0,dbce3dbc,c040eb9f,c31a3560,c048ce80) at cv_timedwait_sig+0xaa kevent1(dbce3f7c,d,bb51f080,0,bb51f080,4,0,c044fbb0,c0492ef0,dbce3fa8) at kevent1+0x45a sys___kevent50(c53692c0,dbce3f54,dbce3f7c,dbce3fa0,c02eabb7,dbce3f54,1b3,103,d,bb51f080) at sys___kevent50+0x45 syscall() at syscall+0x89 --- syscall (number 435) --- bb679a77: Thanks for the help! > > -- > Emmanuel Dreyfus > [email protected] >
_______________________________________________ Gluster-devel mailing list [email protected] http://www.gluster.org/mailman/listinfo/gluster-devel
