Hi,
one our user has found ksh scripts are failing when system is under heavy
load.
We've tried to investigate this issue and found this:
When starting new sub-process this fails or even produce segfault in ksh or
glibc. It's not happening regularly, only when there are 200 copies running,
it reproduce itself after a few tens minutes as one of these problems:
1) cannot execute [Invalid argument] - strace only, no core dump
...
5844 22:55:39.141510 write(2, "+ cut -c 3-\n", 12) = 12
5844 22:55:39.141628 lseek(2, 0, SEEK_CUR) = 479
5844 22:55:39.141882 rt_sigprocmask(SIG_BLOCK, [HUP INT QUIT PIPE], [], 8) =
0
5844 22:55:39.142046 vfork( <unfinished ...>
5859 22:55:39.158619 <... fstat resumed> {st_mode=S_IFSOCK|0200, st_size=0,
...}) = 0
5859 22:55:39.158740 fstat(2, {st_mode=S_IFREG|0644, st_size=479, ...}) = 0
5859 22:55:39.158856 lseek(2, 0, SEEK_CUR) = 479
5859 22:55:39.159028 fstat(1, {st_mode=S_IFSOCK|0200, st_size=0, ...}) = 0
5859 22:55:39.159576 exit_group(0) = ?
5844 22:55:39.161879 <... vfork resumed> ) = ? ERESTARTNOINTR (To be
restarted)
5844 22:55:39.206674 --- SIGCHLD (Child exited) @ 0 (0) ---
5844 22:55:39.207144 rt_sigreturn(0x11) = 56
5844 22:55:39.207306 clone(child_stack=0x16d4, flags=CLONE_FS|CLONE_FILES|
CLONE_THREAD|CLONE_SETTLS|CLONE_DETACHED|SIGIO, tls=0x80) = -1 EINVAL (Invalid
argument)
5844 22:55:39.241907 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
5844 22:55:39.243627 open("/usr/share/locale/locale.alias", O_RDONLY) = 4
5844 22:55:39.324129 fstat(4, {st_mode=S_IFREG|0644, st_size=2528, ...}) = 0
5844 22:55:39.405627 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|
MAP_ANONYMOUS, -1, 0) = 0x2ab40711b000
5844 22:55:39.501774 read(4, "# ..."..., 4096) = 2528
5844 22:55:39.503823 brk(0x19ddc000) = 0x19ddc000
5844 22:55:39.540158 read(4, "", 4096) = 0
5844 22:55:39.540370 close(4) = 0
5844 22:55:39.540530 munmap(0x2ab40711b000, 4096) = 0
5844 22:55:39.540835
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1
ENOENT (No such file or directory)
<other 'locale open' lines>
5844 22:55:39.541884 open("/usr/share/locale/en/LC_MESSAGES/libc.mo",
O_RDONLY) = -1 ENOENT (No such file or directory)
5844 22:55:39.542288 write(2, "/home/testuser/test01[182]: cut: /bin/cut:
cannot execute [Invalid argument]\n", 82) = 82
...
2) vfork segfault
Thread 2 (process 8393):
#0 0x00002b3bfdc95c85 in waitpid () from /lib64/libc.so.6
#1 0x0000000000417032 in job_reap (sig=17) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/jobs.c:277
#2 0x000000000044389e in sh_exec (t=0x1bfa9590, flags=4) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1243
#3 0x0000000000440646 in sh_exec (t=0x1bfa9650, flags=5) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1413
#4 0x000000000044044f in sh_exec (t=0xffffffffffffffff, flags=5) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1539
#5 0x000000000043a8e2 in sh_subshell (t=0x1bfa9670, flags=5, comsub=1) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/subshell.c:573
#6 0x000000000041f5cc in comsubst (mp=0x1bfa0080, t=0x1bfa9670, type=1) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:1999
#7 0x0000000000421088 in copyto (mp=0x1bfa0080, endch=0, newquote=<value
optimized out>) at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:586
#8 0x000000000041f355 in sh_mactrim (shp=<value optimized out>,
str=0x1bfa9411 "S_ESPEXE=`echo \"${S_CODEXP}\" | cut -c 2 `", mode=-1)
at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:173
#9 0x000000000042affa in nv_setlist (arg=0x1bfa9400, flags=8832, typ=0x0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/name.c:325
#10 0x0000000000441901 in sh_exec (t=0x1bfa93c0, flags=36) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:830
#11 0x0000000000407be5 in exfile ()
#12 0x0000000000407093 in sh_main ()
#13 0x00002b3bfdc19994 in __libc_start_main () from /lib64/libc.so.6
#14 0x00000000004062d9 in _start ()
Thread 1 (process 8937):
#0 0x00002b3bfdc96374 in vfork () from /lib64/libc.so.6
Cannot access memory at address 0x20c1
3) _dl_fixup segfault
Thread 2 (process 20493):
#0 0x00002b5359ef75c0 in sigaction () from /lib64/libc.so.6
#1 0x000000000047b7af in signal (sig=17, fun=0x419280 <job_waitsafe>) at
/usr/src/debug/ksh-20091211/src/lib/libast/misc/signal.c:125
#2 0x00000000004172c6 in job_reap (sig=17) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/jobs.c:442
#3 0x000000000044389e in sh_exec (t=0x19b1cd80, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1243
#4 0x0000000000440646 in sh_exec (t=0x19b1ce50, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1413
#5 0x000000000044044f in sh_exec (t=0x11, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1539
#6 0x000000000043f4e6 in sh_exec (t=0x19b1ccb0, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1893
#7 0x00000000004404cf in sh_exec (t=0x19b1cc70, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1823
#8 0x00000000004402e9 in sh_exec (t=0x19b1d3f0, flags=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1560
#9 0x0000000000444fe5 in sh_eval (iop=0x19b1b5c0, mode=0) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:510
#10 0x0000000000450491 in b_dot_cmd (n=<value optimized out>, argv=<value
optimized out>, extra=<value optimized out>) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/bltins/misc.c:294
#11 0x0000000000441dab in sh_exec (t=0x19b0c600, flags=4) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1008
#12 0x000000000044030f in sh_exec (t=0x11, flags=1246418752) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1824
#13 0x000000000043fe41 in sh_exec (t=0x19b0c3c0, flags=<value optimized out>)
at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1680
#14 0x0000000000407be5 in exfile ()
#15 0x00000000004064b3 in sh_source ()
#16 0x0000000000406932 in sh_main ()
#17 0x00002b5359ee4994 in __libc_start_main () from /lib64/libc.so.6
#18 0x00000000004062d9 in _start ()
Thread 1 (process 21727):
#0 0x00002b535982fcd5 in _dl_fixup () from /lib64/ld-linux-x86-64.so.2
#1 0x00002b5359835892 in _dl_runtime_resolve () from /lib64/ld-linux-
x86-64.so.2
#2 0x0000000000490e1c in spawnveg ()
#3 0x0000000000433dda in _spawnveg (path=0x19b1d779 "/bin/grep",
argv=0x19b1ce30, envp=0x19b1d418, pgid=21727) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/path.c:141
#4 0x0000000000434f35 in path_spawn (opath=0x19b1d779 "/bin/grep",
argv=0x19b1ce30, envp=0x19b1d418, libpath=0x7, spawn=1) at
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/path.c:1126
#5 0x0000000000000000 in ?? ()
We've tested several versions and found there is 2009-04-15 which was running
fine for a few days:
2009-12-11 cut: cannot execute [Invalid argument]
2009-12-06 cut: cannot execute [Invalid argument]
2009-10-21 cut: cannot execute [Invalid argument]
2009-09-17 cut: cannot execute [Invalid argument]
2009-09-08 cut: cannot execute [Invalid argument]
2009-06-30 vfork segfault OR _dl_fixup segfault
2009-05-05 vfork segfault OR _dl_fixup segfault
2009-04-28 _dl_fixup segfault
2009-04-15 running fine for 2.5 days
It seems all of them are caused by memory operations between vfork and execve.
Vfork is sensitive to memory operations and any malloc / free can corrupt
memory.
It seems that under heavy load, there is increasing chance SIGCHLD handler is
executed between vfork->execve and it's memory operation is probably root of
all troubles.
An example of vfork->SIGCHLD->execve segfault:
12441 04:58:42.580130 vfork( <unfinished ...>
12586 04:58:42.580163 <... stat resumed> 0x7fff7759d0c0) = -1 ENOENT (No such
file or directory)
12586 04:58:42.580370 write(1, "/usr/local/bin:/bin:/usr/bin\n", 29) = 29
12586 04:58:42.580490 fstat(1, {st_mode=S_IFSOCK|0200, st_size=0, ...}) = 0
12586 04:58:42.580576 fstat(2, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
12586 04:58:42.580649 lseek(2, 0, SEEK_CUR) = 0
12586 04:58:42.580802 exit_group(0) = ?
12441 04:58:43.202251 <... vfork resumed> ) = ? ERESTARTNOINTR (To be
restarted)
12441 04:58:43.230899 --- SIGCHLD (Child exited) @ 0 (0) ---
12441 04:58:43.235305 rt_sigreturn(0x11) = 56
12441 04:58:43.242206 clone(child_stack=0, flags=CLONE_VM|CLONE_FILES|
CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_DETACHED|141, tls=0x80) = 12772
12441 04:58:43.245141 rt_sigprocmask(SIG_SETMASK, [], <unfinished ...>
12772 04:58:43.245830 rt_sigprocmask(SIG_SETMASK, [], <unfinished ...>
12441 04:58:43.245883 <... rt_sigprocmask resumed> NULL, 8) = 0
12772 04:58:43.246377 <... rt_sigprocmask resumed> NULL, 8) = 0
12441 04:58:43.246456 execve("/bin/grep", ["/bin/grep", "-q",
"/usr/kerberos/bin"], [/* 17 vars */] <unfinished ...>
12772 04:58:43.246950 --- SIGSEGV (Segmentation fault) @ 0 (0) ---
But this is only a guess.
I can test any patch or repo snapshot.
Regards,
Michal Hlavinka
_______________________________________________
ast-developers mailing list
[email protected]
https://mailman.research.att.com/mailman/listinfo/ast-developers