Hi,

one our user has found ksh scripts are failing when system is under heavy 
load.

We've tried to investigate this issue and found this:

When starting new sub-process this fails or even produce segfault in ksh or 
glibc. It's not happening regularly, only when there are 200 copies running, 
it reproduce itself after a few tens minutes as one of these problems:

1) cannot execute [Invalid argument] - strace only, no core dump
...
5844  22:55:39.141510 write(2, "+ cut -c 3-\n", 12) = 12
5844  22:55:39.141628 lseek(2, 0, SEEK_CUR) = 479
5844  22:55:39.141882 rt_sigprocmask(SIG_BLOCK, [HUP INT QUIT PIPE], [], 8) = 
0
5844  22:55:39.142046 vfork( <unfinished ...>
5859  22:55:39.158619 <... fstat resumed> {st_mode=S_IFSOCK|0200, st_size=0, 
...}) = 0
5859  22:55:39.158740 fstat(2, {st_mode=S_IFREG|0644, st_size=479, ...}) = 0
5859  22:55:39.158856 lseek(2, 0, SEEK_CUR) = 479
5859  22:55:39.159028 fstat(1, {st_mode=S_IFSOCK|0200, st_size=0, ...}) = 0
5859  22:55:39.159576 exit_group(0)     = ?
5844  22:55:39.161879 <... vfork resumed> ) = ? ERESTARTNOINTR (To be 
restarted)
5844  22:55:39.206674 --- SIGCHLD (Child exited) @ 0 (0) ---
5844  22:55:39.207144 rt_sigreturn(0x11) = 56
5844  22:55:39.207306 clone(child_stack=0x16d4, flags=CLONE_FS|CLONE_FILES|
CLONE_THREAD|CLONE_SETTLS|CLONE_DETACHED|SIGIO, tls=0x80) = -1 EINVAL (Invalid 
argument)
5844  22:55:39.241907 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
5844  22:55:39.243627 open("/usr/share/locale/locale.alias", O_RDONLY) = 4
5844  22:55:39.324129 fstat(4, {st_mode=S_IFREG|0644, st_size=2528, ...}) = 0
5844  22:55:39.405627 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|
MAP_ANONYMOUS, -1, 0) = 0x2ab40711b000
5844  22:55:39.501774 read(4, "# ..."..., 4096) = 2528
5844  22:55:39.503823 brk(0x19ddc000)   = 0x19ddc000
5844  22:55:39.540158 read(4, "", 4096) = 0
5844  22:55:39.540370 close(4)          = 0
5844  22:55:39.540530 munmap(0x2ab40711b000, 4096) = 0
5844  22:55:39.540835 
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/libc.mo", O_RDONLY) = -1 
ENOENT (No such file or directory)
<other 'locale open' lines>
5844  22:55:39.541884 open("/usr/share/locale/en/LC_MESSAGES/libc.mo", 
O_RDONLY) = -1 ENOENT (No such file or directory)
5844  22:55:39.542288 write(2, "/home/testuser/test01[182]: cut: /bin/cut: 
cannot execute [Invalid argument]\n", 82) = 82
...

2) vfork segfault
Thread 2 (process 8393):
#0  0x00002b3bfdc95c85 in waitpid () from /lib64/libc.so.6
#1  0x0000000000417032 in job_reap (sig=17) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/jobs.c:277
#2  0x000000000044389e in sh_exec (t=0x1bfa9590, flags=4) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1243
#3  0x0000000000440646 in sh_exec (t=0x1bfa9650, flags=5) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1413
#4  0x000000000044044f in sh_exec (t=0xffffffffffffffff, flags=5) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1539
#5  0x000000000043a8e2 in sh_subshell (t=0x1bfa9670, flags=5, comsub=1) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/subshell.c:573
#6  0x000000000041f5cc in comsubst (mp=0x1bfa0080, t=0x1bfa9670, type=1) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:1999
#7  0x0000000000421088 in copyto (mp=0x1bfa0080, endch=0, newquote=<value 
optimized out>) at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:586
#8  0x000000000041f355 in sh_mactrim (shp=<value optimized out>, 
str=0x1bfa9411 "S_ESPEXE=`echo \"${S_CODEXP}\" | cut -c 2 `", mode=-1)
    at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/macro.c:173
#9  0x000000000042affa in nv_setlist (arg=0x1bfa9400, flags=8832, typ=0x0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/name.c:325
#10 0x0000000000441901 in sh_exec (t=0x1bfa93c0, flags=36) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:830
#11 0x0000000000407be5 in exfile ()
#12 0x0000000000407093 in sh_main ()
#13 0x00002b3bfdc19994 in __libc_start_main () from /lib64/libc.so.6
#14 0x00000000004062d9 in _start ()

Thread 1 (process 8937):
#0  0x00002b3bfdc96374 in vfork () from /lib64/libc.so.6
Cannot access memory at address 0x20c1


3) _dl_fixup segfault
Thread 2 (process 20493):
#0  0x00002b5359ef75c0 in sigaction () from /lib64/libc.so.6
#1  0x000000000047b7af in signal (sig=17, fun=0x419280 <job_waitsafe>) at 
/usr/src/debug/ksh-20091211/src/lib/libast/misc/signal.c:125
#2  0x00000000004172c6 in job_reap (sig=17) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/jobs.c:442
#3  0x000000000044389e in sh_exec (t=0x19b1cd80, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1243
#4  0x0000000000440646 in sh_exec (t=0x19b1ce50, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1413
#5  0x000000000044044f in sh_exec (t=0x11, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1539
#6  0x000000000043f4e6 in sh_exec (t=0x19b1ccb0, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1893
#7  0x00000000004404cf in sh_exec (t=0x19b1cc70, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1823
#8  0x00000000004402e9 in sh_exec (t=0x19b1d3f0, flags=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1560
#9  0x0000000000444fe5 in sh_eval (iop=0x19b1b5c0, mode=0) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:510
#10 0x0000000000450491 in b_dot_cmd (n=<value optimized out>, argv=<value 
optimized out>, extra=<value optimized out>) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/bltins/misc.c:294
#11 0x0000000000441dab in sh_exec (t=0x19b0c600, flags=4) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1008
#12 0x000000000044030f in sh_exec (t=0x11, flags=1246418752) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1824
#13 0x000000000043fe41 in sh_exec (t=0x19b0c3c0, flags=<value optimized out>) 
at /usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/xec.c:1680
#14 0x0000000000407be5 in exfile ()
#15 0x00000000004064b3 in sh_source ()
#16 0x0000000000406932 in sh_main ()
#17 0x00002b5359ee4994 in __libc_start_main () from /lib64/libc.so.6
#18 0x00000000004062d9 in _start ()

Thread 1 (process 21727):
#0  0x00002b535982fcd5 in _dl_fixup () from /lib64/ld-linux-x86-64.so.2
#1  0x00002b5359835892 in _dl_runtime_resolve () from /lib64/ld-linux-
x86-64.so.2
#2  0x0000000000490e1c in spawnveg ()
#3  0x0000000000433dda in _spawnveg (path=0x19b1d779 "/bin/grep", 
argv=0x19b1ce30, envp=0x19b1d418, pgid=21727) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/path.c:141
#4  0x0000000000434f35 in path_spawn (opath=0x19b1d779 "/bin/grep", 
argv=0x19b1ce30, envp=0x19b1d418, libpath=0x7, spawn=1) at 
/usr/src/debug/ksh-20091211/src/cmd/ksh93/sh/path.c:1126
#5  0x0000000000000000 in ?? ()


We've tested several versions and found there is 2009-04-15 which was running 
fine for a few days:

2009-12-11 cut: cannot execute [Invalid argument] 
2009-12-06 cut: cannot execute [Invalid argument]
2009-10-21 cut: cannot execute [Invalid argument]
2009-09-17 cut: cannot execute [Invalid argument]
2009-09-08 cut: cannot execute [Invalid argument]
2009-06-30 vfork segfault OR  _dl_fixup segfault
2009-05-05 vfork segfault OR  _dl_fixup segfault
2009-04-28 _dl_fixup segfault
2009-04-15 running fine for 2.5 days

It seems all of them are caused by memory operations between vfork and execve. 
Vfork is sensitive to memory operations and any malloc / free can corrupt 
memory. 

It seems that under heavy load, there is increasing chance SIGCHLD handler is 
executed between vfork->execve and it's memory operation is probably root of 
all troubles.

An example of vfork->SIGCHLD->execve segfault:

12441 04:58:42.580130 vfork( <unfinished ...>
12586 04:58:42.580163 <... stat resumed> 0x7fff7759d0c0) = -1 ENOENT (No such 
file or directory)
12586 04:58:42.580370 write(1, "/usr/local/bin:/bin:/usr/bin\n", 29) = 29
12586 04:58:42.580490 fstat(1, {st_mode=S_IFSOCK|0200, st_size=0, ...}) = 0
12586 04:58:42.580576 fstat(2, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0
12586 04:58:42.580649 lseek(2, 0, SEEK_CUR) = 0
12586 04:58:42.580802 exit_group(0)     = ?
12441 04:58:43.202251 <... vfork resumed> ) = ? ERESTARTNOINTR (To be 
restarted)
12441 04:58:43.230899 --- SIGCHLD (Child exited) @ 0 (0) ---
12441 04:58:43.235305 rt_sigreturn(0x11) = 56
12441 04:58:43.242206 clone(child_stack=0, flags=CLONE_VM|CLONE_FILES|
CLONE_SIGHAND|CLONE_THREAD|CLONE_SETTLS|CLONE_DETACHED|141, tls=0x80) = 12772
12441 04:58:43.245141 rt_sigprocmask(SIG_SETMASK, [],  <unfinished ...>
12772 04:58:43.245830 rt_sigprocmask(SIG_SETMASK, [],  <unfinished ...>
12441 04:58:43.245883 <... rt_sigprocmask resumed> NULL, 8) = 0
12772 04:58:43.246377 <... rt_sigprocmask resumed> NULL, 8) = 0
12441 04:58:43.246456 execve("/bin/grep", ["/bin/grep", "-q", 
"/usr/kerberos/bin"], [/* 17 vars */] <unfinished ...>
12772 04:58:43.246950 --- SIGSEGV (Segmentation fault) @ 0 (0) ---

But this is only a guess. 

I can test any patch or repo snapshot.

Regards,
Michal Hlavinka

_______________________________________________
ast-developers mailing list
[email protected]
https://mailman.research.att.com/mailman/listinfo/ast-developers

Reply via email to