On 3 June 2013 15:22, Roland Mainz <[email protected]> wrote: > On Sat, Jun 1, 2013 at 4:17 AM, Cedric Blancher > <[email protected]> wrote: >> I've tried to use the realtime signals in ast-ksh 20130524 to see if >> they are reliable how but ran into instabilities again. >> My example I used for testing is this one: >> ------------cut-------------- > [snip] >> ------------cut-------------- >> >> The example should print nothing if realtime signals are working as >> expected. ast-ksh.20130524 however gives me this output: >> got {#rtar[10][*]}=1,expected 10 >> got {#rtar[11][*]}=4,expected 10 >> got {#rtar[12][*]}=4,expected 10 >> got {#rtar[13][*]}=4,expected 10 >> got {#rtar[14][*]}=5,expected 10 >> got {#rtar[15][*]}=5,expected 10 >> ( >> [10]=( >> ( >> msg=10 >> typeset -l -i pid=55918 >> ) >> ) >> [11]=( >> ( >> typeset -l -i pid=55916 >> ) >> ( >> typeset -l -i pid=55918 >> ) >> ( >> msg=11 >> typeset -l -i pid=55922 >> ) >> ( >> msg=11 >> typeset -l -i pid=55925 >> ) >> ) >> [12]=( >> ( >> msg=12 >> typeset -l -i pid=55918 >> ) >> ( >> msg=12 >> typeset -l -i pid=55920 >> ) >> ( >> typeset -l -i pid=55922 >> ) >> ( >> msg=12 >> typeset -l -i pid=55925 >> ) >> ) >> [13]=( >> ( >> msg=13 >> typeset -l -i pid=55917 >> ) >> ( >> msg=13 >> typeset -l -i pid=55918 >> ) >> ( >> msg=13 >> typeset -l -i pid=55924 >> ) >> ( >> msg=13 >> typeset -l -i pid=55925 >> ) >> ) >> [14]=( >> ( >> typeset -l -i pid=55916 >> ) >> ( >> msg=14 >> typeset -l -i pid=55917 >> ) >> ( >> msg=14 >> typeset -l -i pid=55918 >> ) >> ( >> msg=14 >> typeset -l -i pid=55924 >> ) >> ( >> msg=14 >> typeset -l -i pid=55925 >> ) >> ) >> [15]=( >> ( >> msg=15 >> typeset -l -i pid=55917 >> ) >> ( >> msg=15 >> typeset -l -i pid=55920 >> ) >> ( >> msg=15 >> typeset -l -i pid=55923 >> ) >> ( >> msg=15 >> typeset -l -i pid=55924 >> ) >> ( >> msg=15 >> typeset -l -i pid=55925 >> ) >> ) >> ) >> >> More than half of the signals are lost (which is a violation of the >> POSIX realtime spec which mandates that realtime signals must be >> reliable) and some of the array entries aren't even filled out (like >> rtar[14][0].msg is missing). > > Grumpf... I spend half the weekend digging and debugging the signal > handling code. Basically the issue is that signal+signal trap handling > in ast-ksh.2013-05-24 is IMO utterly *broken* - while testing I found > that SIGCHLD and SIGRTMIN signals are delivered _reliably_ to the > shell but the implementation then looses between 8%-50% (measured on > SuSE 12.3/Linux/AMD64 and Solaris 11) during signal trap processing. > > Example: > I applied the following test patch to ast-ksh.2013-05-24: > -- snip -- > diff -r -u src/cmd/ksh93/sh/fault.c src/cmd/ksh93/sh/fault.c > --- src/cmd/ksh93/sh/fault.c 2013-05-22 19:33:13.000000000 +0200 > +++ src/cmd/ksh93/sh/fault.c 2013-06-03 12:41:13.575689611 +0200 > @@ -72,6 +72,9 @@ > return(action); > } > > +volatile int numsigrt_received=0; > +volatile int numsigrt_processed=0; > + > /* > * Most signals caught or ignored by the shell come here > */ > @@ -86,6 +89,12 @@ > register char *trap; > register struct checkpt *pp = (struct checkpt*)shp->jmplist; > int action=0; > + > + if (sig==SIGRTMIN) > + { > + numsigrt_received++; > + } > + > /* reset handler */ > if(!(sig&SH_TRAP)) > signal(sig, sh_fault); > @@ -469,6 +478,11 @@ > > sh_setsiginfo((siginfo_t*)shp->siginfo[sig]); > #endif > cursig = sig; > + > +if (sig==SIGRTMIN) > +{ > + numsigrt_processed++; > +} > sh_trap(shp,trap,0); > #ifdef _lib_sigaction > if(shp->siginfo[sig] && sig!=SIGCHLD) > diff -r -u src/cmd/ksh93/sh/main.c src/cmd/ksh93/sh/main.c > --- src/cmd/ksh93/sh/main.c 2013-05-18 18:09:01.000000000 +0200 > +++ src/cmd/ksh93/sh/main.c 2013-06-03 12:42:35.020704526 +0200 > @@ -369,6 +369,12 @@ > sh_onstate(shp,SH_INTERACTIVE); > nv_putval(IFSNOD,(char*)e_sptbnl,NV_RDONLY); > exfile(shp,iop,fdin); > + > +extern volatile int numsigrt_received; > +extern volatile int numsigrt_processed; > +sfprintf(sfstderr, "#>>>>>>>>>>>>>>>>>>>> sum: numsigrt_received=%d, > numsigrt_processed=%d\n", numsigrt_received, numsigrt_processed); > + > + > sh_done(shp,0); > /* NOTREACHED */ > return(0); > -- snip -- > > ... and then I modified Cedric's testcase and added a workaround to > avoid that SIGCHLD handling can interfere with filling the 2D indexed > array: > -- snip -- > compound -a rtar > > function rttrap > { > integer v=${.sh.sig.value} > integer s=${#rtar[v][@]} > > rtar[$v][$s]=( > integer pid=${.sh.sig.pid} > typeset msg=${v} > ) > } > > trap 'rttrap' RTMIN > > typeset m # used in child processes > integer pid p i > (( pid=$$ )) > > for (( p=0 ; p < 5 ; p++ )) ; do > { > # sleep for a moment to make sure the > # other child processes can become ready > # (sort of "poor man's" "barrier") > sleep 1 > > for m in 'a' 'b' 'c' 'd' 'e' 'f' ; do > kill -q $((16#$m)) -s RTMIN $pid > done > > sleep 5 # make sure SIGCHLD doesn't interfere > exit 0 > } & > done > > # wait for children to terminate > # we need to loop here since wait(1) may be aborted > # by signals > while ! wait ; do > true > done > > bool fail=false > if ! (( ${#rtar[@]} == 6 )) ; then > printf "got {#rtar[@]}=%d, expected 6\n" ${#rtar[@]} > (( fail=true )) > fi > > for (( i=0xa ; i <= 0xf; i++ )) ; do > if ! (( ${#rtar[$i][*]} == 5 )) ; then > printf "got {#rtar[%d][*]}=%d," \ > $i ${#rtar[$i][*]} > printf "expected 5\n" > (( fail=true )) > fi > done > > ((fail)) && print -v rtar > -- snip -- > > > Building the modified ksh93 version and running the script returns the > following debug output: > -- snip -- > $ arch/linux.i386-64/bin/ksh ~/tmp/shrttest1.sh >/dev/null > #>>>>>>>>>>>>>>>>>>>> sum: numsigrt_received=30, numsigrt_processed=17 > -- snip -- > > The output is more or less self-describing: 30 SIGRTMIN signals have > been received but only 17 have been processed... which should not > happen. IMO the trap for RTMIN should be called 30 times - one time > for each SIGRTMIN signal received. > > The issue is not limited to RTMIN-RTMAX signal trap handling (for > example SIGCHLD handling has similar problems... which are basically > only avoided because each received SIGCHLD signal causes the shell to > probe all registered child processes) ... the whole shell signal trap > handling is IMO doomed (e.g. the same general issues apply to "bash" > and "dash") as long as... > 1. ... it happens on the signal stack itself > 2. ... C signal handlers execute shell code. The result is usually > silent data corruption within the shell. I call it "silent" since the > symptoms are usually not visible from the shell level unless a lot of > signals arrive and the shell scripts run long enough so the damage can > accumulate > 3. ... C signal handlers can poke around in every aspect of the > |Shell_t| structure. > 4. ... the code assumes that C statements like |sig &= ~SH_TRAP| are > an atomic operation (OK... wrong statement used as example) and can't > be interrupted. > On most RISC platforms this statement results in 3-5 instructions... > where another signal may interrupt between any of those 3-5 > instructions. This kind of issue can currently strike _everywhere_ in > the code and leads to corruption issues when complex variables types > like indexed arrays, associative arrays or compound variables are > processed (Cedric's demo is a good example because multiple signal > handlers work on the same 2D indexed array). > 5. ... interleaving signal handlers can overwrite data in the .sh.sig > compound variable while other signal traps are running and reading > those data > 6. ... long as ksh93's C code uses stuff like "critical sections" > where signals are silently dropped > > I'm currently drafting a prototype patch[1] (which means: David... > please wait for my prototype patch before trying to address this > yourself) to fix these problems... but any feedback on the findings > above would be nice. > > [1]=IMO the only practical solution to fix all these problems is to > "offload" the execution of signal trap shell code into the main shell > (stack), e.g. the C signal handlers only "record" all (including > SIGCHLD) the signals (including siginfo data) in a queue and the shell > processes this queue in-order. IMO this will prevent all the issues > listed above, avoid data corruption, get rid of the critical sections > and the usual stack exhaustion crashes "bash" and "dash" are famous > for when too many signals arrive.
I'm forwarding an email from Chet Ramey <[email protected]>, the bash4 author, who is exactly trying to implement the same solution on bash4 now - evict all shell script execution from the signal handler and do it on the normal stack. ---------- Forwarded message ---------- From: Chet Ramey <[email protected]> Date: 6 June 2013 17:28 Subject: Re: Executing shell code on the signal stack, or why is this is a bad idea To: Lionel Cons <[email protected]> Cc: [email protected], [email protected] On 6/6/13 5:31 AM, Lionel Cons wrote: > On 6 June 2013 11:29, Lionel Cons <[email protected]> wrote: >> Forwarding an interesting posting from Roland Mainz who did an >> investigation why signal trap processing in ksh93, bash and dash is >> currently not reliable. >> >> Lionel >> > > PS: malloc() from AT&T libast, used by ksh93, is async signal safe. > Unless bash's sh_malloc() has a similar functionality you'll need a > different solution, such as using signalfd() The internal bash malloc (lib/malloc/malloc.c) blocks signals, but it's easy to compile bash without it, and many malloc implementations (e.g., the Linux libc malloc) do not do signal manipulation. I did a lot of work between bash-4.2 and bash-4.3, currently in alpha testing, to move any execution of non-signal-safe functions and code out of signal handlers. Chet -- ``The lyf so short, the craft so long to lerne.'' - Chaucer ``Ars longa, vita brevis'' - Hippocrates Chet Ramey, ITS, CWRU [email protected] http://cnswww.cns.cwru.edu/~chet/ _______________________________________________ ast-developers mailing list [email protected] http://lists.research.att.com/mailman/listinfo/ast-developers
