Hi!

----

While playing around with realtime signals I hacked-together a
"simple" testcase which shows that ksh93 (ast-ksh.2013-03-18):
-- snip --
builtin wc

# config
integer -r num_attackers=50

compound -a ar

trap 'ar+=( integer value=${.sh.sig.value} pid=${.sh.sig.pid} )' RTMIN

integer thispid=$$
integer i

for (( i=0 ; i < num_attackers ; i++ )) ; do
        kill -q $i -RTMIN $thispid &
done

# wait for all child processes
while ! wait ; do
        true
done

# list jobs (this list should be empty after the
# "wait"-loop above)
jobdata=${ jobs 2>&1 ; }
printf '%s\n' "$jobdata"

print -v ar
printf '# number of array elements in ar=%d, expected %d.\n' \
        ${#ar[@]} num_attackers
printf '# job data, expected 0 lines, got %d.\n' \
        $(wc -l <<<"$jobdata")

print '# done.'
-- snip --

Running this gives some weired (and variable output):
-- snip --
$ ~/bin/ksh sigrtstorm1.sh
[50] +  Running                 <command unknown>
[49] -  Running                 <command unknown>
[48]    Running                 <command unknown>
[47]   Lowest priority realtime signal <command unknown>
[46]    Running                 <command unknown>
[45]    Running                 <command unknown>
[44]    Running                 <command unknown>
[43]    Running                 <command unknown>
[42]    Running                 <command unknown>
[41]   Lowest priority realtime signal <command unknown>
[40]    Running                 <command unknown>
[39]    Running                 <command unknown>
[38]    Running                 <command unknown>
[37]    Running                 <command unknown>
[36]    Running                 <command unknown>
[35]   Lowest priority realtime signal <command unknown>
[34]    Running                 <command unknown>
[33]    Running                 <command unknown>
[32]   Lowest priority realtime signal <command unknown>
[31]    Running                 <command unknown>
[30]    Running                 <command unknown>
[29]   Lowest priority realtime signal <command unknown>
[28]    Running                 <command unknown>
[27]    Running                 <command unknown>
[26]   Lowest priority realtime signal <command unknown>
[25]    Running                 <command unknown>
[24]   Lowest priority realtime signal <command unknown>
[23]    Running                 <command unknown>
[22]    Running                 <command unknown>
[21]    Running                 <command unknown>
[20]    Running                 <command unknown>
[19]   Lowest priority realtime signal <command unknown>
[18]    Running                 <command unknown>
[17]    Running                 <command unknown>
[16]   Lowest priority realtime signal <command unknown>
[15]    Running                 <command unknown>
[14]    Running                 <command unknown>
[13]    Running                 <command unknown>
[12]    Running                 <command unknown>
[11]   Lowest priority realtime signal <command unknown>
[10]    Running                 <command unknown>
[9]   Lowest priority realtime signal <command unknown>
[8]    Running                 <command unknown>
[7]    Running                 <command unknown>
[6]    Running                 <command unknown>
[5]    Running                 <command unknown>
[4]    Running                 <command unknown>
[3]   Lowest priority realtime signal <command unknown>
[2]    Running                 <command unknown>
[1]    Running                 <command unknown>
(
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3261
                typeset -l -i value=4
        )
        (
                typeset -l -i pid=3261
                typeset -l -i value=4
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3284
                typeset -l -i value=19
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3316
                typeset -l -i value=39
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3254
                typeset -l -i value=0
        )
        (
                typeset -l -i pid=3323
                typeset -l -i value=44
        )
        (
                typeset -l -i pid=3319
                typeset -l -i value=41
        )
        (
                typeset -l -i pid=3319
                typeset -l -i value=41
        )
        (
                typeset -l -i pid=3322
                typeset -l -i value=43
        )
        (
                typeset -l -i pid=3329
                typeset -l -i value=49
        )
        (
                typeset -l -i pid=3312
                typeset -l -i value=36
        )
        (
                typeset -l -i pid=3327
                typeset -l -i value=47
        )
)
# number of array elements in ar=32, expected 50.
# job data, expected 0 lines, got 50.
# done.
-- snip --

AFAIK four things are wrong:
1. The shell receives 50 SIGRTMIN signals... but the SIGRTMIN trap is
only called 32 times (the number is variable)
2. It seems even after a loop of $ while ! wait ; do true ; done # the
child processes were not reaped... why does that happen ?
3. The output of $ job -l # contains messages like "[47]   Lowest
priority realtime signal <command unknown>" ... which at least sounds
wrong...
4. The realtime value (yes, POSIX realtime signals can pass _values_
via signals) is often 0 (see output "value=0") but this value should
occur only one

Digging around in the code I found that at least part of the problem
is that signals arrive faster than they can be processed by the shell
trap... therefore I hacked-up a patch (attached as
"ksh93_sigrt_siginfo_queue001.diff.txt") which implements a simple
queue system which saves the siginfo data in a single-linked list and
uses that list when the matching shell trap is called (e.g. the shell
trap is called once for each |siginfo_chain_t| entry).

* The good news is: Under valgrind control (which previously only
called the shell trap for SIGRTMIN 3-5 times for the example code
above) now calls the shell trap exactly 50 times
* The bad news is: Without valgrind the number of trap calls is not
exactly 50... but the number correlates exactly with the number of $
job -l #-lines complaining about "[47]   Lowest priority realtime
signal <command unknown>" (see [3] above), e.g. if 8 lines of "[47]
Lowest priority realtime signal <command unknown>" occur then array
'ar" has exactly 42 entries...

... erm: David/Phong: Any idea what may go wrong ? What do you think
about the patch ([1]) ?

[1]=Note the patch is not exactly what I wish for... there are two issues:
1. I'd like to have the shell traps called exactly in-order in which
they arrive, e.g. instead of having lists per signal number to queue
the siginfo data there should only be one global list (the typical
issue Irek brought up was that if a process sends a RTMIN signal and
then terminates currently SIGCLD for that process child is executed
before the RTMIN signal is processed)
2. The list mangement is not fully async-signal-safe, e.g. this code:
-- snip --
+                                       si =
(siginfo_chain_t*)shp->siginfo[sig];
+                                       shp->siginfo[sig]=NULL;
-- snip --
... which is used to grab the current list of queued siginfo data for
processing may suffer from a race condition when a signal handler is
called exactly for these instructions (technically async-signals can
interrupt any instruction).
A mutex is not possible (for obvious reasons) ... and the "official"
way to disable signals (which would mean _all_ signals for which shell
traps are registered if we implement a single list for all kinds of
siginfo data) during that time is IMO far to heavywheight... any ideas
what can be used (yes... I saw the discussion about ASO CAS... can
that be used ?) ?

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) [email protected]
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)
diff -r -u src/cmd/ksh93/sh/fault.c src/cmd/ksh93/sh/fault.c
--- src/cmd/ksh93/sh/fault.c    2013-03-11 22:12:27.000000000 +0100
+++ src/cmd/ksh93/sh/fault.c    2013-03-26 11:03:57.952498746 +0100
@@ -72,6 +72,13 @@
        return(action);
 }
 
+typedef struct _siginfo_chain_t siginfo_chain_t;
+struct _siginfo_chain_t
+{
+       siginfo_chain_t *next;
+       siginfo_t       si;
+};
+
 /*
  * Most signals caught or ignored by the shell come here
 */
@@ -121,12 +128,24 @@
        {
                if(trap && *trap)
                {
+                       siginfo_chain_t *si;
+
                        shp->trapnote |= SH_SIGTRAP;
                        shp->sigflag[sig] |= SH_SIGTRAP;
                        if(!shp->siginfo)
                                shp->siginfo = 
(void**)calloc(sizeof(void*),shp->gd->sigmax);
-                       shp->siginfo[sig] = malloc(sizeof(siginfo_t));
-                       memcpy(shp->siginfo[sig],info,sizeof(siginfo_t));
+                       si = calloc(sizeof(siginfo_chain_t), 1);
+                       si->next=NULL;
+                       memcpy(&si->si,info,sizeof(siginfo_t));
+                       if (shp->siginfo[sig])
+                       {
+                               siginfo_chain_t *chain = (siginfo_chain_t 
*)shp->siginfo[sig];
+                               while (chain->next != NULL)
+                                       chain=chain->next;
+                               chain->next = si;
+                       }
+                       else
+                               shp->siginfo[sig] = si;
                }
                return;
        }
@@ -203,14 +222,26 @@
                shp->lastsig = sig;
        if(trap)
        {
+               siginfo_chain_t *si;
+
                /*
                 * propogate signal to foreground group
                 */
 #ifdef _lib_sigaction
                if(!shp->siginfo)
                        shp->siginfo = 
(void**)calloc(sizeof(void*),shp->gd->sigmax);
-               shp->siginfo[sig] = malloc(sizeof(siginfo_t));
-               memcpy(shp->siginfo[sig],info,sizeof(siginfo_t));
+               si = calloc(sizeof(siginfo_chain_t), 1);
+               si->next=NULL;
+               memcpy(&si->si,info,sizeof(siginfo_t));
+               if (shp->siginfo[sig])
+               {
+                       siginfo_chain_t *chain = (siginfo_chain_t 
*)shp->siginfo[sig];
+                       while (chain->next != NULL)
+                               chain=chain->next;
+                       chain->next = si;
+               }
+               else
+                       shp->siginfo[sig] = si;
 #endif
                if(sig==SIGHUP && job.curpgid)
                        killpg(job.curpgid,SIGHUP);
@@ -460,17 +491,41 @@
                        shp->sigflag[sig] &= ~SH_SIGTRAP;
                        if(trap=shp->st.trapcom[sig])
                        {
-#ifdef _lib_sigaction
                                if(shp->siginfo && shp->siginfo[sig])
-                                       
sh_setsiginfo((siginfo_t*)shp->siginfo[sig]);
-#endif
-                               cursig = sig;
-                               sh_trap(shp,trap,0);
-#ifdef _lib_sigaction
-                               if(shp->siginfo[sig])
-                                       free(shp->siginfo[sig]);
-                               shp->siginfo[sig] = 0;
+                               {
+                                       siginfo_chain_t *si, *chain;
+
+                                       si = 
(siginfo_chain_t*)shp->siginfo[sig];
+                                       shp->siginfo[sig]=NULL;
+
+#if 1
+                                       chain = si;
+                                       long chained=0L;
+                                       while (chain->next != NULL)
+                                       {
+                                               chain=chain->next;
+                                               chained++;
+                                       }
+                                       sfprintf(sfstderr, "## chained sig=%d, 
num=%ld\n", (int)sig, chained);
 #endif
+
+                                       while (si != NULL)
+                                       {
+                                               sh_setsiginfo(&si->si);
+
+                                               cursig = sig;
+                                               sh_trap(shp,trap,0);
+                                               
+                                               chain=si->next;
+                                               free(si);
+                                               si=chain;
+                                       }
+                               }
+                               else
+                               {
+                                       cursig = sig;
+                                       sh_trap(shp,trap,0);
+                               }
                                cursig = -1;
                        }
                }
_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers

Reply via email to