Hi!
----
While playing around with realtime signals I hacked-together a
"simple" testcase which shows that ksh93 (ast-ksh.2013-03-18):
-- snip --
builtin wc
# config
integer -r num_attackers=50
compound -a ar
trap 'ar+=( integer value=${.sh.sig.value} pid=${.sh.sig.pid} )' RTMIN
integer thispid=$$
integer i
for (( i=0 ; i < num_attackers ; i++ )) ; do
kill -q $i -RTMIN $thispid &
done
# wait for all child processes
while ! wait ; do
true
done
# list jobs (this list should be empty after the
# "wait"-loop above)
jobdata=${ jobs 2>&1 ; }
printf '%s\n' "$jobdata"
print -v ar
printf '# number of array elements in ar=%d, expected %d.\n' \
${#ar[@]} num_attackers
printf '# job data, expected 0 lines, got %d.\n' \
$(wc -l <<<"$jobdata")
print '# done.'
-- snip --
Running this gives some weired (and variable output):
-- snip --
$ ~/bin/ksh sigrtstorm1.sh
[50] + Running <command unknown>
[49] - Running <command unknown>
[48] Running <command unknown>
[47] Lowest priority realtime signal <command unknown>
[46] Running <command unknown>
[45] Running <command unknown>
[44] Running <command unknown>
[43] Running <command unknown>
[42] Running <command unknown>
[41] Lowest priority realtime signal <command unknown>
[40] Running <command unknown>
[39] Running <command unknown>
[38] Running <command unknown>
[37] Running <command unknown>
[36] Running <command unknown>
[35] Lowest priority realtime signal <command unknown>
[34] Running <command unknown>
[33] Running <command unknown>
[32] Lowest priority realtime signal <command unknown>
[31] Running <command unknown>
[30] Running <command unknown>
[29] Lowest priority realtime signal <command unknown>
[28] Running <command unknown>
[27] Running <command unknown>
[26] Lowest priority realtime signal <command unknown>
[25] Running <command unknown>
[24] Lowest priority realtime signal <command unknown>
[23] Running <command unknown>
[22] Running <command unknown>
[21] Running <command unknown>
[20] Running <command unknown>
[19] Lowest priority realtime signal <command unknown>
[18] Running <command unknown>
[17] Running <command unknown>
[16] Lowest priority realtime signal <command unknown>
[15] Running <command unknown>
[14] Running <command unknown>
[13] Running <command unknown>
[12] Running <command unknown>
[11] Lowest priority realtime signal <command unknown>
[10] Running <command unknown>
[9] Lowest priority realtime signal <command unknown>
[8] Running <command unknown>
[7] Running <command unknown>
[6] Running <command unknown>
[5] Running <command unknown>
[4] Running <command unknown>
[3] Lowest priority realtime signal <command unknown>
[2] Running <command unknown>
[1] Running <command unknown>
(
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3261
typeset -l -i value=4
)
(
typeset -l -i pid=3261
typeset -l -i value=4
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3284
typeset -l -i value=19
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3316
typeset -l -i value=39
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3254
typeset -l -i value=0
)
(
typeset -l -i pid=3323
typeset -l -i value=44
)
(
typeset -l -i pid=3319
typeset -l -i value=41
)
(
typeset -l -i pid=3319
typeset -l -i value=41
)
(
typeset -l -i pid=3322
typeset -l -i value=43
)
(
typeset -l -i pid=3329
typeset -l -i value=49
)
(
typeset -l -i pid=3312
typeset -l -i value=36
)
(
typeset -l -i pid=3327
typeset -l -i value=47
)
)
# number of array elements in ar=32, expected 50.
# job data, expected 0 lines, got 50.
# done.
-- snip --
AFAIK four things are wrong:
1. The shell receives 50 SIGRTMIN signals... but the SIGRTMIN trap is
only called 32 times (the number is variable)
2. It seems even after a loop of $ while ! wait ; do true ; done # the
child processes were not reaped... why does that happen ?
3. The output of $ job -l # contains messages like "[47] Lowest
priority realtime signal <command unknown>" ... which at least sounds
wrong...
4. The realtime value (yes, POSIX realtime signals can pass _values_
via signals) is often 0 (see output "value=0") but this value should
occur only one
Digging around in the code I found that at least part of the problem
is that signals arrive faster than they can be processed by the shell
trap... therefore I hacked-up a patch (attached as
"ksh93_sigrt_siginfo_queue001.diff.txt") which implements a simple
queue system which saves the siginfo data in a single-linked list and
uses that list when the matching shell trap is called (e.g. the shell
trap is called once for each |siginfo_chain_t| entry).
* The good news is: Under valgrind control (which previously only
called the shell trap for SIGRTMIN 3-5 times for the example code
above) now calls the shell trap exactly 50 times
* The bad news is: Without valgrind the number of trap calls is not
exactly 50... but the number correlates exactly with the number of $
job -l #-lines complaining about "[47] Lowest priority realtime
signal <command unknown>" (see [3] above), e.g. if 8 lines of "[47]
Lowest priority realtime signal <command unknown>" occur then array
'ar" has exactly 42 entries...
... erm: David/Phong: Any idea what may go wrong ? What do you think
about the patch ([1]) ?
[1]=Note the patch is not exactly what I wish for... there are two issues:
1. I'd like to have the shell traps called exactly in-order in which
they arrive, e.g. instead of having lists per signal number to queue
the siginfo data there should only be one global list (the typical
issue Irek brought up was that if a process sends a RTMIN signal and
then terminates currently SIGCLD for that process child is executed
before the RTMIN signal is processed)
2. The list mangement is not fully async-signal-safe, e.g. this code:
-- snip --
+ si =
(siginfo_chain_t*)shp->siginfo[sig];
+ shp->siginfo[sig]=NULL;
-- snip --
... which is used to grab the current list of queued siginfo data for
processing may suffer from a race condition when a signal handler is
called exactly for these instructions (technically async-signals can
interrupt any instruction).
A mutex is not possible (for obvious reasons) ... and the "official"
way to disable signals (which would mean _all_ signals for which shell
traps are registered if we implement a single list for all kinds of
siginfo data) during that time is IMO far to heavywheight... any ideas
what can be used (yes... I saw the discussion about ASO CAS... can
that be used ?) ?
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) [email protected]
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
diff -r -u src/cmd/ksh93/sh/fault.c src/cmd/ksh93/sh/fault.c
--- src/cmd/ksh93/sh/fault.c 2013-03-11 22:12:27.000000000 +0100
+++ src/cmd/ksh93/sh/fault.c 2013-03-26 11:03:57.952498746 +0100
@@ -72,6 +72,13 @@
return(action);
}
+typedef struct _siginfo_chain_t siginfo_chain_t;
+struct _siginfo_chain_t
+{
+ siginfo_chain_t *next;
+ siginfo_t si;
+};
+
/*
* Most signals caught or ignored by the shell come here
*/
@@ -121,12 +128,24 @@
{
if(trap && *trap)
{
+ siginfo_chain_t *si;
+
shp->trapnote |= SH_SIGTRAP;
shp->sigflag[sig] |= SH_SIGTRAP;
if(!shp->siginfo)
shp->siginfo =
(void**)calloc(sizeof(void*),shp->gd->sigmax);
- shp->siginfo[sig] = malloc(sizeof(siginfo_t));
- memcpy(shp->siginfo[sig],info,sizeof(siginfo_t));
+ si = calloc(sizeof(siginfo_chain_t), 1);
+ si->next=NULL;
+ memcpy(&si->si,info,sizeof(siginfo_t));
+ if (shp->siginfo[sig])
+ {
+ siginfo_chain_t *chain = (siginfo_chain_t
*)shp->siginfo[sig];
+ while (chain->next != NULL)
+ chain=chain->next;
+ chain->next = si;
+ }
+ else
+ shp->siginfo[sig] = si;
}
return;
}
@@ -203,14 +222,26 @@
shp->lastsig = sig;
if(trap)
{
+ siginfo_chain_t *si;
+
/*
* propogate signal to foreground group
*/
#ifdef _lib_sigaction
if(!shp->siginfo)
shp->siginfo =
(void**)calloc(sizeof(void*),shp->gd->sigmax);
- shp->siginfo[sig] = malloc(sizeof(siginfo_t));
- memcpy(shp->siginfo[sig],info,sizeof(siginfo_t));
+ si = calloc(sizeof(siginfo_chain_t), 1);
+ si->next=NULL;
+ memcpy(&si->si,info,sizeof(siginfo_t));
+ if (shp->siginfo[sig])
+ {
+ siginfo_chain_t *chain = (siginfo_chain_t
*)shp->siginfo[sig];
+ while (chain->next != NULL)
+ chain=chain->next;
+ chain->next = si;
+ }
+ else
+ shp->siginfo[sig] = si;
#endif
if(sig==SIGHUP && job.curpgid)
killpg(job.curpgid,SIGHUP);
@@ -460,17 +491,41 @@
shp->sigflag[sig] &= ~SH_SIGTRAP;
if(trap=shp->st.trapcom[sig])
{
-#ifdef _lib_sigaction
if(shp->siginfo && shp->siginfo[sig])
-
sh_setsiginfo((siginfo_t*)shp->siginfo[sig]);
-#endif
- cursig = sig;
- sh_trap(shp,trap,0);
-#ifdef _lib_sigaction
- if(shp->siginfo[sig])
- free(shp->siginfo[sig]);
- shp->siginfo[sig] = 0;
+ {
+ siginfo_chain_t *si, *chain;
+
+ si =
(siginfo_chain_t*)shp->siginfo[sig];
+ shp->siginfo[sig]=NULL;
+
+#if 1
+ chain = si;
+ long chained=0L;
+ while (chain->next != NULL)
+ {
+ chain=chain->next;
+ chained++;
+ }
+ sfprintf(sfstderr, "## chained sig=%d,
num=%ld\n", (int)sig, chained);
#endif
+
+ while (si != NULL)
+ {
+ sh_setsiginfo(&si->si);
+
+ cursig = sig;
+ sh_trap(shp,trap,0);
+
+ chain=si->next;
+ free(si);
+ si=chain;
+ }
+ }
+ else
+ {
+ cursig = sig;
+ sh_trap(shp,trap,0);
+ }
cursig = -1;
}
}
_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers