Craig A. Berry writes:
> At 10:04 AM 4/22/2002 -0400, Charles Lane wrote:
> >The ones that I've seen are hanging in the "my_waitpid" code. Here's
> >what seems to be going on:
> >
>> --> pipe to/from subprocess, it does it's thing and exits
>> --> pipe code picks up exit via termination ast, deletes pipe structs
>> --> my_waitpid called from Perl:
>> doesn't find match to pipe (it was deleted)...
>> does a getjpi and finds a termination mbx
>> tries to read termination mbx, hangs forever
>
....
> >
> >So if we are sucessful in opening a channel to the termination mbx and
> >grabbing the termination message, we'll mess up whatever code was
> >waiting for that message.
> Really? What prevents two readers from reading the same thing?
Well, whoever reads the message first causes the message to be removed
from the mailbox. That's just the way "plain vanilla" mailbox i/o works.
I have a vague recollection that there may be some sort of special QIO
call that will peek at the mailbox contents without removing the
message, BUT:
if the intended recipient grabs the message first, you can just bet
that it *won't* be doing a "peek at the message" QIO. So sometimes
we get the message, sometimes we don't. Ugh.
Any "peek at the message" option is not available for older
versions of VMS. It doesn't seem to be on my V6.2 system, for
example.
(one could also do a "read message, write it back" to emulate a "peek";
I have no idea how well it would play with lib$spawn/DCL, but it would have
the same timing race problems as a true "peek" function)
> What about putting a timeout on the $qiow that is reading from the
> termination mailbox? If it completes with a timeout, we can requeue it, but
> in the meanwhile whatever else was pending should have a chance to fire.
We seem to have triggered a real race condition, because of the timing
of termination messages and process shutdown/device deallocation. I don't
think a timeout/retry will get around it, but a "timeout/I give up" would
at least break us out of the infinite loop.
Unless there's some other mechanism out there that I'm missing, I think
we're just going to have to implement waitpid with polling (and forget
about getting termination status) for non-child processes.
> We really need to get the pipe torture tests or some subset of them into the
> test suite so we catch these problems when they first arise.
Agreed. It's been sorting out the logic of the "okay results", particularly
when you have two groups of messages intermixed (each in a fixed order, but
interspersed randomly) that stalled me.
Here's a patch that implements my suggested changes to waitpid; give it
a shot. It seems to get us through the pipe torture tests.
diff -uBb vms/vms.c-orig vms/vms.c
--- vms/vms.c-orig Mon Apr 22 13:33:55 2002
+++ vms/vms.c Mon Apr 22 13:55:00 2002
@@ -1341,6 +1341,18 @@
unsigned long int exit_status;
};
+typedef struct _closed_pipes Xpipe;
+typedef struct _closed_pipes* pXpipe;
+
+struct _closed_pipes {
+ int pid; /* PID of subprocess */
+ unsigned long completion; /* termination status of subprocess */
+};
+#define NKEEPCLOSED 50
+static Xpipe closed_list[NKEEPCLOSED];
+static int closed_index = 0;
+static int closed_num = 0;
+
#define RETRY_DELAY "0 ::0.20"
#define MAX_RETRY 50
@@ -1476,6 +1488,15 @@
{
pInfo i = open_pipes;
int iss;
+ pXpipe x;
+
+ info->completion &= 0x0FFFFFFF; /* strip off "control" field */
+ closed_list[closed_index].pid = info->pid;
+ closed_list[closed_index].completion = info->completion;
+ closed_index++;
+ if (closed_index == NKEEPCLOSED)
+ closed_index = 0;
+ closed_num++;
while (i) {
if (i == info) break;
@@ -1483,7 +1504,6 @@
}
if (!i) return; /* unlinked, probably freed too */
- info->completion &= 0x0FFFFFFF; /* strip off "control" field */
info->done = TRUE;
/*
@@ -2639,6 +2659,7 @@
pInfo info;
int done;
int sts;
+ int j;
if (statusp) *statusp = 0;
@@ -2656,9 +2677,18 @@
if (statusp) *statusp = info->completion;
return pid;
+ }
+
+ /* child that already terminated */
+ for (j = 0; j < NKEEPCLOSED && j < closed_num; j++) {
+ if (closed_list[j].pid == pid) {
+ if (statusp) *statusp = closed_list[j].completion;
+ return pid;
}
- else { /* this child is not one of our own pipe children */
+ }
+
+ { /* this child is not one of our own pipe children */
#if defined(__CRTL_VER) && __CRTL_VER >= 70100322
@@ -2685,18 +2715,11 @@
unsigned long int ownercode = JPI$_OWNER, ownerpid;
unsigned long int pidcode = JPI$_PID, mypid;
unsigned long int interval[2];
- int termination_mbu = 0;
- unsigned short qio_iosb[4];
unsigned int jpi_iosb[2];
- struct itmlst_3 jpilist[3] = {
+ struct itmlst_3 jpilist[2] = {
{sizeof(ownerpid), JPI$_OWNER, &ownerpid, 0},
- {sizeof(termination_mbu), JPI$_TMBU, &termination_mbu, 0},
{ 0, 0, 0, 0}
};
- char trmmbx[NAM$C_DVI+1];
- $DESCRIPTOR(trmmbxdsc,trmmbx);
- struct accdef trmmsg;
- unsigned short int mbxchan;
if (pid <= 0) {
/* Sorry folks, we don't presently implement rooting around for
@@ -2737,52 +2760,16 @@
pid,mypid);
}
- /* It's possible to have a mailbox unit number but no actual mailbox; we
- * check for this by assigning a channel to it, which we need anyway.
- */
- if (termination_mbu != 0) {
- sprintf(trmmbx, "MBA%d:", termination_mbu);
- trmmbxdsc.dsc$w_length = strlen(trmmbx);
- sts = sys$assign(&trmmbxdsc, &mbxchan, 0, 0);
- if (sts == SS$_NOSUCHDEV) {
- termination_mbu = 0; /* set up to take "no mailbox" case */
- sts = SS$_NORMAL;
- }
- _ckvmssts(sts);
- }
- /* If the process doesn't have a termination mailbox, then simply check
+ /* simply check
* on it once a second until it's not there anymore.
*/
- if (termination_mbu == 0) {
+
_ckvmssts(sys$bintim(&intdsc,interval));
while ((sts=lib$getjpi(&ownercode,&pid,0,&ownerpid,0,0)) & 1) {
_ckvmssts(sys$schdwk(0,0,interval,0));
_ckvmssts(sys$hiber());
}
if (sts == SS$_NONEXPR) sts = SS$_NORMAL;
- }
- else {
- /* If we do have a termination mailbox, post reads to it until we get a
- * termination message, discarding messages of the wrong type or for other
- * processes. If there is a place to put the final status, then do so.
- */
- sts = SS$_NORMAL;
- while (sts & 1) {
- memset((void *) &trmmsg, 0, sizeof(trmmsg));
- sts = sys$qiow(0,mbxchan,IO$_READVBLK,&qio_iosb,0,0,
- &trmmsg,ACC$K_TERMLEN,0,0,0,0);
- if (sts & 1) sts = qio_iosb[0];
-
- if ( sts & 1
- && trmmsg.acc$w_msgtyp == MSG$_DELPROC
- && trmmsg.acc$l_pid == pid ) {
-
- if (statusp) *statusp = trmmsg.acc$l_finalsts;
- sts = sys$dassgn(mbxchan);
- break;
- }
- }
- } /* termination_mbu ? */
_ckvmssts(sts);
return pid;
--
Drexel University \V --Chuck Lane
======]---------->--------*------------<-------[===========
(215) 895-1545 _/ \ Particle Physics
FAX: (215) 895-5934 /\ /~~~~~~~~~~~ [EMAIL PROTECTED]