r26868 seems to have some issues. It works well as long as all processes are started on the same node (aka. there is a single daemon), but it breaks with the error message attached below if there are more than two daemons.
$ mpirun -np 2 --bynode ./runme [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent to a process whose contact information is unknown in file ../../../../../ompi/orte/mca/rml/oob/rml_oob_send.c at line 362 [node01:07767] [[21341,0],1] attempted to send to [[21341,0],2]: tag 15 [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent to a process whose contact information is unknown in file ../../../../ompi/orte/mca/grpcomm/base/grpcomm_base_xcast.c at line 157 I confirm that applying the reverted commit brings the trunk to a normal state. Please - a tad more care in what gets committed?? george. On Jul 25, 2012, at 23:46 , svn-commit-mai...@open-mpi.org wrote: > Author: rhc (Ralph Castain) > Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012) > New Revision: 26868 > URL: https://svn.open-mpi.org/trac/ompi/changeset/26868 > > Log: > Reconnect the rsh/ssh error reporting code for remote spawns to report > failure to launch. Ensure the HNP correctly reports non-zero exit status when > ssh encounters a problem. > > Thanks to Terry for spotting it! > > Text files modified: > trunk/orte/mca/plm/base/plm_base_launch_support.c | 44 > ++++++++++++++++++++++++++++++++++++++++ > trunk/orte/mca/plm/base/plm_base_receive.c | 6 +++++ > > trunk/orte/mca/plm/base/plm_private.h | 4 +++ > > trunk/orte/mca/plm/rsh/plm_rsh_module.c | 18 +++++++--------- > > 4 files changed, 62 insertions(+), 10 deletions(-) > > Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_launch_support.c Wed Jul 25 12:32:51 > 2012 (r26867) > +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2012-07-25 17:46:45 EDT > (Wed, 25 Jul 2012) (r26868) > @@ -741,6 +741,50 @@ > > } > > +void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender, > + opal_buffer_t *buffer, > + orte_rml_tag_t tag, void *cbdata) > +{ > + int status, rc; > + int32_t n; > + orte_vpid_t vpid; > + orte_proc_t *daemon; > + > + /* get the daemon job, if necessary */ > + if (NULL == jdatorted) { > + jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); > + } > + > + /* unpack the daemon that failed */ > + n=1; > + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, > ORTE_VPID))) { > + ORTE_ERROR_LOG(rc); > + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); > + goto finish; > + } > + > + /* unpack the exit status */ > + n=1; > + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, > OPAL_INT))) { > + ORTE_ERROR_LOG(rc); > + status = ORTE_ERROR_DEFAULT_EXIT_CODE; > + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); > + } else { > + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); > + } > + > + /* find the daemon and update its state/status */ > + if (NULL == (daemon = > (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) { > + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); > + goto finish; > + } > + daemon->state = ORTE_PROC_STATE_FAILED_TO_START; > + daemon->exit_code = status; > + > + finish: > + ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START); > +} > + > int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) > { > int i, loc; > > Modified: trunk/orte/mca/plm/base/plm_base_receive.c > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_receive.c Wed Jul 25 12:32:51 > 2012 (r26867) > +++ trunk/orte/mca/plm/base/plm_base_receive.c 2012-07-25 17:46:45 EDT > (Wed, 25 Jul 2012) (r26868) > @@ -87,6 +87,12 @@ > > orte_plm_base_daemon_callback, NULL))) { > ORTE_ERROR_LOG(rc); > } > + if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, > + > ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, > + > ORTE_RML_PERSISTENT, > + > orte_plm_base_daemon_failed, NULL))) { > + ORTE_ERROR_LOG(rc); > + } > } > recv_issued = true; > > > Modified: trunk/orte/mca/plm/base/plm_private.h > ============================================================================== > --- trunk/orte/mca/plm/base/plm_private.h Wed Jul 25 12:32:51 2012 > (r26867) > +++ trunk/orte/mca/plm/base/plm_private.h 2012-07-25 17:46:45 EDT (Wed, > 25 Jul 2012) (r26868) > @@ -78,6 +78,10 @@ > ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, > orte_process_name_t* sender, > opal_buffer_t *buffer, > orte_rml_tag_t tag, void > *cbdata); > +ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status, > orte_process_name_t* sender, > + opal_buffer_t *buffer, > + orte_rml_tag_t tag, void > *cbdata); > + > ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata); > ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); > ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata); > > Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c > ============================================================================== > --- trunk/orte/mca/plm/rsh/plm_rsh_module.c Wed Jul 25 12:32:51 2012 > (r26867) > +++ trunk/orte/mca/plm/rsh/plm_rsh_module.c 2012-07-25 17:46:45 EDT (Wed, > 25 Jul 2012) (r26868) > @@ -258,8 +258,6 @@ > */ > static void rsh_wait_daemon(pid_t pid, int status, void* cbdata) > { > - orte_std_cntr_t cnt=1; > - uint8_t flag; > orte_job_t *jdata; > orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata; > orte_proc_t *daemon=caddy->daemon; > @@ -283,10 +281,8 @@ > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > (int)daemon->name.vpid, > WEXITSTATUS(status))); > buf = OBJ_NEW(opal_buffer_t); > - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); > - flag = 1; > - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); > opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID); > + opal_dss.pack(buf, &status, 1, OPAL_INT); > orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, > ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, > orte_rml_send_callback, NULL); > @@ -297,6 +293,8 @@ > "%s daemon %d failed with status %d", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > (int)daemon->name.vpid, > WEXITSTATUS(status))); > + /* set the exit status */ > + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); > /* note that this daemon failed */ > daemon->state = ORTE_PROC_STATE_FAILED_TO_START; > /* increment the #daemons terminated so we will exit properly */ > @@ -735,7 +733,7 @@ > char **argv = NULL; > char *prefix, *hostname, *var; > int argc; > - int rc; > + int rc=ORTE_SUCCESS; > bool failed_launch = true; > orte_std_cntr_t n; > opal_byte_object_t *bo; > @@ -748,6 +746,9 @@ > "%s plm:rsh: remote spawn called", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); > > + /* if we hit any errors, tell the HNP it was us */ > + target.vpid = ORTE_PROC_MY_NAME->vpid; > + > /* extract the prefix from the launch buffer */ > n = 1; > if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, > OPAL_STRING))) { > @@ -867,12 +868,9 @@ > if (failed_launch) { > /* report cannot launch this daemon to HNP */ > opal_buffer_t *buf; > - orte_std_cntr_t cnt=1; > - uint8_t flag=1; > buf = OBJ_NEW(opal_buffer_t); > - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); > - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); > opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID); > + opal_dss.pack(buf, &rc, 1, OPAL_INT); > orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, > ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, > orte_rml_send_callback, NULL); > _______________________________________________ > svn mailing list > s...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn