Hmmm...it was working for me, but I'll recheck. Thanks! On Jul 26, 2012, at 8:04 AM, George Bosilca wrote:
> r26868 seems to have some issues. It works well as long as all processes are > started on the same node (aka. there is a single daemon), but it breaks with > the error message attached below if there are more than two daemons. > > $ mpirun -np 2 --bynode ./runme > [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be > sent to a process whose contact information is unknown in file > ../../../../../ompi/orte/mca/rml/oob/rml_oob_send.c at line 362 > [node01:07767] [[21341,0],1] attempted to send to [[21341,0],2]: tag 15 > [node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be > sent to a process whose contact information is unknown in file > ../../../../ompi/orte/mca/grpcomm/base/grpcomm_base_xcast.c at line 157 > > I confirm that applying the reverted commit brings the trunk to a normal > state. > > Please - a tad more care in what gets committed?? > > george. > > > On Jul 25, 2012, at 23:46 , svn-commit-mai...@open-mpi.org wrote: > >> Author: rhc (Ralph Castain) >> Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012) >> New Revision: 26868 >> URL: https://svn.open-mpi.org/trac/ompi/changeset/26868 >> >> Log: >> Reconnect the rsh/ssh error reporting code for remote spawns to report >> failure to launch. Ensure the HNP correctly reports non-zero exit status >> when ssh encounters a problem. >> >> Thanks to Terry for spotting it! >> >> Text files modified: >> trunk/orte/mca/plm/base/plm_base_launch_support.c | 44 >> ++++++++++++++++++++++++++++++++++++++++ >> trunk/orte/mca/plm/base/plm_base_receive.c | 6 +++++ >> >> trunk/orte/mca/plm/base/plm_private.h | 4 +++ >> >> trunk/orte/mca/plm/rsh/plm_rsh_module.c | 18 +++++++--------- >> >> 4 files changed, 62 insertions(+), 10 deletions(-) >> >> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_launch_support.c Wed Jul 25 >> 12:32:51 2012 (r26867) >> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2012-07-25 >> 17:46:45 EDT (Wed, 25 Jul 2012) (r26868) >> @@ -741,6 +741,50 @@ >> >> } >> >> +void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender, >> + opal_buffer_t *buffer, >> + orte_rml_tag_t tag, void *cbdata) >> +{ >> + int status, rc; >> + int32_t n; >> + orte_vpid_t vpid; >> + orte_proc_t *daemon; >> + >> + /* get the daemon job, if necessary */ >> + if (NULL == jdatorted) { >> + jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); >> + } >> + >> + /* unpack the daemon that failed */ >> + n=1; >> + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, >> ORTE_VPID))) { >> + ORTE_ERROR_LOG(rc); >> + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); >> + goto finish; >> + } >> + >> + /* unpack the exit status */ >> + n=1; >> + if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, >> OPAL_INT))) { >> + ORTE_ERROR_LOG(rc); >> + status = ORTE_ERROR_DEFAULT_EXIT_CODE; >> + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); >> + } else { >> + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); >> + } >> + >> + /* find the daemon and update its state/status */ >> + if (NULL == (daemon = >> (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) { >> + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); >> + goto finish; >> + } >> + daemon->state = ORTE_PROC_STATE_FAILED_TO_START; >> + daemon->exit_code = status; >> + >> + finish: >> + ORTE_ACTIVATE_PROC_STATE(&daemon->name, >> ORTE_PROC_STATE_FAILED_TO_START); >> +} >> + >> int orte_plm_base_setup_orted_cmd(int *argc, char ***argv) >> { >> int i, loc; >> >> Modified: trunk/orte/mca/plm/base/plm_base_receive.c >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_base_receive.c Wed Jul 25 12:32:51 >> 2012 (r26867) >> +++ trunk/orte/mca/plm/base/plm_base_receive.c 2012-07-25 17:46:45 EDT >> (Wed, 25 Jul 2012) (r26868) >> @@ -87,6 +87,12 @@ >> >> orte_plm_base_daemon_callback, NULL))) { >> ORTE_ERROR_LOG(rc); >> } >> + if (ORTE_SUCCESS != (rc = >> orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, >> + >> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, >> + >> ORTE_RML_PERSISTENT, >> + >> orte_plm_base_daemon_failed, NULL))) { >> + ORTE_ERROR_LOG(rc); >> + } >> } >> recv_issued = true; >> >> >> Modified: trunk/orte/mca/plm/base/plm_private.h >> ============================================================================== >> --- trunk/orte/mca/plm/base/plm_private.h Wed Jul 25 12:32:51 2012 >> (r26867) >> +++ trunk/orte/mca/plm/base/plm_private.h 2012-07-25 17:46:45 EDT (Wed, >> 25 Jul 2012) (r26868) >> @@ -78,6 +78,10 @@ >> ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, >> orte_process_name_t* sender, >> opal_buffer_t *buffer, >> orte_rml_tag_t tag, void >> *cbdata); >> +ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status, >> orte_process_name_t* sender, >> + opal_buffer_t *buffer, >> + orte_rml_tag_t tag, void >> *cbdata); >> + >> ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata); >> ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void); >> ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata); >> >> Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c >> ============================================================================== >> --- trunk/orte/mca/plm/rsh/plm_rsh_module.c Wed Jul 25 12:32:51 2012 >> (r26867) >> +++ trunk/orte/mca/plm/rsh/plm_rsh_module.c 2012-07-25 17:46:45 EDT (Wed, >> 25 Jul 2012) (r26868) >> @@ -258,8 +258,6 @@ >> */ >> static void rsh_wait_daemon(pid_t pid, int status, void* cbdata) >> { >> - orte_std_cntr_t cnt=1; >> - uint8_t flag; >> orte_job_t *jdata; >> orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata; >> orte_proc_t *daemon=caddy->daemon; >> @@ -283,10 +281,8 @@ >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> (int)daemon->name.vpid, >> WEXITSTATUS(status))); >> buf = OBJ_NEW(opal_buffer_t); >> - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); >> - flag = 1; >> - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); >> opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID); >> + opal_dss.pack(buf, &status, 1, OPAL_INT); >> orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, >> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, >> orte_rml_send_callback, NULL); >> @@ -297,6 +293,8 @@ >> "%s daemon %d failed with status %d", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), >> (int)daemon->name.vpid, >> WEXITSTATUS(status))); >> + /* set the exit status */ >> + ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status)); >> /* note that this daemon failed */ >> daemon->state = ORTE_PROC_STATE_FAILED_TO_START; >> /* increment the #daemons terminated so we will exit properly */ >> @@ -735,7 +733,7 @@ >> char **argv = NULL; >> char *prefix, *hostname, *var; >> int argc; >> - int rc; >> + int rc=ORTE_SUCCESS; >> bool failed_launch = true; >> orte_std_cntr_t n; >> opal_byte_object_t *bo; >> @@ -748,6 +746,9 @@ >> "%s plm:rsh: remote spawn called", >> ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); >> >> + /* if we hit any errors, tell the HNP it was us */ >> + target.vpid = ORTE_PROC_MY_NAME->vpid; >> + >> /* extract the prefix from the launch buffer */ >> n = 1; >> if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, >> OPAL_STRING))) { >> @@ -867,12 +868,9 @@ >> if (failed_launch) { >> /* report cannot launch this daemon to HNP */ >> opal_buffer_t *buf; >> - orte_std_cntr_t cnt=1; >> - uint8_t flag=1; >> buf = OBJ_NEW(opal_buffer_t); >> - opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR); >> - opal_dss.pack(buf, &flag, 1, OPAL_UINT8); >> opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID); >> + opal_dss.pack(buf, &rc, 1, OPAL_INT); >> orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, >> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0, >> orte_rml_send_callback, NULL); >> _______________________________________________ >> svn mailing list >> s...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/svn > > > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel