r26868 seems to have some issues. It works well as long as all processes are 
started on the same node (aka. there is a single daemon), but it breaks with 
the error message attached below if there are more than two daemons.

$ mpirun -np 2 --bynode ./runme
[node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent 
to a process whose contact information is unknown in file 
../../../../../ompi/orte/mca/rml/oob/rml_oob_send.c at line 362
[node01:07767] [[21341,0],1] attempted to send to [[21341,0],2]: tag 15
[node01:07767] [[21341,0],1] ORTE_ERROR_LOG: A message is attempting to be sent 
to a process whose contact information is unknown in file 
../../../../ompi/orte/mca/grpcomm/base/grpcomm_base_xcast.c at line 157

I confirm that applying the reverted commit brings the trunk to a normal state.

Please - a tad more care in what gets committed??

  george.


On Jul 25, 2012, at 23:46 , svn-commit-mai...@open-mpi.org wrote:

> Author: rhc (Ralph Castain)
> Date: 2012-07-25 17:46:45 EDT (Wed, 25 Jul 2012)
> New Revision: 26868
> URL: https://svn.open-mpi.org/trac/ompi/changeset/26868
> 
> Log:
> Reconnect the rsh/ssh error reporting code for remote spawns to report 
> failure to launch. Ensure the HNP correctly reports non-zero exit status when 
> ssh encounters a problem.
> 
> Thanks to Terry for spotting it!
> 
> Text files modified: 
>   trunk/orte/mca/plm/base/plm_base_launch_support.c |    44 
> ++++++++++++++++++++++++++++++++++++++++
>   trunk/orte/mca/plm/base/plm_base_receive.c        |     6 +++++             
>                       
>   trunk/orte/mca/plm/base/plm_private.h             |     4 +++               
>                       
>   trunk/orte/mca/plm/rsh/plm_rsh_module.c           |    18 +++++++---------  
>                       
>   4 files changed, 62 insertions(+), 10 deletions(-)
> 
> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
> ==============================================================================
> --- trunk/orte/mca/plm/base/plm_base_launch_support.c Wed Jul 25 12:32:51 
> 2012        (r26867)
> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2012-07-25 17:46:45 EDT 
> (Wed, 25 Jul 2012)      (r26868)
> @@ -741,6 +741,50 @@
> 
> }
> 
> +void orte_plm_base_daemon_failed(int st, orte_process_name_t* sender,
> +                                 opal_buffer_t *buffer,
> +                                 orte_rml_tag_t tag, void *cbdata)
> +{
> +    int status, rc;
> +    int32_t n;
> +    orte_vpid_t vpid;
> +    orte_proc_t *daemon;
> +
> +    /* get the daemon job, if necessary */
> +    if (NULL == jdatorted) {
> +        jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
> +    }
> +
> +    /* unpack the daemon that failed */
> +    n=1;
> +    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &vpid, &n, 
> ORTE_VPID))) {
> +        ORTE_ERROR_LOG(rc);
> +        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
> +        goto finish;
> +    }
> +
> +    /* unpack the exit status */
> +    n=1;
> +    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &status, &n, 
> OPAL_INT))) {
> +        ORTE_ERROR_LOG(rc);
> +        status = ORTE_ERROR_DEFAULT_EXIT_CODE;
> +        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
> +    } else {
> +        ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
> +    }
> +
> +    /* find the daemon and update its state/status */
> +    if (NULL == (daemon = 
> (orte_proc_t*)opal_pointer_array_get_item(jdatorted->procs, vpid))) {
> +        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
> +        goto finish;
> +    }
> +    daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
> +    daemon->exit_code = status;
> +
> + finish:
> +    ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
> +}
> +
> int orte_plm_base_setup_orted_cmd(int *argc, char ***argv)
> {
>     int i, loc;
> 
> Modified: trunk/orte/mca/plm/base/plm_base_receive.c
> ==============================================================================
> --- trunk/orte/mca/plm/base/plm_base_receive.c        Wed Jul 25 12:32:51 
> 2012        (r26867)
> +++ trunk/orte/mca/plm/base/plm_base_receive.c        2012-07-25 17:46:45 EDT 
> (Wed, 25 Jul 2012)      (r26868)
> @@ -87,6 +87,12 @@
>                                                           
> orte_plm_base_daemon_callback, NULL))) {
>             ORTE_ERROR_LOG(rc);
>         }
> +        if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
> +                                                          
> ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
> +                                                          
> ORTE_RML_PERSISTENT,
> +                                                          
> orte_plm_base_daemon_failed, NULL))) {
> +            ORTE_ERROR_LOG(rc);
> +        }
>     }
>     recv_issued = true;
> 
> 
> Modified: trunk/orte/mca/plm/base/plm_private.h
> ==============================================================================
> --- trunk/orte/mca/plm/base/plm_private.h     Wed Jul 25 12:32:51 2012        
> (r26867)
> +++ trunk/orte/mca/plm/base/plm_private.h     2012-07-25 17:46:45 EDT (Wed, 
> 25 Jul 2012)      (r26868)
> @@ -78,6 +78,10 @@
> ORTE_DECLSPEC void orte_plm_base_daemon_callback(int status, 
> orte_process_name_t* sender,
>                                                  opal_buffer_t *buffer,
>                                                  orte_rml_tag_t tag, void 
> *cbdata);
> +ORTE_DECLSPEC void orte_plm_base_daemon_failed(int status, 
> orte_process_name_t* sender,
> +                                               opal_buffer_t *buffer,
> +                                               orte_rml_tag_t tag, void 
> *cbdata);
> +
> ORTE_DECLSPEC int orte_plm_base_create_jobid(orte_job_t *jdata);
> ORTE_DECLSPEC int orte_plm_base_set_hnp_name(void);
> ORTE_DECLSPEC void orte_plm_base_reset_job(orte_job_t *jdata);
> 
> Modified: trunk/orte/mca/plm/rsh/plm_rsh_module.c
> ==============================================================================
> --- trunk/orte/mca/plm/rsh/plm_rsh_module.c   Wed Jul 25 12:32:51 2012        
> (r26867)
> +++ trunk/orte/mca/plm/rsh/plm_rsh_module.c   2012-07-25 17:46:45 EDT (Wed, 
> 25 Jul 2012)      (r26868)
> @@ -258,8 +258,6 @@
>  */
> static void rsh_wait_daemon(pid_t pid, int status, void* cbdata)
> {
> -    orte_std_cntr_t cnt=1;
> -    uint8_t flag;
>     orte_job_t *jdata;
>     orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)cbdata;
>     orte_proc_t *daemon=caddy->daemon;
> @@ -283,10 +281,8 @@
>                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
>                                  (int)daemon->name.vpid, 
> WEXITSTATUS(status)));
>             buf = OBJ_NEW(opal_buffer_t);
> -            opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR);
> -            flag = 1;
> -            opal_dss.pack(buf, &flag, 1, OPAL_UINT8);
>             opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID);
> +            opal_dss.pack(buf, &status, 1, OPAL_INT);
>             orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
>                                     ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
>                                     orte_rml_send_callback, NULL);
> @@ -297,6 +293,8 @@
>                                  "%s daemon %d failed with status %d",
>                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
>                                  (int)daemon->name.vpid, 
> WEXITSTATUS(status)));
> +            /* set the exit status */
> +            ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(status));
>             /* note that this daemon failed */
>             daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
>             /* increment the #daemons terminated so we will exit properly */
> @@ -735,7 +733,7 @@
>     char **argv = NULL;
>     char *prefix, *hostname, *var;
>     int argc;
> -    int rc;
> +    int rc=ORTE_SUCCESS;
>     bool failed_launch = true;
>     orte_std_cntr_t n;
>     opal_byte_object_t *bo;
> @@ -748,6 +746,9 @@
>                          "%s plm:rsh: remote spawn called",
>                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
> 
> +    /* if we hit any errors, tell the HNP it was us */
> +    target.vpid = ORTE_PROC_MY_NAME->vpid;
> +
>     /* extract the prefix from the launch buffer */
>     n = 1;
>     if (ORTE_SUCCESS != (rc = opal_dss.unpack(launch, &prefix, &n, 
> OPAL_STRING))) {
> @@ -867,12 +868,9 @@
>     if (failed_launch) {
>         /* report cannot launch this daemon to HNP */
>         opal_buffer_t *buf;
> -        orte_std_cntr_t cnt=1;
> -        uint8_t flag=1;
>         buf = OBJ_NEW(opal_buffer_t);
> -        opal_dss.pack(buf, &cnt, 1, ORTE_STD_CNTR);
> -        opal_dss.pack(buf, &flag, 1, OPAL_UINT8);
>         opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID);
> +        opal_dss.pack(buf, &rc, 1, OPAL_INT);
>         orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
>                                 ORTE_RML_TAG_REPORT_REMOTE_LAUNCH, 0,
>                                 orte_rml_send_callback, NULL);
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn


Reply via email to