Yo George

This commit is going to break non-rsh launchers. While it is true that the
rsh launcher may handle multi-word options by putting them in quotes, we
specifically avoided it here because it breaks SLURM, Torque, and others.

This is why we specifically put the inclusion of multi-word options in the
rsh plm module, and not here. Would you please move it back there?

Thanks
Ralph


On Wed, Jun 24, 2009 at 1:51 PM, <bosi...@osl.iu.edu> wrote:

> Author: bosilca
> Date: 2009-06-24 15:51:52 EDT (Wed, 24 Jun 2009)
> New Revision: 21513
> URL: https://svn.open-mpi.org/trac/ompi/changeset/21513
>
> Log:
> When we get a report from an orted about its state, don't use the sender of
> the message to update the structures, but instead use the information from
> the URI. The reason is that even the launch report messages can get routed.
>
> Deal with the orted_cmd_line in a single location.
>
> Text files modified:
>   trunk/orte/mca/plm/base/plm_base_launch_support.c |    69
> +++++++++++++++++++++++----------------
>   1 files changed, 41 insertions(+), 28 deletions(-)
>
> Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
>
> ==============================================================================
> --- trunk/orte/mca/plm/base/plm_base_launch_support.c   (original)
> +++ trunk/orte/mca/plm/base/plm_base_launch_support.c   2009-06-24 15:51:52
> EDT (Wed, 24 Jun 2009)
> @@ -433,7 +433,8 @@
>  {
>     orte_message_event_t *mev = (orte_message_event_t*)data;
>     opal_buffer_t *buffer = mev->buffer;
> -    char *rml_uri;
> +    orte_process_name_t peer;
> +    char *rml_uri = NULL;
>     int rc, idx;
>     int32_t arch;
>     orte_node_t **nodes;
> @@ -442,19 +443,11 @@
>     int64_t setupsec, setupusec;
>     int64_t startsec, startusec;
>
> -    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> -                         "%s plm:base:orted_report_launch from daemon %s",
> -                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> -                         ORTE_NAME_PRINT(&mev->sender)));
> -
>     /* see if we need to timestamp this receipt */
>     if (orte_timing) {
>         gettimeofday(&recvtime, NULL);
>     }
>
> -    /* update state */
> -    pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING;
> -
>     /* unpack its contact info */
>     idx = 1;
>     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx,
> OPAL_STRING))) {
> @@ -466,13 +459,26 @@
>     /* set the contact info into the hash table */
>     if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) {
>         ORTE_ERROR_LOG(rc);
> -        free(rml_uri);
>         orted_failed_launch = true;
>         goto CLEANUP;
>     }
> -    /* lookup and record this daemon's contact info */
> -    pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri);
> -    free(rml_uri);
> +
> +    rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL );
> +    if( ORTE_SUCCESS != rc ) {
> +        ORTE_ERROR_LOG(rc);
> +        orted_failed_launch = true;
> +        goto CLEANUP;
> +    }
> +
> +    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> +                         "%s plm:base:orted_report_launch from daemon %s
> via %s",
> +                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
> +                         ORTE_NAME_PRINT(&peer),
> +                         ORTE_NAME_PRINT(&mev->sender)));
> +
> +    /* update state and record for this daemon contact info */
> +    pdatorted[peer.vpid]->state = ORTE_PROC_STATE_RUNNING;
> +    pdatorted[peer.vpid]->rml_uri = rml_uri;
>
>     /* get the remote arch */
>     idx = 1;
> @@ -555,31 +561,33 @@
>
>     /* lookup the node */
>     nodes = (orte_node_t**)orte_node_pool->addr;
> -    if (NULL == nodes[mev->sender.vpid]) {
> +    if (NULL == nodes[peer.vpid]) {
>         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
>         orted_failed_launch = true;
>         goto CLEANUP;
>     }
>     /* store the arch */
> -    nodes[mev->sender.vpid]->arch = arch;
> +    nodes[peer.vpid]->arch = arch;
>
>     /* if a tree-launch is underway, send the cmd back */
>     if (NULL != orte_tree_launch_cmd) {
> -        orte_rml.send_buffer(&mev->sender, orte_tree_launch_cmd,
> ORTE_RML_TAG_DAEMON, 0);
> +        orte_rml.send_buffer(&peer, orte_tree_launch_cmd,
> ORTE_RML_TAG_DAEMON, 0);
>     }
>
>  CLEANUP:
>
>     OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
> -                         "%s plm:base:orted_report_launch %s for daemon %s
> at contact %s",
> +                         "%s plm:base:orted_report_launch %s for daemon %s
> (via %s) at contact %s",
>                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
>                          orted_failed_launch ? "failed" : "completed",
> -                         ORTE_NAME_PRINT(&mev->sender),
> pdatorted[mev->sender.vpid]->rml_uri));
> +                         ORTE_NAME_PRINT(&peer),
> +                         ORTE_NAME_PRINT(&mev->sender),
> pdatorted[peer.vpid]->rml_uri));
>
>     /* release the message */
>     OBJ_RELEASE(mev);
>
>     if (orted_failed_launch) {
> +        if( NULL != rml_uri ) free(rml_uri);
>         orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid,
> ORTE_ERROR_DEFAULT_EXIT_CODE);
>     } else {
>         orted_num_callback++;
> @@ -1133,18 +1141,23 @@
>      * being sure to "purge" any that would cause problems
>      * on backend nodes
>      */
> -    if (ORTE_PROC_IS_HNP) {
> +    if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
>         cnt = opal_argv_count(orted_cmd_line);
>         for (i=0; i < cnt; i+=3) {
> -            /* if the specified option is more than one word, we don't
> -             * have a generic way of passing it as some environments
> ignore
> -             * any quotes we add, while others don't - so we ignore any
> -             * such options. In most cases, this won't be a problem as
> -             * they typically only apply to things of interest to the HNP.
> -             * Individual environments can add these back into the cmd
> line
> -             * as they know if it can be supported
> -             */
> -            if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
> +             /* in the rsh environment, we can append multi-word arguments
> +              * by enclosing them in quotes. Check for any multi-word
> +              * mca params passed to mpirun and include them
> +              */
> +             if (NULL != strchr(orted_cmd_line[i+2], ' ')) {
> +                char* param;
> +
> +                /* must add quotes around it */
> +                asprintf(&param, "\"%s\"", orted_cmd_line[i+2]);
> +                /* now pass it along */
> +                opal_argv_append(argc, argv, orted_cmd_line[i]);
> +                opal_argv_append(argc, argv, orted_cmd_line[i+1]);
> +                opal_argv_append(argc, argv, param);
> +                free(param);
>                 continue;
>             }
>             /* The daemon will attempt to open the PLM on the remote
> _______________________________________________
> svn mailing list
> s...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>

Reply via email to