Yo George This commit is going to break non-rsh launchers. While it is true that the rsh launcher may handle multi-word options by putting them in quotes, we specifically avoided it here because it breaks SLURM, Torque, and others.
This is why we specifically put the inclusion of multi-word options in the rsh plm module, and not here. Would you please move it back there? Thanks Ralph On Wed, Jun 24, 2009 at 1:51 PM, <bosi...@osl.iu.edu> wrote: > Author: bosilca > Date: 2009-06-24 15:51:52 EDT (Wed, 24 Jun 2009) > New Revision: 21513 > URL: https://svn.open-mpi.org/trac/ompi/changeset/21513 > > Log: > When we get a report from an orted about its state, don't use the sender of > the message to update the structures, but instead use the information from > the URI. The reason is that even the launch report messages can get routed. > > Deal with the orted_cmd_line in a single location. > > Text files modified: > trunk/orte/mca/plm/base/plm_base_launch_support.c | 69 > +++++++++++++++++++++++---------------- > 1 files changed, 41 insertions(+), 28 deletions(-) > > Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c > > ============================================================================== > --- trunk/orte/mca/plm/base/plm_base_launch_support.c (original) > +++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2009-06-24 15:51:52 > EDT (Wed, 24 Jun 2009) > @@ -433,7 +433,8 @@ > { > orte_message_event_t *mev = (orte_message_event_t*)data; > opal_buffer_t *buffer = mev->buffer; > - char *rml_uri; > + orte_process_name_t peer; > + char *rml_uri = NULL; > int rc, idx; > int32_t arch; > orte_node_t **nodes; > @@ -442,19 +443,11 @@ > int64_t setupsec, setupusec; > int64_t startsec, startusec; > > - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, > - "%s plm:base:orted_report_launch from daemon %s", > - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > - ORTE_NAME_PRINT(&mev->sender))); > - > /* see if we need to timestamp this receipt */ > if (orte_timing) { > gettimeofday(&recvtime, NULL); > } > > - /* update state */ > - pdatorted[mev->sender.vpid]->state = ORTE_PROC_STATE_RUNNING; > - > /* unpack its contact info */ > idx = 1; > if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &rml_uri, &idx, > OPAL_STRING))) { > @@ -466,13 +459,26 @@ > /* set the contact info into the hash table */ > if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(rml_uri))) { > ORTE_ERROR_LOG(rc); > - free(rml_uri); > orted_failed_launch = true; > goto CLEANUP; > } > - /* lookup and record this daemon's contact info */ > - pdatorted[mev->sender.vpid]->rml_uri = strdup(rml_uri); > - free(rml_uri); > + > + rc = orte_rml_base_parse_uris(rml_uri, &peer, NULL ); > + if( ORTE_SUCCESS != rc ) { > + ORTE_ERROR_LOG(rc); > + orted_failed_launch = true; > + goto CLEANUP; > + } > + > + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, > + "%s plm:base:orted_report_launch from daemon %s > via %s", > + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > + ORTE_NAME_PRINT(&peer), > + ORTE_NAME_PRINT(&mev->sender))); > + > + /* update state and record for this daemon contact info */ > + pdatorted[peer.vpid]->state = ORTE_PROC_STATE_RUNNING; > + pdatorted[peer.vpid]->rml_uri = rml_uri; > > /* get the remote arch */ > idx = 1; > @@ -555,31 +561,33 @@ > > /* lookup the node */ > nodes = (orte_node_t**)orte_node_pool->addr; > - if (NULL == nodes[mev->sender.vpid]) { > + if (NULL == nodes[peer.vpid]) { > ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); > orted_failed_launch = true; > goto CLEANUP; > } > /* store the arch */ > - nodes[mev->sender.vpid]->arch = arch; > + nodes[peer.vpid]->arch = arch; > > /* if a tree-launch is underway, send the cmd back */ > if (NULL != orte_tree_launch_cmd) { > - orte_rml.send_buffer(&mev->sender, orte_tree_launch_cmd, > ORTE_RML_TAG_DAEMON, 0); > + orte_rml.send_buffer(&peer, orte_tree_launch_cmd, > ORTE_RML_TAG_DAEMON, 0); > } > > CLEANUP: > > OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, > - "%s plm:base:orted_report_launch %s for daemon %s > at contact %s", > + "%s plm:base:orted_report_launch %s for daemon %s > (via %s) at contact %s", > ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), > orted_failed_launch ? "failed" : "completed", > - ORTE_NAME_PRINT(&mev->sender), > pdatorted[mev->sender.vpid]->rml_uri)); > + ORTE_NAME_PRINT(&peer), > + ORTE_NAME_PRINT(&mev->sender), > pdatorted[peer.vpid]->rml_uri)); > > /* release the message */ > OBJ_RELEASE(mev); > > if (orted_failed_launch) { > + if( NULL != rml_uri ) free(rml_uri); > orte_errmgr.incomplete_start(ORTE_PROC_MY_NAME->jobid, > ORTE_ERROR_DEFAULT_EXIT_CODE); > } else { > orted_num_callback++; > @@ -1133,18 +1141,23 @@ > * being sure to "purge" any that would cause problems > * on backend nodes > */ > - if (ORTE_PROC_IS_HNP) { > + if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { > cnt = opal_argv_count(orted_cmd_line); > for (i=0; i < cnt; i+=3) { > - /* if the specified option is more than one word, we don't > - * have a generic way of passing it as some environments > ignore > - * any quotes we add, while others don't - so we ignore any > - * such options. In most cases, this won't be a problem as > - * they typically only apply to things of interest to the HNP. > - * Individual environments can add these back into the cmd > line > - * as they know if it can be supported > - */ > - if (NULL != strchr(orted_cmd_line[i+2], ' ')) { > + /* in the rsh environment, we can append multi-word arguments > + * by enclosing them in quotes. Check for any multi-word > + * mca params passed to mpirun and include them > + */ > + if (NULL != strchr(orted_cmd_line[i+2], ' ')) { > + char* param; > + > + /* must add quotes around it */ > + asprintf(¶m, "\"%s\"", orted_cmd_line[i+2]); > + /* now pass it along */ > + opal_argv_append(argc, argv, orted_cmd_line[i]); > + opal_argv_append(argc, argv, orted_cmd_line[i+1]); > + opal_argv_append(argc, argv, param); > + free(param); > continue; > } > /* The daemon will attempt to open the PLM on the remote > _______________________________________________ > svn mailing list > s...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/svn >