Author: bosilca
Date: 2009-06-23 18:10:25 EDT (Tue, 23 Jun 2009)
New Revision: 21504
URL: https://svn.open-mpi.org/trac/ompi/changeset/21504
Log:
Repair the tree spawn. The problem seems to come from the fact
that now the HNP send the messages using the routed component. In
the case
of tree spawn, when a intermediary node spawn a child it doesn't
know how
to forward a message to it, so when the node-map message is coming
from
the HNP (as there is nothing yet in the contact/routing table) the
message
is sent back the way it came. As a result the node-map message
keeps jumping
between the HNP and the first level orteds.
The solution is to add a new option to the children
orte_parent_uri, which
is only set when the orted is _not_ directly spawned by the HNP.
When this
option is present on the argument list, the orted will add the
parent to
its routing, and force the parent to update his routes (by sending
the URI).
With this approach, the routing tree is build in same time as the
processes
are spawned, and all messages from the HNP can be routed to the
leaves.
However, this is far from an optimal solution. Right now, this so
called tree
spawn, only spawn the children in a tree without doing anything
about the
"connect back to the HNP" step. The HNP is flooded with reports
from all the
orted. The total number of messages is higher than in the non tree
startup
scheme, so we do not expect this approach to be scalable in the
current
incarnation. A complete overhaul of the tree startup is required in
order
improve the scalability. Stay tuned!
Text files modified:
trunk/orte/mca/ess/base/ess_base_std_orted.c | 41 +++++++++
++++++++----------------------
trunk/orte/mca/plm/base/plm_base_launch_support.c | 9 +++++++
trunk/orte/orted/orted_main.c | 38 +++++++++
++++++++++++++++++++++++++++
3 files changed, 64 insertions(+), 24 deletions(-)
Modified: trunk/orte/mca/ess/base/ess_base_std_orted.c
=
=
=
=
=
=
=
=
=
=====================================================================
--- trunk/orte/mca/ess/base/ess_base_std_orted.c (original)
+++ trunk/orte/mca/ess/base/ess_base_std_orted.c 2009-06-23
18:10:25 EDT (Tue, 23 Jun 2009)
@@ -2,13 +2,15 @@
* Copyright (c) 2004-2005 The Trustees of Indiana University and
Indiana
* University Research and Technology
* Corporation. All rights reserved.
- * Copyright (c) 2004-2005 The University of Tennessee and The
University
+ * Copyright (c) 2004-2009 The University of Tennessee and The
University
* of Tennessee Research Foundation. All
rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center
Stuttgart,
* University of Stuttgart. All rights
reserved.
* Copyright (c) 2004-2005 The Regents of the University of
California.
* All rights reserved.
+ * Copyright (c) 2009 Institut National de Recherche en
Informatique
+ * et Automatique. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -172,6 +174,12 @@
goto error;
}
+ /* initialize the nidmaps */
+ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
+ ORTE_ERROR_LOG(ret);
+ error = "orte_util_nidmap_init";
+ goto error;
+ }
/* if we are using static ports, then we need to setup
* the daemon info so the RML can function properly
* without requiring a wireup stage. This must be done
@@ -179,12 +187,6 @@
* own port, which we need in order to construct the nidmap
*/
if (orte_static_ports) {
- /* construct the nidmap arrays */
- if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
- ORTE_ERROR_LOG(ret);
- error = "orte_util_nidmap_init";
- goto error;
- }
if (NULL != orted_launch_cmd) {
/* the launch cmd was given via regexp on the cmd line -
parse
* it to get the contact info
@@ -209,23 +211,16 @@
goto error;
}
}
- /* be sure to update the routing tree so the initial
"phone home"
- * to mpirun goes through the tree!
- */
- if (ORTE_SUCCESS != (ret =
orte_routed.update_routing_tree())) {
- ORTE_ERROR_LOG(ret);
- error = "failed to update routing tree";
- goto error;
- }
- } else {
- /* initialize the nidmaps */
- if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
- ORTE_ERROR_LOG(ret);
- error = "orte_util_nidmap_init";
- goto error;
- }
}
-
+ /* be sure to update the routing tree so the initial "phone
home"
+ * to mpirun goes through the tree!
+ */
+ if (ORTE_SUCCESS != (ret = orte_routed.update_routing_tree())) {
+ ORTE_ERROR_LOG(ret);
+ error = "failed to update routing tree";
+ goto error;
+ }
+
/* Now provide a chance for the PLM
* to perform any module-specific init functions. This
* needs to occur AFTER the communications are setup
Modified: trunk/orte/mca/plm/base/plm_base_launch_support.c
=
=
=
=
=
=
=
=
=
=====================================================================
--- trunk/orte/mca/plm/base/plm_base_launch_support.c (original)
+++ trunk/orte/mca/plm/base/plm_base_launch_support.c 2009-06-23
18:10:25 EDT (Tue, 23 Jun 2009)
@@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of
California.
* All rights reserved.
* Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2009 Institut National de Recherche en
Informatique
+ * et Automatique. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -1108,13 +1110,18 @@
if (ORTE_PROC_IS_HNP) {
rml_uri = orte_rml.get_contact_info();
} else {
+ asprintf(¶m, "\"%s\"", orte_rml.get_contact_info() );
+ opal_argv_append(argc, argv, "--parent-uri");
+ opal_argv_append(argc, argv, param);
+ free(param);
+
rml_uri = orte_process_info.my_hnp_uri;
}
asprintf(¶m, "\"%s\"", rml_uri);
opal_argv_append(argc, argv, "--hnp-uri");
opal_argv_append(argc, argv, param);
free(param);
-
+
/* if given, pass the node list */
if (NULL != nodes) {
opal_argv_append(argc, argv, "-mca");
Modified: trunk/orte/orted/orted_main.c
=
=
=
=
=
=
=
=
=
=====================================================================
--- trunk/orte/orted/orted_main.c (original)
+++ trunk/orte/orted/orted_main.c 2009-06-23 18:10:25 EDT (Tue, 23
Jun 2009)
@@ -12,6 +12,8 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All
rights
* reserved.
+ * Copyright (c) 2009 Institut National de Recherche en
Informatique
+ * et Automatique. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -63,6 +65,7 @@
#include "orte/util/session_dir.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_locks.h"
+#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
@@ -169,6 +172,10 @@
NULL, OPAL_CMD_LINE_TYPE_STRING,
"URI for the HNP"},
+ { "orte", "parent", "uri", '\0', NULL, "parent-uri", 1,
+ NULL, OPAL_CMD_LINE_TYPE_STRING,
+ "URI for the parent if tree launch is enabled."},
+
{ NULL, NULL, NULL, '\0', NULL, "set-sid", 0,
&orted_globals.set_sid, OPAL_CMD_LINE_TYPE_BOOL,
"Direct the orted to separate from the current session"},
@@ -677,6 +684,37 @@
OBJ_RELEASE(buffer);
goto DONE;
}
+
+ mca_base_param_reg_string_name("orte", "parent_uri",
+ "URI for the parent if tree
launch is enabled.",
+ true, false, NULL,
&rml_uri);
+ if (NULL != rml_uri) {
+ orte_process_name_t parent;
+
+ /* set the contact info into the hash table */
+ if (ORTE_SUCCESS != (ret =
orte_rml.set_contact_info(rml_uri))) {
+ ORTE_ERROR_LOG(ret);
+ free(rml_uri);
+ OBJ_RELEASE(buffer);
+ goto DONE;
+ }
+ ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL );
+ if( ORTE_SUCCESS != ret ) {
+ ORTE_ERROR_LOG(ret);
+ free(rml_uri);
+ OBJ_RELEASE(buffer);
+ goto DONE;
+ }
+ free(rml_uri);
+
+ if( 0 > (ret = orte_rml.send_buffer(&parent, buffer,
+
ORTE_RML_TAG_ORTED_CALLBACK, 0)) ) {
+ ORTE_ERROR_LOG(ret);
+ OBJ_RELEASE(buffer);
+ goto DONE;
+ }
+ }
+
OBJ_RELEASE(buffer); /* done with this */
}
_______________________________________________
svn mailing list
s...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn