Hmm. I don't think that we want to put knowledge of XRC in the OOB CPC (and vice versa). That seems like an abstraction violation.

I didn't like that XRC knowledge was put in the connect base either, but I was too busy to argue with it. :-)

Isn't there a better way somehow? Perhaps we should have "select" call *all* the functions and accept back a priority. The one with the highest priority then wins. This is quite similar to much of the other selection logic in OMPI.

Sidenote: Keep in mind that there are some changes coming to select CPCs on a per-endpoint basis (I can't look up the trac ticket right now...). This makes things a little complicated -- do we need btl_openib_cpc_include and btl_openib_cpc_exclude MCA params to include/exclude CPCs (because you might need more than one CPC in a single job)? That wouldn't be hard to do.

But then what to do about if someone sets to use some XRC QPs and selects to use OOB or RDMA CM? How do we catch this and print an error? It doesn't seem right to put the "if num_xrc_qps>0" check in every CPC. What happens if you try to make an XRC QP when not using xoob? Where is the error detected and what kind of error message do we print?

Also, I'm not sure why the #if/#else is there for xoob (i.e., having empty/printf functions there when XRC support is compiled out) -- if xoob was disabled during compilation, then it simply should not be compiled and therefore not be there at all at run-time. If a user selects the xoob CPC, then we should print a message from the base that that CPC doesn't exist in the installation. Correspondingly, we can make an info MCA param in the btl openib that shows which CPCs are available (we already have this information -- it's easy enough to put this in an info MCA param).


On Dec 11, 2007, at 6:59 PM, Jon Mason wrote:

Currently, alternate CMs cannot be called because
ompi_btl_openib_connect_base_open forces a choice of either oob or xoob
(and goes into an erroneous error path if you pick something else).
This patch reorganizes ompi_btl_openib_connect_base_open so that new
functions can easily be added.  New Open functions were added to oob
and xoob for the error handling.

I tested calling oob, xoob, and rdma_cm. oob happily allows connections
to be established and throws no errors.  xoob fails because ompi does
not have it compiled in (and I have no connectx cards).  rdma_cm calls
the empty hooks and exits without connecting (thus throwing
non-connection errors).  All expected behavior.

Since this patch fixes the existing behavior, and is not necessarily
tied to my implementing of rdma_cm, I think it is acceptable to go in
now.

Thanks,
Jon

Index: ompi/mca/btl/openib/connect/btl_openib_connect_base.c
===================================================================
--- ompi/mca/btl/openib/connect/btl_openib_connect_base.c (revision 16937) +++ ompi/mca/btl/openib/connect/btl_openib_connect_base.c (working copy)
@@ -50,8 +50,8 @@
 */
int ompi_btl_openib_connect_base_open(void)
{
-    int i;
-    char **temp, *a, *b;
+    char **temp, *a, *b, *defval;
+    int i, ret = OMPI_ERROR;

    /* Make an MCA parameter to select which connect module to use */
    temp = NULL;
@@ -66,40 +66,23 @@

    /* For XRC qps we must to use XOOB connection manager */
    if (mca_btl_openib_component.num_xrc_qps > 0) {
- mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
-                "connect",
-                b, false, false,
-                "xoob", &param);
-        if (0 != strcmp("xoob", param)) {
-            opal_show_help("help-mpi-btl-openib.txt",
-                    "XRC with wrong OOB", true,
-                    orte_system_info.nodename,
-                    mca_btl_openib_component.num_xrc_qps);
-            return OMPI_ERROR;
-        }
+       defval = "xoob";
    } else { /* For all others we should use OOB */
- mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
-                "connect",
-                b, false, false,
-                "oob", &param);
-        if (0 != strcmp("oob", param)) {
-            opal_show_help("help-mpi-btl-openib.txt",
-                    "SRQ or PP with wrong OOB", true,
-                    orte_system_info.nodename,
-                    mca_btl_openib_component.num_srq_qps,
-                    mca_btl_openib_component.num_pp_qps);
-            return OMPI_ERROR;
-        }
+       defval = "oob";
    }

+ mca_base_param_reg_string(&mca_btl_openib_component.super.btl_version,
+                             "connect", b, false, false, defval, &param);
+
    /* Call the open function on all the connect modules */
    for (i = 0; NULL != all[i]; ++i) {
-        if (NULL != all[i]->bcf_open) {
-            all[i]->bcf_open();
+        if (0 == strcmp(all[i]->bcf_name, param)) {
+            ret = all[i]->bcf_open();
+           break;
        }
    }

-    return OMPI_SUCCESS;
+    return ret;
}


Index: ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c
===================================================================
--- ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c (revision 16937) +++ ompi/mca/btl/openib/connect/btl_openib_connect_ibcm.c (working copy)
@@ -28,11 +28,7 @@

static int ibcm_open(void)
{
- mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
-                           "btl_openib_connect_ibcm_foo",
-                           "A dummy help message", false, false,
-                           17, NULL);
-
+    printf("ibcm open\n");
    return OMPI_SUCCESS;
}

Index: ompi/mca/btl/openib/connect/btl_openib_connect_oob.c
===================================================================
--- ompi/mca/btl/openib/connect/btl_openib_connect_oob.c (revision 16937) +++ ompi/mca/btl/openib/connect/btl_openib_connect_oob.c (working copy)
@@ -22,6 +22,8 @@

#include "ompi_config.h"

+#include "opal/util/show_help.h"
+
#include "orte/mca/ns/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
@@ -39,6 +41,7 @@
    ENDPOINT_CONNECT_ACK
} connect_message_type_t;

+static int oob_open(void);
static int oob_init(void);
static int oob_start_connect(mca_btl_base_endpoint_t *e);
static int oob_finalize(void);
@@ -67,8 +70,8 @@
 */
ompi_btl_openib_connect_base_funcs_t ompi_btl_openib_connect_oob = {
    "oob",
-    /* No need for "open */
-    NULL,
+    /* Open */
+    oob_open,
    /* Init */
    oob_init,
    /* Connect */
@@ -78,6 +81,23 @@
};

/*
+ * Open function.
+ */
+static int oob_open(void)
+{
+    if (mca_btl_openib_component.num_xrc_qps > 0) {
+            opal_show_help("help-mpi-btl-openib.txt",
+                    "SRQ or PP with wrong OOB", true,
+                    orte_system_info.nodename,
+                    mca_btl_openib_component.num_srq_qps,
+                    mca_btl_openib_component.num_pp_qps);
+            return OMPI_ERROR;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+/*
 * Init function.  Post non-blocking RML receive to accept incoming
 * connection requests.
 */
Index: ompi/mca/btl/openib/connect/btl_openib_connect_rdma_cm.c
===================================================================
--- ompi/mca/btl/openib/connect/btl_openib_connect_rdma_cm.c (revision 16937) +++ ompi/mca/btl/openib/connect/btl_openib_connect_rdma_cm.c (working copy)
@@ -28,11 +28,7 @@

static int rdma_cm_open(void)
{
- mca_base_param_reg_int(&mca_btl_openib_component.super.btl_version,
-                           "btl_openib_connect_rdma_cm_foo",
-                           "A dummy help message", false, false,
-                           17, NULL);
-
+    printf("rdma cm open\n");
    return OMPI_SUCCESS;
}

Index: ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c
===================================================================
--- ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c (revision 16937) +++ ompi/mca/btl/openib/connect/btl_openib_connect_xoob.c (working copy)
@@ -10,6 +10,8 @@

#include "ompi_config.h"

+#include "opal/util/show_help.h"
+
#include "orte/mca/ns/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
@@ -22,6 +24,7 @@
#include "btl_openib_xrc.h"
#include "connect/connect.h"

+static int xoob_open(void);
static int xoob_init(void);
static int xoob_start_connect(mca_btl_base_endpoint_t *e);
static int xoob_finalize(void);
@@ -32,8 +35,8 @@
 */
ompi_btl_openib_connect_base_funcs_t ompi_btl_openib_connect_xoob = {
    "xoob",
-    /* No need for "open */
-    NULL,
+    /* Open */
+    xoob_open,
    /* Init */
    xoob_init,
    /* Connect */
@@ -99,7 +102,24 @@

static int init_rem_info(mca_btl_openib_rem_info_t *rem_info);
static void free_rem_info(mca_btl_openib_rem_info_t *rem_info);
+
/*
+ * Open function.
+ */
+static int xoob_open(void)
+{
+    if (mca_btl_openib_component.num_xrc_qps <= 0) {
+            opal_show_help("help-mpi-btl-openib.txt",
+                    "XRC with wrong OOB", true,
+                    orte_system_info.nodename,
+                    mca_btl_openib_component.num_xrc_qps);
+            return OMPI_ERROR;
+    }
+
+    return OMPI_SUCCESS;
+}
+
+/*
 * Init function.  Post non-blocking RML receive to accept incoming
 * connection requests.
 */
@@ -834,6 +854,12 @@

#else
/* In case if the XRC was disabled during compilation we will print message and return error */
+static int xoob_open(void)
+{
+    printf("xoob open\n");
+    return OMPI_ERR_NOT_IMPLEMENTED;
+}
+
static int xoob_init(void)
{
    printf("xoob init\n");
_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel


--
Jeff Squyres
Cisco Systems

Reply via email to