Mike -

Can you verify that this commit is ok?

Sent from my phone. No type good.

Begin forwarded message:

From: <svn-commit-mai...@open-mpi.org<mailto:svn-commit-mai...@open-mpi.org>>
List-Post: mtt-devel@lists.open-mpi.org
Date: March 1, 2013, 4:50:56 PM EST
To: <svn-f...@open-mpi.org<mailto:svn-f...@open-mpi.org>>
Subject: [OMPI svn-full] svn:open-mpi r28142 - trunk/ompi/mca/common/cuda
Reply-To: <de...@open-mpi.org<mailto:de...@open-mpi.org>>

Author: rolfv (Rolf Vandevaart)
List-Post: mtt-devel@lists.open-mpi.org
Date: 2013-03-01 16:50:56 EST (Fri, 01 Mar 2013)
New Revision: 28142
URL: https://svn.open-mpi.org/trac/ompi/changeset/28142

Log:
Add a search path.  Refactor code.

Text files modified:
  trunk/ompi/mca/common/cuda/common_cuda.c |   116 
+++++++++++++++++----------------------
  1 files changed, 52 insertions(+), 64 deletions(-)

Modified: trunk/ompi/mca/common/cuda/common_cuda.c
==============================================================================
--- trunk/ompi/mca/common/cuda/common_cuda.c    Fri Mar  1 14:13:06 2013    
(r28141)
+++ trunk/ompi/mca/common/cuda/common_cuda.c    2013-03-01 16:50:56 EST (Fri, 
01 Mar 2013)    (r28142)
@@ -463,11 +463,15 @@
 * This function will open and load the symbols needed from the CUDA driver
 * library.  Any failure will result in a message and we will return 1.
 */
+#define NUMLIBS 2
static int mca_common_cuda_load_libcuda(void)
{
    opal_lt_dladvise advise;
-    int retval;
+    int retval, i;
    int advise_support = 1;
+    bool loaded = true;
+    char *errs[NUMLIBS] = {NULL, NULL};
+    char *cudalibs[NUMLIBS] = {"libcuda.so", "libcuda.so.1"};

    if (0 != (retval = opal_lt_dlinit())) {
        if (OPAL_ERR_NOT_SUPPORTED == retval) {
@@ -491,6 +495,14 @@
        }
    }

+    /* Make sure we check in lib64 also in the case where there are both
+     * 32 and 64 bit libraries installed.  Otherwise, we may fail trying to
+     * load the 32 bit library. */
+    opal_lt_dladdsearchdir("/usr/lib64");
+
+    /* Now walk through all the potential names libcuda and find one
+     * that works.  If it does, all is good.  If not, print out all
+     * the messages about why things failed. */
    if (advise_support) {
        if (0 != (retval = opal_lt_dladvise_global(&advise))) {
            opal_show_help("help-mpi-common-cuda.txt", "unknown ltdl error", 
true,
@@ -498,84 +510,60 @@
            opal_lt_dladvise_destroy(&advise);
            return 1;
        }
-
-        /*
-         * Try and open libcuda.so and libcuda.so.1.  Note that we are not 
using
-         * opal_lt_dladvise_ext() as we do not need ltdl to add any suffixes to
-         * the library names being handed in.
-         */
-        libcuda_handle = opal_lt_dlopenadvise("libcuda.so", advise);
-
-        /* If the first open fails, save the error message so that it can be 
printed
-         * out of the second open fails as well.  If the second open succeeds, 
then
-         * we do not caer that the first open failed. */
-        if (NULL == libcuda_handle) {
-            char *err1;
-            const char *str1 = opal_lt_dlerror();
-            if (NULL != str1) {
-                err1 = strdup(str1);
-            } else {
-                err1 = strdup("lt_dlerror() returned NULL.");
-            }
-            libcuda_handle = opal_lt_dlopenadvise("libcuda.so.1", advise);
+        for (i = 0; i < NUMLIBS; i++) {
+            const char *str;
+            libcuda_handle = opal_lt_dlopenadvise(cudalibs[i], advise);
            if (NULL == libcuda_handle) {
-                char *err2;
-                const char *str2 = opal_lt_dlerror();
-                if (NULL != str2) {
-                    err2 = strdup(str2);
+                str = opal_lt_dlerror();
+                if (NULL != str) {
+                    errs[i] = strdup(str);
                } else {
-                    err2 = strdup("lt_dlerror() returned NULL.");
+                    errs[i] = strdup("lt_dlerror() returned NULL.");
                }
-                opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", 
true,
-                               "libcuda.so", err1, "libcuda.so.1", err2);
-                free(err1);
-                free(err2);
-                opal_lt_dladvise_destroy(&advise);
-                return 1;
+                opal_output_verbose(10, mca_common_cuda_output,
+                                    "CUDA: Library open error: %s",
+                                    errs[i]);
+            } else {
+                loaded = true;
+                break;
            }
-            free(err1);
        }
-
        opal_lt_dladvise_destroy(&advise);
    } else {
        /* No lt_dladvise support.  This should rarely happen. */
-        /*
-         * Try and open libcuda.so and libcuda.so.1.  Note that we are not 
using
-         * opal_lt_dladvise_ext() as we do not need ltdl to add any suffixes to
-         * the library names being handed in.
-         */
-        libcuda_handle = opal_lt_dlopen("libcuda.so");
-
-        /* If the first open fails, save the error message so that it can be 
printed
-         * out of the second open fails as well.  If the second open succeeds, 
then
-         * we do not caer that the first open failed. */
-        if (NULL == libcuda_handle) {
-            char *err1;
-            const char *str1 = opal_lt_dlerror();
-            if (NULL != str1) {
-                err1 = strdup(str1);
-            } else {
-                err1 = strdup("lt_dlerror() returned NULL.");
-            }
-            libcuda_handle = opal_lt_dlopen("libcuda.so.1");
+        for (i = 0; i < NUMLIBS; i++) {
+            const char *str;
+            libcuda_handle = opal_lt_dlopen(cudalibs[i]);
            if (NULL == libcuda_handle) {
-                char *err2;
-                const char *str2 = opal_lt_dlerror();
-                if (NULL != str2) {
-                    err2 = strdup(str2);
+                str = opal_lt_dlerror();
+                if (NULL != str) {
+                    errs[i] = strdup(str);
                } else {
-                    err2 = strdup("lt_dlerror() returned NULL.");
+                    errs[i] = strdup("lt_dlerror() returned NULL.");
                }
-                opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", 
true,
-                               "libcuda.so", err1, "libcuda.so.1", err2);
-                free(err1);
-                free(err2);
-                return 1;
+            } else {
+                loaded = true;
+                break;
            }
-            free(err1);
        }
    }

+    if (loaded != true) {
+        opal_show_help("help-mpi-common-cuda.txt", "dlopen failed", true,
+                       cudalibs[0], errs[0], cudalibs[1], errs[1]);
+    }
+
+    /* Cleanup error messages.  Need to do this after printing them. */
+    for (i = 0; i < NUMLIBS; i++) {
+        if (NULL != errs[i]) {
+            free(errs[i]);
+        }
+    }
+
+    if (loaded != true) {
+        return 1;
+    }
+
    /* Map in the functions that we need */
    OMPI_CUDA_DLSYM(libcuda_handle, cuStreamCreate);
    OMPI_CUDA_DLSYM(libcuda_handle, cuCtxGetCurrent);
_______________________________________________
svn-full mailing list
svn-f...@open-mpi.org<mailto:svn-f...@open-mpi.org>
http://www.open-mpi.org/mailman/listinfo.cgi/svn-full

Reply via email to