Hi,

When the asynchronous events device handler(btl_openib_async_deviceh())
gets an async event and XRC is enabled, the XRC bit is cleared to
process the event_type value, but orte_show_help is called with the
event_type original value (i.e. XRC bit still present). This leads to
the following kind of message:

  ----------------------------------------------------------
  The OpenFabrics stack has reported a network error event. Open MPI
  will try to continue. but your job may end up failing.

   Local host:       XXXX
   MPI process PID:  31818
   Error number:     -2147483645 (UNKNOWN)
  This error may indicate connectivity problems within the fabric;
  please contact your system administrator
  -----------------------------------------------------------

While the expected error number is 
  Error number:      3 (IBV_EVENT_QP_ACCESS_ERR)

I propose the attached small patch to fix this issue.

Regards,
Nadia

-- 
nadia.derbey <nadia.der...@bull.net>
Wrong event_type value passed in to show_help when getting xrc async events

diff -r e4bab4451664 ompi/mca/btl/openib/btl_openib_async.c
--- a/ompi/mca/btl/openib/btl_openib_async.c	Tue May 25 01:30:35 2010 +0200
+++ b/ompi/mca/btl/openib/btl_openib_async.c	Mon Jul 12 14:47:07 2010 +0200
@@ -291,7 +291,7 @@ static int btl_openib_async_deviceh(stru
     mca_btl_openib_device_t *device = NULL;
     struct ibv_async_event event;
     bool xrc_event = false;
-    int event_type;
+    enum ibv_event_type event_type;

     /* We need to find correct device and process this event */
     for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
@@ -350,13 +350,13 @@ static int btl_openib_async_deviceh(stru
             case IBV_EVENT_SRQ_ERR:
                 orte_show_help("help-mpi-btl-openib.txt", "of error event",
                     true,orte_process_info.nodename, orte_process_info.pid,
-                    event.event_type, openib_event_to_str(event.event_type),
+                    event_type, openib_event_to_str(event_type),
                     xrc_event ? "true" : "false");
                 break;
             case IBV_EVENT_PORT_ERR:
                 orte_show_help("help-mpi-btl-openib.txt", "of error event",
                     true,orte_process_info.nodename, orte_process_info.pid,
-                    event.event_type, openib_event_to_str(event.event_type),
+                    event_type, openib_event_to_str(event_type),
                     xrc_event ? "true" : "false");
                 /* Set the flag to indicate port error */
                 device->got_port_event = true;
@@ -385,7 +385,7 @@ static int btl_openib_async_deviceh(stru
             default:
                 orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
                         true,orte_process_info.nodename, orte_process_info.pid,
-                        event.event_type, xrc_event ? "true" : "false");
+                        event_type, xrc_event ? "true" : "false");
         }
         ibv_ack_async_event(&event);
     } else {

Reply via email to