Hi,
When the asynchronous events device handler(btl_openib_async_deviceh())
gets an async event and XRC is enabled, the XRC bit is cleared to
process the event_type value, but orte_show_help is called with the
event_type original value (i.e. XRC bit still present). This leads to
the following kind of message:
----------------------------------------------------------
The OpenFabrics stack has reported a network error event. Open MPI
will try to continue. but your job may end up failing.
Local host: XXXX
MPI process PID: 31818
Error number: -2147483645 (UNKNOWN)
This error may indicate connectivity problems within the fabric;
please contact your system administrator
-----------------------------------------------------------
While the expected error number is
Error number: 3 (IBV_EVENT_QP_ACCESS_ERR)
I propose the attached small patch to fix this issue.
Regards,
Nadia
--
nadia.derbey <[email protected]>
Wrong event_type value passed in to show_help when getting xrc async events
diff -r e4bab4451664 ompi/mca/btl/openib/btl_openib_async.c
--- a/ompi/mca/btl/openib/btl_openib_async.c Tue May 25 01:30:35 2010 +0200
+++ b/ompi/mca/btl/openib/btl_openib_async.c Mon Jul 12 14:47:07 2010 +0200
@@ -291,7 +291,7 @@ static int btl_openib_async_deviceh(stru
mca_btl_openib_device_t *device = NULL;
struct ibv_async_event event;
bool xrc_event = false;
- int event_type;
+ enum ibv_event_type event_type;
/* We need to find correct device and process this event */
for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) {
@@ -350,13 +350,13 @@ static int btl_openib_async_deviceh(stru
case IBV_EVENT_SRQ_ERR:
orte_show_help("help-mpi-btl-openib.txt", "of error event",
true,orte_process_info.nodename, orte_process_info.pid,
- event.event_type, openib_event_to_str(event.event_type),
+ event_type, openib_event_to_str(event_type),
xrc_event ? "true" : "false");
break;
case IBV_EVENT_PORT_ERR:
orte_show_help("help-mpi-btl-openib.txt", "of error event",
true,orte_process_info.nodename, orte_process_info.pid,
- event.event_type, openib_event_to_str(event.event_type),
+ event_type, openib_event_to_str(event_type),
xrc_event ? "true" : "false");
/* Set the flag to indicate port error */
device->got_port_event = true;
@@ -385,7 +385,7 @@ static int btl_openib_async_deviceh(stru
default:
orte_show_help("help-mpi-btl-openib.txt", "of unknown event",
true,orte_process_info.nodename, orte_process_info.pid,
- event.event_type, xrc_event ? "true" : "false");
+ event_type, xrc_event ? "true" : "false");
}
ibv_ack_async_event(&event);
} else {