[OMPI devel] Improve OB1 performance when multiple NICs are available

Gleb Natapov Mon, 25 Jun 2007 08:28:32 -0400

Hello,

 Attached patch improves OB1 scheduling algorithm between multiple
links. Current algorithm perform very poorly if interconnects with very
different bandwidth values are used. For big message sizes it always
divide traffic equally between all available interconnects. Attached
patch change this. It calculates for each message how much data should be
send via each link according to relative weight of the link. This is
done for RDMAed part of the message as well as for the part that is send
by send/recv in the case of pipeline protocol. As a side effect
send_schedule/recv_schedule functions are greatly simplified.


 Surprisingly (at least for me) this patch is also greatly improves some
benchmarks results when multiple links with the same bandwidth are in use.
Attached postscript shows some benchmark results with and without the
patch. I used two computers connected with 4 DDR HCAs for this benchmark.
Each HCA is capable of ~1600MB on its own.

--
                        Gleb.

diff --git a/ompi/mca/bml/r2/bml_r2.c b/ompi/mca/bml/r2/bml_r2.c
index 61f5db8..97f5c8a 100644
--- a/ompi/mca/bml/r2/bml_r2.c
+++ b/ompi/mca/bml/r2/bml_r2.c
@@ -157,6 +157,14 @@ static int mca_bml_r2_add_btls( void )
     return OMPI_SUCCESS;
 }

+static int btl_bandwidth_compare(const void *v1, const void *v2)
+{
+    mca_bml_base_btl_t *b1 = (mca_bml_base_btl_t*)v1,
+                       *b2 = (mca_bml_base_btl_t*)v2;
+
+    return b2->btl->btl_bandwidth - b1->btl->btl_bandwidth;
+}
+
 /*
  *   For each proc setup a datastructure that indicates the PTLs
  *   that can be used to reach the destination.
@@ -380,6 +388,11 @@ int mca_bml_r2_add_procs(
          *     start using the weight to compute the correct amount.
          */
         n_size = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send); 
+        
+        /* sort BTLs in descending order according to bandwidth value */
+        qsort(bml_endpoint->btl_send.bml_btls, n_size,
+                sizeof(mca_bml_base_btl_t), btl_bandwidth_compare);
+
         bml_endpoint->bml_max_send_length = 0;
         bml_endpoint->bml_max_rdma_length = 0;
         bml_endpoint->btl_rdma_index = 0;
diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c
index fd528fa..8b06c99 100644
--- a/ompi/mca/pml/ob1/pml_ob1.c
+++ b/ompi/mca/pml/ob1/pml_ob1.c
@@ -536,3 +536,15 @@ int mca_pml_ob1_ft_event( int state )

     return OMPI_SUCCESS;
 }
+
+int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
+{
+    const mca_pml_ob1_com_btl_t *b1 = v1, *b2 = v2;
+
+    if(b1->bml_btl->btl_weight < b2->bml_btl->btl_weight)
+        return 1;
+    if(b1->bml_btl->btl_weight > b2->bml_btl->btl_weight)
+        return -1;
+
+    return 0;
+}
diff --git a/ompi/mca/pml/ob1/pml_ob1.h b/ompi/mca/pml/ob1/pml_ob1.h
index 50c8f31..e9c5403 100644
--- a/ompi/mca/pml/ob1/pml_ob1.h
+++ b/ompi/mca/pml/ob1/pml_ob1.h
@@ -55,6 +55,8 @@ struct mca_pml_ob1_t {
     size_t send_pipeline_depth;
     size_t recv_pipeline_depth;
     size_t rdma_put_retries_limit;
+    int max_rdma_per_request;
+    int max_send_per_range;
     bool leave_pinned; 
     int leave_pinned_pipeline;

@@ -241,8 +243,6 @@ struct mca_pml_ob1_rdma_reg_t {
 };
 typedef struct mca_pml_ob1_rdma_reg_t mca_pml_ob1_rdma_reg_t;

-#define MCA_PML_OB1_MAX_REGISTRATIONS 4
-
 struct mca_pml_ob1_pckt_pending_t {
     ompi_free_list_item_t super;
     ompi_proc_t* proc;
@@ -326,4 +326,53 @@ do {                                                                        \
    length -= hdrlen;                                                        \
 } while(0)

+/* represent BTL chosen for sending request */
+struct mca_pml_ob1_com_btl_t {
+    mca_bml_base_btl_t *bml_btl;
+    struct mca_mpool_base_registration_t* btl_reg;
+    size_t length;
+};
+typedef struct mca_pml_ob1_com_btl_t mca_pml_ob1_com_btl_t;
+
+int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2);
+
+/* Calculate what percentage of a message to send through each BTL according to
+ * relative weight */
+static inline void mca_pml_ob1_calc_weighted_length(
+        mca_pml_ob1_com_btl_t *btls, int num_btls, size_t size,
+        double weight_total)
+{
+    int i;
+    size_t length_left = size;
+
+    /* shortcut for common case for only one BTL */
+    if(num_btls == 1) {
+        btls[0].length = size;
+        return;
+    }
+
+    /* sort BTLs according of their weights so BTLs with smaller weight will
+     * not hijack all of the traffic */
+    qsort(btls, num_btls, sizeof(mca_pml_ob1_com_btl_t),
+            mca_pml_ob1_com_btl_comp);
+
+    for(i = 0; i < num_btls; i++) {
+        mca_bml_base_btl_t* bml_btl = btls[i].bml_btl;
+        size_t length = 0;
+        if(length_left != 0) {
+            length = (length_left > bml_btl->btl_eager_limit)?
+                ((size_t)(size * (bml_btl->btl_weight / weight_total))) :
+                length_left;
+
+            if(length > length_left)
+                length = length_left;
+            length_left -= length;
+        }
+        btls[i].length = length;
+    }
+
+    /* account for rounding errors */
+    btls[0].length += length_left;
+}
+
 #endif
diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
index d3237e5..b35f4ee 100644
--- a/ompi/mca/pml/ob1/pml_ob1_component.c
+++ b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -113,6 +113,10 @@ int mca_pml_ob1_component_open(void)
         mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
     mca_pml_ob1.rdma_put_retries_limit =
         mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
+    mca_pml_ob1.max_rdma_per_request =
+        mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
+    mca_pml_ob1.max_send_per_range =
+        mca_pml_ob1_param_register_int("max_send_per_range", 4);

     mca_pml_ob1.unexpected_limit =
         mca_pml_ob1_param_register_int("unexpected_limit", 128);
@@ -146,7 +150,8 @@ int mca_pml_ob1_component_open(void)
     OBJ_CONSTRUCT(&mca_pml_ob1.send_requests, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.send_requests,
-        sizeof(mca_pml_ob1_send_request_t),
+        sizeof(mca_pml_ob1_send_request_t) +
+        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_send_request_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
@@ -156,7 +161,8 @@ int mca_pml_ob1_component_open(void)
     OBJ_CONSTRUCT(&mca_pml_ob1.recv_requests, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.recv_requests,
-        sizeof(mca_pml_ob1_recv_request_t),
+        sizeof(mca_pml_ob1_recv_request_t) +
+        (mca_pml_ob1.max_rdma_per_request - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_recv_request_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
@@ -185,8 +191,6 @@ int mca_pml_ob1_component_open(void)
         mca_pml_ob1.free_list_inc,
         NULL);

-        
-
     OBJ_CONSTRUCT(&mca_pml_ob1.pending_pckts, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.pending_pckts,
@@ -202,7 +206,8 @@ int mca_pml_ob1_component_open(void)
     OBJ_CONSTRUCT(&mca_pml_ob1.send_ranges, ompi_free_list_t);
     ompi_free_list_init(
         &mca_pml_ob1.send_ranges,
-        sizeof(mca_pml_ob1_send_range_t),
+        sizeof(mca_pml_ob1_send_range_t) +
+        (mca_pml_ob1.max_send_per_range - 1) * sizeof(mca_pml_ob1_com_btl_t),
         OBJ_CLASS(mca_pml_ob1_send_range_t),
         mca_pml_ob1.free_list_num,
         mca_pml_ob1.free_list_max,
diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.c b/ompi/mca/pml/ob1/pml_ob1_rdma.c
index c8eec33..4e64a87 100644
--- a/ompi/mca/pml/ob1/pml_ob1_rdma.c
+++ b/ompi/mca/pml/ob1/pml_ob1_rdma.c
@@ -30,6 +30,11 @@
 #include "pml_ob1.h"
 #include "pml_ob1_rdma.h"

+/* Use this registration if no registration needed for a BTL instead of NULL.
+ * This will help other code to distinguish case when memory is not registered
+ * from case when registration is not needed */
+static mca_mpool_base_registration_t pml_ob1_dummy_reg;
+
 /*
  * Check to see if memory is registered or can be registered. Build a 
  * set of registrations on the request.
@@ -39,11 +44,11 @@ size_t mca_pml_ob1_rdma_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
-    mca_pml_ob1_rdma_btl_t* rdma_btls)
+    mca_pml_ob1_com_btl_t* rdma_btls)
 {
-    size_t num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
-    size_t num_btls_used = 0;
-    size_t n;
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
+    double weight_total = 0;
+    int num_btls_used = 0, n;

     /* shortcut when there are no rdma capable btls */
     if(num_btls == 0) {
@@ -51,7 +56,7 @@ size_t mca_pml_ob1_rdma_btls(
     }

     /* check to see if memory is registered */        
-    for(n = 0; n < num_btls && num_btls_used < MCA_PML_OB1_MAX_RDMA_PER_REQUEST;
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
             n++) {
         mca_bml_base_btl_t* bml_btl =
             mca_bml_base_btl_array_get_index(&bml_endpoint->btl_rdma,
@@ -59,10 +64,7 @@ size_t mca_pml_ob1_rdma_btls(
         mca_mpool_base_registration_t* reg = NULL;
         mca_mpool_base_module_t *btl_mpool = bml_btl->btl_mpool;

-        /* btl is rdma capable and registration is not required */
-        if(NULL == btl_mpool) {
-            reg = NULL;
-        } else {
+        if(NULL != btl_mpool) {
             if(!mca_pml_ob1.leave_pinned) {
                 /* look through existing registrations */
                 btl_mpool->mpool_find(btl_mpool, base, size, &reg);
@@ -73,14 +75,52 @@ size_t mca_pml_ob1_rdma_btls(

             if(NULL == reg)
                 bml_btl = NULL; /* skip it */
+        } else {
+            /* if registration is not required use dummy registration */
+            reg = &pml_ob1_dummy_reg;
         }

         if(bml_btl != NULL) {
             rdma_btls[num_btls_used].bml_btl = bml_btl;
             rdma_btls[num_btls_used].btl_reg = reg;
+            weight_total += bml_btl->btl_weight;
             num_btls_used++;
         }
     }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+     * registered amount to less then half of available bandwidth - fall back to
+     * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    mca_pml_ob1_calc_weighted_length(rdma_btls, num_btls_used, size,
+            weight_total);
+
     bml_endpoint->btl_rdma_index = (bml_endpoint->btl_rdma_index + 1) % num_btls;
     return num_btls_used;
 }
+
+size_t mca_pml_ob1_rdma_pipeline_btls(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    size_t size,
+    mca_pml_ob1_com_btl_t* rdma_btls)
+{
+    int i, num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
+    double weight_total = 0;
+
+    for(i = 0; i < num_btls && i < mca_pml_ob1.max_rdma_per_request; i++) {
+        rdma_btls[i].bml_btl =
+            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
+        if(rdma_btls[i].bml_btl->btl_mpool != NULL)
+            rdma_btls[i].btl_reg = NULL;
+        else
+            rdma_btls[i].btl_reg = &pml_ob1_dummy_reg;
+
+        weight_total += rdma_btls[i].bml_btl->btl_weight;
+    }
+
+    mca_pml_ob1_calc_weighted_length(rdma_btls, i, size, weight_total);
+
+    return i;
+}
diff --git a/ompi/mca/pml/ob1/pml_ob1_rdma.h b/ompi/mca/pml/ob1/pml_ob1_rdma.h
index d2c983c..3ed0655 100644
--- a/ompi/mca/pml/ob1/pml_ob1_rdma.h
+++ b/ompi/mca/pml/ob1/pml_ob1_rdma.h
@@ -24,27 +24,18 @@

 struct mca_bml_base_endpoint_t;

-/**
- * structure to associate RDMA capable BTL(s) with corresponding registration
- */
-
-struct mca_pml_ob1_rdma_btl_t {
-    struct mca_bml_base_btl_t* bml_btl; 
-    struct mca_mpool_base_registration_t* btl_reg;
-};
-typedef struct mca_pml_ob1_rdma_btl_t mca_pml_ob1_rdma_btl_t;
-
-
-#define MCA_PML_OB1_MAX_RDMA_PER_REQUEST 4
-
-
 /*
  * Of the set of available btls that support RDMA,
  * find those that already have registrations - or
  * register if required (for leave_pinned option)
  */
 size_t mca_pml_ob1_rdma_btls(struct mca_bml_base_endpoint_t* endpoint,
-    unsigned char* base, size_t size, struct mca_pml_ob1_rdma_btl_t* btls);
+    unsigned char* base, size_t size, struct mca_pml_ob1_com_btl_t* btls);

+/* Choose RDMA BTLs to use for sending of a request by pipeline protocol.
+ * Calculate number of bytes to send through each BTL according to available
+ * bandwidth */
+size_t mca_pml_ob1_rdma_pipeline_btls(struct mca_bml_base_endpoint_t* endpoint,
+                size_t size, mca_pml_ob1_com_btl_t* rdma_btls);
 #endif

diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index 33c7c3e..c47003c 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -240,13 +240,15 @@ static int mca_pml_ob1_recv_request_ack(
     /* by default copy everything */
     recvreq->req_send_offset = bytes_received;
     if(hdr->hdr_msg_length > bytes_received) {
+        size_t rdma_num = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
         /*
          * lookup request buffer to determine if memory is already
          * registered. 
          */

         if(ompi_convertor_need_buffers(&recvreq->req_recv.req_convertor) == 0 &&
-           hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG) {
+           hdr->hdr_match.hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_CONTIG &&
+           rdma_num != 0) {
             unsigned char *base;
             ompi_convertor_get_current_pointer( &recvreq->req_recv.req_convertor, (void**)&(base) );

@@ -261,19 +263,23 @@ static int mca_pml_ob1_recv_request_ack(

                 recvreq->req_send_offset = hdr->hdr_msg_length;
                 /* are rdma devices available for long rdma protocol */
-            } else if (bml_endpoint->btl_send_limit < hdr->hdr_msg_length &&
-                    bml_endpoint->btl_pipeline_send_length <
-                    hdr->hdr_msg_length &&
-                    mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma)) {
-                
+            } else if(bml_endpoint->btl_send_limit < hdr->hdr_msg_length) {
                 /* use convertor to figure out the rdma offset for this request */
                 recvreq->req_send_offset = hdr->hdr_msg_length - 
                     bml_endpoint->btl_pipeline_send_length;
-                if(recvreq->req_send_offset < bytes_received) {
+
+                if(recvreq->req_send_offset < bytes_received)
                     recvreq->req_send_offset = bytes_received;
-                }
-                ompi_convertor_set_position( &recvreq->req_recv.req_convertor,
-                                             &recvreq->req_send_offset );
+
+                /* use converter to figure out the rdma offset for this
+                 * request */
+                ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
+                        &recvreq->req_send_offset);
+
+                recvreq->req_rdma_cnt =
+                    mca_pml_ob1_rdma_pipeline_btls(bml_endpoint,
+                            recvreq->req_send_offset - bytes_received,
+                            recvreq->req_rdma);
             }
         }
         /* nothing to send by copy in/out - no need to ack */
@@ -592,14 +598,8 @@ void mca_pml_ob1_recv_request_matched_probe(
 int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq )
 {
     ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc;
-    mca_bml_base_endpoint_t* bml_endpoint = (mca_bml_base_endpoint_t*) proc->proc_bml; 
     mca_bml_base_btl_t* bml_btl; 
-    int num_btl_avail =
-        mca_bml_base_btl_array_get_size(&bml_endpoint->btl_rdma);
-    int num_tries = num_btl_avail;
-
-    if(recvreq->req_rdma_cnt)
-        num_tries = recvreq->req_rdma_cnt;
+    int num_tries = recvreq->req_rdma_cnt;

     do {
         size_t bytes_remaining = recvreq->req_send_offset -
@@ -615,8 +615,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
             mca_btl_base_descriptor_t* dst;
             mca_btl_base_descriptor_t* ctl;
             mca_mpool_base_registration_t * reg = NULL;
-            int rc;
-               
+            int rc, rdma_idx;
+
             if(prev_bytes_remaining == bytes_remaining) {
                 if( ++num_fail == num_tries ) {
                     OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@@ -636,47 +636,18 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
             ompi_convertor_set_position(&recvreq->req_recv.req_convertor,
                                         &recvreq->req_rdma_offset);

-            if(recvreq->req_rdma_cnt) {
-                /*
-                 * Select the next btl out of the list w/ preregistered
-                 * memory.
-                 */
-                 bml_btl = recvreq->req_rdma[recvreq->req_rdma_idx].bml_btl;
-                 num_btl_avail = recvreq->req_rdma_cnt - recvreq->req_rdma_idx;
-                 reg = recvreq->req_rdma[recvreq->req_rdma_idx].btl_reg;
-
-                 if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
+            do {
+                rdma_idx = recvreq->req_rdma_idx;
+                bml_btl = recvreq->req_rdma[rdma_idx].bml_btl;
+                reg = recvreq->req_rdma[rdma_idx].btl_reg;
+                size = recvreq->req_rdma[rdma_idx].length;
+                if(++recvreq->req_rdma_idx >= recvreq->req_rdma_cnt)
                     recvreq->req_rdma_idx = 0;
-            } else {
-                /*
-                 * Otherwise, schedule round-robin across the
-                 * available RDMA nics dynamically registering/deregister
-                 * as required.
-                 */
-                bml_btl =
-                    mca_bml_base_btl_array_get_next(&bml_endpoint->btl_rdma);
-            }
+            } while(!size);

-            /*
-             * If more than one NIC is available - try to use both for
-             * anything larger than the eager limit
-             */
-            if( num_btl_avail == 1 ||
-                bytes_remaining < bml_btl->btl_eager_limit ) {
-                size = bytes_remaining;
-            } else {
-                /* 
-                 * otherwise attempt to give the BTL a percentage of
-                 * the message based on a weighting factor. for
-                 * simplicity calculate this as a percentage of the
-                 * overall message length (regardless of amount
-                 * previously assigned)
-                 */
-                size = (size_t)(bml_btl->btl_weight * bytes_remaining);
-            }
             /* makes sure that we don't exceed BTL max rdma size
              * if memory is not pinned already */
-            if(0 == recvreq->req_rdma_cnt &&
+            if(NULL == reg &&
                     bml_btl->btl_rdma_pipeline_frag_size != 0 &&
                     size > bml_btl->btl_rdma_pipeline_frag_size) {
                 size = bml_btl->btl_rdma_pipeline_frag_size;
@@ -684,8 +655,8 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec

             /* prepare a descriptor for RDMA */
             mca_bml_base_prepare_dst(bml_btl, reg, 
-                                     &recvreq->req_recv.req_convertor, MCA_BTL_NO_ORDER, 
-                                     0, &size, &dst);
+                                     &recvreq->req_recv.req_convertor,
+                                     MCA_BTL_NO_ORDER, 0, &size, &dst);
             if(dst == NULL) {
                 continue;
             }
@@ -751,6 +722,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
                 /* update request state */
                 recvreq->req_rdma_offset += size;
                 OPAL_THREAD_ADD_SIZE_T(&recvreq->req_pipeline_depth,1);
+                recvreq->req_rdma[rdma_idx].length -= size;
                 bytes_remaining -= size;
             } else {
                 mca_bml_base_free(bml_btl,ctl);
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index 0928e02..1c67d1d 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -48,11 +48,11 @@ struct mca_pml_ob1_recv_request_t {
     size_t  req_bytes_delivered;
     size_t  req_rdma_offset;
     size_t  req_send_offset;
-    mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST];
     uint32_t req_rdma_cnt;
     uint32_t req_rdma_idx;
     bool req_pending;
     bool req_ack_sent; /**< whether ack was sent to the sender */
+    mca_pml_ob1_com_btl_t req_rdma[1];
 };
 typedef struct mca_pml_ob1_recv_request_t mca_pml_ob1_recv_request_t;

@@ -135,7 +135,7 @@ do {
                                                                                 \
     for( r = 0; r < recvreq->req_rdma_cnt; r++ ) {                              \
         mca_mpool_base_registration_t* btl_reg = recvreq->req_rdma[r].btl_reg;  \
-        if( NULL != btl_reg ) {                                                 \
+        if( NULL != btl_reg  && btl_reg->mpool != NULL) {                       \
             btl_reg->mpool->mpool_deregister( btl_reg->mpool, btl_reg );           \
         }                                                                       \
     }                                                                           \
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index 31549bc..94923f1 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -883,7 +883,10 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
 {
     mca_pml_ob1_send_range_t *sr;
     ompi_free_list_item_t *i;
-    int rc = OMPI_SUCCESS;
+    mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    int rc = OMPI_SUCCESS, n;
+    double weight_total = 0;

     if(0 == send_length)
         return;
@@ -894,6 +897,18 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,

     sr->range_send_offset = send_offset;
     sr->range_send_length = send_length;
+    sr->range_btl_idx = 0;
+
+    for(n = 0; n < num_btls && n < mca_pml_ob1.max_send_per_range; n++) {
+        sr->range_btls[n].bml_btl =
+            mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
+        weight_total += sr->range_btls[n].bml_btl->btl_weight;
+    }
+
+    sr->range_btl_cnt = n;
+    mca_pml_ob1_calc_weighted_length(sr->range_btls, n, send_length,
+            weight_total);
+
     OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
     opal_list_append(&sendreq->req_send_ranges, (opal_list_item_t*)sr);
     OPAL_THREAD_UNLOCK(&sendreq->req_send_range_lock);
@@ -910,22 +925,19 @@ void mca_pml_ob1_send_requst_copy_in_out(mca_pml_ob1_send_request_t *sendreq,
 int mca_pml_ob1_send_request_schedule_exclusive(
         mca_pml_ob1_send_request_t* sendreq)
 { 
-    mca_bml_base_endpoint_t* bml_endpoint = sendreq->req_endpoint;
-    size_t num_btl_avail =
-        mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);

     do {
-        size_t prev_bytes_remaining = 0, num_fail = 0;
+        size_t prev_bytes_remaining = 0;
         mca_pml_ob1_send_range_t *range = NULL;
+        int num_fail = 0;

         while(true) {
             mca_pml_ob1_frag_hdr_t* hdr;
             mca_btl_base_descriptor_t* des;
-            int rc;
+            int rc, btl_idx;
             size_t size, offset;
             opal_list_item_t *item;
-            mca_bml_base_btl_t* bml_btl =
-                mca_bml_base_btl_array_get_next(&bml_endpoint->btl_send);
+            mca_bml_base_btl_t* bml_btl;

             if(NULL == range || 0 == range->range_send_length) {
                 OPAL_THREAD_LOCK(&sendreq->req_send_range_lock);
@@ -957,7 +969,7 @@ int mca_pml_ob1_send_request_schedule_exclusive(

             prev_bytes_remaining = range->range_send_length;

-            if (num_fail == num_btl_avail) {
+            if (num_fail == range->range_btl_cnt) {
                 assert(sendreq->req_pending == MCA_PML_OB1_SEND_PENDING_NONE);
                 OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
                 sendreq->req_pending = MCA_PML_OB1_SEND_PENDING_SCHEDULE;
@@ -967,17 +979,13 @@ int mca_pml_ob1_send_request_schedule_exclusive(
                 return OMPI_ERR_OUT_OF_RESOURCE;
             }

-            if(num_btl_avail == 1 ||
-                    range->range_send_length < bml_btl->btl_min_send_size) {
-                size = range->range_send_length;
-            } else {
-                /* otherwise attempt to give the BTL a percentage of the message
-                 * based on a weighting factor. for simplicity calculate this as
-                 * a percentage of the overall message length (regardless of
-                 * amount previously assigned)
-                 */
-                size = (size_t)(bml_btl->btl_weight * range->range_send_length);
-            } 
+            do {
+                btl_idx = range->range_btl_idx;
+                bml_btl = range->range_btls[btl_idx].bml_btl;
+                size = range->range_btls[btl_idx].length;
+                if(++range->range_btl_idx == range->range_btl_cnt)
+                    range->range_btl_idx = 0;
+            } while(!size);

             /* makes sure that we don't exceed BTL max send size */
             if (bml_btl->btl_max_send_size != 0 &&
@@ -1035,8 +1043,9 @@ int mca_pml_ob1_send_request_schedule_exclusive(
             rc = mca_bml_base_send(bml_btl, des, MCA_BTL_TAG_PML);

             if(rc == OMPI_SUCCESS) {
-                range->range_send_length -= size;
                 /* update state */
+                range->range_btls[btl_idx].length -= size;
+                range->range_send_length -= size;
                 range->range_send_offset += size;
                 OPAL_THREAD_ADD_SIZE_T(&sendreq->req_pipeline_depth, 1);
             } else { 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index 30a7783..f1394c0 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -54,11 +54,11 @@ struct mca_pml_ob1_send_request_t {
     bool req_throttle_sends;
     size_t req_pipeline_depth;
     size_t req_bytes_delivered;
-    mca_pml_ob1_rdma_btl_t req_rdma[MCA_PML_OB1_MAX_RDMA_PER_REQUEST]; 
     uint32_t req_rdma_cnt; 
     mca_pml_ob1_send_pending_t req_pending;
     opal_mutex_t req_send_range_lock; 
     opal_list_t req_send_ranges;
+    mca_pml_ob1_com_btl_t req_rdma[1]; 
 };
 typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;

@@ -68,6 +68,9 @@ struct mca_pml_ob1_send_range_t {
     ompi_free_list_item_t base;
     uint64_t range_send_offset;
     uint64_t range_send_length;
+    int range_btl_idx;
+    int range_btl_cnt;
+    mca_pml_ob1_com_btl_t range_btls[1];
 };
 typedef struct mca_pml_ob1_send_range_t mca_pml_ob1_send_range_t;
 OBJ_CLASS_DECLARATION(mca_pml_ob1_send_range_t);
@@ -122,7 +125,7 @@ static inline void mca_pml_ob1_free_rdma_resources(mca_pml_ob1_send_request_t* s
     /* return mpool resources */
     for(r = 0; r < sendreq->req_rdma_cnt; r++) {
         mca_mpool_base_registration_t* reg = sendreq->req_rdma[r].btl_reg;
-        if( NULL != reg ) {
+        if( NULL != reg && reg->mpool != NULL ) {
             reg->mpool->mpool_deregister(reg->mpool, reg);
         }
     }

openib_mulihca_bw.ps
Description: PostScript document

[OMPI devel] Improve OB1 performance when multiple NICs are available

Reply via email to