Re: [OMPI devel] Improve OB1 performance when multiple NICs are available

Gleb Natapov Wed, 27 Jun 2007 10:06:22 -0400

On Tue, Jun 26, 2007 at 05:42:05PM -0400, George Bosilca wrote:
> Gleb,
> 
> Simplifying the code and getting better performance is always a good  
> approach (at least from my perspective). However, your patch still  
> dispatch the messages over the BTLs in a round robin fashion, which  
> doesn't look to me as the best approach. How about merging your patch  
> and mine ? We will get a better data distribution and a better  
> scheduling (on-demand based on the network load).
Attached patch adds this on top of my previous patch. The performance on
my setup is little bit worse with this patch applied.


> 
> Btw, did you compare my patch with yours on your multi-NIC system ?  
> With my patch on our system with 3 networks (2*1Gbs and one 100 Mbs)  
> I'm close to 99% of the total bandwidth. I'll try to see what I get  
> with yours.
Your patch SEGV on my setup. So can check and compare. I see this in
your patch:
+     reg = recvreq->req_rdma[bml_btl->btl_index].btl_reg;
But bml_btl->btl_index is not an index in req_rdma array and actually we
never initialize bml_btl->btl_index at all, so may be it would be a good
idea to remove this field at all. TCP never use reg so no problem there,
but for IB it should be valid. 

--
                        Gleb.

diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index c47003c..94f897a 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -47,7 +47,7 @@ void mca_pml_ob1_recv_request_process_pending(void)
         if(NULL == recvreq)
             break;
         recvreq->req_pending = false;
-        if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq) == 
+        if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL) == 
                 OMPI_ERR_OUT_OF_RESOURCE)
             break;
     }
@@ -170,7 +170,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
         MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq );
     } else if (recvreq->req_rdma_offset < recvreq->req_send_offset) {
         /* schedule additional rdma operations */
-        mca_pml_ob1_recv_request_schedule(recvreq);
+        mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
     }
     MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
 }
@@ -548,7 +548,7 @@ void mca_pml_ob1_recv_request_progress(
         MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq );
     } else if (recvreq->req_rdma_offset < recvreq->req_send_offset) {
         /* schedule additional rdma operations */
-        mca_pml_ob1_recv_request_schedule(recvreq);
+        mca_pml_ob1_recv_request_schedule(recvreq, NULL);
     }
 }

@@ -595,22 +595,42 @@ void mca_pml_ob1_recv_request_matched_probe(
  *
 */

-int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq )
+int mca_pml_ob1_recv_request_schedule_exclusive(
+        mca_pml_ob1_recv_request_t* recvreq,
+        mca_bml_base_btl_t *start_bml_btl)
 {
     ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc;
     mca_bml_base_btl_t* bml_btl; 
     int num_tries = recvreq->req_rdma_cnt;
+    size_t i;
+    size_t bytes_remaining = recvreq->req_send_offset -
+        recvreq->req_rdma_offset;
+
+    if(bytes_remaining == 0) {
+        OPAL_THREAD_ADD32(&recvreq->req_lock, -recvreq->req_lock);
+        return;
+    }
+
+    /* if starting bml_btl is provided schedule next fragment on it first */
+    if(start_bml_btl != NULL) {
+        for(i = 0; i < recvreq->req_rdma_cnt; i++) {
+            if(recvreq->req_rdma[i].bml_btl != start_bml_btl)
+                continue;
+            /* something left to be send? */
+            if(recvreq->req_rdma[i].length)
+                recvreq->req_rdma_idx = i;
+            break;
+        }
+    }

     do {
-        size_t bytes_remaining = recvreq->req_send_offset -
-            recvreq->req_rdma_offset;
         size_t prev_bytes_remaining = 0;
         int num_fail = 0;

         while( bytes_remaining > 0 &&
                recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth ) {
             size_t hdr_size;
-            size_t size, i;
+            size_t size;
             mca_pml_ob1_rdma_hdr_t* hdr;
             mca_btl_base_descriptor_t* dst;
             mca_btl_base_descriptor_t* ctl;
@@ -733,6 +753,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
             /* run progress as the prepare (pinning) can take some time */
             mca_bml.bml_progress();
         }
+        bytes_remaining = recvreq->req_send_offset - recvreq->req_rdma_offset;
     } while(OPAL_THREAD_ADD32(&recvreq->req_lock,-1) > 0);

     return OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index 1c67d1d..247f944 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -333,13 +333,14 @@ void mca_pml_ob1_recv_request_matched_probe(
  */

 int mca_pml_ob1_recv_request_schedule_exclusive(
-    mca_pml_ob1_recv_request_t* req);
+    mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);

 static inline void mca_pml_ob1_recv_request_schedule(
-        mca_pml_ob1_recv_request_t* req)
+        mca_pml_ob1_recv_request_t* req,
+        mca_bml_base_btl_t* start_bml_btl)
 {
     if(OPAL_THREAD_ADD32(&req->req_lock,1) == 1)
-        mca_pml_ob1_recv_request_schedule_exclusive(req);
+        mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
 }

 #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O)                      \

Re: [OMPI devel] Improve OB1 performance when multiple NICs are available

Reply via email to