On Tue, Jun 26, 2007 at 05:42:05PM -0400, George Bosilca wrote: > Gleb, > > Simplifying the code and getting better performance is always a good > approach (at least from my perspective). However, your patch still > dispatch the messages over the BTLs in a round robin fashion, which > doesn't look to me as the best approach. How about merging your patch > and mine ? We will get a better data distribution and a better > scheduling (on-demand based on the network load). Attached patch adds this on top of my previous patch. The performance on my setup is little bit worse with this patch applied.
> > Btw, did you compare my patch with yours on your multi-NIC system ? > With my patch on our system with 3 networks (2*1Gbs and one 100 Mbs) > I'm close to 99% of the total bandwidth. I'll try to see what I get > with yours. Your patch SEGV on my setup. So can check and compare. I see this in your patch: + reg = recvreq->req_rdma[bml_btl->btl_index].btl_reg; But bml_btl->btl_index is not an index in req_rdma array and actually we never initialize bml_btl->btl_index at all, so may be it would be a good idea to remove this field at all. TCP never use reg so no problem there, but for IB it should be valid. -- Gleb.
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index c47003c..94f897a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -47,7 +47,7 @@ void mca_pml_ob1_recv_request_process_pending(void) if(NULL == recvreq) break; recvreq->req_pending = false; - if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq) == + if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL) == OMPI_ERR_OUT_OF_RESOURCE) break; } @@ -170,7 +170,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl, MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq ); } else if (recvreq->req_rdma_offset < recvreq->req_send_offset) { /* schedule additional rdma operations */ - mca_pml_ob1_recv_request_schedule(recvreq); + mca_pml_ob1_recv_request_schedule(recvreq, bml_btl); } MCA_PML_OB1_PROGRESS_PENDING(bml_btl); } @@ -548,7 +548,7 @@ void mca_pml_ob1_recv_request_progress( MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq ); } else if (recvreq->req_rdma_offset < recvreq->req_send_offset) { /* schedule additional rdma operations */ - mca_pml_ob1_recv_request_schedule(recvreq); + mca_pml_ob1_recv_request_schedule(recvreq, NULL); } } @@ -595,22 +595,42 @@ void mca_pml_ob1_recv_request_matched_probe( * */ -int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq ) +int mca_pml_ob1_recv_request_schedule_exclusive( + mca_pml_ob1_recv_request_t* recvreq, + mca_bml_base_btl_t *start_bml_btl) { ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc; mca_bml_base_btl_t* bml_btl; int num_tries = recvreq->req_rdma_cnt; + size_t i; + size_t bytes_remaining = recvreq->req_send_offset - + recvreq->req_rdma_offset; + + if(bytes_remaining == 0) { + OPAL_THREAD_ADD32(&recvreq->req_lock, -recvreq->req_lock); + return; + } + + /* if starting bml_btl is provided schedule next fragment on it first */ + if(start_bml_btl != NULL) { + for(i = 0; i < recvreq->req_rdma_cnt; i++) { + if(recvreq->req_rdma[i].bml_btl != start_bml_btl) + continue; + /* something left to be send? */ + if(recvreq->req_rdma[i].length) + recvreq->req_rdma_idx = i; + break; + } + } do { - size_t bytes_remaining = recvreq->req_send_offset - - recvreq->req_rdma_offset; size_t prev_bytes_remaining = 0; int num_fail = 0; while( bytes_remaining > 0 && recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth ) { size_t hdr_size; - size_t size, i; + size_t size; mca_pml_ob1_rdma_hdr_t* hdr; mca_btl_base_descriptor_t* dst; mca_btl_base_descriptor_t* ctl; @@ -733,6 +753,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec /* run progress as the prepare (pinning) can take some time */ mca_bml.bml_progress(); } + bytes_remaining = recvreq->req_send_offset - recvreq->req_rdma_offset; } while(OPAL_THREAD_ADD32(&recvreq->req_lock,-1) > 0); return OMPI_SUCCESS; diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 1c67d1d..247f944 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -333,13 +333,14 @@ void mca_pml_ob1_recv_request_matched_probe( */ int mca_pml_ob1_recv_request_schedule_exclusive( - mca_pml_ob1_recv_request_t* req); + mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl); static inline void mca_pml_ob1_recv_request_schedule( - mca_pml_ob1_recv_request_t* req) + mca_pml_ob1_recv_request_t* req, + mca_bml_base_btl_t* start_bml_btl) { if(OPAL_THREAD_ADD32(&req->req_lock,1) == 1) - mca_pml_ob1_recv_request_schedule_exclusive(req); + mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl); } #define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \