On Tue, Jun 26, 2007 at 05:42:05PM -0400, George Bosilca wrote:
> Gleb,
>
> Simplifying the code and getting better performance is always a good
> approach (at least from my perspective). However, your patch still
> dispatch the messages over the BTLs in a round robin fashion, which
> doesn't look to me as the best approach. How about merging your patch
> and mine ? We will get a better data distribution and a better
> scheduling (on-demand based on the network load).
Attached patch adds this on top of my previous patch. The performance on
my setup is little bit worse with this patch applied.
>
> Btw, did you compare my patch with yours on your multi-NIC system ?
> With my patch on our system with 3 networks (2*1Gbs and one 100 Mbs)
> I'm close to 99% of the total bandwidth. I'll try to see what I get
> with yours.
Your patch SEGV on my setup. So can check and compare. I see this in
your patch:
+ reg = recvreq->req_rdma[bml_btl->btl_index].btl_reg;
But bml_btl->btl_index is not an index in req_rdma array and actually we
never initialize bml_btl->btl_index at all, so may be it would be a good
idea to remove this field at all. TCP never use reg so no problem there,
but for IB it should be valid.
--
Gleb.
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index c47003c..94f897a 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -47,7 +47,7 @@ void mca_pml_ob1_recv_request_process_pending(void)
if(NULL == recvreq)
break;
recvreq->req_pending = false;
- if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq) ==
+ if(mca_pml_ob1_recv_request_schedule_exclusive(recvreq, NULL) ==
OMPI_ERR_OUT_OF_RESOURCE)
break;
}
@@ -170,7 +170,7 @@ static void mca_pml_ob1_put_completion( mca_btl_base_module_t* btl,
MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq );
} else if (recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */
- mca_pml_ob1_recv_request_schedule(recvreq);
+ mca_pml_ob1_recv_request_schedule(recvreq, bml_btl);
}
MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}
@@ -548,7 +548,7 @@ void mca_pml_ob1_recv_request_progress(
MCA_PML_OB1_RECV_REQUEST_PML_COMPLETE( recvreq );
} else if (recvreq->req_rdma_offset < recvreq->req_send_offset) {
/* schedule additional rdma operations */
- mca_pml_ob1_recv_request_schedule(recvreq);
+ mca_pml_ob1_recv_request_schedule(recvreq, NULL);
}
}
@@ -595,22 +595,42 @@ void mca_pml_ob1_recv_request_matched_probe(
*
*/
-int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* recvreq )
+int mca_pml_ob1_recv_request_schedule_exclusive(
+ mca_pml_ob1_recv_request_t* recvreq,
+ mca_bml_base_btl_t *start_bml_btl)
{
ompi_proc_t* proc = recvreq->req_recv.req_base.req_proc;
mca_bml_base_btl_t* bml_btl;
int num_tries = recvreq->req_rdma_cnt;
+ size_t i;
+ size_t bytes_remaining = recvreq->req_send_offset -
+ recvreq->req_rdma_offset;
+
+ if(bytes_remaining == 0) {
+ OPAL_THREAD_ADD32(&recvreq->req_lock, -recvreq->req_lock);
+ return;
+ }
+
+ /* if starting bml_btl is provided schedule next fragment on it first */
+ if(start_bml_btl != NULL) {
+ for(i = 0; i < recvreq->req_rdma_cnt; i++) {
+ if(recvreq->req_rdma[i].bml_btl != start_bml_btl)
+ continue;
+ /* something left to be send? */
+ if(recvreq->req_rdma[i].length)
+ recvreq->req_rdma_idx = i;
+ break;
+ }
+ }
do {
- size_t bytes_remaining = recvreq->req_send_offset -
- recvreq->req_rdma_offset;
size_t prev_bytes_remaining = 0;
int num_fail = 0;
while( bytes_remaining > 0 &&
recvreq->req_pipeline_depth < mca_pml_ob1.recv_pipeline_depth ) {
size_t hdr_size;
- size_t size, i;
+ size_t size;
mca_pml_ob1_rdma_hdr_t* hdr;
mca_btl_base_descriptor_t* dst;
mca_btl_base_descriptor_t* ctl;
@@ -733,6 +753,7 @@ int mca_pml_ob1_recv_request_schedule_exclusive( mca_pml_ob1_recv_request_t* rec
/* run progress as the prepare (pinning) can take some time */
mca_bml.bml_progress();
}
+ bytes_remaining = recvreq->req_send_offset - recvreq->req_rdma_offset;
} while(OPAL_THREAD_ADD32(&recvreq->req_lock,-1) > 0);
return OMPI_SUCCESS;
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
index 1c67d1d..247f944 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h
@@ -333,13 +333,14 @@ void mca_pml_ob1_recv_request_matched_probe(
*/
int mca_pml_ob1_recv_request_schedule_exclusive(
- mca_pml_ob1_recv_request_t* req);
+ mca_pml_ob1_recv_request_t* req, mca_bml_base_btl_t* start_bml_btl);
static inline void mca_pml_ob1_recv_request_schedule(
- mca_pml_ob1_recv_request_t* req)
+ mca_pml_ob1_recv_request_t* req,
+ mca_bml_base_btl_t* start_bml_btl)
{
if(OPAL_THREAD_ADD32(&req->req_lock,1) == 1)
- mca_pml_ob1_recv_request_schedule_exclusive(req);
+ mca_pml_ob1_recv_request_schedule_exclusive(req, start_bml_btl);
}
#define MCA_PML_OB1_ADD_ACK_TO_PENDING(P, S, D, O) \