Ok. :-) On Apr 24, 2012, at 4:47 PM, Nathan Hjelm wrote:
> This was RFC'd last month. No one objected :) > > -Nathan > > On Tue, 24 Apr 2012, Jeffrey Squyres wrote: > >> There's some pretty extensive ob1 changes in here. >> >> Can we get these reviewed? Brian / George? >> >> >> On Apr 24, 2012, at 4:18 PM, hje...@osl.iu.edu wrote: >> >>> Author: hjelmn >>> Date: 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012) >>> New Revision: 26329 >>> URL: https://svn.open-mpi.org/trac/ompi/changeset/26329 >>> >>> Log: >>> ob1: add support for get fallback on put/send >>> Text files modified: >>> trunk/ompi/mca/btl/ugni/btl_ugni_get.c | 17 ---- >>> trunk/ompi/mca/btl/ugni/btl_ugni_put.c | 48 -------------- >>> trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h | 7 -- >>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c | 5 - >>> trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h | 1 >>> trunk/ompi/mca/pml/ob1/pml_ob1.c | 5 + >>> trunk/ompi/mca/pml/ob1/pml_ob1.h | 2 >>> trunk/ompi/mca/pml/ob1/pml_ob1_component.c | 4 >>> trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c | 15 +++- >>> trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c | 94 >>> ++++++++++++++++++++++++++-- >>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c | 131 >>> ++++++++++++++++++++++----------------- >>> trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h | 2 >>> 12 files changed, 182 insertions(+), 149 deletions(-) >>> >>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_get.c >>> ============================================================================== >>> --- trunk/ompi/mca/btl/ugni/btl_ugni_get.c (original) >>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_get.c 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -13,19 +13,6 @@ >>> #include "btl_ugni_rdma.h" >>> #include "btl_ugni_smsg.h" >>> >>> -static int mca_btl_ugni_init_put (struct mca_btl_base_module_t *btl, >>> - mca_btl_ugni_base_frag_t *frag) { >>> - /* off alignment/off size. switch to put */ >>> - frag->hdr.rdma.src_seg = frag->base.des_src[0]; >>> - frag->hdr.rdma.dst_seg = frag->base.des_dst[0]; >>> - frag->hdr.rdma.ctx = (void *) frag; >>> - >>> - /* send the fragment header using smsg. ignore local completion */ >>> - return ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.rdma, >>> - sizeof (frag->hdr.rdma), NULL, 0, >>> - MCA_BTL_UGNI_TAG_PUT_INIT); >>> -} >>> - >>> /** >>> * Initiate a get operation. >>> * >>> @@ -54,7 +41,7 @@ >>> >>> if (OPAL_UNLIKELY(check || size > >>> mca_btl_ugni_component.ugni_get_limit)) { >>> /* switch to put */ >>> - return mca_btl_ugni_init_put (btl, frag); >>> + return OMPI_ERR_NOT_AVAILABLE; >>> } >>> >>> if (NULL != frag->base.des_cbfunc) { >>> @@ -68,7 +55,7 @@ >>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, des->des_dst, >>> des->des_src); >>> } >>> >>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, >>> int rc) >>> +static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t >>> *frag, int rc) >>> { >>> BTL_VERBOSE(("rdma operation for rem_ctx %p complete", >>> frag->hdr.rdma.ctx)); >>> >>> >>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_put.c >>> ============================================================================== >>> --- trunk/ompi/mca/btl/ugni/btl_ugni_put.c (original) >>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_put.c 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -46,51 +46,3 @@ >>> >>> return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT, des->des_src, >>> des->des_dst); >>> } >>> - >>> -/* reversed get */ >>> -static void mca_btl_ugni_callback_put_retry (mca_btl_ugni_base_frag_t >>> *frag, int rc) >>> -{ >>> - (void) mca_btl_ugni_start_put(frag->endpoint, frag->hdr.rdma, frag); >>> -} >>> - >>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep, >>> - mca_btl_ugni_rdma_frag_hdr_t hdr, >>> - mca_btl_ugni_base_frag_t *frag) >>> -{ >>> - int rc; >>> - >>> - BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p", >>> hdr.ctx)); >>> - >>> - if (NULL == frag) { >>> - rc = MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag); >>> - if (OPAL_UNLIKELY(NULL == frag)) { >>> - BTL_ERROR(("error allocating rdma frag for reverse get. rc = >>> %d. fl_num_allocated = %d", rc, >>> - ep->btl->rdma_int_frags.fl_num_allocated)); >>> - return rc; >>> - } >>> - } >>> - >>> - frag->hdr.rdma = hdr; >>> - >>> - frag->base.des_cbfunc = NULL; >>> - frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP; >>> - >>> - frag->segments[0] = hdr.src_seg; >>> - frag->base.des_src = frag->segments; >>> - frag->base.des_src_cnt = 1; >>> - >>> - frag->segments[1] = hdr.dst_seg; >>> - frag->base.des_dst = frag->segments + 1; >>> - frag->base.des_dst_cnt = 1; >>> - >>> - rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base); >>> - if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { >>> - frag->cbfunc = mca_btl_ugni_callback_put_retry; >>> - opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) >>> frag); >>> - return rc; >>> - } >>> - >>> - frag->cbfunc = mca_btl_ugni_callback_rdma_complete; >>> - >>> - return OMPI_SUCCESS; >>> -} >>> >>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h >>> ============================================================================== >>> --- trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h (original) >>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -16,17 +16,10 @@ >>> #include "btl_ugni.h" >>> #include "btl_ugni_frag.h" >>> >>> -/* mca_btl_ugni_start_put: get operation could not be completed. start put >>> instead */ >>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep, >>> - mca_btl_ugni_rdma_frag_hdr_t hdr, >>> - mca_btl_ugni_base_frag_t *frag); >>> - >>> int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep, >>> mca_btl_ugni_eager_ex_frag_hdr_t hdr, >>> mca_btl_ugni_base_frag_t *frag); >>> >>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, >>> int rc); >>> - >>> static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag, >>> gni_post_type_t op_type, >>> uint64_t lcl_addr, >>> >>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c >>> ============================================================================== >>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c (original) >>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -78,11 +78,6 @@ >>> reg->cbfunc(&ep->btl->super, tag, &(frag.base), reg->cbdata); >>> >>> break; >>> - case MCA_BTL_UGNI_TAG_PUT_INIT: >>> - frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *) data_ptr)[0]; >>> - >>> - mca_btl_ugni_start_put (ep, frag.hdr.rdma, NULL); >>> - break; >>> case MCA_BTL_UGNI_TAG_GET_INIT: >>> frag.hdr.eager_ex = ((mca_btl_ugni_eager_ex_frag_hdr_t *) >>> data_ptr)[0]; >>> >>> >>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h >>> ============================================================================== >>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h (original) >>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -21,7 +21,6 @@ >>> typedef enum { >>> MCA_BTL_UGNI_TAG_SEND, >>> MCA_BTL_UGNI_TAG_DISCONNECT, >>> - MCA_BTL_UGNI_TAG_PUT_INIT, >>> MCA_BTL_UGNI_TAG_GET_INIT, >>> MCA_BTL_UGNI_TAG_RDMA_COMPLETE >>> } mca_btl_ugni_smsg_tag_t; >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1.c (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.c 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -147,6 +147,7 @@ >>> OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t); >>> OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t); >>> OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t); >>> + >>> /* missing communicator pending list */ >>> OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending, >>> opal_list_t); >>> >>> @@ -599,8 +600,10 @@ >>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> if(NULL == frag) >>> break; >>> + >>> + frag->retries++; >>> + >>> if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) { >>> - frag->retries++; >>> rc = mca_pml_ob1_send_request_put_frag(frag); >>> } else { >>> rc = mca_pml_ob1_recv_request_get_frag(frag); >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.h >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1.h (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.h 2012-04-24 16:18:56 EDT (Tue, >>> 24 Apr 2012) >>> @@ -52,7 +52,7 @@ >>> int free_list_inc; /* number of elements to grow free list */ >>> size_t send_pipeline_depth; >>> size_t recv_pipeline_depth; >>> - size_t rdma_put_retries_limit; >>> + size_t rdma_retries_limit; >>> int max_rdma_per_request; >>> int max_send_per_range; >>> bool leave_pinned; >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_component.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1_component.c (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_component.c 2012-04-24 16:18:56 EDT >>> (Tue, 24 Apr 2012) >>> @@ -112,8 +112,8 @@ >>> mca_pml_ob1_param_register_int("send_pipeline_depth", 3); >>> mca_pml_ob1.recv_pipeline_depth = >>> mca_pml_ob1_param_register_int("recv_pipeline_depth", 4); >>> - mca_pml_ob1.rdma_put_retries_limit = >>> - mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5); >>> + mca_pml_ob1.rdma_retries_limit = >>> + mca_pml_ob1_param_register_int("rdma_retries_limit", 5); >>> mca_pml_ob1.max_rdma_per_request = >>> mca_pml_ob1_param_register_int("max_rdma_per_request", 4); >>> mca_pml_ob1.max_send_per_range = >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c 2012-04-24 16:18:56 EDT >>> (Tue, 24 Apr 2012) >>> @@ -294,15 +294,22 @@ >>> if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) >>> ) { >>> return; >>> } >>> - >>> + >>> ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK); >>> sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval; >>> sendreq->req_recv = hdr->hdr_ack.hdr_dst_req; >>> - >>> + >>> /* if the request should be delivered entirely by copy in/out >>> * then throttle sends */ >>> - if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) >>> + if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) { >>> + if (NULL != sendreq->src_des) { >>> + /* release registered memory */ >>> + mca_bml_base_free (sendreq->req_rdma[0].bml_btl, >>> sendreq->src_des); >>> + sendreq->src_des = NULL; >>> + } >>> + >>> sendreq->req_throttle_sends = true; >>> + } >>> >>> mca_pml_ob1_send_request_copy_in_out(sendreq, >>> hdr->hdr_ack.hdr_send_offset, >>> @@ -324,7 +331,7 @@ >>> >>> if(send_request_pml_complete_check(sendreq) == false) >>> mca_pml_ob1_send_request_schedule(sendreq); >>> - >>> + >>> return; >>> } >>> >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c 2012-04-24 16:18:56 EDT >>> (Tue, 24 Apr 2012) >>> @@ -352,6 +352,66 @@ >>> } >>> >>> >>> +static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag, >>> + mca_btl_base_descriptor_t *dst) { >>> + mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) >>> frag->rdma_req; >>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; >>> + mca_btl_base_descriptor_t *ctl; >>> + mca_pml_ob1_rdma_hdr_t *hdr; >>> + size_t hdr_size; >>> + unsigned int i; >>> + int rc; >>> + >>> + /* prepare a descriptor for rdma control message */ >>> + hdr_size = sizeof (mca_pml_ob1_rdma_hdr_t); >>> + if (dst->des_dst_cnt > 1) { >>> + hdr_size += (sizeof (mca_btl_base_segment_t) * >>> + (dst->des_dst_cnt-1)); >>> + } >>> + >>> + mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size, >>> + MCA_BTL_DES_FLAGS_PRIORITY | >>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | >>> + MCA_BTL_DES_SEND_ALWAYS_CALLBACK); >>> + if (OPAL_UNLIKELY(NULL == ctl)) { >>> + return OMPI_ERR_OUT_OF_RESOURCE; >>> + } >>> + ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion; >>> + >>> + /* fill in rdma header */ >>> + hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_src->seg_addr.pval; >>> + hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT; >>> + hdr->hdr_common.hdr_flags = >>> + (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0; >>> + >>> + hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req; >>> + hdr->hdr_rdma_offset = recvreq->req_rdma_offset; >>> + hdr->hdr_des.pval = dst; >>> + >>> + hdr->hdr_seg_cnt = dst->des_dst_cnt; >>> + >>> + for (i = 0 ; i < dst->des_dst_cnt ; ++i) { >>> + hdr->hdr_segs[i].seg_addr.lval = >>> ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval); >>> + hdr->hdr_segs[i].seg_len = dst->des_dst[i].seg_len; >>> + hdr->hdr_segs[i].seg_key.key64[0] = >>> dst->des_dst[i].seg_key.key64[0]; >>> + hdr->hdr_segs[i].seg_key.key64[1] = >>> dst->des_dst[i].seg_key.key64[1]; >>> + } >>> + >>> + dst->des_cbfunc = mca_pml_ob1_put_completion; >>> + dst->des_cbdata = recvreq; >>> + >>> + if (!recvreq->req_ack_sent) >>> + recvreq->req_ack_sent = true; >>> + >>> + /* send rdma request to peer */ >>> + rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT); >>> + if (OPAL_UNLIKELY(rc < 0)) { >>> + mca_bml_base_free (bml_btl, ctl); >>> + return rc; >>> + } >>> + >>> + return OMPI_SUCCESS; >>> +} >>> + >>> /* >>> * >>> */ >>> @@ -371,14 +431,25 @@ >>> 0, >>> &frag->rdma_length, >>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | >>> MCA_BTL_DES_SEND_ALWAYS_CALLBACK | >>> - MCA_BTL_DES_FLAGS_GET, >>> + MCA_BTL_DES_FLAGS_GET, >>> &descriptor ); >>> if( OPAL_UNLIKELY(NULL == descriptor) ) { >>> - frag->rdma_length = save_size; >>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); >>> - opal_list_append(&mca_pml_ob1.rdma_pending, >>> (opal_list_item_t*)frag); >>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> - return OMPI_ERR_OUT_OF_RESOURCE; >>> + if (frag->retries < mca_pml_ob1.rdma_retries_limit) { >>> + frag->rdma_length = save_size; >>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); >>> + opal_list_append(&mca_pml_ob1.rdma_pending, >>> (opal_list_item_t*)frag); >>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> + return OMPI_ERR_OUT_OF_RESOURCE; >>> + } else { >>> + ompi_proc_t *proc = (ompi_proc_t *) >>> recvreq->req_recv.req_base.req_proc; >>> + >>> + /* tell peer to fall back on send */ >>> + recvreq->req_send_offset = 0; >>> + rc = mca_pml_ob1_recv_request_ack_send(proc, >>> frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, >>> + recvreq, >>> recvreq->req_send_offset, true); >>> + MCA_PML_OB1_RDMA_FRAG_RETURN(frag); >>> + return rc; >>> + } >>> } >>> >>> descriptor->des_src = frag->rdma_segs; >>> @@ -393,6 +464,11 @@ >>> /* queue up get request */ >>> rc = mca_bml_base_get(bml_btl,descriptor); >>> if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { >>> + if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) { >>> + /* get isn't supported for this transfer. tell peer to >>> fallback on put */ >>> + rc = mca_pml_ob1_init_get_fallback (frag, descriptor); >>> + } >>> + >>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) { >>> mca_bml_base_free(bml_btl, descriptor); >>> OPAL_THREAD_LOCK(&mca_pml_ob1.lock); >>> @@ -400,7 +476,7 @@ >>> (opal_list_item_t*)frag); >>> OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> return OMPI_ERR_OUT_OF_RESOURCE; >>> - } else { >>> + } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { >>> ORTE_ERROR_LOG(rc); >>> orte_errmgr.abort(-1, NULL); >>> } >>> @@ -551,7 +627,9 @@ >>> orte_errmgr.abort(-1, NULL); >>> } >>> #endif /* OMPI_CUDA_SUPPORT */ >>> + >>> frag->rdma_hdr.hdr_rget = *hdr; >>> + frag->retries = 0; >>> frag->rdma_req = recvreq; >>> frag->rdma_ep = bml_endpoint; >>> frag->rdma_length = size; >>> @@ -792,7 +870,7 @@ >>> mca_bml_base_prepare_dst(bml_btl, reg, >>> &recvreq->req_recv.req_base.req_convertor, >>> MCA_BTL_NO_ORDER, 0, &size, >>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | >>> - MCA_BTL_DES_FLAGS_PUT, &dst); >>> + MCA_BTL_DES_FLAGS_PUT, &dst); >>> OPAL_THREAD_UNLOCK(&recvreq->lock); >>> >>> if(OPAL_UNLIKELY(dst == NULL)) { >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c 2012-04-24 16:18:56 EDT >>> (Tue, 24 Apr 2012) >>> @@ -264,6 +264,7 @@ >>> MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt, >>> 0, req_bytes_delivered ); >>> OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, >>> req_bytes_delivered); >>> + sendreq->src_des = NULL; >>> >>> send_request_pml_complete_check(sendreq); >>> /* free the descriptor */ >>> @@ -639,6 +640,8 @@ >>> bool need_local_cb = false; >>> int rc; >>> >>> + sendreq->src_des = NULL; >>> + >>> bml_btl = sendreq->req_rdma[0].bml_btl; >>> if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & >>> (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) { >>> mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg; >>> @@ -657,10 +660,8 @@ >>> mca_bml_base_prepare_src( bml_btl, >>> reg, >>> &sendreq->req_send.req_base.req_convertor, >>> - MCA_BTL_NO_ORDER, >>> - 0, >>> - &size, >>> - MCA_BTL_DES_FLAGS_GET, >>> + MCA_BTL_NO_ORDER, 0, &size, >>> + MCA_BTL_DES_FLAGS_GET | >>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP, >>> &src ); >>> MEMCHECKER( >>> memchecker_call(&opal_memchecker_base_mem_noaccess, >>> @@ -676,6 +677,8 @@ >>> src->des_cbfunc = mca_pml_ob1_rget_completion; >>> src->des_cbdata = sendreq; >>> >>> + sendreq->src_des = src; >>> + >>> /* allocate space for get hdr + segment list */ >>> mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER, >>> sizeof(mca_pml_ob1_rget_hdr_t) + >>> @@ -782,8 +785,9 @@ >>> return OMPI_SUCCESS; >>> } >>> mca_bml_base_free(bml_btl, des); >>> - if (NULL != src) { >>> - mca_bml_base_free (bml_btl, src); >>> + if (sendreq->src_des) { >>> + mca_bml_base_free (bml_btl, sendreq->src_des); >>> + sendreq->src_des = NULL; >>> } >>> >>> return rc; >>> @@ -1133,63 +1137,71 @@ >>> MCA_PML_OB1_PROGRESS_PENDING(bml_btl); >>> } >>> >>> -int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag ) >>> -{ >>> - mca_mpool_base_registration_t* reg = NULL; >>> - mca_bml_base_btl_t* bml_btl = frag->rdma_bml; >>> - mca_btl_base_descriptor_t* des; >>> +int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag ) >>> +{ >>> + mca_pml_ob1_send_request_t* sendreq = >>> (mca_pml_ob1_send_request_t*)frag->rdma_req; >>> + mca_mpool_base_registration_t *reg = NULL; >>> + mca_bml_base_btl_t *bml_btl = frag->rdma_bml; >>> + mca_btl_base_descriptor_t *des; >>> size_t save_size = frag->rdma_length; >>> int rc; >>> >>> - /* setup descriptor */ >>> - mca_bml_base_prepare_src( bml_btl, >>> - reg, >>> - &frag->convertor, >>> - MCA_BTL_NO_ORDER, >>> - 0, >>> - &frag->rdma_length, >>> - MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | >>> - MCA_BTL_DES_FLAGS_PUT, >>> - &des ); >>> + if (OPAL_LIKELY(NULL == sendreq->src_des)) { >>> + /* setup descriptor */ >>> + mca_bml_base_prepare_src( bml_btl, >>> + reg, >>> + &frag->convertor, >>> + MCA_BTL_NO_ORDER, >>> + 0, >>> + &frag->rdma_length, >>> + MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | >>> + MCA_BTL_DES_FLAGS_PUT, >>> + &des ); >>> >>> - if( OPAL_UNLIKELY(NULL == des) ) { >>> - if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) { >>> - size_t offset = >>> (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; >>> - frag->rdma_length = save_size; >>> - opal_convertor_set_position(&frag->convertor, &offset); >>> - OPAL_THREAD_LOCK(&mca_pml_ob1.lock); >>> - opal_list_append(&mca_pml_ob1.rdma_pending, >>> (opal_list_item_t*)frag); >>> - OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> - } else { >>> - mca_pml_ob1_send_request_t *sendreq = >>> - (mca_pml_ob1_send_request_t*)frag->rdma_req; >>> + if( OPAL_UNLIKELY(NULL == des) ) { >>> + if(frag->retries < mca_pml_ob1.rdma_retries_limit) { >>> + size_t offset = >>> (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset; >>> + frag->rdma_length = save_size; >>> + opal_convertor_set_position(&frag->convertor, &offset); >>> + OPAL_THREAD_LOCK(&mca_pml_ob1.lock); >>> + opal_list_append(&mca_pml_ob1.rdma_pending, >>> (opal_list_item_t*)frag); >>> + OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock); >>> + } else { >>> + mca_pml_ob1_send_request_t *sendreq = >>> + (mca_pml_ob1_send_request_t*)frag->rdma_req; >>> + >>> + /* tell receiver to unregister memory */ >>> + mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, >>> + bml_btl, >>> frag->rdma_hdr.hdr_rdma.hdr_des, >>> + MCA_BTL_NO_ORDER, 1); >>> + >>> + /* send fragment by copy in/out */ >>> + mca_pml_ob1_send_request_copy_in_out(sendreq, >>> + >>> frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length); >>> + /* if a pointer to a receive request is not set it means >>> that >>> + * ACK was not yet received. Don't schedule sends before >>> ACK */ >>> + if(NULL != sendreq->req_recv.pval) >>> + mca_pml_ob1_send_request_schedule(sendreq); >>> + } >>> >>> - /* tell receiver to unregister memory */ >>> - mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc, >>> - bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des, >>> - MCA_BTL_NO_ORDER, 1); >>> - >>> - /* send fragment by copy in/out */ >>> - mca_pml_ob1_send_request_copy_in_out(sendreq, >>> - frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, >>> frag->rdma_length); >>> - /* if a pointer to a receive request is not set it means that >>> - * ACK was not yet received. Don't schedule sends before ACK */ >>> - if(NULL != sendreq->req_recv.pval) >>> - mca_pml_ob1_send_request_schedule(sendreq); >>> + return OMPI_ERR_OUT_OF_RESOURCE; >>> } >>> - return OMPI_ERR_OUT_OF_RESOURCE; >>> + } else { >>> + /* already have a source descriptor */ >>> + des = sendreq->src_des; >>> + sendreq->src_des = NULL; >>> } >>> - >>> - des->des_dst = frag->rdma_segs; >>> + >>> + des->des_dst = frag->rdma_segs; >>> des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt; >>> - des->des_cbfunc = mca_pml_ob1_put_completion; >>> - des->des_cbdata = frag; >>> + des->des_cbfunc = mca_pml_ob1_put_completion; >>> + des->des_cbdata = frag; >>> >>> PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE, >>> >>> &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), >>> save_size, PERUSE_SEND ); >>> >>> rc = mca_bml_base_put(bml_btl, des); >>> - if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) { >>> + if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) { >>> mca_bml_base_free(bml_btl, des); >>> frag->rdma_length = save_size; >>> if(OMPI_ERR_OUT_OF_RESOURCE == rc) { >>> @@ -1203,6 +1215,7 @@ >>> orte_errmgr.abort(-1, NULL); >>> } >>> } >>> + >>> return OMPI_SUCCESS; >>> } >>> >>> @@ -1261,21 +1274,25 @@ >>> frag->reg = NULL; >>> frag->retries = 0; >>> >>> + if (OPAL_UNLIKELY(NULL != sendreq->src_des)) { >>> + /* get fallback path */ >>> + sendreq->req_state = 0; >>> + } >>> + >>> /* lookup the corresponding registration */ >>> for(i=0; i<sendreq->req_rdma_cnt; i++) { >>> - if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { >>> - frag->reg = sendreq->req_rdma[i].btl_reg; >>> - break; >>> - } >>> - } >>> + if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) { >>> + frag->reg = sendreq->req_rdma[i].btl_reg; >>> + break; >>> + } >>> + } >>> >>> /* RDMA writes may proceed in parallel to send and to each other, so >>> * create clone of the convertor for each RDMA fragment >>> */ >>> size = hdr->hdr_rdma_offset; >>> >>> opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor, >>> - &frag->convertor, 0, &size); >>> + &frag->convertor, 0, &size); >>> >>> mca_pml_ob1_send_request_put_frag(frag); >>> } >>> - >>> >>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h >>> ============================================================================== >>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h (original) >>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h 2012-04-24 16:18:56 EDT >>> (Tue, 24 Apr 2012) >>> @@ -54,6 +54,7 @@ >>> mca_pml_ob1_send_pending_t req_pending; >>> opal_mutex_t req_send_range_lock; >>> opal_list_t req_send_ranges; >>> + mca_btl_base_descriptor_t *src_des; >>> mca_pml_ob1_com_btl_t req_rdma[1]; >>> }; >>> typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t; >>> @@ -129,6 +130,7 @@ >>> OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \ >>> sendreq = (mca_pml_ob1_send_request_t*)item; \ >>> sendreq->req_send.req_base.req_proc = proc; \ >>> + sendreq->src_des = NULL; \ >>> } \ >>> } >>> >>> _______________________________________________ >>> svn-full mailing list >>> svn-f...@open-mpi.org >>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full >> >> >> -- >> Jeff Squyres >> jsquy...@cisco.com >> For corporate legal information go to: >> http://www.cisco.com/web/about/doing_business/legal/cri/ >> >> >> _______________________________________________ >> devel mailing list >> de...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/devel >> > _______________________________________________ > devel mailing list > de...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/devel -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/