Ok.  :-)

On Apr 24, 2012, at 4:47 PM, Nathan Hjelm wrote:

> This was RFC'd last month. No one objected :)
> 
> -Nathan
> 
> On Tue, 24 Apr 2012, Jeffrey Squyres wrote:
> 
>> There's some pretty extensive ob1 changes in here.
>> 
>> Can we get these reviewed?  Brian / George?
>> 
>> 
>> On Apr 24, 2012, at 4:18 PM, hje...@osl.iu.edu wrote:
>> 
>>> Author: hjelmn
>>> Date: 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
>>> New Revision: 26329
>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/26329
>>> 
>>> Log:
>>> ob1: add support for get fallback on put/send
>>> Text files modified:
>>>  trunk/ompi/mca/btl/ugni/btl_ugni_get.c     |    17 ----
>>>  trunk/ompi/mca/btl/ugni/btl_ugni_put.c     |    48 --------------
>>>  trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h    |     7 --
>>>  trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c    |     5 -
>>>  trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h    |     1
>>>  trunk/ompi/mca/pml/ob1/pml_ob1.c           |     5 +
>>>  trunk/ompi/mca/pml/ob1/pml_ob1.h           |     2
>>>  trunk/ompi/mca/pml/ob1/pml_ob1_component.c |     4
>>>  trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c  |    15 +++-
>>>  trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c   |    94 
>>> ++++++++++++++++++++++++++--
>>>  trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c   |   131 
>>> ++++++++++++++++++++++-----------------
>>>  trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h   |     2
>>>  12 files changed, 182 insertions(+), 149 deletions(-)
>>> 
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_get.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_get.c  (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_get.c  2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -13,19 +13,6 @@
>>> #include "btl_ugni_rdma.h"
>>> #include "btl_ugni_smsg.h"
>>> 
>>> -static int mca_btl_ugni_init_put (struct mca_btl_base_module_t *btl,
>>> -                                  mca_btl_ugni_base_frag_t *frag) {
>>> -    /* off alignment/off size. switch to put */
>>> -    frag->hdr.rdma.src_seg = frag->base.des_src[0];
>>> -    frag->hdr.rdma.dst_seg = frag->base.des_dst[0];
>>> -    frag->hdr.rdma.ctx     = (void *) frag;
>>> -
>>> -    /* send the fragment header using smsg. ignore local completion */
>>> -    return ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.rdma,
>>> -                                        sizeof (frag->hdr.rdma), NULL, 0,
>>> -                                        MCA_BTL_UGNI_TAG_PUT_INIT);
>>> -}
>>> -
>>> /**
>>> * Initiate a get operation.
>>> *
>>> @@ -54,7 +41,7 @@
>>> 
>>>    if (OPAL_UNLIKELY(check || size > 
>>> mca_btl_ugni_component.ugni_get_limit)) {
>>>        /* switch to put */
>>> -        return mca_btl_ugni_init_put (btl, frag);
>>> +        return OMPI_ERR_NOT_AVAILABLE;
>>>    }
>>> 
>>>    if (NULL != frag->base.des_cbfunc) {
>>> @@ -68,7 +55,7 @@
>>>    return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, des->des_dst, 
>>> des->des_src);
>>> }
>>> 
>>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, 
>>> int rc)
>>> +static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t 
>>> *frag, int rc)
>>> {
>>>    BTL_VERBOSE(("rdma operation for rem_ctx %p complete", 
>>> frag->hdr.rdma.ctx));
>>> 
>>> 
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_put.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_put.c  (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_put.c  2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -46,51 +46,3 @@
>>> 
>>>    return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT, des->des_src, 
>>> des->des_dst);
>>> }
>>> -
>>> -/* reversed get */
>>> -static void mca_btl_ugni_callback_put_retry (mca_btl_ugni_base_frag_t 
>>> *frag, int rc)
>>> -{
>>> -    (void) mca_btl_ugni_start_put(frag->endpoint, frag->hdr.rdma, frag);
>>> -}
>>> -
>>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>>> -                            mca_btl_ugni_rdma_frag_hdr_t hdr,
>>> -                            mca_btl_ugni_base_frag_t *frag)
>>> -{
>>> -    int rc;
>>> -
>>> -    BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p", 
>>> hdr.ctx));
>>> -
>>> -    if (NULL == frag) {
>>> -        rc = MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
>>> -        if (OPAL_UNLIKELY(NULL == frag)) {
>>> -            BTL_ERROR(("error allocating rdma frag for reverse get. rc = 
>>> %d. fl_num_allocated = %d", rc,
>>> -                       ep->btl->rdma_int_frags.fl_num_allocated));
>>> -            return rc;
>>> -        }
>>> -    }
>>> -
>>> -    frag->hdr.rdma = hdr;
>>> -
>>> -    frag->base.des_cbfunc = NULL;
>>> -    frag->base.des_flags  = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
>>> -
>>> -    frag->segments[0] = hdr.src_seg;
>>> -    frag->base.des_src = frag->segments;
>>> -    frag->base.des_src_cnt = 1;
>>> -
>>> -    frag->segments[1] = hdr.dst_seg;
>>> -    frag->base.des_dst = frag->segments + 1;
>>> -    frag->base.des_dst_cnt = 1;
>>> -
>>> -    rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base);
>>> -    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>> -        frag->cbfunc = mca_btl_ugni_callback_put_retry;
>>> -        opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) 
>>> frag);
>>> -        return rc;
>>> -    }
>>> -
>>> -    frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
>>> -
>>> -    return OMPI_SUCCESS;
>>> -}
>>> 
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h
>>> ==============================================================================
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h 2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -16,17 +16,10 @@
>>> #include "btl_ugni.h"
>>> #include "btl_ugni_frag.h"
>>> 
>>> -/* mca_btl_ugni_start_put: get operation could not be completed. start put 
>>> instead */
>>> -int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
>>> -                            mca_btl_ugni_rdma_frag_hdr_t hdr,
>>> -                            mca_btl_ugni_base_frag_t *frag);
>>> -
>>> int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
>>>                                  mca_btl_ugni_eager_ex_frag_hdr_t hdr,
>>>                                  mca_btl_ugni_base_frag_t *frag);
>>> 
>>> -void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, 
>>> int rc);
>>> -
>>> static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
>>>                                     gni_post_type_t op_type,
>>>                                     uint64_t lcl_addr,
>>> 
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c 2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -78,11 +78,6 @@
>>>            reg->cbfunc(&ep->btl->super, tag, &(frag.base), reg->cbdata);
>>> 
>>>            break;
>>> -        case MCA_BTL_UGNI_TAG_PUT_INIT:
>>> -            frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *) data_ptr)[0];
>>> -
>>> -            mca_btl_ugni_start_put (ep, frag.hdr.rdma, NULL);
>>> -            break;
>>>        case MCA_BTL_UGNI_TAG_GET_INIT:
>>>            frag.hdr.eager_ex = ((mca_btl_ugni_eager_ex_frag_hdr_t *) 
>>> data_ptr)[0];
>>> 
>>> 
>>> Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h
>>> ==============================================================================
>>> --- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h (original)
>>> +++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h 2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -21,7 +21,6 @@
>>> typedef enum {
>>>    MCA_BTL_UGNI_TAG_SEND,
>>>    MCA_BTL_UGNI_TAG_DISCONNECT,
>>> -    MCA_BTL_UGNI_TAG_PUT_INIT,
>>>    MCA_BTL_UGNI_TAG_GET_INIT,
>>>    MCA_BTL_UGNI_TAG_RDMA_COMPLETE
>>> } mca_btl_ugni_smsg_tag_t;
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1.c        (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.c        2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -147,6 +147,7 @@
>>>    OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t);
>>>    OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t);
>>>    OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t);
>>> +
>>>    /* missing communicator pending list */
>>>    OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending, 
>>> opal_list_t);
>>> 
>>> @@ -599,8 +600,10 @@
>>>        OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>>        if(NULL == frag)
>>>            break;
>>> +
>>> +        frag->retries++;
>>> +
>>>        if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) {
>>> -            frag->retries++;
>>>            rc = mca_pml_ob1_send_request_put_frag(frag);
>>>        } else {
>>>            rc = mca_pml_ob1_recv_request_get_frag(frag);
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1.h
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1.h        (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1.h        2012-04-24 16:18:56 EDT (Tue, 
>>> 24 Apr 2012)
>>> @@ -52,7 +52,7 @@
>>>    int free_list_inc;      /* number of elements to grow free list */
>>>    size_t send_pipeline_depth;
>>>    size_t recv_pipeline_depth;
>>> -    size_t rdma_put_retries_limit;
>>> +    size_t rdma_retries_limit;
>>>    int max_rdma_per_request;
>>>    int max_send_per_range;
>>>    bool leave_pinned;
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_component.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_component.c      (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_component.c      2012-04-24 16:18:56 EDT 
>>> (Tue, 24 Apr 2012)
>>> @@ -112,8 +112,8 @@
>>>        mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
>>>    mca_pml_ob1.recv_pipeline_depth =
>>>        mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
>>> -    mca_pml_ob1.rdma_put_retries_limit =
>>> -        mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
>>> +    mca_pml_ob1.rdma_retries_limit =
>>> +        mca_pml_ob1_param_register_int("rdma_retries_limit", 5);
>>>    mca_pml_ob1.max_rdma_per_request =
>>>        mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
>>>    mca_pml_ob1.max_send_per_range =
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c       (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c       2012-04-24 16:18:56 EDT 
>>> (Tue, 24 Apr 2012)
>>> @@ -294,15 +294,22 @@
>>>    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) 
>>> ) {
>>>         return;
>>>    }
>>> -
>>> +
>>>    ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK);
>>>    sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
>>>    sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
>>> -
>>> +
>>>    /* if the request should be delivered entirely by copy in/out
>>>     * then throttle sends */
>>> -    if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA)
>>> +    if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
>>> +        if (NULL != sendreq->src_des) {
>>> +            /* release registered memory */
>>> +            mca_bml_base_free (sendreq->req_rdma[0].bml_btl, 
>>> sendreq->src_des);
>>> +            sendreq->src_des = NULL;
>>> +        }
>>> +
>>>        sendreq->req_throttle_sends = true;
>>> +    }
>>> 
>>>    mca_pml_ob1_send_request_copy_in_out(sendreq,
>>>                                         hdr->hdr_ack.hdr_send_offset,
>>> @@ -324,7 +331,7 @@
>>> 
>>>    if(send_request_pml_complete_check(sendreq) == false)
>>>        mca_pml_ob1_send_request_schedule(sendreq);
>>> -
>>> +
>>>    return;
>>> }
>>> 
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c        (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c        2012-04-24 16:18:56 EDT 
>>> (Tue, 24 Apr 2012)
>>> @@ -352,6 +352,66 @@
>>> }
>>> 
>>> 
>>> +static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
>>> +                                          mca_btl_base_descriptor_t *dst) {
>>> +    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) 
>>> frag->rdma_req;
>>> +    mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>>> +    mca_btl_base_descriptor_t *ctl;
>>> +    mca_pml_ob1_rdma_hdr_t *hdr;
>>> +    size_t hdr_size;
>>> +    unsigned int i;
>>> +    int rc;
>>> +
>>> +    /* prepare a descriptor for rdma control message */
>>> +    hdr_size = sizeof (mca_pml_ob1_rdma_hdr_t);
>>> +    if (dst->des_dst_cnt > 1) {
>>> +        hdr_size += (sizeof (mca_btl_base_segment_t) *
>>> +                     (dst->des_dst_cnt-1));
>>> +    }
>>> +
>>> +    mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size,
>>> +                        MCA_BTL_DES_FLAGS_PRIORITY | 
>>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> +                        MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
>>> +    if (OPAL_UNLIKELY(NULL == ctl)) {
>>> +        return OMPI_ERR_OUT_OF_RESOURCE;
>>> +    }
>>> +    ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
>>> +
>>> +    /* fill in rdma header */
>>> +    hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_src->seg_addr.pval;
>>> +    hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
>>> +    hdr->hdr_common.hdr_flags =
>>> +        (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
>>> +
>>> +    hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
>>> +    hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
>>> +    hdr->hdr_des.pval = dst;
>>> +
>>> +    hdr->hdr_seg_cnt = dst->des_dst_cnt;
>>> +
>>> +    for (i = 0 ; i < dst->des_dst_cnt ; ++i) {
>>> +        hdr->hdr_segs[i].seg_addr.lval = 
>>> ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval);
>>> +        hdr->hdr_segs[i].seg_len       = dst->des_dst[i].seg_len;
>>> +        hdr->hdr_segs[i].seg_key.key64[0] = 
>>> dst->des_dst[i].seg_key.key64[0];
>>> +        hdr->hdr_segs[i].seg_key.key64[1] = 
>>> dst->des_dst[i].seg_key.key64[1];
>>> +    }
>>> +
>>> +    dst->des_cbfunc = mca_pml_ob1_put_completion;
>>> +    dst->des_cbdata = recvreq;
>>> +
>>> +    if (!recvreq->req_ack_sent)
>>> +        recvreq->req_ack_sent = true;
>>> +
>>> +    /* send rdma request to peer */
>>> +    rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
>>> +    if (OPAL_UNLIKELY(rc < 0)) {
>>> +        mca_bml_base_free (bml_btl, ctl);
>>> +        return rc;
>>> +    }
>>> +
>>> +    return OMPI_SUCCESS;
>>> +}
>>> +
>>> /*
>>> *
>>> */
>>> @@ -371,14 +431,25 @@
>>>                              0,
>>>                              &frag->rdma_length,
>>>                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | 
>>> MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
>>> -                         MCA_BTL_DES_FLAGS_GET,
>>> +                              MCA_BTL_DES_FLAGS_GET,
>>>                              &descriptor );
>>>    if( OPAL_UNLIKELY(NULL == descriptor) ) {
>>> -        frag->rdma_length = save_size;
>>> -        OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> -        opal_list_append(&mca_pml_ob1.rdma_pending, 
>>> (opal_list_item_t*)frag);
>>> -        OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> -        return OMPI_ERR_OUT_OF_RESOURCE;
>>> +        if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
>>> +            frag->rdma_length = save_size;
>>> +            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> +            opal_list_append(&mca_pml_ob1.rdma_pending, 
>>> (opal_list_item_t*)frag);
>>> +            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> +            return OMPI_ERR_OUT_OF_RESOURCE;
>>> +        } else {
>>> +            ompi_proc_t *proc = (ompi_proc_t *) 
>>> recvreq->req_recv.req_base.req_proc;
>>> +
>>> +            /* tell peer to fall back on send */
>>> +            recvreq->req_send_offset = 0;
>>> +            rc = mca_pml_ob1_recv_request_ack_send(proc, 
>>> frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
>>> +                                                   recvreq, 
>>> recvreq->req_send_offset, true);
>>> +            MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
>>> +            return rc;
>>> +        }
>>>    }
>>> 
>>>    descriptor->des_src = frag->rdma_segs;
>>> @@ -393,6 +464,11 @@
>>>    /* queue up get request */
>>>    rc = mca_bml_base_get(bml_btl,descriptor);
>>>    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>>> +        if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
>>> +            /* get isn't supported for this transfer. tell peer to 
>>> fallback on put */
>>> +            rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
>>> +         }
>>> +
>>>        if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>>>            mca_bml_base_free(bml_btl, descriptor);
>>>            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> @@ -400,7 +476,7 @@
>>>                    (opal_list_item_t*)frag);
>>>            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>>            return OMPI_ERR_OUT_OF_RESOURCE;
>>> -        } else {
>>> +        } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>>            ORTE_ERROR_LOG(rc);
>>>            orte_errmgr.abort(-1, NULL);
>>>        }
>>> @@ -551,7 +627,9 @@
>>>        orte_errmgr.abort(-1, NULL);
>>>    }
>>> #endif /* OMPI_CUDA_SUPPORT */
>>> +
>>>    frag->rdma_hdr.hdr_rget = *hdr;
>>> +    frag->retries = 0;
>>>    frag->rdma_req = recvreq;
>>>    frag->rdma_ep = bml_endpoint;
>>>    frag->rdma_length = size;
>>> @@ -792,7 +870,7 @@
>>>        mca_bml_base_prepare_dst(bml_btl, reg,
>>>                                 &recvreq->req_recv.req_base.req_convertor,
>>>                                 MCA_BTL_NO_ORDER, 0, &size, 
>>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> -                            MCA_BTL_DES_FLAGS_PUT, &dst);
>>> +                                 MCA_BTL_DES_FLAGS_PUT, &dst);
>>>        OPAL_THREAD_UNLOCK(&recvreq->lock);
>>> 
>>>        if(OPAL_UNLIKELY(dst == NULL)) {
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c        (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c        2012-04-24 16:18:56 EDT 
>>> (Tue, 24 Apr 2012)
>>> @@ -264,6 +264,7 @@
>>>    MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
>>>                                        0, req_bytes_delivered );
>>>    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, 
>>> req_bytes_delivered);
>>> +    sendreq->src_des = NULL;
>>> 
>>>    send_request_pml_complete_check(sendreq);
>>>    /* free the descriptor */
>>> @@ -639,6 +640,8 @@
>>>    bool need_local_cb = false;
>>>    int rc;
>>> 
>>> +    sendreq->src_des = NULL;
>>> +
>>>    bml_btl = sendreq->req_rdma[0].bml_btl;
>>>    if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & 
>>> (MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_CUDA_GET))) {
>>>        mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
>>> @@ -657,10 +660,8 @@
>>>        mca_bml_base_prepare_src( bml_btl,
>>>                                  reg,
>>>                                  &sendreq->req_send.req_base.req_convertor,
>>> -                                  MCA_BTL_NO_ORDER,
>>> -                                  0,
>>> -                                  &size,
>>> -                                  MCA_BTL_DES_FLAGS_GET,
>>> +                                  MCA_BTL_NO_ORDER, 0, &size,
>>> +                                  MCA_BTL_DES_FLAGS_GET | 
>>> MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
>>>                                  &src );
>>>        MEMCHECKER(
>>>            memchecker_call(&opal_memchecker_base_mem_noaccess,
>>> @@ -676,6 +677,8 @@
>>>        src->des_cbfunc = mca_pml_ob1_rget_completion;
>>>        src->des_cbdata = sendreq;
>>> 
>>> +        sendreq->src_des = src;
>>> +
>>>        /* allocate space for get hdr + segment list */
>>>        mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
>>>                           sizeof(mca_pml_ob1_rget_hdr_t) +
>>> @@ -782,8 +785,9 @@
>>>        return OMPI_SUCCESS;
>>>    }
>>>    mca_bml_base_free(bml_btl, des);
>>> -    if (NULL != src) {
>>> -        mca_bml_base_free (bml_btl, src);
>>> +    if (sendreq->src_des) {
>>> +        mca_bml_base_free (bml_btl, sendreq->src_des);
>>> +        sendreq->src_des = NULL;
>>>    }
>>> 
>>>    return rc;
>>> @@ -1133,63 +1137,71 @@
>>>    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
>>> }
>>> 
>>> -int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
>>> -{
>>> -    mca_mpool_base_registration_t* reg = NULL;
>>> -    mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
>>> -    mca_btl_base_descriptor_t* des;
>>> +int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
>>> +{
>>> +    mca_pml_ob1_send_request_t* sendreq = 
>>> (mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> +    mca_mpool_base_registration_t *reg = NULL;
>>> +    mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
>>> +    mca_btl_base_descriptor_t *des;
>>>    size_t save_size = frag->rdma_length;
>>>    int rc;
>>> 
>>> -    /* setup descriptor */
>>> -    mca_bml_base_prepare_src( bml_btl,
>>> -                              reg,
>>> -                              &frag->convertor,
>>> -                              MCA_BTL_NO_ORDER,
>>> -                              0,
>>> -                              &frag->rdma_length,
>>> -                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> -                              MCA_BTL_DES_FLAGS_PUT,
>>> -                              &des );
>>> +    if (OPAL_LIKELY(NULL == sendreq->src_des)) {
>>> +        /* setup descriptor */
>>> +        mca_bml_base_prepare_src( bml_btl,
>>> +                                  reg,
>>> +                                  &frag->convertor,
>>> +                                  MCA_BTL_NO_ORDER,
>>> +                                  0,
>>> +                                  &frag->rdma_length,
>>> +                                  MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
>>> +                                  MCA_BTL_DES_FLAGS_PUT,
>>> +                                  &des );
>>> 
>>> -    if( OPAL_UNLIKELY(NULL == des) ) {
>>> -        if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) {
>>> -            size_t offset = 
>>> (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>>> -            frag->rdma_length = save_size;
>>> -            opal_convertor_set_position(&frag->convertor, &offset);
>>> -            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> -            opal_list_append(&mca_pml_ob1.rdma_pending, 
>>> (opal_list_item_t*)frag);
>>> -            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> -        } else {
>>> -            mca_pml_ob1_send_request_t *sendreq =
>>> -                (mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> +        if( OPAL_UNLIKELY(NULL == des) ) {
>>> +            if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
>>> +                size_t offset = 
>>> (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
>>> +                frag->rdma_length = save_size;
>>> +                opal_convertor_set_position(&frag->convertor, &offset);
>>> +                OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
>>> +                opal_list_append(&mca_pml_ob1.rdma_pending, 
>>> (opal_list_item_t*)frag);
>>> +                OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
>>> +            } else {
>>> +                mca_pml_ob1_send_request_t *sendreq =
>>> +                    (mca_pml_ob1_send_request_t*)frag->rdma_req;
>>> +
>>> +                /* tell receiver to unregister memory */
>>> +                mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>>> +                                     bml_btl, 
>>> frag->rdma_hdr.hdr_rdma.hdr_des,
>>> +                                     MCA_BTL_NO_ORDER, 1);
>>> +
>>> +                /* send fragment by copy in/out */
>>> +                mca_pml_ob1_send_request_copy_in_out(sendreq,
>>> +                                                     
>>> frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
>>> +                /* if a pointer to a receive request is not set it means 
>>> that
>>> +                 * ACK was not yet received. Don't schedule sends before 
>>> ACK */
>>> +                if(NULL != sendreq->req_recv.pval)
>>> +                    mca_pml_ob1_send_request_schedule(sendreq);
>>> +            }
>>> 
>>> -            /* tell receiver to unregister memory */
>>> -            mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
>>> -                    bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
>>> -                    MCA_BTL_NO_ORDER, 1);
>>> -
>>> -            /* send fragment by copy in/out */
>>> -            mca_pml_ob1_send_request_copy_in_out(sendreq,
>>> -                    frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, 
>>> frag->rdma_length);
>>> -            /* if a pointer to a receive request is not set it means that
>>> -             * ACK was not yet received. Don't schedule sends before ACK */
>>> -            if(NULL != sendreq->req_recv.pval)
>>> -                mca_pml_ob1_send_request_schedule(sendreq);
>>> +            return OMPI_ERR_OUT_OF_RESOURCE;
>>>        }
>>> -        return OMPI_ERR_OUT_OF_RESOURCE;
>>> +    } else {
>>> +        /* already have a source descriptor */
>>> +        des = sendreq->src_des;
>>> +        sendreq->src_des = NULL;
>>>    }
>>> -
>>> -    des->des_dst = frag->rdma_segs;
>>> +
>>> +    des->des_dst     = frag->rdma_segs;
>>>    des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
>>> -    des->des_cbfunc = mca_pml_ob1_put_completion;
>>> -    des->des_cbdata = frag;
>>> +    des->des_cbfunc  = mca_pml_ob1_put_completion;
>>> +    des->des_cbdata  = frag;
>>> 
>>>    PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
>>>                                  
>>> &(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), 
>>> save_size, PERUSE_SEND );
>>> 
>>>    rc = mca_bml_base_put(bml_btl, des);
>>> -    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
>>> +    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
>>>        mca_bml_base_free(bml_btl, des);
>>>        frag->rdma_length = save_size;
>>>        if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
>>> @@ -1203,6 +1215,7 @@
>>>            orte_errmgr.abort(-1, NULL);
>>>        }
>>>    }
>>> +
>>>    return OMPI_SUCCESS;
>>> }
>>> 
>>> @@ -1261,21 +1274,25 @@
>>>    frag->reg = NULL;
>>>    frag->retries = 0;
>>> 
>>> +    if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
>>> +        /* get fallback path */
>>> +        sendreq->req_state = 0;
>>> +    }
>>> +
>>>    /* lookup the corresponding registration */
>>>    for(i=0; i<sendreq->req_rdma_cnt; i++) {
>>> -       if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>>> -           frag->reg = sendreq->req_rdma[i].btl_reg;
>>> -           break;
>>> -       }
>>> -    }
>>> +        if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
>>> +            frag->reg = sendreq->req_rdma[i].btl_reg;
>>> +            break;
>>> +        }
>>> +    }
>>> 
>>>    /*  RDMA writes may proceed in parallel to send and to each other, so
>>>     *  create clone of the convertor for each RDMA fragment
>>>     */
>>>    size = hdr->hdr_rdma_offset;
>>>    
>>> opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
>>> -            &frag->convertor, 0, &size);
>>> +                                       &frag->convertor, 0, &size);
>>> 
>>>    mca_pml_ob1_send_request_put_frag(frag);
>>> }
>>> -
>>> 
>>> Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h
>>> ==============================================================================
>>> --- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h        (original)
>>> +++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h        2012-04-24 16:18:56 EDT 
>>> (Tue, 24 Apr 2012)
>>> @@ -54,6 +54,7 @@
>>>    mca_pml_ob1_send_pending_t req_pending;
>>>    opal_mutex_t req_send_range_lock;
>>>    opal_list_t req_send_ranges;
>>> +    mca_btl_base_descriptor_t *src_des;
>>>    mca_pml_ob1_com_btl_t req_rdma[1];
>>> };
>>> typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
>>> @@ -129,6 +130,7 @@
>>>            OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
>>>            sendreq = (mca_pml_ob1_send_request_t*)item;                \
>>>            sendreq->req_send.req_base.req_proc = proc;                 \
>>> +            sendreq->src_des = NULL;                                    \
>>>        }                                                               \
>>>    }
>>> 
>>> _______________________________________________
>>> svn-full mailing list
>>> svn-f...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn-full
>> 
>> 
>> -- 
>> Jeff Squyres
>> jsquy...@cisco.com
>> For corporate legal information go to: 
>> http://www.cisco.com/web/about/doing_business/legal/cri/
>> 
>> 
>> _______________________________________________
>> devel mailing list
>> de...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>> 
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/devel


-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/


Reply via email to