This was RFC'd last month. No one objected :)

-Nathan

On Tue, 24 Apr 2012, Jeffrey Squyres wrote:

There's some pretty extensive ob1 changes in here.

Can we get these reviewed?  Brian / George?


On Apr 24, 2012, at 4:18 PM, hje...@osl.iu.edu wrote:

Author: hjelmn
Date: 2012-04-24 16:18:56 EDT (Tue, 24 Apr 2012)
New Revision: 26329
URL: https://svn.open-mpi.org/trac/ompi/changeset/26329

Log:
ob1: add support for get fallback on put/send
Text files modified:
  trunk/ompi/mca/btl/ugni/btl_ugni_get.c     |    17 ----
  trunk/ompi/mca/btl/ugni/btl_ugni_put.c     |    48 --------------
  trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h    |     7 --
  trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c    |     5 -
  trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h    |     1
  trunk/ompi/mca/pml/ob1/pml_ob1.c           |     5 +
  trunk/ompi/mca/pml/ob1/pml_ob1.h           |     2
  trunk/ompi/mca/pml/ob1/pml_ob1_component.c |     4
  trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c  |    15 +++-
  trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c   |    94 
++++++++++++++++++++++++++--
  trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c   |   131 
++++++++++++++++++++++-----------------
  trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h   |     2
  12 files changed, 182 insertions(+), 149 deletions(-)

Modified: trunk/ompi/mca/btl/ugni/btl_ugni_get.c
==============================================================================
--- trunk/ompi/mca/btl/ugni/btl_ugni_get.c      (original)
+++ trunk/ompi/mca/btl/ugni/btl_ugni_get.c      2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -13,19 +13,6 @@
#include "btl_ugni_rdma.h"
#include "btl_ugni_smsg.h"

-static int mca_btl_ugni_init_put (struct mca_btl_base_module_t *btl,
-                                  mca_btl_ugni_base_frag_t *frag) {
-    /* off alignment/off size. switch to put */
-    frag->hdr.rdma.src_seg = frag->base.des_src[0];
-    frag->hdr.rdma.dst_seg = frag->base.des_dst[0];
-    frag->hdr.rdma.ctx     = (void *) frag;
-
-    /* send the fragment header using smsg. ignore local completion */
-    return ompi_mca_btl_ugni_smsg_send (frag, true, &frag->hdr.rdma,
-                                        sizeof (frag->hdr.rdma), NULL, 0,
-                                        MCA_BTL_UGNI_TAG_PUT_INIT);
-}
-
/**
 * Initiate a get operation.
 *
@@ -54,7 +41,7 @@

    if (OPAL_UNLIKELY(check || size > mca_btl_ugni_component.ugni_get_limit)) {
        /* switch to put */
-        return mca_btl_ugni_init_put (btl, frag);
+        return OMPI_ERR_NOT_AVAILABLE;
    }

    if (NULL != frag->base.des_cbfunc) {
@@ -68,7 +55,7 @@
    return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_GET, des->des_dst, 
des->des_src);
}

-void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int 
rc)
+static void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t 
*frag, int rc)
{
    BTL_VERBOSE(("rdma operation for rem_ctx %p complete", frag->hdr.rdma.ctx));


Modified: trunk/ompi/mca/btl/ugni/btl_ugni_put.c
==============================================================================
--- trunk/ompi/mca/btl/ugni/btl_ugni_put.c      (original)
+++ trunk/ompi/mca/btl/ugni/btl_ugni_put.c      2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -46,51 +46,3 @@

    return mca_btl_ugni_post_bte (frag, GNI_POST_RDMA_PUT, des->des_src, 
des->des_dst);
}
-
-/* reversed get */
-static void mca_btl_ugni_callback_put_retry (mca_btl_ugni_base_frag_t *frag, 
int rc)
-{
-    (void) mca_btl_ugni_start_put(frag->endpoint, frag->hdr.rdma, frag);
-}
-
-int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
-                            mca_btl_ugni_rdma_frag_hdr_t hdr,
-                            mca_btl_ugni_base_frag_t *frag)
-{
-    int rc;
-
-    BTL_VERBOSE(("starting reverse get (put) for remote ctx: %p", hdr.ctx));
-
-    if (NULL == frag) {
-        rc = MCA_BTL_UGNI_FRAG_ALLOC_RDMA_INT(ep, frag);
-        if (OPAL_UNLIKELY(NULL == frag)) {
-            BTL_ERROR(("error allocating rdma frag for reverse get. rc = %d. 
fl_num_allocated = %d", rc,
-                       ep->btl->rdma_int_frags.fl_num_allocated));
-            return rc;
-        }
-    }
-
-    frag->hdr.rdma = hdr;
-
-    frag->base.des_cbfunc = NULL;
-    frag->base.des_flags  = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-
-    frag->segments[0] = hdr.src_seg;
-    frag->base.des_src = frag->segments;
-    frag->base.des_src_cnt = 1;
-
-    frag->segments[1] = hdr.dst_seg;
-    frag->base.des_dst = frag->segments + 1;
-    frag->base.des_dst_cnt = 1;
-
-    rc = mca_btl_ugni_put (&ep->btl->super, ep, &frag->base);
-    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
-        frag->cbfunc = mca_btl_ugni_callback_put_retry;
-        opal_list_append (&ep->btl->failed_frags, (opal_list_item_t *) frag);
-        return rc;
-    }
-
-    frag->cbfunc = mca_btl_ugni_callback_rdma_complete;
-
-    return OMPI_SUCCESS;
-}

Modified: trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h
==============================================================================
--- trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h     (original)
+++ trunk/ompi/mca/btl/ugni/btl_ugni_rdma.h     2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -16,17 +16,10 @@
#include "btl_ugni.h"
#include "btl_ugni_frag.h"

-/* mca_btl_ugni_start_put: get operation could not be completed. start put 
instead */
-int mca_btl_ugni_start_put (mca_btl_base_endpoint_t *ep,
-                            mca_btl_ugni_rdma_frag_hdr_t hdr,
-                            mca_btl_ugni_base_frag_t *frag);
-
int mca_btl_ugni_start_eager_get (mca_btl_base_endpoint_t *ep,
                                  mca_btl_ugni_eager_ex_frag_hdr_t hdr,
                                  mca_btl_ugni_base_frag_t *frag);

-void mca_btl_ugni_callback_rdma_complete (mca_btl_ugni_base_frag_t *frag, int 
rc);
-
static inline int init_gni_post_desc(mca_btl_ugni_base_frag_t *frag,
                                     gni_post_type_t op_type,
                                     uint64_t lcl_addr,

Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c
==============================================================================
--- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c     (original)
+++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.c     2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -78,11 +78,6 @@
            reg->cbfunc(&ep->btl->super, tag, &(frag.base), reg->cbdata);

            break;
-        case MCA_BTL_UGNI_TAG_PUT_INIT:
-            frag.hdr.rdma = ((mca_btl_ugni_rdma_frag_hdr_t *) data_ptr)[0];
-
-            mca_btl_ugni_start_put (ep, frag.hdr.rdma, NULL);
-            break;
        case MCA_BTL_UGNI_TAG_GET_INIT:
            frag.hdr.eager_ex = ((mca_btl_ugni_eager_ex_frag_hdr_t *) 
data_ptr)[0];


Modified: trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h
==============================================================================
--- trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h     (original)
+++ trunk/ompi/mca/btl/ugni/btl_ugni_smsg.h     2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -21,7 +21,6 @@
typedef enum {
    MCA_BTL_UGNI_TAG_SEND,
    MCA_BTL_UGNI_TAG_DISCONNECT,
-    MCA_BTL_UGNI_TAG_PUT_INIT,
    MCA_BTL_UGNI_TAG_GET_INIT,
    MCA_BTL_UGNI_TAG_RDMA_COMPLETE
} mca_btl_ugni_smsg_tag_t;

Modified: trunk/ompi/mca/pml/ob1/pml_ob1.c
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1.c    (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1.c    2012-04-24 16:18:56 EDT (Tue, 24 Apr 
2012)
@@ -147,6 +147,7 @@
    OBJ_CONSTRUCT(&mca_pml_ob1.recv_pending, opal_list_t);
    OBJ_CONSTRUCT(&mca_pml_ob1.pckt_pending, opal_list_t);
    OBJ_CONSTRUCT(&mca_pml_ob1.rdma_pending, opal_list_t);
+
    /* missing communicator pending list */
    OBJ_CONSTRUCT(&mca_pml_ob1.non_existing_communicator_pending, opal_list_t);

@@ -599,8 +600,10 @@
        OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
        if(NULL == frag)
            break;
+
+        frag->retries++;
+
        if(frag->rdma_state == MCA_PML_OB1_RDMA_PUT) {
-            frag->retries++;
            rc = mca_pml_ob1_send_request_put_frag(frag);
        } else {
            rc = mca_pml_ob1_recv_request_get_frag(frag);

Modified: trunk/ompi/mca/pml/ob1/pml_ob1.h
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1.h    (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1.h    2012-04-24 16:18:56 EDT (Tue, 24 Apr 
2012)
@@ -52,7 +52,7 @@
    int free_list_inc;      /* number of elements to grow free list */
    size_t send_pipeline_depth;
    size_t recv_pipeline_depth;
-    size_t rdma_put_retries_limit;
+    size_t rdma_retries_limit;
    int max_rdma_per_request;
    int max_send_per_range;
    bool leave_pinned;

Modified: trunk/ompi/mca/pml/ob1/pml_ob1_component.c
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1_component.c  (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1_component.c  2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -112,8 +112,8 @@
        mca_pml_ob1_param_register_int("send_pipeline_depth", 3);
    mca_pml_ob1.recv_pipeline_depth =
        mca_pml_ob1_param_register_int("recv_pipeline_depth", 4);
-    mca_pml_ob1.rdma_put_retries_limit =
-        mca_pml_ob1_param_register_int("rdma_put_retries_limit", 5);
+    mca_pml_ob1.rdma_retries_limit =
+        mca_pml_ob1_param_register_int("rdma_retries_limit", 5);
    mca_pml_ob1.max_rdma_per_request =
        mca_pml_ob1_param_register_int("max_rdma_per_request", 4);
    mca_pml_ob1.max_send_per_range =

Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c   (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1_recvfrag.c   2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -294,15 +294,22 @@
    if( OPAL_UNLIKELY(segments->seg_len < sizeof(mca_pml_ob1_common_hdr_t)) ) {
         return;
    }
-
+
    ob1_hdr_ntoh(hdr, MCA_PML_OB1_HDR_TYPE_ACK);
    sendreq = (mca_pml_ob1_send_request_t*)hdr->hdr_ack.hdr_src_req.pval;
    sendreq->req_recv = hdr->hdr_ack.hdr_dst_req;
-
+
    /* if the request should be delivered entirely by copy in/out
     * then throttle sends */
-    if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA)
+    if(hdr->hdr_common.hdr_flags & MCA_PML_OB1_HDR_FLAGS_NORDMA) {
+        if (NULL != sendreq->src_des) {
+            /* release registered memory */
+            mca_bml_base_free (sendreq->req_rdma[0].bml_btl, sendreq->src_des);
+            sendreq->src_des = NULL;
+        }
+
        sendreq->req_throttle_sends = true;
+    }

    mca_pml_ob1_send_request_copy_in_out(sendreq,
                                         hdr->hdr_ack.hdr_send_offset,
@@ -324,7 +331,7 @@

    if(send_request_pml_complete_check(sendreq) == false)
        mca_pml_ob1_send_request_schedule(sendreq);
-
+
    return;
}


Modified: trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c    (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1_recvreq.c    2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -352,6 +352,66 @@
}


+static int mca_pml_ob1_init_get_fallback (mca_pml_ob1_rdma_frag_t *frag,
+                                          mca_btl_base_descriptor_t *dst) {
+    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) 
frag->rdma_req;
+    mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
+    mca_btl_base_descriptor_t *ctl;
+    mca_pml_ob1_rdma_hdr_t *hdr;
+    size_t hdr_size;
+    unsigned int i;
+    int rc;
+
+    /* prepare a descriptor for rdma control message */
+    hdr_size = sizeof (mca_pml_ob1_rdma_hdr_t);
+    if (dst->des_dst_cnt > 1) {
+        hdr_size += (sizeof (mca_btl_base_segment_t) *
+                     (dst->des_dst_cnt-1));
+    }
+
+    mca_bml_base_alloc (bml_btl, &ctl, MCA_BTL_NO_ORDER, hdr_size,
+                        MCA_BTL_DES_FLAGS_PRIORITY | 
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                        MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
+    if (OPAL_UNLIKELY(NULL == ctl)) {
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+    ctl->des_cbfunc = mca_pml_ob1_recv_ctl_completion;
+
+    /* fill in rdma header */
+    hdr = (mca_pml_ob1_rdma_hdr_t *) ctl->des_src->seg_addr.pval;
+    hdr->hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_PUT;
+    hdr->hdr_common.hdr_flags =
+        (!recvreq->req_ack_sent) ? MCA_PML_OB1_HDR_TYPE_ACK : 0;
+
+    hdr->hdr_req = frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req;
+    hdr->hdr_rdma_offset = recvreq->req_rdma_offset;
+    hdr->hdr_des.pval = dst;
+
+    hdr->hdr_seg_cnt = dst->des_dst_cnt;
+
+    for (i = 0 ; i < dst->des_dst_cnt ; ++i) {
+        hdr->hdr_segs[i].seg_addr.lval = 
ompi_ptr_ptol(dst->des_dst[i].seg_addr.pval);
+        hdr->hdr_segs[i].seg_len       = dst->des_dst[i].seg_len;
+        hdr->hdr_segs[i].seg_key.key64[0] = dst->des_dst[i].seg_key.key64[0];
+        hdr->hdr_segs[i].seg_key.key64[1] = dst->des_dst[i].seg_key.key64[1];
+    }
+
+    dst->des_cbfunc = mca_pml_ob1_put_completion;
+    dst->des_cbdata = recvreq;
+
+    if (!recvreq->req_ack_sent)
+        recvreq->req_ack_sent = true;
+
+    /* send rdma request to peer */
+    rc = mca_bml_base_send (bml_btl, ctl, MCA_PML_OB1_HDR_TYPE_PUT);
+    if (OPAL_UNLIKELY(rc < 0)) {
+        mca_bml_base_free (bml_btl, ctl);
+        return rc;
+    }
+
+    return OMPI_SUCCESS;
+}
+
/*
 *
 */
@@ -371,14 +431,25 @@
                              0,
                              &frag->rdma_length,
                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | 
MCA_BTL_DES_SEND_ALWAYS_CALLBACK |
-                             MCA_BTL_DES_FLAGS_GET,
+                              MCA_BTL_DES_FLAGS_GET,
                              &descriptor );
    if( OPAL_UNLIKELY(NULL == descriptor) ) {
-        frag->rdma_length = save_size;
-        OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
-        opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
-        OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
-        return OMPI_ERR_OUT_OF_RESOURCE;
+        if (frag->retries < mca_pml_ob1.rdma_retries_limit) {
+            frag->rdma_length = save_size;
+            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
+            opal_list_append(&mca_pml_ob1.rdma_pending, 
(opal_list_item_t*)frag);
+            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
+            return OMPI_ERR_OUT_OF_RESOURCE;
+        } else {
+            ompi_proc_t *proc = (ompi_proc_t *) 
recvreq->req_recv.req_base.req_proc;
+
+            /* tell peer to fall back on send */
+            recvreq->req_send_offset = 0;
+            rc = mca_pml_ob1_recv_request_ack_send(proc, 
frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval,
+                                                   recvreq, 
recvreq->req_send_offset, true);
+            MCA_PML_OB1_RDMA_FRAG_RETURN(frag);
+            return rc;
+        }
    }

    descriptor->des_src = frag->rdma_segs;
@@ -393,6 +464,11 @@
    /* queue up get request */
    rc = mca_bml_base_get(bml_btl,descriptor);
    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+        if (OPAL_UNLIKELY(OMPI_ERR_NOT_AVAILABLE == rc)) {
+            /* get isn't supported for this transfer. tell peer to fallback on 
put */
+            rc = mca_pml_ob1_init_get_fallback (frag, descriptor);
+         }
+
        if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
            mca_bml_base_free(bml_btl, descriptor);
            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
@@ -400,7 +476,7 @@
                    (opal_list_item_t*)frag);
            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
            return OMPI_ERR_OUT_OF_RESOURCE;
-        } else {
+        } else if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
            ORTE_ERROR_LOG(rc);
            orte_errmgr.abort(-1, NULL);
        }
@@ -551,7 +627,9 @@
        orte_errmgr.abort(-1, NULL);
    }
#endif /* OMPI_CUDA_SUPPORT */
+
    frag->rdma_hdr.hdr_rget = *hdr;
+    frag->retries = 0;
    frag->rdma_req = recvreq;
    frag->rdma_ep = bml_endpoint;
    frag->rdma_length = size;
@@ -792,7 +870,7 @@
        mca_bml_base_prepare_dst(bml_btl, reg,
                                 &recvreq->req_recv.req_base.req_convertor,
                                 MCA_BTL_NO_ORDER, 0, &size, 
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
-                                MCA_BTL_DES_FLAGS_PUT, &dst);
+                                 MCA_BTL_DES_FLAGS_PUT, &dst);
        OPAL_THREAD_UNLOCK(&recvreq->lock);

        if(OPAL_UNLIKELY(dst == NULL)) {

Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c    (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.c    2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -264,6 +264,7 @@
    MCA_PML_OB1_COMPUTE_SEGMENT_LENGTH( des->des_src, des->des_src_cnt,
                                        0, req_bytes_delivered );
    OPAL_THREAD_ADD_SIZE_T(&sendreq->req_bytes_delivered, req_bytes_delivered);
+    sendreq->src_des = NULL;

    send_request_pml_complete_check(sendreq);
    /* free the descriptor */
@@ -639,6 +640,8 @@
    bool need_local_cb = false;
    int rc;

+    sendreq->src_des = NULL;
+
    bml_btl = sendreq->req_rdma[0].bml_btl;
    if((sendreq->req_rdma_cnt == 1) && (bml_btl->btl_flags & (MCA_BTL_FLAGS_GET 
| MCA_BTL_FLAGS_CUDA_GET))) {
        mca_mpool_base_registration_t* reg = sendreq->req_rdma[0].btl_reg;
@@ -657,10 +660,8 @@
        mca_bml_base_prepare_src( bml_btl,
                                  reg,
                                  &sendreq->req_send.req_base.req_convertor,
-                                  MCA_BTL_NO_ORDER,
-                                  0,
-                                  &size,
-                                  MCA_BTL_DES_FLAGS_GET,
+                                  MCA_BTL_NO_ORDER, 0, &size,
+                                  MCA_BTL_DES_FLAGS_GET | 
MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
                                  &src );
        MEMCHECKER(
            memchecker_call(&opal_memchecker_base_mem_noaccess,
@@ -676,6 +677,8 @@
        src->des_cbfunc = mca_pml_ob1_rget_completion;
        src->des_cbdata = sendreq;

+        sendreq->src_des = src;
+
        /* allocate space for get hdr + segment list */
        mca_bml_base_alloc(bml_btl, &des, MCA_BTL_NO_ORDER,
                           sizeof(mca_pml_ob1_rget_hdr_t) +
@@ -782,8 +785,9 @@
        return OMPI_SUCCESS;
    }
    mca_bml_base_free(bml_btl, des);
-    if (NULL != src) {
-        mca_bml_base_free (bml_btl, src);
+    if (sendreq->src_des) {
+        mca_bml_base_free (bml_btl, sendreq->src_des);
+        sendreq->src_des = NULL;
    }

    return rc;
@@ -1133,63 +1137,71 @@
    MCA_PML_OB1_PROGRESS_PENDING(bml_btl);
}

-int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t* frag )
-{
-    mca_mpool_base_registration_t* reg = NULL;
-    mca_bml_base_btl_t* bml_btl = frag->rdma_bml;
-    mca_btl_base_descriptor_t* des;
+int mca_pml_ob1_send_request_put_frag( mca_pml_ob1_rdma_frag_t *frag )
+{
+    mca_pml_ob1_send_request_t* sendreq = 
(mca_pml_ob1_send_request_t*)frag->rdma_req;
+    mca_mpool_base_registration_t *reg = NULL;
+    mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
+    mca_btl_base_descriptor_t *des;
    size_t save_size = frag->rdma_length;
    int rc;

-    /* setup descriptor */
-    mca_bml_base_prepare_src( bml_btl,
-                              reg,
-                              &frag->convertor,
-                              MCA_BTL_NO_ORDER,
-                              0,
-                              &frag->rdma_length,
-                              MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
-                              MCA_BTL_DES_FLAGS_PUT,
-                              &des );
+    if (OPAL_LIKELY(NULL == sendreq->src_des)) {
+        /* setup descriptor */
+        mca_bml_base_prepare_src( bml_btl,
+                                  reg,
+                                  &frag->convertor,
+                                  MCA_BTL_NO_ORDER,
+                                  0,
+                                  &frag->rdma_length,
+                                  MCA_BTL_DES_FLAGS_BTL_OWNERSHIP |
+                                  MCA_BTL_DES_FLAGS_PUT,
+                                  &des );

-    if( OPAL_UNLIKELY(NULL == des) ) {
-        if(frag->retries < mca_pml_ob1.rdma_put_retries_limit) {
-            size_t offset = (size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
-            frag->rdma_length = save_size;
-            opal_convertor_set_position(&frag->convertor, &offset);
-            OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
-            opal_list_append(&mca_pml_ob1.rdma_pending, 
(opal_list_item_t*)frag);
-            OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
-        } else {
-            mca_pml_ob1_send_request_t *sendreq =
-                (mca_pml_ob1_send_request_t*)frag->rdma_req;
+        if( OPAL_UNLIKELY(NULL == des) ) {
+            if(frag->retries < mca_pml_ob1.rdma_retries_limit) {
+                size_t offset = 
(size_t)frag->rdma_hdr.hdr_rdma.hdr_rdma_offset;
+                frag->rdma_length = save_size;
+                opal_convertor_set_position(&frag->convertor, &offset);
+                OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
+                opal_list_append(&mca_pml_ob1.rdma_pending, 
(opal_list_item_t*)frag);
+                OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
+            } else {
+                mca_pml_ob1_send_request_t *sendreq =
+                    (mca_pml_ob1_send_request_t*)frag->rdma_req;
+
+                /* tell receiver to unregister memory */
+                mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
+                                     bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
+                                     MCA_BTL_NO_ORDER, 1);
+
+                /* send fragment by copy in/out */
+                mca_pml_ob1_send_request_copy_in_out(sendreq,
+                                                     
frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, frag->rdma_length);
+                /* if a pointer to a receive request is not set it means that
+                 * ACK was not yet received. Don't schedule sends before ACK */
+                if(NULL != sendreq->req_recv.pval)
+                    mca_pml_ob1_send_request_schedule(sendreq);
+            }

-            /* tell receiver to unregister memory */
-            mca_pml_ob1_send_fin(sendreq->req_send.req_base.req_proc,
-                    bml_btl, frag->rdma_hdr.hdr_rdma.hdr_des,
-                    MCA_BTL_NO_ORDER, 1);
-
-            /* send fragment by copy in/out */
-            mca_pml_ob1_send_request_copy_in_out(sendreq,
-                    frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, 
frag->rdma_length);
-            /* if a pointer to a receive request is not set it means that
-             * ACK was not yet received. Don't schedule sends before ACK */
-            if(NULL != sendreq->req_recv.pval)
-                mca_pml_ob1_send_request_schedule(sendreq);
+            return OMPI_ERR_OUT_OF_RESOURCE;
        }
-        return OMPI_ERR_OUT_OF_RESOURCE;
+    } else {
+        /* already have a source descriptor */
+        des = sendreq->src_des;
+        sendreq->src_des = NULL;
    }
-
-    des->des_dst = frag->rdma_segs;
+
+    des->des_dst     = frag->rdma_segs;
    des->des_dst_cnt = frag->rdma_hdr.hdr_rdma.hdr_seg_cnt;
-    des->des_cbfunc = mca_pml_ob1_put_completion;
-    des->des_cbdata = frag;
+    des->des_cbfunc  = mca_pml_ob1_put_completion;
+    des->des_cbdata  = frag;

    PERUSE_TRACE_COMM_OMPI_EVENT( PERUSE_COMM_REQ_XFER_CONTINUE,
                                  
&(((mca_pml_ob1_send_request_t*)frag->rdma_req)->req_send.req_base), save_size, 
PERUSE_SEND );

    rc = mca_bml_base_put(bml_btl, des);
-    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        mca_bml_base_free(bml_btl, des);
        frag->rdma_length = save_size;
        if(OMPI_ERR_OUT_OF_RESOURCE == rc) {
@@ -1203,6 +1215,7 @@
            orte_errmgr.abort(-1, NULL);
        }
    }
+
    return OMPI_SUCCESS;
}

@@ -1261,21 +1274,25 @@
    frag->reg = NULL;
    frag->retries = 0;

+    if (OPAL_UNLIKELY(NULL != sendreq->src_des)) {
+        /* get fallback path */
+        sendreq->req_state = 0;
+    }
+
    /* lookup the corresponding registration */
    for(i=0; i<sendreq->req_rdma_cnt; i++) {
-       if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
-           frag->reg = sendreq->req_rdma[i].btl_reg;
-           break;
-       }
-    }
+        if(sendreq->req_rdma[i].bml_btl == frag->rdma_bml) {
+            frag->reg = sendreq->req_rdma[i].btl_reg;
+            break;
+        }
+    }

    /*  RDMA writes may proceed in parallel to send and to each other, so
     *  create clone of the convertor for each RDMA fragment
     */
    size = hdr->hdr_rdma_offset;
    
opal_convertor_clone_with_position(&sendreq->req_send.req_base.req_convertor,
-            &frag->convertor, 0, &size);
+                                       &frag->convertor, 0, &size);

    mca_pml_ob1_send_request_put_frag(frag);
}
-

Modified: trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h
==============================================================================
--- trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h    (original)
+++ trunk/ompi/mca/pml/ob1/pml_ob1_sendreq.h    2012-04-24 16:18:56 EDT (Tue, 
24 Apr 2012)
@@ -54,6 +54,7 @@
    mca_pml_ob1_send_pending_t req_pending;
    opal_mutex_t req_send_range_lock;
    opal_list_t req_send_ranges;
+    mca_btl_base_descriptor_t *src_des;
    mca_pml_ob1_com_btl_t req_rdma[1];
};
typedef struct mca_pml_ob1_send_request_t mca_pml_ob1_send_request_t;
@@ -129,6 +130,7 @@
            OMPI_FREE_LIST_WAIT(&mca_pml_base_send_requests, item, rc); \
            sendreq = (mca_pml_ob1_send_request_t*)item;                \
            sendreq->req_send.req_base.req_proc = proc;                 \
+            sendreq->src_des = NULL;                                    \
        }                                                               \
    }

_______________________________________________
svn-full mailing list
svn-f...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn-full


--
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/


_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to