What: Push some ob1 optimizations to the trunk and 1.7.5.

What: This patch contains two optimizations:

  - Introduce a fast send path for blocking send calls. This path uses
    the btl sendi function to put the data on the wire without the need
    for setting up a send request. In the case of btl/vader this can
    also avoid allocating/initializing a new fragment. With btl/vader
    this optimization improves small message latency by 50-200ns in
    ping-pong type benchmarks. Larger messages may take a small hit in
    the range of 10-20ns.

  - Use a stack-allocated receive request for blocking recieves. This
    optimization saves the extra instructions associated with accessing
    the receive request free list. I was able to get another 50-200ns
    improvement in the small-message ping-pong with this optimization. I
    see no hit for larger messages.

When: These changes touch the critical path in ob1 and are targeted for
1.7.5. As such I will set a moderately long timeout. Timeout set for
next Friday (Jan 17).

Some results from osu_latency on haswell:

hjelmn@cn143 pt2pt]$ mpirun -n 2 --bind-to core -mca btl vader,self 
./osu_latency
# OSU MPI Latency Test v4.0.1
# Size          Latency (us)
0                       0.11
1                       0.14
2                       0.14
4                       0.14
8                       0.14
16                      0.14
32                      0.15
64                      0.18
128                     0.36
256                     0.37
512                     0.46
1024                    0.56
2048                    0.80
4096                    1.12
8192                    1.68
16384                   2.98
32768                   5.10
65536                   8.12
131072                 14.07
262144                 25.30
524288                 47.40
1048576                91.71
2097152               195.56
4194304               487.05


Patch Attached.

-Nathan
diff --git a/ompi/mca/pml/ob1/pml_ob1_irecv.c b/ompi/mca/pml/ob1/pml_ob1_irecv.c
index c85d8f6..ee4812d 100644
--- a/ompi/mca/pml/ob1/pml_ob1_irecv.c
+++ b/ompi/mca/pml/ob1/pml_ob1_irecv.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -9,7 +10,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2013 Los Alamos National Security, LLC.  All rights
  *                         reserved. 
  * Copyright (c) 2010-2012 Oracle and/or its affiliates.  All rights reserved.
  * Copyright (c) 2011      Sandia National Laboratories. All rights reserved.
@@ -87,28 +88,29 @@ int mca_pml_ob1_recv(void *addr,
                      struct ompi_communicator_t *comm,
                      ompi_status_public_t * status)
 {
+    mca_pml_ob1_recv_request_t recvreq;
     int rc;
-    mca_pml_ob1_recv_request_t *recvreq;
-    MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq);
-    if (NULL == recvreq)
-        return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
 
-    MCA_PML_OB1_RECV_REQUEST_INIT(recvreq,
-                                   addr,
-                                   count, datatype, src, tag, comm, false);
+    OBJ_CONSTRUCT(&recvreq, mca_pml_ob1_recv_request_t);
+
+    MCA_PML_OB1_RECV_REQUEST_INIT(&recvreq, addr, count, datatype,
+                                  src, tag, comm, false);
 
     PERUSE_TRACE_COMM_EVENT (PERUSE_COMM_REQ_ACTIVATE,
-                             &((recvreq)->req_recv.req_base),
+                             &(recvreq.req_recv.req_base),
                              PERUSE_RECV);
 
-    MCA_PML_OB1_RECV_REQUEST_START(recvreq);
-    ompi_request_wait_completion(&recvreq->req_recv.req_base.req_ompi);
+    MCA_PML_OB1_RECV_REQUEST_START(&recvreq);
+    ompi_request_wait_completion(&recvreq.req_recv.req_base.req_ompi);
 
     if (NULL != status) {  /* return status */
-        *status = recvreq->req_recv.req_base.req_ompi.req_status;
+        *status = recvreq.req_recv.req_base.req_ompi.req_status;
     }
-    rc = recvreq->req_recv.req_base.req_ompi.req_status.MPI_ERROR;
-    ompi_request_free( (ompi_request_t**)&recvreq );
+
+    rc = recvreq.req_recv.req_base.req_ompi.req_status.MPI_ERROR;
+
+    OBJ_DESTRUCT(&recvreq);
+
     return rc;
 }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_isend.c b/ompi/mca/pml/ob1/pml_ob1_isend.c
index c689981..bbff946 100644
--- a/ompi/mca/pml/ob1/pml_ob1_isend.c
+++ b/ompi/mca/pml/ob1/pml_ob1_isend.c
@@ -1,3 +1,4 @@
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
 /*
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
@@ -9,7 +10,7 @@
  *                         University of Stuttgart.  All rights reserved.
  * Copyright (c) 2004-2005 The Regents of the University of California.
  *                         All rights reserved.
- * Copyright (c) 2007      Los Alamos National Security, LLC.  All rights
+ * Copyright (c) 2007-2013 Los Alamos National Security, LLC.  All rights
  *                         reserved. 
  * $COPYRIGHT$
  * 
@@ -54,6 +55,66 @@ int mca_pml_ob1_isend_init(void *buf,
     return OMPI_SUCCESS;
 }
 
+/* try to get a small message out on to the wire quickly */
+static inline int mca_pml_ob1_send_inline (void *buf, size_t count,
+                                           ompi_datatype_t * datatype,
+                                           int dst, int tag,
+                                           ompi_communicator_t * comm)
+{
+    ompi_proc_t *dst_proc = ompi_comm_peer_lookup (comm, dst);
+    mca_bml_base_endpoint_t* endpoint = (mca_bml_base_endpoint_t*)
+                                        
dst_proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];
+    mca_pml_ob1_comm_t* ob1_comm = comm->c_pml_comm;
+    mca_pml_ob1_match_hdr_t match;
+    mca_bml_base_btl_t* bml_btl;
+    OPAL_PTRDIFF_TYPE lb, extent;
+    opal_convertor_t convertor;
+    size_t size = 0;
+    int rc;
+
+    bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager);
+
+    ompi_datatype_get_extent (datatype, &lb, &extent);
+
+    if (OPAL_UNLIKELY((extent * count) > 256 || !bml_btl->btl->btl_sendi)) {
+        return OMPI_ERR_NOT_AVAILABLE;
+    }
+
+    if (count > 0) {
+        /* initialize just enough of the convertor to avoid a SEGV in 
opal_convertor_cleanup */
+        convertor.stack_size = 0;
+
+        /* We will create a convertor specialized for the        */
+        /* remote architecture and prepared with the datatype.   */
+        opal_convertor_copy_and_prepare_for_send (dst_proc->proc_convertor,
+                                                  (const struct 
opal_datatype_t *) datatype,
+                                                 count, buf, 0, &convertor);
+
+        /* find out the packed size of the data */
+        opal_convertor_get_packed_size (&convertor, &size);
+    }
+
+    match.hdr_common.hdr_type = MCA_PML_OB1_HDR_TYPE_MATCH;
+    match.hdr_ctx = comm->c_contextid;
+    match.hdr_src = comm->c_my_rank;
+    match.hdr_tag = tag;
+    match.hdr_seq = (uint16_t) 
OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, 1);
+
+    /* try to send immediately */
+    rc = mca_bml_base_sendi (bml_btl, &convertor, &match, 
OMPI_PML_OB1_MATCH_HDR_LEN,
+                             size, MCA_BTL_NO_ORDER, 
MCA_BTL_DES_FLAGS_PRIORITY | MCA_BTL_DES_FLAGS_BTL_OWNERSHIP,
+                             MCA_PML_OB1_HDR_TYPE_MATCH, NULL);
+    if (count > 0) {
+        opal_convertor_cleanup (&convertor);
+    }
+
+    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
+        OPAL_THREAD_ADD32(&ob1_comm->procs[dst].send_sequence, -1);
+       return rc;
+    }
+
+    return (int) size;
+}
 
 int mca_pml_ob1_isend(void *buf,
                       size_t count,
@@ -66,11 +127,11 @@ int mca_pml_ob1_isend(void *buf,
 {
     int rc;
     mca_pml_ob1_send_request_t *sendreq = NULL;
-    
+
     MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
     if (NULL == sendreq)
         return OMPI_ERR_OUT_OF_RESOURCE;
-    
+
     MCA_PML_OB1_SEND_REQUEST_INIT(sendreq,
                                   buf,
                                   count,
@@ -87,7 +148,6 @@ int mca_pml_ob1_isend(void *buf,
     return rc;
 }
 
-
 int mca_pml_ob1_send(void *buf,
                      size_t count,
                      ompi_datatype_t * datatype,
@@ -99,6 +159,13 @@ int mca_pml_ob1_send(void *buf,
     int rc;
     mca_pml_ob1_send_request_t *sendreq = NULL;
 
+    if (MCA_PML_BASE_SEND_SYNCHRONOUS != sendmode) {
+        rc = mca_pml_ob1_send_inline (buf, count, datatype, dst, tag, comm);
+        if (OPAL_LIKELY(0 <= rc)) {
+            return OMPI_SUCCESS;
+        }
+    }
+
     MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
     if (NULL == sendreq)
         return OMPI_ERR_OUT_OF_RESOURCE;
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c 
b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index de6fc31..834cff7 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -145,7 +145,7 @@ static int mca_pml_ob1_recv_request_cancel(struct 
ompi_request_t* ompi_request,
 
 static void mca_pml_ob1_recv_request_construct(mca_pml_ob1_recv_request_t* 
request)
 {
-    request->req_recv.req_base.req_type = MCA_PML_REQUEST_RECV;
+    /* the request type is set by the superclass */
     request->req_recv.req_base.req_ompi.req_free = 
mca_pml_ob1_recv_request_free;
     request->req_recv.req_base.req_ompi.req_cancel = 
mca_pml_ob1_recv_request_cancel;
     request->req_rdma_cnt = 0;
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.h 
b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
index 54cdf67..a870c71 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.h
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.h
@@ -143,7 +143,7 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t 
*type)
                                        sendmode,                        \
                                        persistent)                      \
     {                                                                   \
-        MCA_PML_BASE_SEND_REQUEST_INIT(&sendreq->req_send,              \
+        MCA_PML_BASE_SEND_REQUEST_INIT(&(sendreq)->req_send,            \
                                        buf,                             \
                                        count,                           \
                                        datatype,                        \
@@ -157,9 +157,9 @@ get_request_from_send_pending(mca_pml_ob1_send_pending_t 
*type)
     }
 
 #define MCA_PML_OB1_SEND_REQUEST_RESET(sendreq)                         \
-    if (sendreq->req_send.req_bytes_packed > 0) {                       \
+    if ((sendreq)->req_send.req_bytes_packed > 0) {                     \
         size_t _position = 0;                                           \
-        opal_convertor_set_position(&sendreq->req_send.req_base.req_convertor, 
\
+        
opal_convertor_set_position(&(sendreq)->req_send.req_base.req_convertor, \
                                     &_position);                        \
         assert( 0 == _position );                                       \
     }

Attachment: pgpFFwguLVGr5.pgp
Description: PGP signature

Reply via email to