Hello,

I'm trying to find a set of mca parameters that will (among other things) 
force all but the largest messages to be transmitted eagerly during an 
MPI_Isend call rather than during the accompanying MPI_Wait.  I thought 
increasing the btl_tcp_eager_limit and other buffer sizes would accomplish 
this, but that's didn't work on this system.

What's the correct way of doing this?  

Thanks much, 

Barry Rountree
University of Georgia



OpenMPI 2.8 Linux+gcc, self+tcp.
Relevant bit of the hostfile looks like:
        opt00 slots=1 max_slots=1
        opt01 slots=1 max_slots=1
The program is being launched from a remote node.

This is the default case:

mpirun -np 2 -hostfile /osr/users/rountree/hostfile 
-mca btl self,tcp                
./harness -v -h --test_ping

I'm assuming it's using these default values that were reported by 
ompi_info --param all all

btl_tcp_sndbuf=131072
btl_tcp_rcvbuf=131072
btl_tcp_endpoint_cache=30720
btl_tcp_exclusivity=0
btl_tcp_eager_limit=65536
btl_tcp_min_send_size=65536
btl_tcp_max_send_size=131072
btl_tcp_min_rdma_size=131072
btl_tcp_max_rdma_size=2147483647

btl_self_eager_limit=131072
btl_self_min_send_size=262144
btl_self_max_send_size=262144
btl_self_min_rdma_size=2147483647
btl_self_max_rdma_size=2147483647


This gives me a tracefile that looks like:

 Rank            File Line             Function      Comp      Comm   MsgSz
...
    0       harness.c   97            MPI_Irecv  0.103897  0.000003   16384
    0       harness.c  100          MPI_Waitall  0.000006  0.000088      -1
    0       harness.c  105          MPI_Barrier  0.000002  0.000021      -1
    0       harness.c   97            MPI_Irecv  0.103877  0.000003   32768
    0       harness.c  100          MPI_Waitall  0.000005  0.000155      -1
    0       harness.c  105          MPI_Barrier  0.000002  0.000019      -1
    0       harness.c   97            MPI_Irecv  0.103807  0.000003   65536
    0       harness.c  100          MPI_Waitall  0.000005  0.001516      -1
    0       harness.c  105          MPI_Barrier  0.000002  0.000020      -1
    0       harness.c   97            MPI_Irecv  0.102436  0.000003  131072
    0       harness.c  100          MPI_Waitall  0.000007  0.001975      -1
    0       harness.c  105          MPI_Barrier  0.000002  0.000020      -1
...
    1       harness.c  102            MPI_Isend  0.000002  0.000143   16384
    1       harness.c  103             MPI_Wait  0.000001  0.000002      -1
    1       harness.c  105          MPI_Barrier  0.000001  0.103813      -1
    1       harness.c  102            MPI_Isend  0.000002  0.000423   32768
    1       harness.c  103             MPI_Wait  0.000002  0.000002      -1
    1       harness.c  105          MPI_Barrier  0.000002  0.103627      -1
    1       harness.c  102            MPI_Isend  0.000001  0.000027   65536
    1       harness.c  103             MPI_Wait  0.000002  0.104615      -1
    1       harness.c  105          MPI_Barrier  0.000002  0.000709      -1
    1       harness.c  102            MPI_Isend  0.000002  0.000017  131072
    1       harness.c  103             MPI_Wait  0.000002  0.103822      -1
    1       harness.c  105          MPI_Barrier  0.000002  0.000602      -1
...

So there's an expected transition from 32k to 64k, where the MPI_Isend time 
goes from 0.000423 seconds to 0.000027 and the MPI_Wait time on the sender's 
side goes from 0.000002 seconds to 0.104615 seconds.  

Ok, so let's increase buffer sizes and such and see if we can make that 
transition happen at a larger message size.  I tried:

mpirun -np 2 -hostfile /osr/users/rountree/hostfile 
-mca btl self,tcp 
-mca btl btl_tcp_sndbuf=2097152          
-mca btl btl_tcp_rcvbuf=2097152              
-mca btl btl_tcp_endpoint_cache=491520                  
-mca btl btl_tcp_eager_limit=1048576    
-mca btl btl_tcp_min_send_size=1048576  
-mca btl btl_tcp_max_send_size=2097152 
-mca btl btl_tcp_min_rdma_size=2097152 
-mca btl btl_tcp_max_rdma_size=2147483647 
-mca btl btl_self_eager_limit=2097152                
-mca btl btl_self_min_send_size=2097152                 
-mca btl btl_self_max_send_size=2097152              
-mca btl btl_self_min_rdma_size=2147483647      
-mca btl btl_self_max_rdma_size=2147483647      
./harness -v -h --test_ping


and.... nothing changed.


 Rank            File Line             Function      Comp      Comm   MsgSz
...
    0       harness.c   97            MPI_Irecv  0.103861  0.000003   32768
    0       harness.c  100          MPI_Waitall  0.000005  0.000161      -1
    0       harness.c  105          MPI_Barrier  0.000002  0.000020      -1
    0       harness.c   97            MPI_Irecv  0.103800  0.000003   65536
    0       harness.c  100          MPI_Waitall  0.000005  0.001483      -1
    0       harness.c  105          MPI_Barrier  0.000001  0.000020      -1
...
    1       harness.c  102            MPI_Isend  0.000002  0.000426   32768
    1       harness.c  103             MPI_Wait  0.000001  0.000002      -1
    1       harness.c  105          MPI_Barrier  0.000002  0.103619      -1
    1       harness.c  102            MPI_Isend  0.000001  0.000027   65536
    1       harness.c  103             MPI_Wait  0.000001  0.104673      -1
    1       harness.c  105          MPI_Barrier  0.000001  0.000602      -1



The code looks like:

static int
test_ping(){
#define PP_BUFSZ (1024*1024*2)
        int s, r, reps=1, i;
        char *buf=malloc(PP_BUFSZ);
        MPI_Request* req = malloc( sizeof(MPI_Request) * g_size );
        MPI_Status*  sta = malloc( sizeof(MPI_Status) * g_size );
        assert(buf);
        assert(req);
        assert(sta);
        for(r=0; r<reps; r++){
                for(s=1; s<PP_BUFSZ; s=s*2){
                        if(g_rank==0){
                                usleep(100000);
                                for(i=1; i<g_size; i++){
                                        MPI_Irecv( buf, s, MPI_CHAR, i,
                                                0xFF+r, MPI_COMM_WORLD, &req[i] 
);
                                }
                                MPI_Waitall( g_size-1, &req[1], &sta[1] );
                        }else{
                                MPI_Isend( buf, s, MPI_CHAR, 0, 0xFF+r,
                                        MPI_COMM_WORLD, &req[g_rank] );
                                MPI_Wait( &req[g_rank], &sta[g_rank] );
                        }
                        MPI_Barrier(MPI_COMM_WORLD);
                }
        }
        return 0;
#undef PP_BUFSZ
}

The same behavior is observed when I put the usleep on the send side, or if I 
remove it altogether.

Just doubling the default values (instead of going up by 16x) also didn't 
work.

Barry

Reply via email to