Hi!

When benchmarking the performance of sending a std::vector<double> with Boost.MPI, I noticed that you can gain several factors of speedup if you replace

std::vector<double> data(n);
comm.send(0,0,data);

by e.g.

std::vector<double> data(n);
comm.send(0,0, boost::mpi::skeleton(data));
comm.send(0,0, boost::mpi::get_content(data));

The code to benchmark, the measured data as well as a plot thereof are 
attached. Further parameters were:

MPI implementation: Open MPI 1.6.5
C++ compiler: gcc 4.8.2
Compiler flags: -O3 -std=c++11
mpirun parameter: --bind-to-core
CPU model: AMD Opteron(tm) Processor 6174

Why is it/what am I doing wrong that the default sending of std::vector<double> 
performs so badly?

Best regards,
Simon Etter
#include <iostream>
#include <chrono>
#include <vector>
#include <numeric>
#include <cstring>

#define BOOST_MPI_HOMOGENEOUS

#include <boost/mpi.hpp>

typedef double scalar_type;
typedef std::chrono::duration<double, std::ratio<1>> seconds;

BOOST_CLASS_TRACKING(std::vector<scalar_type>, boost::serialization::track_never)
BOOST_CLASS_IMPLEMENTATION(std::vector<scalar_type>, boost::serialization::object_serializable)

#define N_REP 100
#define MAX_SIZE (1<<20)

int main(int argc, char** argv) {
    boost::mpi::environment env(argc, argv);
    boost::mpi::communicator comm;
    
    if (comm.rank() == 0) {
        std::cout << "------------" << std::endl;
        std::cout << "Plain array:" << std::endl;
        std::cout << "------------" << std::endl;
        std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
    }
    for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
        double elapsed = 0;
        for (int i = 0; i < N_REP; i++) {
            if (comm.rank() == 1) {
                scalar_type* data = new scalar_type[n];
                
                // Touch the memory before benchmarking the send
                std::memset(data, 0, n*sizeof(scalar_type));
                
                comm.barrier();
                comm.send(0,0, data, n);
            }
            else if (comm.rank() == 0) {
                scalar_type* data = new scalar_type[n];
                
                // Touch the memory before benchmarking the send
                std::memset(data, 0, n*sizeof(scalar_type));
                
                comm.barrier();
                auto start = std::chrono::high_resolution_clock::now();
                comm.recv(1,0, data, n);
                auto stop = std::chrono::high_resolution_clock::now();
                elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
            }
        }
        if (comm.rank() == 0) {
            std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
        }
    }
    
    if (comm.rank() == 0) {
        std::cout << std::endl;
        std::cout << "------------" << std::endl;
        std::cout << "std::vector:" << std::endl;
        std::cout << "------------" << std::endl;
        std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
    }
    for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
        double elapsed = 0;
        for (int i = 0; i < N_REP; i++) {
            if (comm.rank() == 1) {
                std::vector<scalar_type> data(n);
                
                comm.barrier();
                comm.send(0,0,data);
            }
            else if (comm.rank() == 0) {
                std::vector<scalar_type> data;
                
                comm.barrier();
                auto start = std::chrono::high_resolution_clock::now();
                comm.recv(1,0, data);
                auto stop = std::chrono::high_resolution_clock::now();
                elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
            }
        }
        if (comm.rank() == 0) {
            std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
        }
    }
    
    if (comm.rank() == 0) {
        std::cout << std::endl;
        std::cout << "----------------------------------------------------" << std::endl;
        std::cout << "std::vector with split sending of skeleton and data:" << std::endl;
        std::cout << "----------------------------------------------------" << std::endl;
        std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
    }
    for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
        double elapsed = 0;
        for (int i = 0; i < N_REP; i++) {
            if (comm.rank() == 1) {
                std::vector<scalar_type> data(n);
                
                comm.barrier();
                comm.send(0,0, boost::mpi::skeleton(data));
                comm.send(0,0, boost::mpi::get_content(data));
            }
            else if (comm.rank() == 0) {
                std::vector<scalar_type> data;
                
                comm.barrier();
                auto start = std::chrono::high_resolution_clock::now();
                comm.recv(1,0, boost::mpi::skeleton(data));
                comm.recv(1,0, boost::mpi::get_content(data));
                auto stop = std::chrono::high_resolution_clock::now();
                elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
            }
        }
        if (comm.rank() == 0) {
            std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
        }
    }
    
    if (comm.rank() == 0) {
        std::cout << std::endl;
        std::cout << "--------------------------------" << std::endl;
        std::cout << "std::vector sent as plain array:" << std::endl;
        std::cout << "--------------------------------" << std::endl;
        std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
    }
    for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
        double elapsed = 0;
        for (int i = 0; i < N_REP; i++) {
            if (comm.rank() == 1) {
                std::vector<scalar_type> data(n);
                
                comm.barrier();
                std::size_t s = data.size();
                comm.send(0,0, s);
                comm.send(0,0, data.data(), s);
            }
            else if (comm.rank() == 0) {
                std::vector<scalar_type> data;
                
                comm.barrier();
                auto start = std::chrono::high_resolution_clock::now();
                std::size_t s;
                comm.recv(1,0,s);
                data.resize(s);
                comm.recv(1,0, data.data(), data.size());
                auto stop = std::chrono::high_resolution_clock::now();
                elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
            }
        }
        if (comm.rank() == 0) {
            std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
        }
    }
    
    return 0;
}

Attachment: vector_send_timings.pdf
Description: Adobe PDF document

------------
Plain array:
------------
N       Time [sec]      # bytes per second
2       1.14699e-06     1.39496e+07
4       1.16787e-06     2.74003e+07
8       1.12932e-06     5.66713e+07
16      1.11687e-06     1.14606e+08
32      1.31076e-06     1.95307e+08
64      1.87018e-06     2.7377e+08
128     2.55917e-06     4.0013e+08
256     4.11111e-06     4.98162e+08
512     6.54926e-06     6.25414e+08
1024    9.82945e-06     8.33414e+08
2048    1.68555e-05     9.72025e+08
4096    3.06214e-05     1.0701e+09
8192    5.4937e-05      1.19293e+09
16384   0.0001001       1.30942e+09
32768   0.000183717     1.42689e+09
65536   0.000347867     1.50715e+09
131072  0.000707096     1.48293e+09
262144  0.00148345      1.4137e+09
524288  0.0030194       1.38912e+09
1048576 0.00607652      1.3805e+09

------------
std::vector:
------------
N       Time [sec]      # bytes per second
2       3.67094e-06     4.35856e+06
4       2.78088e-06     1.15071e+07
8       2.85336e-06     2.24297e+07
16      3.09183e-06     4.13994e+07
32      3.50106e-06     7.31207e+07
64      4.43758e-06     1.15378e+08
128     6.57839e-06     1.55661e+08
256     1.23385e-05     1.65985e+08
512     1.66955e-05     2.45335e+08
1024    3.12738e-05     2.61945e+08
2048    6.07867e-05     2.69533e+08
4096    0.000121077     2.70637e+08
8192    0.000237852     2.75533e+08
16384   0.000495919     2.64301e+08
32768   0.000992363     2.64161e+08
65536   0.00201489      2.60207e+08
131072  0.0047326       2.21564e+08
262144  0.0115056       1.82272e+08
524288  0.0311629       1.34593e+08
1048576 0.060575        1.38483e+08

----------------------------------------------------
std::vector with split sending of skeleton and data:
----------------------------------------------------
N       Time [sec]      # bytes per second
2       5.76795e-06     2.77395e+06
4       4.99399e-06     6.4077e+06
8       5.11506e-06     1.25121e+07
16      5.35042e-06     2.39234e+07
32      5.50621e-06     4.6493e+07
64      5.74085e-06     8.91854e+07
128     6.13791e-06     1.66832e+08
256     7.47633e-06     2.73931e+08
512     9.82626e-06     4.16842e+08
1024    1.31645e-05     6.22279e+08
2048    2.03273e-05     8.06011e+08
4096    3.40979e-05     9.60998e+08
8192    6.15744e-05     1.06434e+09
16384   0.00011404      1.14935e+09
32768   0.000215421     1.21689e+09
65536   0.000410144     1.2783e+09
131072  0.000843321     1.24339e+09
262144  0.00186808      1.12262e+09
524288  0.00422265      9.93287e+08
1048576 0.00850844      9.85916e+08

--------------------------------
std::vector sent as plain array:
--------------------------------
N       Time [sec]      # bytes per second
2       1.6751e-06      9.55167e+06
4       1.75567e-06     1.82267e+07
8       1.84443e-06     3.46991e+07
16      1.91517e-06     6.68348e+07
32      1.80817e-06     1.4158e+08
64      2.08817e-06     2.45191e+08
128     2.82128e-06     3.62956e+08
256     4.40952e-06     4.6445e+08
512     6.68324e-06     6.12876e+08
1024    1.00152e-05     8.1796e+08
2048    1.71754e-05     9.53923e+08
4096    3.04354e-05     1.07664e+09
8192    5.78611e-05     1.13264e+09
16384   0.000110022     1.19133e+09
32768   0.000209929     1.24873e+09
65536   0.000401084     1.30718e+09
131072  0.000829423     1.26422e+09
262144  0.00184946      1.13393e+09
524288  0.00418481      1.00227e+09
1048576 0.00844186      9.93692e+08

_______________________________________________
Boost-mpi mailing list
[email protected]
http://lists.boost.org/mailman/listinfo.cgi/boost-mpi

Reply via email to