Hi!When benchmarking the performance of sending a std::vector<double> with Boost.MPI, I noticed that you can gain several factors of speedup if you replace
std::vector<double> data(n); comm.send(0,0,data); by e.g. std::vector<double> data(n); comm.send(0,0, boost::mpi::skeleton(data)); comm.send(0,0, boost::mpi::get_content(data)); The code to benchmark, the measured data as well as a plot thereof are attached. Further parameters were: MPI implementation: Open MPI 1.6.5 C++ compiler: gcc 4.8.2 Compiler flags: -O3 -std=c++11 mpirun parameter: --bind-to-core CPU model: AMD Opteron(tm) Processor 6174 Why is it/what am I doing wrong that the default sending of std::vector<double> performs so badly? Best regards, Simon Etter
#include <iostream>
#include <chrono>
#include <vector>
#include <numeric>
#include <cstring>
#define BOOST_MPI_HOMOGENEOUS
#include <boost/mpi.hpp>
typedef double scalar_type;
typedef std::chrono::duration<double, std::ratio<1>> seconds;
BOOST_CLASS_TRACKING(std::vector<scalar_type>, boost::serialization::track_never)
BOOST_CLASS_IMPLEMENTATION(std::vector<scalar_type>, boost::serialization::object_serializable)
#define N_REP 100
#define MAX_SIZE (1<<20)
int main(int argc, char** argv) {
boost::mpi::environment env(argc, argv);
boost::mpi::communicator comm;
if (comm.rank() == 0) {
std::cout << "------------" << std::endl;
std::cout << "Plain array:" << std::endl;
std::cout << "------------" << std::endl;
std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
}
for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
double elapsed = 0;
for (int i = 0; i < N_REP; i++) {
if (comm.rank() == 1) {
scalar_type* data = new scalar_type[n];
// Touch the memory before benchmarking the send
std::memset(data, 0, n*sizeof(scalar_type));
comm.barrier();
comm.send(0,0, data, n);
}
else if (comm.rank() == 0) {
scalar_type* data = new scalar_type[n];
// Touch the memory before benchmarking the send
std::memset(data, 0, n*sizeof(scalar_type));
comm.barrier();
auto start = std::chrono::high_resolution_clock::now();
comm.recv(1,0, data, n);
auto stop = std::chrono::high_resolution_clock::now();
elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
}
}
if (comm.rank() == 0) {
std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
}
}
if (comm.rank() == 0) {
std::cout << std::endl;
std::cout << "------------" << std::endl;
std::cout << "std::vector:" << std::endl;
std::cout << "------------" << std::endl;
std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
}
for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
double elapsed = 0;
for (int i = 0; i < N_REP; i++) {
if (comm.rank() == 1) {
std::vector<scalar_type> data(n);
comm.barrier();
comm.send(0,0,data);
}
else if (comm.rank() == 0) {
std::vector<scalar_type> data;
comm.barrier();
auto start = std::chrono::high_resolution_clock::now();
comm.recv(1,0, data);
auto stop = std::chrono::high_resolution_clock::now();
elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
}
}
if (comm.rank() == 0) {
std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
}
}
if (comm.rank() == 0) {
std::cout << std::endl;
std::cout << "----------------------------------------------------" << std::endl;
std::cout << "std::vector with split sending of skeleton and data:" << std::endl;
std::cout << "----------------------------------------------------" << std::endl;
std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
}
for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
double elapsed = 0;
for (int i = 0; i < N_REP; i++) {
if (comm.rank() == 1) {
std::vector<scalar_type> data(n);
comm.barrier();
comm.send(0,0, boost::mpi::skeleton(data));
comm.send(0,0, boost::mpi::get_content(data));
}
else if (comm.rank() == 0) {
std::vector<scalar_type> data;
comm.barrier();
auto start = std::chrono::high_resolution_clock::now();
comm.recv(1,0, boost::mpi::skeleton(data));
comm.recv(1,0, boost::mpi::get_content(data));
auto stop = std::chrono::high_resolution_clock::now();
elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
}
}
if (comm.rank() == 0) {
std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
}
}
if (comm.rank() == 0) {
std::cout << std::endl;
std::cout << "--------------------------------" << std::endl;
std::cout << "std::vector sent as plain array:" << std::endl;
std::cout << "--------------------------------" << std::endl;
std::cout << "N\tTime [sec]\t# bytes per second" << std::endl;
}
for (unsigned long n = 2; n <= MAX_SIZE; n <<= 1) {
double elapsed = 0;
for (int i = 0; i < N_REP; i++) {
if (comm.rank() == 1) {
std::vector<scalar_type> data(n);
comm.barrier();
std::size_t s = data.size();
comm.send(0,0, s);
comm.send(0,0, data.data(), s);
}
else if (comm.rank() == 0) {
std::vector<scalar_type> data;
comm.barrier();
auto start = std::chrono::high_resolution_clock::now();
std::size_t s;
comm.recv(1,0,s);
data.resize(s);
comm.recv(1,0, data.data(), data.size());
auto stop = std::chrono::high_resolution_clock::now();
elapsed += std::chrono::duration_cast<seconds>(stop - start).count();
}
}
if (comm.rank() == 0) {
std::cout << n << "\t" << elapsed/N_REP << "\t" << N_REP*n*sizeof(scalar_type)/elapsed << std::endl;
}
}
return 0;
}
vector_send_timings.pdf
Description: Adobe PDF document
------------ Plain array: ------------ N Time [sec] # bytes per second 2 1.14699e-06 1.39496e+07 4 1.16787e-06 2.74003e+07 8 1.12932e-06 5.66713e+07 16 1.11687e-06 1.14606e+08 32 1.31076e-06 1.95307e+08 64 1.87018e-06 2.7377e+08 128 2.55917e-06 4.0013e+08 256 4.11111e-06 4.98162e+08 512 6.54926e-06 6.25414e+08 1024 9.82945e-06 8.33414e+08 2048 1.68555e-05 9.72025e+08 4096 3.06214e-05 1.0701e+09 8192 5.4937e-05 1.19293e+09 16384 0.0001001 1.30942e+09 32768 0.000183717 1.42689e+09 65536 0.000347867 1.50715e+09 131072 0.000707096 1.48293e+09 262144 0.00148345 1.4137e+09 524288 0.0030194 1.38912e+09 1048576 0.00607652 1.3805e+09 ------------ std::vector: ------------ N Time [sec] # bytes per second 2 3.67094e-06 4.35856e+06 4 2.78088e-06 1.15071e+07 8 2.85336e-06 2.24297e+07 16 3.09183e-06 4.13994e+07 32 3.50106e-06 7.31207e+07 64 4.43758e-06 1.15378e+08 128 6.57839e-06 1.55661e+08 256 1.23385e-05 1.65985e+08 512 1.66955e-05 2.45335e+08 1024 3.12738e-05 2.61945e+08 2048 6.07867e-05 2.69533e+08 4096 0.000121077 2.70637e+08 8192 0.000237852 2.75533e+08 16384 0.000495919 2.64301e+08 32768 0.000992363 2.64161e+08 65536 0.00201489 2.60207e+08 131072 0.0047326 2.21564e+08 262144 0.0115056 1.82272e+08 524288 0.0311629 1.34593e+08 1048576 0.060575 1.38483e+08 ---------------------------------------------------- std::vector with split sending of skeleton and data: ---------------------------------------------------- N Time [sec] # bytes per second 2 5.76795e-06 2.77395e+06 4 4.99399e-06 6.4077e+06 8 5.11506e-06 1.25121e+07 16 5.35042e-06 2.39234e+07 32 5.50621e-06 4.6493e+07 64 5.74085e-06 8.91854e+07 128 6.13791e-06 1.66832e+08 256 7.47633e-06 2.73931e+08 512 9.82626e-06 4.16842e+08 1024 1.31645e-05 6.22279e+08 2048 2.03273e-05 8.06011e+08 4096 3.40979e-05 9.60998e+08 8192 6.15744e-05 1.06434e+09 16384 0.00011404 1.14935e+09 32768 0.000215421 1.21689e+09 65536 0.000410144 1.2783e+09 131072 0.000843321 1.24339e+09 262144 0.00186808 1.12262e+09 524288 0.00422265 9.93287e+08 1048576 0.00850844 9.85916e+08 -------------------------------- std::vector sent as plain array: -------------------------------- N Time [sec] # bytes per second 2 1.6751e-06 9.55167e+06 4 1.75567e-06 1.82267e+07 8 1.84443e-06 3.46991e+07 16 1.91517e-06 6.68348e+07 32 1.80817e-06 1.4158e+08 64 2.08817e-06 2.45191e+08 128 2.82128e-06 3.62956e+08 256 4.40952e-06 4.6445e+08 512 6.68324e-06 6.12876e+08 1024 1.00152e-05 8.1796e+08 2048 1.71754e-05 9.53923e+08 4096 3.04354e-05 1.07664e+09 8192 5.78611e-05 1.13264e+09 16384 0.000110022 1.19133e+09 32768 0.000209929 1.24873e+09 65536 0.000401084 1.30718e+09 131072 0.000829423 1.26422e+09 262144 0.00184946 1.13393e+09 524288 0.00418481 1.00227e+09 1048576 0.00844186 9.93692e+08
_______________________________________________ Boost-mpi mailing list [email protected] http://lists.boost.org/mailman/listinfo.cgi/boost-mpi
