Hi,

I have a problem receiving a vector of a MPI_datatype constructed via MPI_Type_create_struct.

It looks like MPI_Send or MPI_Recv doesn't works as expected: some parts of a nested struct in the received buffer are not filled at all!.

I tested the code under mpich 3.0.3 and it worked perfectly!

So I simplified everything (but still have ~400 lines of code) and put all of it in self-contained example attached to this mail.

Briefly, we construct different MPI_datatype and nests them into a final type which is a:
{MPI_LONG,{{MPI_DOUBLE,MPI_LONG,MPI_CHAR}*2}
(which represents a std::pair<long int, 2DVerticeInfo>)

The output for different openmpi versions is surprising:

With openmpi 1.6.3 and 1.6.4:

  Rank 0 send this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}}
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}}
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}}
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}}
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}}
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}}
  Rank 1 received this:
  i: 0 => {{0},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 1 => {{1},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 2 => {{2},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 3 => {{3},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 4 => {{4},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 5 => {{5},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****

With openmpi 1.7.0:
  Rank 0 send this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}}
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}}
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}}
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}}
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}}
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}}
  Rank 1 received this:
  i: 0 => {{0},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 1 => {{1},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 2 => {{2},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 3 => {{3},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 4 => {{4},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****
  i: 5 => {{5},{{0.5,3,%},{-888.8,-999,$}}} *** ERROR ****

with mpich-3.0.3:
  Rank 0 send this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}}
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}}
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}}
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}}
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}}
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}}
  Rank 1 received this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}} OK
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}} OK
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}} OK
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}} OK
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}} OK
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}} OK

I also "valgrinded" the code under mpich:
  mpirun -n 2 valgrind ./sbugnt
==25148== Memcheck, a memory error detector
==25148== Copyright (C) 2002-2012, and GNU GPL'd, by Julian Seward et al.
==25148== Using Valgrind-3.8.1 and LibVEX; rerun with -h for copyright info
==25148== Command: ./sbugnt
==25148==
==25147== Memcheck, a memory error detector
==25147== Copyright (C) 2002-2012, and GNU GPL'd, by Julian Seward et al.
==25147== Using Valgrind-3.8.1 and LibVEX; rerun with -h for copyright info
==25147== Command: ./sbugnt
==25147==
  Rank 0 send this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}}
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}}
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}}
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}}
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}}
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}}
  Rank 1 received this:
  i: 0 => {{0},{{0.5,3,%},{0.25,7,5}}} OK
  i: 1 => {{1},{{0.5,3,%},{0.25,7,5}}} OK
  i: 2 => {{2},{{0.5,3,%},{0.25,7,5}}} OK
  i: 3 => {{3},{{0.5,3,%},{0.25,7,5}}} OK
  i: 4 => {{4},{{0.5,3,%},{0.25,7,5}}} OK
  i: 5 => {{5},{{0.5,3,%},{0.25,7,5}}} OK
==25147==
==25147== HEAP SUMMARY:
==25147==     in use at exit: 0 bytes in 0 blocks
==25147==   total heap usage: 215 allocs, 215 frees, 26,067 bytes allocated
==25147==
==25147== All heap blocks were freed -- no leaks are possible
==25147==
==25147== For counts of detected and suppressed errors, rerun with: -v
==25147== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
==25148==
==25148== HEAP SUMMARY:
==25148==     in use at exit: 0 bytes in 0 blocks
==25148==   total heap usage: 213 allocs, 213 frees, 26,019 bytes allocated
==25148==
==25148== All heap blocks were freed -- no leaks are possible
==25148==
==25148== For counts of detected and suppressed errors, rerun with: -v
==25148== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)


Did we misused something?

Thanks for your help!

Eric


#include "mpi.h"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>

//**************************************************************************
//
// This example is showing a problem with multiple nested types!
// It works perfectly with mpich-3.0.3 but seems to do a wrong transmission
// with openmpi 1.6.3, 1.6.4, 1.7.0 and 1.7.1
//
// The basic problem seems to arise with a vector of PAPairLongV2D which is a
// MPI nested type constructed like this:
//--------------------------------------
// Struct          | is composed of
//--------------------------------------
// PAPairLongV2D   |  {PALong,PAV2D}
// PAV2D           |  {3xPADouble}
// PADouble        |  {double, long, char}
// PALong          |  {long}
//--------------------------------------
//
// If you comment the #define THE_BUG, PALong is changed to:
//
// PALong          |  {long, long, char}
//
// and everything works perfectly!
//
//**************************************************************************

using namespace std;

// Comment the #define and everything goes fine! (it changes the struct PALong)
#define THE_BUG

//Forward declarations (full definitions after the "main"):
void abortOnError(int ierr);
class PALong;
class PAV2D;
class PADouble;
class PAPaireLongV2D;

// a constant:
#define FIRST_CHAR 32

//****************************
//
// Bunch of class definitions:
//
//****************************


//! Function to abort on any MPI error:
void abortOnError(int ierr) {
  if (ierr != MPI_SUCCESS) {
    std::cerr << "ERROR Returned by MPI: " << ierr << std::endl;
    char* lCharPtr = new char[MPI_MAX_ERROR_STRING];
    int lLongueur = 0;
    MPI_Error_string(ierr,lCharPtr, &lLongueur);
    std::cerr << "ERROR_string Returned by MPI: " << lCharPtr << std::endl;
    MPI_Abort( MPI_COMM_WORLD, 1 );
  }
}

// Here goes the classes definitions:

#ifdef THE_BUG

// Defining some classes and the MPI_datatype:
class PALong
{
public:
  long int  aLong; //!< a long int

  //! Notice the default value!
  PALong() : aLong(-444) {}

  //! A methode to "print" the class:
  void print(std::ostream& pOS) {pOS << "{" << aLong << "}";}

  static MPI_Datatype  asMPIDatatype; //!< attribute storing the MPI_datatype
  static MPI_Datatype& reqMPIDatatype() { return asMPIDatatype;}

  //! A static method to create the MPI_datatype:
  static void          createMPIDatatype()
  {
    asMPIDatatype = MPI_LONG;
  }
};
#else
class PALong
{
public:

  //! Notice the default values!
  PALong() : aLong(-555), aLVal(-666), aLB(FIRST_CHAR+3) {}

  long int  aLong;
  long int  aLVal;
  char   aLB;

  void print(std::ostream& pOS) {pOS << "{" << aLong << "," << aLVal << "," << 
aLB << "}";}

  static MPI_Datatype  asMPIDatatype;

  static MPI_Datatype& reqMPIDatatype() { return asMPIDatatype;}

  static void createMPIDatatype() {

    PALong lPAType;

    MPI_Datatype lTypes[3];

    lTypes[0] = MPI_LONG;
    lTypes[1] = MPI_LONG;
    lTypes[2] = MPI_CHAR;

    MPI_Aint lDeplacements[3];

    MPI_Aint lPtrBase = 0;
    MPI_Get_address(&lPAType,            &lPtrBase);
    MPI_Get_address(&lPAType.aLong,      &lDeplacements[0]);
    MPI_Get_address(&lPAType.aLVal,      &lDeplacements[1]);
    MPI_Get_address(&lPAType.aLB,        &lDeplacements[2]);

    //Compute the "displacement" from lPtrBase
    for (int i = 0; i < 3; ++i) {
      lDeplacements[i] -= lPtrBase;
    }

    int lNbParBloc[3] = {1,1,1};

    abortOnError(MPI_Type_create_struct(3, lNbParBloc, lDeplacements, lTypes, 
&asMPIDatatype));
    abortOnError(MPI_Type_commit(&asMPIDatatype));

  }
};

#endif

// The static attribute declaration:
MPI_Datatype PALong::asMPIDatatype = MPI_DATATYPE_NULL;

//----------------------------------------------------------
//! Here is another class storing a double, a long and a char:
//----------------------------------------------------------
class PADouble
{
public:
  //! Notice the default values!
  PADouble() :aDouble(-888.8), aDVal(-999), aDB(FIRST_CHAR+4) {}

  double    aDouble;
  long int  aDVal;
  char      aDB;

  static MPI_Datatype  asMPIDatatype;
  static MPI_Datatype& reqMPIDatatype() { return asMPIDatatype;}

  void print(std::ostream& pOS) {pOS << "{" << aDouble << "," << aDVal << "," 
<< aDB << "}";}

  static void createMPIDatatype() {

    PADouble lPAType;

    MPI_Datatype lTypes[3];

    lTypes[0] = MPI_DOUBLE;
    lTypes[1] = MPI_LONG;
    lTypes[2] = MPI_CHAR;

    MPI_Aint lDeplacements[3];

    MPI_Aint lPtrBase = 0;
    MPI_Get_address(&lPAType,           &lPtrBase);
    MPI_Get_address(&lPAType.aDouble,   &lDeplacements[0]);
    MPI_Get_address(&lPAType.aDVal,     &lDeplacements[1]);
    MPI_Get_address(&lPAType.aDB,       &lDeplacements[2]);

    //Compute the "displacement" from lPtrBase
    for (int i = 0; i < 3; ++i) {
      lDeplacements[i] -= lPtrBase;
    }

    int lNbParBloc[3] = {1,1,1};

    abortOnError(MPI_Type_create_struct(3, lNbParBloc, lDeplacements, lTypes, 
&asMPIDatatype));
    abortOnError(MPI_Type_commit(&asMPIDatatype));

  }
};
MPI_Datatype PADouble::asMPIDatatype = MPI_DATATYPE_NULL;

// Another class: kind of "2D vector"
class PAV2D
{

public:
  PAV2D() {}

  PADouble aXYZ[2];

  static MPI_Datatype  asMPIDatatype;
  static MPI_Datatype& reqMPIDatatype() { return asMPIDatatype;}
  void print(std::ostream& pOS) {
    pOS << "{" ;
    aXYZ[0].print(pOS);
    pOS << "," ;
    aXYZ[1].print(pOS);
    pOS << "}" ;
  }

  static void createMPIDatatype()
  {
    PAV2D lPAType;

    MPI_Datatype lTypes;

    lTypes = PADouble::reqMPIDatatype();

    MPI_Aint lDeplacements;

    MPI_Aint lPtrBase = 0;
    MPI_Get_address(&lPAType,                             &lPtrBase);
    MPI_Get_address(&lPAType.aXYZ[0],                     &lDeplacements);

    //Compute the "displacement" from lPtrBase
    lDeplacements -= lPtrBase;

    int lNbParBloc;

    lNbParBloc = 2;

    abortOnError(MPI_Type_create_struct(1, &lNbParBloc, &lDeplacements, 
&lTypes, &asMPIDatatype));
    abortOnError(MPI_Type_commit(&asMPIDatatype));

  }

};
MPI_Datatype PAV2D::asMPIDatatype = MPI_DATATYPE_NULL;


//And the laste class: a kind of std::pair<>
class PAPairLongV2D
{
public:
  PAPairLongV2D()  {}

  PALong aFirst;
  PAV2D  aSecond;

  static MPI_Datatype  asMPIDatatype;
  static MPI_Datatype& reqMPIDatatype() { return asMPIDatatype;}
  void print(std::ostream& pOS) {
    pOS << "{" ;
    aFirst.print(pOS);
    pOS << "," ;
    aSecond.print(pOS);
    pOS << "}" ;
  }
  static void createMPIDatatype()
  {
    PAPairLongV2D lPAType;

    MPI_Datatype lTypes[2];

    lTypes[0] = PALong::reqMPIDatatype();
    lTypes[1] = PAV2D ::reqMPIDatatype();

    MPI_Aint lDeplacements[2];

    MPI_Aint lPtrBase = 0;
    MPI_Get_address(&lPAType,                             &lPtrBase);
    MPI_Get_address(&lPAType.aFirst,                      &lDeplacements[0]);
    MPI_Get_address(&lPAType.aSecond,                     &lDeplacements[1]);

    //Compute the "displacement" from lPtrBase
    for (int i = 0; i < 2; ++i) {
      lDeplacements[i] -= lPtrBase;
    }

    int lNbParBloc[2] = {1,1};

    abortOnError(MPI_Type_create_struct(2, lNbParBloc, lDeplacements, lTypes, 
&asMPIDatatype));
    abortOnError(MPI_Type_commit(&asMPIDatatype));

  }

};

MPI_Datatype PAPairLongV2D::asMPIDatatype = MPI_DATATYPE_NULL;



//****************************
//
// Here is now the main...
//
//****************************



int main(int argc, char *argv[])
{

  MPI_Init(&argc, &argv);

  //Calls the types creations once:
  PALong       ::createMPIDatatype();
  PADouble     ::createMPIDatatype();
  PAV2D        ::createMPIDatatype();
  PAPairLongV2D::createMPIDatatype();

  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  if (size != 2)
  {
    std::cout << "Please run with 2 processes.\n";
    MPI_Finalize();
    return 1;
  }

  // Here is the array we wil try to transmit:
  PAPairLongV2D* lBuf = 0;

  const int lBufSize = 6;
  const int lTag = 123;
  lBuf = new PAPairLongV2D[lBufSize];

  if (rank == 0)
  {
    // Some filling of values (to override the default ones from the 
constructors):
    for (int i=0; i<lBufSize; i++) {
      lBuf[i].aFirst.aLong             = i;
      lBuf[i].aSecond.aXYZ[0].aDouble  = (1/2.);
      lBuf[i].aSecond.aXYZ[0].aDVal    = 3*1;
      lBuf[i].aSecond.aXYZ[0].aDB      = FIRST_CHAR+5*1;
      lBuf[i].aSecond.aXYZ[1].aDouble =  1/4.;
      lBuf[i].aSecond.aXYZ[1].aDVal    = 7*1;
      lBuf[i].aSecond.aXYZ[1].aDB      = FIRST_CHAR+1+20;
    }
    // Print what we will send:
    std::cout << " Rank 0 send this: " << std::endl;
    for (int i=0; i<lBufSize; i++) {
      std::cout << " i: " << i << " => ";
      lBuf[i].print(std::cout);
      std::cout << std::endl;
    }
    //Now send this vector!
    abortOnError(MPI_Send(lBuf, lBufSize, PAPairLongV2D::reqMPIDatatype(), 1, 
lTag, MPI_COMM_WORLD));
  }

  if (rank == 1)
  {
    MPI_Status status;
    status.MPI_SOURCE = -1;
    status.MPI_TAG = -1;
    status.MPI_ERROR = -1;
    abortOnError(MPI_Recv(lBuf, lBufSize, PAPairLongV2D::reqMPIDatatype(), 0, 
lTag, MPI_COMM_WORLD, &status));

    // For verifying the status:
    int lCount = -1;
    abortOnError(MPI_Get_count(&status, 
PAPairLongV2D::reqMPIDatatype(),&lCount));
    const bool lAllOK = 0           == status.MPI_SOURCE &&
                        lTag        == status.MPI_TAG &&
                        MPI_SUCCESS == status.MPI_ERROR &&
                        lCount      == lBufSize;
    if (lAllOK)
    {
      std::cout << "MPI_Recv returned success and everything in MPI_Status is 
correct after receive." << std::endl;
    }
    else {
      std::cout << "MPI_Status is not correct!" << std::endl;
    }

    std::cout << " Rank 1 received this: " << std::endl;
    for (int i=0; i<lBufSize; i++) {
      std::cout << " i: " << i << " => ";
      lBuf[i].print(std::cout);
      // Verifications of what we should have received:
      bool lOK = true;
      lOK &= lBuf[i].aFirst.aLong             == i;
      lOK &= lBuf[i].aSecond.aXYZ[0].aDouble  == (1/2.);
      lOK &= lBuf[i].aSecond.aXYZ[0].aDVal    == 3*1;
      lOK &= lBuf[i].aSecond.aXYZ[0].aDB      == FIRST_CHAR+5*1;
      lOK &= lBuf[i].aSecond.aXYZ[1].aDouble  ==  1/4.;
      lOK &= lBuf[i].aSecond.aXYZ[1].aDVal    == 7*1;
      lOK &= lBuf[i].aSecond.aXYZ[1].aDB      == FIRST_CHAR+1+20;

      //If it is not what we expect, print an error:
      std::cout << (lOK ? " OK " : " *** ERROR ****") << std::endl;

    }

  }
   MPI_Barrier(MPI_COMM_WORLD);
   delete [] lBuf;

#ifndef THE_BUG
   // Carefull: we can't call MPI_Type_free on natives types...
   abortOnError(MPI_Type_free(&PALong       ::reqMPIDatatype()));
#endif
   abortOnError(MPI_Type_free(&PADouble     ::reqMPIDatatype()));
   abortOnError(MPI_Type_free(&PAV2D        ::reqMPIDatatype()));
   abortOnError(MPI_Type_free(&PAPairLongV2D::reqMPIDatatype()));

  abortOnError(MPI_Finalize());

  return 0;
}

Reply via email to