Hi Jeff,

thanks for your answer!

You inserted a doubt in my mind... and gave me hope... :-)

So I did some modifications on the code to help everyone:

1- it's now in "C"... :-)
2- Concerning your remark about arbitrary address: I am now using the "offsetof" macro defined in "stddef.h" to compute the offset, or displacement needed to create the datatype 3- I have simplified and reduced (again) the number of lines to reproduce the error...

see "nested_bug.c" attached to this mail...

Output with openmpi 1.6.3:
----------------------------------------
 Rank 0 send this:
{{1},{{2,3},{4,5}}}
 Rank 1 received this: {{1},{{2,3},{4199789,15773951}}} *** ERROR ****

Expected output (still ok with mpich 3.0.3 and intel mpi 4):
----------------------------------------
Rank 0 send this:
{{1},{{2,3},{4,5}}}
 Rank 1 received this: {{1},{{2,3},{4,5}}} OK

Thanks!

Eric


Le 2013-04-23 18:03, Jeff Squyres (jsquyres) a écrit :
Sorry for the delay.

My C++ is a bit rusty, but this does not seem correct to me.

You're making the datatypes relative to an arbitrary address (&lPtrBase) in a 
static method on each class.  You really need the datatypes to be relative to each 
instance's *this* pointer.

Doing so allows MPI to read/write the data relative to the specific instance of 
the objects that you're trying to send/receive.

Make sense?


On Apr 23, 2013, at 5:01 PM, Eric Chamberland 
<eric.chamberl...@giref.ulaval.ca> wrote:

another information: I just tested the example with Intel MPI 4.0.1.007 and it 
works correctly...

So the problem seems to be only with OpenMPI... which is the default 
distribution we use... :-/

Is my example code too long?

Eric

Le 2013-04-23 09:55, Eric Chamberland a écrit :
Sorry,

here is the attachment...

Eric

On 04/23/2013 09:54 AM, Eric Chamberland wrote:
Hi,

I have sent a previous message showing something that I think is a bug
(or maybe a misuse, but...).

I worked on the example sent to have it simplified: now it is almost
half of the lines of code and the structures are more simple... but
still showing the wrong behaviour.

Briefly, we construct different MPI_datatype and nests them into a final
type which is a:
{MPI_LONG,{{MPI_LONG,MPI_CHAR}*2}

Here is the output from OpenMPI 1.6.3:

  Rank 0 send this:
  i: 0 => {{0},{{3,%},{7,5}}}
  i: 1 => {{1},{{3,%},{7,5}}}
  i: 2 => {{2},{{3,%},{7,5}}}
  i: 3 => {{3},{{3,%},{7,5}}}
  i: 4 => {{4},{{3,%},{7,5}}}
  i: 5 => {{5},{{3,%},{7,5}}}
MPI_Recv returned success and everything in MPI_Status is correct after
receive.
  Rank 1 received this:
  i: 0 => {{0},{{3,%},{-999,$}}} *** ERROR ****
  i: 1 => {{1},{{3,%},{-999,$}}} *** ERROR ****
  i: 2 => {{2},{{3,%},{-999,$}}} *** ERROR ****
  i: 3 => {{3},{{3,%},{-999,$}}} *** ERROR ****
  i: 4 => {{4},{{3,%},{-999,$}}} *** ERROR ****
  i: 5 => {{5},{{3,%},{-999,$}}} *** ERROR ****

Here is the expected output, obtained with mpich-3.0.3:

  Rank 0 send this:
  i: 0 => {{0},{{3,%},{7,5}}}
  i: 1 => {{1},{{3,%},{7,5}}}
  i: 2 => {{2},{{3,%},{7,5}}}
  i: 3 => {{3},{{3,%},{7,5}}}
  i: 4 => {{4},{{3,%},{7,5}}}
  i: 5 => {{5},{{3,%},{7,5}}}
MPI_Recv returned success and everything in MPI_Status is correct after
receive.
  Rank 1 received this:
  i: 0 => {{0},{{3,%},{7,5}}} OK
  i: 1 => {{1},{{3,%},{7,5}}} OK
  i: 2 => {{2},{{3,%},{7,5}}} OK
  i: 3 => {{3},{{3,%},{7,5}}} OK
  i: 4 => {{4},{{3,%},{7,5}}} OK
  i: 5 => {{5},{{3,%},{7,5}}} OK

Is it related to the bug reported here:
http://www.open-mpi.org/community/lists/devel/2013/04/12267.php ?

Thanks,

Eric


_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel
_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel


#include "mpi.h"
#include <malloc.h>
#include <stdio.h>
#include <stddef.h>

/**************************************************************************
//
// This example is showing a problem with nested types!
// It works perfectly with mpich-3.0.3 but seems to do a wrong transmission
// with openmpi 1.6.3, 1.6.4, 1.7.0 and 1.7.1
//
// The basic problem seems to arise with a vector of PALong_2Pairs which is a
// MPI nested type constructed like this:
//--------------------------------------
// Struct          | is composed of
//--------------------------------------
// PAPairLI        |  {long, int}
// PALong_2Pairs   |  {long,{PAPairLI,PAPairLI}}
//--------------------------------------
//
*/


/*! Function to abort on any MPI error:*/
void abortOnError(int ierr) {
  if (ierr != MPI_SUCCESS) {
    printf("ERROR Returned by MPI: %d\n",ierr);
    char* lCharPtr = malloc(sizeof(char)*MPI_MAX_ERROR_STRING);
    int lLongueur = 0;
    MPI_Error_string(ierr,lCharPtr, &lLongueur);
    printf("ERROR_string Returned by MPI: %s\n",lCharPtr);
    MPI_Abort( MPI_COMM_WORLD, 1 );
  }
}

/* a constant:*/
#define FIRST_CHAR 32

/*****************************************************
//
// PAPairLI is a pair: {long, int}
//
*/

struct PAPairLI
{
  long   aLong;
  int    aInt;
};

/*Global variable*/
MPI_Datatype gPAPairLI_datatype      = MPI_DATATYPE_NULL;
MPI_Datatype gPALong_2Pairs_datatype = MPI_DATATYPE_NULL;

void createPAPaireLI_datatype() 
{
    MPI_Datatype lTypes[2] = {MPI_LONG, MPI_INT};

    /*Compute the offset:*/
    MPI_Aint lOffset[2];
    lOffset[0] = offsetof(struct PAPairLI, aLong);
    lOffset[1] = offsetof(struct PAPairLI, aInt);

    int lBlocLen[2] = {1,1};

    abortOnError(MPI_Type_create_struct(2, lBlocLen, lOffset, lTypes, 
&gPAPairLI_datatype));
    abortOnError(MPI_Type_commit(&gPAPairLI_datatype));
}

/*****************************************************
//
// PALong_2Pairs is a struct of: {long, PAPairLI[2]}
//
*/
struct PALong_2Pairs
{
  long      aFirst;
  struct PAPairLI a2Pairs[2];
};

void printPALong_2Pairs(struct PALong_2Pairs* pObj) { 
printf("{{%ld},{{%ld,%d},{%ld,%d}}}",pObj->aFirst, 
pObj->a2Pairs[0].aLong,pObj->a2Pairs[0].aInt,pObj->a2Pairs[1].aLong,pObj->a2Pairs[1].aInt);
 }

void createPALong_2Pairs_datatype()
{
    MPI_Datatype lTypes[2] = {MPI_LONG, gPAPairLI_datatype};

    /*Compute the offset:*/
    MPI_Aint lOffset[2];
    lOffset[0] = offsetof(struct PALong_2Pairs, aFirst);
    lOffset[1] = offsetof(struct PALong_2Pairs, a2Pairs);

    int lBlocLen[2] = {1,2};

    abortOnError(MPI_Type_create_struct(2, lBlocLen, lOffset, lTypes, 
&gPALong_2Pairs_datatype));
    abortOnError(MPI_Type_commit(&gPALong_2Pairs_datatype));
}



/****************************
//
// Here is now the main...
//
*/
int main(int argc, char *argv[])
{
  int rank, size;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  if (size != 2)
  {
    printf("Please run with 2 processes.\n");
    MPI_Finalize();
    return 1;
  }

  /*Calls the types creations once:*/
  createPAPaireLI_datatype(); 
  createPALong_2Pairs_datatype();

  /* Here is the object we wil try to transmit:*/
  struct PALong_2Pairs lBuf;
  const int lTag = 123;

  if (rank == 0)
  {
    /* Some filling of values (to override the default ones from the 
constructors):*/
    lBuf.aFirst              = 1;
    lBuf.a2Pairs[0].aLong    = 2;
    lBuf.a2Pairs[0].aInt     = 3;
    lBuf.a2Pairs[1].aLong    = 4;
    lBuf.a2Pairs[1].aInt     = 5;

    /* Print what we will send:*/
    printf(" Rank 0 send this:\n");
    printPALong_2Pairs(&lBuf);
    printf("\n");

    /*Now send this object!*/
    abortOnError(MPI_Send(&lBuf, 1, gPALong_2Pairs_datatype, 1, lTag, 
MPI_COMM_WORLD));
  }

  if (rank == 1)
  {
    MPI_Status status;
    status.MPI_SOURCE = -1;
    status.MPI_TAG = -1;
    status.MPI_ERROR = MPI_SUCCESS;
    abortOnError(MPI_Recv(&lBuf, 1, gPALong_2Pairs_datatype, 0, lTag, 
MPI_COMM_WORLD, &status));

    /* For verifying the status:*/
    int lCount = -1;
    abortOnError(MPI_Get_count(&status, gPALong_2Pairs_datatype,&lCount));
    const int lAllOK = 0           == status.MPI_SOURCE &&
                       lTag        == status.MPI_TAG &&
                       MPI_SUCCESS == status.MPI_ERROR &&
                       lCount      == 1;
    if (!lAllOK) {
      printf("MPI_Status is not correct!");
      MPI_Abort(MPI_COMM_WORLD,1);
    }

    printf(" Rank 1 received this: ");
    printPALong_2Pairs(&lBuf);
    /* Verifications of what we should have received:*/
    int lOK = 1;
    lOK &= lBuf.aFirst           == 1;
    lOK &= lBuf.a2Pairs[0].aLong == 2;
    lOK &= lBuf.a2Pairs[0].aInt  == 3;
    lOK &= lBuf.a2Pairs[1].aLong == 4;
    lOK &= lBuf.a2Pairs[1].aInt  == 5;

    /*If it is not what we expect, print an error:*/
    char* lOkOrNot = (lOK ? " OK " : " *** ERROR ****");
    printf("%s\n",lOkOrNot);
  }

  MPI_Barrier(MPI_COMM_WORLD);

  abortOnError(MPI_Type_free(&gPALong_2Pairs_datatype));
  abortOnError(MPI_Type_free(&gPAPairLI_datatype));

  abortOnError(MPI_Finalize());

  return 0;
}

Reply via email to