Hi,
I found some unexpected behavior in the distribution behavior if I set in  
MPI-programs the MPI hint striping_unit to (whatever) value. This can be 
reproduced with an arbitrary MPI-program creating the file with the create 
mode. 
For example if I run the MPI-IO.c program (which is attached) using a 
configuration of 5 data servers with the parameter
./MPI-IO -f pvfs2://pvfs2/testfile level0
it will write 100 MByte, each node creates a datafile which has the size of 20 
MBytes which is as expected.
However, if I set the striping_unit hint to an value during creation, which can 
be done by 
./MPI-IO -f pvfs2://pvfs2/testfile level0 -H striping_unit=1000
Only node1 get all 100MByte while the other servers not even create a datafile. 
pvfs2-viewdist shows the correct distribution values which indicates that the 
parameter is used correctly during the file creation, however not used during 
the I/O request:

pvfs2-viewdist -f /pvfs2/testfile 
dist_name = simple_stripe
strip_size = 1000
Number of datafiles/servers = 5
Server 0 - tcp://node3:6666, handle: 2147483650 (80000002.bstream)
Server 1 - tcp://node4:6666, handle: 2863311532 (aaaaaaac.bstream)
Server 2 - tcp://node5:6666, handle: 3579139414 (d5555556.bstream)
Server 3 - tcp://node1:6666, handle: 1431655767 (55555557.bstream)
Server 4 - tcp://node2:6666, handle: 1431655768 (55555558.bstream)

Is there something I do not understand with the striping_unit hint or is this 
the correct behavior ? I even get the 100Mbyte on the first server if I set the 
striping_unit to 65536!

I use mpich2.1.0.5p3 and the current CVS version (however this result appears 
in version which is one month old, too).

Thanks,
Julian
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define MPI_Sleep(c, n) MPI_Barrier(c); sleep(n); MPI_Barrier(c);

#define DEFAULT_FILESIZE (128 * 1024 * 1024)
#define DEFAULT_FILENAME "foobar"
#define DEFAULT_ITERATIONS 1

typedef struct {
	char* name;
	void (*function) ();
} test;

/* Stuff needed by getopt() */
extern char *optarg;
extern int optind, opterr, optopt;

int rank;
int size;

/* Options */
unsigned int filesize = DEFAULT_FILESIZE;
char* filename = DEFAULT_FILENAME;
unsigned int iterations = DEFAULT_ITERATIONS;

int mode = MPI_MODE_RDWR | MPI_MODE_CREATE;
char* buffer = NULL;
MPI_Info info;
MPI_Datatype datatype;

void Test_level0 ();
void Test_level1 ();
void Test_level2 ();
void Test_level3 ();

test tests[] = {
	{ "level0", Test_level0 },
	{ "level1", Test_level1 },
	{ "level2", Test_level2 },
	{ "level3", Test_level3 }
};

/* Tests must return void and take no arguments. */
/* Level 0: non-collective, contiguous */
void Test_level0 ()
{
	MPI_File fh;
	MPI_Status status;
	int i;

	MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
	MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_write(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_Sleep(MPI_COMM_WORLD, 1);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_read(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_File_close(&fh);
}

/* Level 1: collective, contiguous */
void Test_level1 ()
{
	MPI_File fh;
	MPI_Status status;
	int i;

	MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
	MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_write_all(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_Sleep(MPI_COMM_WORLD, 1);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_read_all(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_File_close(&fh);
}

/* Level 2: non-collective, non-contiguous */
void Test_level2 ()
{
	MPI_File fh;
	MPI_Status status;
	int i;

	MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
	MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_write(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_Sleep(MPI_COMM_WORLD, 1);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_read(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_File_close(&fh);
}

/* Level 3: collective, non-contiguous */
void Test_level3 ()
{
	MPI_File fh;
	MPI_Status status;
	int i;

	MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
	MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_write_all(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_Sleep(MPI_COMM_WORLD, 1);
	MPI_File_seek(fh, 0, MPI_SEEK_SET);

	for (i = 0; i < iterations; ++i)
	{
		MPI_File_read_all(fh, buffer, filesize, MPI_BYTE, &status);
	}

	MPI_File_close(&fh);
}

void usage (char** argv)
{
	printf(	"Usage: %s [-d] [-f filename] [-h] [-H hints] [-i iterations] [-s filesize]\n"
			"	-d		Delete file on close. (Default: false)\n"
			"	-f		The name of the file used for the tests. (Default: %s)\n"
			"	-h		Display this help.\n"
			"	-H		Hints, in the form key=value.\n"
			"	-i		Number of iterations. (Default: %d)\n"
			"	-s		The size of the file used for the tests. (Default: %d)\n"
	, argv[0], DEFAULT_FILENAME, DEFAULT_ITERATIONS, DEFAULT_FILESIZE);
}

void get_args (int argc, char** argv)
{
	int opt;
	char* key;
	char* value;
	char multiplier;

	while ((opt = getopt(argc, argv, "df:hH:i:s:")) != -1)
	{
		switch (opt)
		{
			case 'd':
				mode |= MPI_MODE_DELETE_ON_CLOSE;
				break;
			case 'f':
				filename = strdup(optarg);
				break;
			case 'h':
				usage(argv);
				MPI_Finalize();
				exit(0);
			case 'H':
				/* Hint format must be key=value. */
				key = optarg;

				if ((value = strchr(optarg, '=')) == NULL)
				{
					if (rank == 0)
					{
						printf("Error: Invalid hint.\n");
					}

					MPI_Abort(MPI_COMM_WORLD, 1);
				}

				/* Separate key and value. */
				*value = '\0';
				++value;

				if (rank == 0)
				{
					printf("Hint: %s=%s\n", key, value);
				}

				MPI_Info_set(info, key, value);

				/* Restore the string. */
				--value;
				*value = '=';
				break;
			case 'i':
				iterations = atoi(optarg);
				break;
			case 's':
				filesize = atoi(optarg);
				multiplier = *(optarg + strlen(optarg) - 1);

				switch (multiplier)
				{
					case 'G':
						filesize *= 1024;
					case 'M':
						filesize *= 1024;
					case 'K':
						filesize *= 1024;
						break;
				}

				break;
		}
	}
}

int main (int argc, char** argv)
{
	int i, j;

	MPI_Init(&argc, &argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);

	printf("%3d/%3d: Hello world!\n", rank + 1, size);

	MPI_Barrier(MPI_COMM_WORLD);

	MPI_Info_create(&info);

	get_args(argc, argv);

	int array_sizes[] = { size };
	int array_subsizes[] = { 1 };
	int array_starts[] = { rank };
	MPI_Type_create_subarray(1, array_sizes, array_subsizes, array_starts, MPI_ORDER_C, MPI_BYTE, &datatype);
	MPI_Type_commit(&datatype);

	if ((buffer = malloc(filesize * sizeof(MPI_BYTE))) == NULL)
	{
		printf("%3d/%3d: Error: Can not allocate memory.\n", rank + 1, size);

		MPI_Abort(MPI_COMM_WORLD, 1);
	}

	memset(buffer, rank, filesize * sizeof(MPI_BYTE));
	MPI_Barrier(MPI_COMM_WORLD);

	/* Run all specified tests in the given order. */
	for (i = optind; i < argc; ++i)
	{
		for (j = 0; j < sizeof(tests) / sizeof(test); ++j)
		{
			if (strcmp(argv[i], tests[j].name) == 0)
			{
				printf("%3d/%3d: Running %s...\n", rank + 1, size, tests[j].name);
				tests[j].function();
			}
		}
	}

	MPI_Type_free(&datatype);
	MPI_Info_free(&info);

	MPI_Finalize();

	return 0;
}
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers

Reply via email to