Hi,
I found some unexpected behavior in the distribution behavior if I set in
MPI-programs the MPI hint striping_unit to (whatever) value. This can be
reproduced with an arbitrary MPI-program creating the file with the create
mode.
For example if I run the MPI-IO.c program (which is attached) using a
configuration of 5 data servers with the parameter
./MPI-IO -f pvfs2://pvfs2/testfile level0
it will write 100 MByte, each node creates a datafile which has the size of 20
MBytes which is as expected.
However, if I set the striping_unit hint to an value during creation, which can
be done by
./MPI-IO -f pvfs2://pvfs2/testfile level0 -H striping_unit=1000
Only node1 get all 100MByte while the other servers not even create a datafile.
pvfs2-viewdist shows the correct distribution values which indicates that the
parameter is used correctly during the file creation, however not used during
the I/O request:
pvfs2-viewdist -f /pvfs2/testfile
dist_name = simple_stripe
strip_size = 1000
Number of datafiles/servers = 5
Server 0 - tcp://node3:6666, handle: 2147483650 (80000002.bstream)
Server 1 - tcp://node4:6666, handle: 2863311532 (aaaaaaac.bstream)
Server 2 - tcp://node5:6666, handle: 3579139414 (d5555556.bstream)
Server 3 - tcp://node1:6666, handle: 1431655767 (55555557.bstream)
Server 4 - tcp://node2:6666, handle: 1431655768 (55555558.bstream)
Is there something I do not understand with the striping_unit hint or is this
the correct behavior ? I even get the 100Mbyte on the first server if I set the
striping_unit to 65536!
I use mpich2.1.0.5p3 and the current CVS version (however this result appears
in version which is one month old, too).
Thanks,
Julian
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define MPI_Sleep(c, n) MPI_Barrier(c); sleep(n); MPI_Barrier(c);
#define DEFAULT_FILESIZE (128 * 1024 * 1024)
#define DEFAULT_FILENAME "foobar"
#define DEFAULT_ITERATIONS 1
typedef struct {
char* name;
void (*function) ();
} test;
/* Stuff needed by getopt() */
extern char *optarg;
extern int optind, opterr, optopt;
int rank;
int size;
/* Options */
unsigned int filesize = DEFAULT_FILESIZE;
char* filename = DEFAULT_FILENAME;
unsigned int iterations = DEFAULT_ITERATIONS;
int mode = MPI_MODE_RDWR | MPI_MODE_CREATE;
char* buffer = NULL;
MPI_Info info;
MPI_Datatype datatype;
void Test_level0 ();
void Test_level1 ();
void Test_level2 ();
void Test_level3 ();
test tests[] = {
{ "level0", Test_level0 },
{ "level1", Test_level1 },
{ "level2", Test_level2 },
{ "level3", Test_level3 }
};
/* Tests must return void and take no arguments. */
/* Level 0: non-collective, contiguous */
void Test_level0 ()
{
MPI_File fh;
MPI_Status status;
int i;
MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
for (i = 0; i < iterations; ++i)
{
MPI_File_write(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_Sleep(MPI_COMM_WORLD, 1);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_read(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_File_close(&fh);
}
/* Level 1: collective, contiguous */
void Test_level1 ()
{
MPI_File fh;
MPI_Status status;
int i;
MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
for (i = 0; i < iterations; ++i)
{
MPI_File_write_all(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_Sleep(MPI_COMM_WORLD, 1);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_read_all(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_File_close(&fh);
}
/* Level 2: non-collective, non-contiguous */
void Test_level2 ()
{
MPI_File fh;
MPI_Status status;
int i;
MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_write(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_Sleep(MPI_COMM_WORLD, 1);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_read(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_File_close(&fh);
}
/* Level 3: collective, non-contiguous */
void Test_level3 ()
{
MPI_File fh;
MPI_Status status;
int i;
MPI_File_open(MPI_COMM_WORLD, filename, mode, info, &fh);
MPI_File_set_view(fh, 0, MPI_BYTE, datatype, "native", info);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_write_all(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_Sleep(MPI_COMM_WORLD, 1);
MPI_File_seek(fh, 0, MPI_SEEK_SET);
for (i = 0; i < iterations; ++i)
{
MPI_File_read_all(fh, buffer, filesize, MPI_BYTE, &status);
}
MPI_File_close(&fh);
}
void usage (char** argv)
{
printf( "Usage: %s [-d] [-f filename] [-h] [-H hints] [-i iterations] [-s filesize]\n"
" -d Delete file on close. (Default: false)\n"
" -f The name of the file used for the tests. (Default: %s)\n"
" -h Display this help.\n"
" -H Hints, in the form key=value.\n"
" -i Number of iterations. (Default: %d)\n"
" -s The size of the file used for the tests. (Default: %d)\n"
, argv[0], DEFAULT_FILENAME, DEFAULT_ITERATIONS, DEFAULT_FILESIZE);
}
void get_args (int argc, char** argv)
{
int opt;
char* key;
char* value;
char multiplier;
while ((opt = getopt(argc, argv, "df:hH:i:s:")) != -1)
{
switch (opt)
{
case 'd':
mode |= MPI_MODE_DELETE_ON_CLOSE;
break;
case 'f':
filename = strdup(optarg);
break;
case 'h':
usage(argv);
MPI_Finalize();
exit(0);
case 'H':
/* Hint format must be key=value. */
key = optarg;
if ((value = strchr(optarg, '=')) == NULL)
{
if (rank == 0)
{
printf("Error: Invalid hint.\n");
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
/* Separate key and value. */
*value = '\0';
++value;
if (rank == 0)
{
printf("Hint: %s=%s\n", key, value);
}
MPI_Info_set(info, key, value);
/* Restore the string. */
--value;
*value = '=';
break;
case 'i':
iterations = atoi(optarg);
break;
case 's':
filesize = atoi(optarg);
multiplier = *(optarg + strlen(optarg) - 1);
switch (multiplier)
{
case 'G':
filesize *= 1024;
case 'M':
filesize *= 1024;
case 'K':
filesize *= 1024;
break;
}
break;
}
}
}
int main (int argc, char** argv)
{
int i, j;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
printf("%3d/%3d: Hello world!\n", rank + 1, size);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Info_create(&info);
get_args(argc, argv);
int array_sizes[] = { size };
int array_subsizes[] = { 1 };
int array_starts[] = { rank };
MPI_Type_create_subarray(1, array_sizes, array_subsizes, array_starts, MPI_ORDER_C, MPI_BYTE, &datatype);
MPI_Type_commit(&datatype);
if ((buffer = malloc(filesize * sizeof(MPI_BYTE))) == NULL)
{
printf("%3d/%3d: Error: Can not allocate memory.\n", rank + 1, size);
MPI_Abort(MPI_COMM_WORLD, 1);
}
memset(buffer, rank, filesize * sizeof(MPI_BYTE));
MPI_Barrier(MPI_COMM_WORLD);
/* Run all specified tests in the given order. */
for (i = optind; i < argc; ++i)
{
for (j = 0; j < sizeof(tests) / sizeof(test); ++j)
{
if (strcmp(argv[i], tests[j].name) == 0)
{
printf("%3d/%3d: Running %s...\n", rank + 1, size, tests[j].name);
tests[j].function();
}
}
}
MPI_Type_free(&datatype);
MPI_Info_free(&info);
MPI_Finalize();
return 0;
}
_______________________________________________
Pvfs2-developers mailing list
[email protected]
http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers