Hello, dear hdf experts,
I am using the parallel hdf5 library to access a three-dimensional dataset
in a hdf5 file on linux. The dataset is of size: dim[0]=1501, dim[1] = 1536 and
dim[2] = 2048. I want to read the dataset along the first (READ_Z),
second(READ_Y),
and third (READ_X) dimension separately.
However, the performance is quite differently.
(1) Reading along the first dimension (#define READ_Z) is very fast. It
takes
about 0.8 seconds to read a 24 * 1536 * 2048 unsigned integers using one
node.
(2) Reading along the second dimension (#define READ_Y) is much slower. It
takes
about 15 seconds to read a 1501 * 25 * 2048 unsigned integers using one
node.
(3) Reading along the third dimension (#define READ_X) is the slowest. It
takes
about 30 seconds to read a 1501 * 1536 * 64 unsigned integers using one
node.
The first case (1) seems reasonable to me. The other two cases do not. But I do
not
know why. I see that hdf5 library uses MPI IO (MPI_VECTOR_TYPE and
MPI_FILE_SET_VIEW)
for parallel access. But I do not know why the performance is so different. Is
it
from the parallel library? Or is it from my codes (some parameters)? Please
check the
attached codes for details. Thanks.
I am using mpich2 and Parallel hdf5 1.8.5 on linux. The compiler is mpicc from
mpich2.
I would appreciate your comments and suggestions a lot.
Best regards,
Yongsheng Pan
Postdoctoral Appointee
Advanced Photon Science
Argonne National Laboratory/*********************************************************************/
/* PHDF5 Test Application */
/* */
/* 2011-08-31 */
/* */
/* Author(s): Nicholas Schwarz */
/* [email protected] */
/* */
/* Copyright 2011 UCHICAGO ARGONNE, LLC */
/* AS OPERATOR OF ARGONNE NATIONAL LABORATORY */
/*********************************************************************/
#include <mpi.h>
#include <hdf5.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define VERBOSE 0
#define READ_Z
// #define READ_Y
// #define READ_X
//#define FILE_NAME "/data/nschwarz/file_256x512x512.h5"
#define FILE_NAME "/data/nschwarz/raw_stu.hdf5"
//#define FILE_NAME "/data/nschwarz/file_512x1024x1024.h5"
//#define FILE_NAME "/data/nschwarz/file_1024x1024x1024.h5"
//#define FILE_NAME "/data/nschwarz/file_2048x1024x1024.h5"
//#define FILE_NAME "/clhome/NSCHWARZ/tmp/raw_stu.hdf5"
int timeval_add(struct timeval *result,
struct timeval *t2,
struct timeval *t1)
{
long int sum =
(t2 -> tv_usec + 1000000 * t2 -> tv_sec) +
(t1 -> tv_usec + 1000000 * t1 -> tv_sec);
result -> tv_sec = sum / 1000000;
result -> tv_usec = sum % 1000000;
return (sum < 0);
}
int timeval_subtract(struct timeval *result,
struct timeval *t2,
struct timeval *t1)
{
long int diff =
(t2 -> tv_usec + 1000000 * t2 -> tv_sec) -
(t1 -> tv_usec + 1000000 * t1 -> tv_sec);
result -> tv_sec = diff / 1000000;
result -> tv_usec = diff % 1000000;
return (diff < 0);
}
int main (int argc, char** argv)
{
// Check command line
if (argc != 2) {
fprintf(stderr, "phdf_test: Must specify number of bunches.\n");
exit(1);
}
// Number of bunches each node reads
int numBunches = atoi(argv[1]);
// HDF5
hid_t file_id;
hid_t dset_id;
hid_t plist_id;
herr_t status;
// Time
struct timeval tvBegin, tvEnd, tvDiff, tvTotal;
tvTotal.tv_sec = 0;
tvTotal.tv_usec = 0;
// MPI variables
int mpi_size;
int mpi_rank;
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Info info = MPI_INFO_NULL;
// Create MPI world
MPI_Init(&argc, &argv);
MPI_Comm_size(comm, &mpi_size);
MPI_Comm_rank(comm, &mpi_rank);
// Print welcome
if (mpi_rank == 0) {
fprintf(stdout, "------------------------------\n");
fprintf(stdout, "PHDF5 Test Application \n");
fprintf(stdout, "------------------------------\n");
fprintf(stdout, "\n");
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Print status
if (VERBOSE) {
fprintf(stdout, "Rank %d reporting for duty...\n", mpi_rank);
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Set up file access property list with parallel I/O access
plist_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
// Open the file
// file_id = H5Fopen(FILE_NAME, H5F_ACC_RDONLY, H5P_DEFAULT/*plist_id*/);
file_id = H5Fopen(FILE_NAME, H5F_ACC_RDONLY, plist_id);
H5Pclose(plist_id);
// Open the dataset
dset_id = H5Dopen2(file_id, "/entry/exchange/data", H5P_DEFAULT);
//dset_id = H5Dopen2(file_id, "/exchange/data", H5P_DEFAULT);
// Create property list for collective dataset write.
plist_id = H5Pcreate(H5P_DATASET_XFER);
// Setup IO
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
//H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);
// Get dataspace
hid_t dataspace = H5Dget_space(dset_id);
// Get dimensions of full dataset
hsize_t dims_dataset[3];
H5Sget_simple_extent_dims(dataspace, dims_dataset, NULL);
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Print status
if (mpi_rank == 0) {
fprintf(stdout, "\nDataset dimensions: %d %d %d\n",
dims_dataset[0], dims_dataset[1], dims_dataset[2]);
fprintf(stdout, "Number of bunches: %d\n\n", numBunches);
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Number of slices per node
#ifdef READ_Z
int deltaZ = dims_dataset[0] / mpi_size;
#endif
#ifdef READ_Y
int deltaZ = dims_dataset[1] / mpi_size;
#endif
#ifdef READ_X
int deltaZ = dims_dataset[2] / mpi_size;
#endif
// Starting slice for this node
int startZ = mpi_rank * deltaZ;
// Number of slices based on number of bunches
deltaZ = deltaZ / numBunches;
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Number of bytes to allocate
//int numBytes = sizeof(int) * deltaZ * dims_dataset[1] * dims_dataset[2];
#ifdef READ_Z
hsize_t numBytes = 2 * deltaZ * dims_dataset[1] * dims_dataset[2];
// unsigned int data[ 1536 ][ 2048 ];
#endif
#ifdef READ_Y
hsize_t numBytes = 2 * deltaZ * dims_dataset[0] * dims_dataset[2];
// unsigned int data[ 1501 ][ 2048 ];
#endif
#ifdef READ_X
hsize_t numBytes = 2 * deltaZ * dims_dataset[0] * dims_dataset[1];
// unsigned int data[ 1501 ][ 1536 ];
#endif
// Print status
if (VERBOSE) {
fprintf(stdout, "Rank %d allocating %ld bytes...\n", mpi_rank, numBytes);
}
// Allocate memory
unsigned int* data = (unsigned int*) malloc(numBytes);
if (data == NULL) {
fprintf(stderr, "Memory allocation error!\n");
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Print blank line
if (VERBOSE) {
if (mpi_rank == 0) {
fprintf(stdout, "\n");
fflush(stdout);
}
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Get each bunch
for (int b = 0 ; b < numBunches ; b++) {
// Define hyperslab in the dataset
hsize_t count[3];
hsize_t offset[3];
#ifdef READ_Z
offset[0] = startZ + (b * deltaZ);
offset[1] = 0;
offset[2] = 0;
count[0] = deltaZ;
count[1] = dims_dataset[1];
count[2] = dims_dataset[2];
#endif
#ifdef READ_Y
offset[0] = 0;
offset[1] = startZ + (b * deltaZ);
offset[2] = 0;
count[0] = dims_dataset[0];
count[1] = deltaZ;
count[2] = dims_dataset[2];
#endif
#ifdef READ_X
offset[0] = 0;
offset[1] = 0;
offset[2] = startZ + (b * deltaZ);
count[0] = dims_dataset[0];
count[1] = dims_dataset[1];
count[2] = deltaZ;
#endif
status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET,
offset, NULL, count, NULL);
// define the memory dataspace
hsize_t dimsm[3];
#ifdef READ_Z
dimsm[0] = deltaZ;
dimsm[1] = dims_dataset[1];
dimsm[2] = dims_dataset[2];
#endif
#ifdef READ_Y
dimsm[0] = dims_dataset[0];
dimsm[1] = deltaZ;
dimsm[2] = dims_dataset[2];
#endif
#ifdef READ_X
dimsm[0] = dims_dataset[0];
dimsm[1] = dims_dataset[1];
dimsm[2] = deltaZ;
#endif
hid_t memspace = H5Screate_simple(3, dimsm, NULL);
hsize_t count_out[3];
hsize_t offset_out[3];
offset_out[0] = 0;
offset_out[1] = 0;
offset_out[2] = 0;
#ifdef READ_Z
count_out[0] = deltaZ;
count_out[1] = dims_dataset[1];
count_out[2] = dims_dataset[2];
#endif
#ifdef READ_Y
count_out[0] = dims_dataset[0];
count_out[1] = deltaZ;
count_out[2] = dims_dataset[2];
#endif
#ifdef READ_X
count_out[0] = dims_dataset[0];
count_out[1] = dims_dataset[1];
count_out[2] = deltaZ;
#endif
status = H5Sselect_hyperslab(memspace, H5S_SELECT_SET,
offset_out, NULL, count_out, NULL);
// Print status message
if (VERBOSE) {
fprintf(stdout, "Rank %d reading data...\n", mpi_rank);
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
gettimeofday(&tvBegin, NULL);
//status =
// H5Dread(dset_id, H5T_STD_U32LE, memspace, dataspace, plist_id, data);
status = H5Dread(dset_id, H5T_STD_U16LE, memspace, dataspace, plist_id, data);
gettimeofday(&tvEnd, NULL);
timeval_subtract(&tvDiff, &tvEnd, &tvBegin);
printf("reading %d ~ %d slices takes %f seconds\n", startZ + b * deltaZ, startZ + (b+1) * deltaZ, tvDiff.tv_sec + 1e-6*tvDiff.tv_usec );
timeval_add(&tvTotal, &tvTotal, &tvDiff);
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Print blank line
if (VERBOSE) {
if (mpi_rank == 0) {
fprintf(stdout, "\n");
fflush(stdout);
}
}
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Close HDF5 resources
H5Sclose(memspace);
} // End getting bunches
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Print status message
fprintf(stderr, "Rank %d took %ld.%06ld\n",
mpi_rank, tvTotal.tv_sec, tvTotal.tv_usec);
// Sync up
MPI_Barrier(MPI_COMM_WORLD);
// Close HDF5 resources
H5Dclose(dset_id);
H5Sclose(dataspace);
H5Fclose(file_id);
// Free memory
free(data);
// Clean up MPI
MPI_Finalize();
// Done
return 0;
}
_______________________________________________
Hdf-forum is for HDF software users discussion.
[email protected]
http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org