Hello, dear hdf experts, 

I am using the parallel hdf5 library to access a three-dimensional dataset
in a hdf5 file on linux. The dataset is of size: dim[0]=1501, dim[1] = 1536 and 
dim[2] = 2048. I want to read the dataset along the first (READ_Z), 
second(READ_Y), 
and third (READ_X) dimension separately. 

However, the performance is quite differently. 

   (1) Reading along the first dimension (#define READ_Z) is very fast. It 
takes 
       about 0.8 seconds to read a 24 * 1536 * 2048 unsigned integers using one 
node. 

   (2) Reading along the second dimension (#define READ_Y) is much slower. It 
takes 
       about 15 seconds to read a 1501 * 25 * 2048 unsigned integers using one 
node.

   (3) Reading along the third dimension (#define READ_X) is the slowest. It 
takes 
       about 30 seconds to read a 1501 * 1536 * 64 unsigned integers using one 
node. 

The first case (1) seems reasonable to me. The other two cases do not. But I do 
not
know why. I see that hdf5 library uses MPI IO (MPI_VECTOR_TYPE and 
MPI_FILE_SET_VIEW) 
for parallel access. But I do not know why the performance is so different. Is 
it 
from the parallel library? Or is it from my codes (some parameters)? Please 
check the
attached codes for details. Thanks. 

I am using mpich2 and Parallel hdf5 1.8.5 on linux. The compiler is mpicc from 
mpich2. 

I would appreciate your comments and suggestions a lot. 

Best regards,

Yongsheng Pan
Postdoctoral Appointee
Advanced Photon Science
Argonne National Laboratory
/*********************************************************************/
/* PHDF5 Test Application                                            */
/*                                                                   */
/* 2011-08-31                                                        */
/*                                                                   */
/* Author(s): Nicholas Schwarz                                       */
/*            [email protected]                                   */
/*                                                                   */
/* Copyright 2011 UCHICAGO ARGONNE, LLC                              */
/*                AS OPERATOR OF ARGONNE NATIONAL LABORATORY         */
/*********************************************************************/

#include <mpi.h>
#include <hdf5.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>


#define VERBOSE 0

#define READ_Z
// #define READ_Y
// #define READ_X


//#define FILE_NAME "/data/nschwarz/file_256x512x512.h5"
#define FILE_NAME "/data/nschwarz/raw_stu.hdf5"
//#define FILE_NAME "/data/nschwarz/file_512x1024x1024.h5"
//#define FILE_NAME "/data/nschwarz/file_1024x1024x1024.h5"
//#define FILE_NAME "/data/nschwarz/file_2048x1024x1024.h5"
//#define FILE_NAME "/clhome/NSCHWARZ/tmp/raw_stu.hdf5"


int timeval_add(struct timeval *result, 
		struct timeval *t2, 
		struct timeval *t1)
{

    long int sum = 
      (t2 -> tv_usec + 1000000 * t2 -> tv_sec) +
      (t1 -> tv_usec + 1000000 * t1 -> tv_sec);
    result -> tv_sec = sum / 1000000;
    result -> tv_usec = sum % 1000000;

    return (sum < 0);

}


int timeval_subtract(struct timeval *result, 
		     struct timeval *t2, 
		     struct timeval *t1)
{

    long int diff = 
      (t2 -> tv_usec + 1000000 * t2 -> tv_sec) - 
      (t1 -> tv_usec + 1000000 * t1 -> tv_sec);
    result -> tv_sec = diff / 1000000;
    result -> tv_usec = diff % 1000000;

    return (diff < 0);

}


int main (int argc, char** argv)
{

  // Check command line
  if (argc != 2) {
    fprintf(stderr, "phdf_test: Must specify number of bunches.\n");
    exit(1);
  }


  // Number of bunches each node reads
  int numBunches = atoi(argv[1]);


  // HDF5
  hid_t file_id;
  hid_t dset_id;
  hid_t	plist_id;
  herr_t status;

  // Time
  struct timeval tvBegin, tvEnd, tvDiff, tvTotal;
  tvTotal.tv_sec = 0;
  tvTotal.tv_usec = 0;


  // MPI variables
  int mpi_size;
  int mpi_rank;
  MPI_Comm comm = MPI_COMM_WORLD;
  MPI_Info info = MPI_INFO_NULL;



  // Create MPI world
  MPI_Init(&argc, &argv);
  MPI_Comm_size(comm, &mpi_size);
  MPI_Comm_rank(comm, &mpi_rank);  



  // Print welcome
  if (mpi_rank == 0) {
    fprintf(stdout, "------------------------------\n");
    fprintf(stdout, "PHDF5 Test Application        \n");
    fprintf(stdout, "------------------------------\n");
    fprintf(stdout, "\n");
  }

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);

  // Print status
  if (VERBOSE) {
    fprintf(stdout, "Rank %d reporting for duty...\n", mpi_rank);
  }

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);



  // Set up file access property list with parallel I/O access
  plist_id = H5Pcreate(H5P_FILE_ACCESS);
  H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
  
  // Open the file
  // file_id = H5Fopen(FILE_NAME, H5F_ACC_RDONLY, H5P_DEFAULT/*plist_id*/);
  file_id = H5Fopen(FILE_NAME, H5F_ACC_RDONLY, plist_id);
  H5Pclose(plist_id);

  // Open the dataset
  dset_id = H5Dopen2(file_id, "/entry/exchange/data", H5P_DEFAULT);
  //dset_id = H5Dopen2(file_id, "/exchange/data", H5P_DEFAULT);

  // Create property list for collective dataset write.
  plist_id = H5Pcreate(H5P_DATASET_XFER);

  // Setup IO
  H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
  //H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);

  // Get dataspace
  hid_t dataspace = H5Dget_space(dset_id);

  // Get dimensions of full dataset
  hsize_t dims_dataset[3];
  H5Sget_simple_extent_dims(dataspace, dims_dataset, NULL);



  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);

  // Print status
  if (mpi_rank == 0) {
    fprintf(stdout, "\nDataset dimensions: %d %d %d\n", 
	    dims_dataset[0], dims_dataset[1], dims_dataset[2]);
    fprintf(stdout, "Number of bunches: %d\n\n", numBunches);
  }

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);



  // Number of slices per node
#ifdef READ_Z
  int deltaZ = dims_dataset[0] / mpi_size;
#endif

#ifdef READ_Y
  int deltaZ = dims_dataset[1] / mpi_size;
#endif

#ifdef READ_X
  int deltaZ = dims_dataset[2] / mpi_size;
#endif


  // Starting slice for this node
  int startZ = mpi_rank * deltaZ;

  // Number of slices based on number of bunches
  deltaZ = deltaZ / numBunches;


  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);

  // Number of bytes to allocate
  //int numBytes = sizeof(int) * deltaZ * dims_dataset[1] * dims_dataset[2];

#ifdef READ_Z
  hsize_t numBytes = 2 * deltaZ * dims_dataset[1] * dims_dataset[2];
  // unsigned int data[ 1536 ][ 2048 ];
#endif

#ifdef READ_Y
  hsize_t numBytes = 2 * deltaZ * dims_dataset[0] * dims_dataset[2];
  // unsigned int data[ 1501 ][ 2048 ];
#endif

#ifdef READ_X
  hsize_t numBytes = 2 * deltaZ * dims_dataset[0] * dims_dataset[1];
  // unsigned int data[ 1501 ][ 1536 ];
#endif

  // Print status
  if (VERBOSE) {
    fprintf(stdout, "Rank %d allocating %ld bytes...\n", mpi_rank, numBytes);
  }

  // Allocate memory
  unsigned int* data = (unsigned int*) malloc(numBytes);
  if (data == NULL) {
    fprintf(stderr, "Memory allocation error!\n");
  }

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);


  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);

  // Print blank line
  if (VERBOSE) {
    if (mpi_rank == 0) {
      fprintf(stdout, "\n");
      fflush(stdout);
    }
  }

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);



  // Get each bunch
  for (int b = 0 ; b < numBunches ; b++) {

    // Define hyperslab in the dataset
    hsize_t count[3];
    hsize_t offset[3];

#ifdef READ_Z
    offset[0] = startZ + (b * deltaZ);
    offset[1] = 0;
    offset[2] = 0;

    count[0] = deltaZ;
    count[1] = dims_dataset[1];
    count[2] = dims_dataset[2];
#endif

#ifdef READ_Y
    offset[0] = 0;
    offset[1] = startZ + (b * deltaZ);
    offset[2] = 0;

    count[0] = dims_dataset[0];
    count[1] = deltaZ;
    count[2] = dims_dataset[2];

#endif

#ifdef READ_X
    offset[0] = 0;
    offset[1] = 0;
    offset[2] = startZ + (b * deltaZ);

    count[0] = dims_dataset[0];
    count[1] = dims_dataset[1];
    count[2] = deltaZ;

#endif


    status = H5Sselect_hyperslab(dataspace, H5S_SELECT_SET, 
				 offset, NULL, count, NULL);


    // define the memory dataspace
    hsize_t dimsm[3];

#ifdef READ_Z
    dimsm[0] = deltaZ;
    dimsm[1] = dims_dataset[1];
    dimsm[2] = dims_dataset[2];
#endif

#ifdef READ_Y
    dimsm[0] = dims_dataset[0];
    dimsm[1] = deltaZ;
    dimsm[2] = dims_dataset[2];
#endif

#ifdef READ_X
    dimsm[0] = dims_dataset[0];
    dimsm[1] = dims_dataset[1];
    dimsm[2] = deltaZ;
#endif


    hid_t memspace = H5Screate_simple(3, dimsm, NULL);
    
    hsize_t count_out[3];
    hsize_t offset_out[3];

    offset_out[0] = 0;
    offset_out[1] = 0;
    offset_out[2] = 0;

#ifdef READ_Z
    count_out[0] = deltaZ;
    count_out[1] = dims_dataset[1];
    count_out[2] = dims_dataset[2];
#endif

#ifdef READ_Y
    count_out[0] = dims_dataset[0];
    count_out[1] = deltaZ;
    count_out[2] = dims_dataset[2];
#endif

#ifdef READ_X
    count_out[0] = dims_dataset[0];
    count_out[1] = dims_dataset[1];
    count_out[2] = deltaZ;
#endif
 
    status = H5Sselect_hyperslab(memspace, H5S_SELECT_SET, 
				 offset_out, NULL, count_out, NULL);


    // Print status message
    if (VERBOSE) {
      fprintf(stdout, "Rank %d reading data...\n", mpi_rank);
    }


    // Sync up
    MPI_Barrier(MPI_COMM_WORLD);

    gettimeofday(&tvBegin, NULL);
    //status = 
    //  H5Dread(dset_id, H5T_STD_U32LE, memspace, dataspace, plist_id, data);
    status = H5Dread(dset_id, H5T_STD_U16LE, memspace, dataspace, plist_id, data);
    gettimeofday(&tvEnd, NULL);

    timeval_subtract(&tvDiff, &tvEnd, &tvBegin);

    printf("reading %d ~ %d slices takes %f seconds\n", startZ + b * deltaZ, startZ + (b+1) * deltaZ, tvDiff.tv_sec + 1e-6*tvDiff.tv_usec );

    timeval_add(&tvTotal, &tvTotal, &tvDiff);

    // Sync up
    MPI_Barrier(MPI_COMM_WORLD);



    // Sync up
    MPI_Barrier(MPI_COMM_WORLD);

    // Print blank line
    if (VERBOSE) {
      if (mpi_rank == 0) {
	fprintf(stdout, "\n");
	fflush(stdout);
      }
    }

    // Sync up
    MPI_Barrier(MPI_COMM_WORLD);


    // Close HDF5 resources
    H5Sclose(memspace);
  
  } // End getting bunches



  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);

  // Print status message
  fprintf(stderr, "Rank %d took %ld.%06ld\n", 
	  mpi_rank, tvTotal.tv_sec, tvTotal.tv_usec);

  // Sync up
  MPI_Barrier(MPI_COMM_WORLD);



  // Close HDF5 resources
  H5Dclose(dset_id);
  H5Sclose(dataspace);
  H5Fclose(file_id); 

  // Free memory
  free(data);

  // Clean up MPI
  MPI_Finalize();



  // Done
  return 0;

}     


_______________________________________________
Hdf-forum is for HDF software users discussion.
[email protected]
http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org

Reply via email to