Apparently if I reverse my offsets (ie. root process 1 and offset of 4 and
process 2 gets an offset of 0) then everything is written to the file (it
doesn't so much matter that it's in reverse as long it stays consistent).
I've generalized this to an arbirtrary number of processors and run it on 12
on our pvfs2 cluster and as long I reverse the block_offsets everything
appears to work fine...

Can someone explain this to me?  Is this some C/Fortran funny business?

On Thu, Oct 7, 2010 at 3:25 PM, Peter Brady <[email protected]> wrote:

> Hello everyone,
>
> I'm having some trouble getting HDF5 to work on our clusters.  I've
> compiled hdf5 with the following options:
>
> export CC=mpicc
> export CXX=mpicxx
> export FC=mpif90
>
> --enable-fortran --enable-parallel --disable-production --enable-debug=all
>
> I wrote a simple fortran test program to be run with two processors below:
>
> program test
>>   use hdf5
>>   implicit none
>>   include 'mpif.h'
>>
>>   integer, parameter :: str_medium = 40
>>   ! group name
>>   character(len=str_medium) :: hdf_block_group = '/Block'
>>   ! dataset names
>>   character(len=str_medium) :: hdf_nnodes_dset = 'nnodes_gl'
>>   ! h5 file/group/dataset/dataspace descriptors
>>   integer(HSIZE_T), dimension(1) :: dset_dims,dimsf,chunk_dims,offset
>>   integer(HID_T) :: plist_id
>>   integer(HID_T) :: file_id,filespace,memspace
>>   integer(HID_T) :: group_id,dset_id,dspace_id
>>   character(len=str_medium) :: filename
>>   integer :: rank,ierr,i
>>   ! data to write
>>   integer :: nbl,nbl_gl,block_offset
>>   integer, dimension(:),pointer :: nnodes_bl
>>   ! mpi stuff
>>   integer :: info
>>   integer, dimension(MPI_STATUS_SIZE) :: status
>>   integer :: irank,iroot,nproc
>>
>>
>>   call MPI_INIT(ierr)
>>   call MPI_COMM_RANK(MPI_COMM_WORLD,irank,ierr)
>>   call MPI_COMM_SIZE(MPI_COMM_WORLD,nproc,ierr)
>>   irank = irank+1
>>   iroot = 1
>>
>>   call h5open_f(ierr)
>>
>>   info = MPI_INFO_NULL
>>
>>   ! set up block structure
>>   nbl = irank*2
>>   filename = 'lit_restart.h5'
>>
>>   allocate(nnodes_bl(nbl))
>>   nnodes_bl = (/(irank,i=1,nbl)/)
>>   nbl_gl = 6
>>   if(irank == iroot) then
>>      block_offset = 0
>>   else
>>      block_offset = 2
>>   end if
>>
>>   ! Setup file access property list with parallel I/O access.
>>   call h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, ierr)
>>   call h5pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, info, ierr)
>>
>>   ! Create the file collectively.
>>   call h5fcreate_f(filename, H5F_ACC_TRUNC_F, file_id, ierr, access_prp =
>> plist_id)
>>   call h5pclose_f(plist_id, ierr)
>>
>>
>>   !**** block data *****
>>   call h5gcreate_f(file_id,trim(hdf_block_group),group_id,ierr)
>>   rank = 1
>>   dset_dims = (/nbl_gl/)
>>   chunk_dims = (/nbl/)
>>   offset = (/block_offset/)
>>   ! create dataspace for hyperslab dataset
>>   call h5screate_simple_f(rank,dset_dims,filespace,ierr)
>>   call
>> h5dcreate_f(group_id,trim(hdf_nnodes_dset),H5T_NATIVE_INTEGER,filespace,&
>>        dset_id,ierr)
>>   call h5sclose_f(filespace, ierr)
>>   call h5screate_simple_f(rank,chunk_dims, memspace, ierr)
>>   ! select hyperslab in file
>>   call h5dget_space_f(dset_id,filespace,ierr)
>>   call
>> h5sselect_hyperslab_f(filespace,H5S_SELECT_SET_F,offset,chunk_dims,ierr)
>>   ! new mpi list
>>   call h5pcreate_f(H5P_DATASET_XFER_F,plist_id,ierr)
>>   call h5pset_dxpl_mpio_f(plist_id,H5FD_MPIO_COLLECTIVE_F,ierr)
>>   ! write actual data
>>   call h5dwrite_f(dset_id,H5T_NATIVE_INTEGER,nnodes_bl,dset_dims,ierr,&
>>        file_space_id=filespace,mem_space_id=memspace,xfer_prp=plist_id)
>>   call h5sclose_f(filespace,ierr)
>>   call h5sclose_f(memspace,ierr)
>>   call h5pclose_f(plist_id,ierr)
>>   call h5dclose_f(dset_id,ierr)
>>   call h5gclose_f(group_id,ierr)
>>   write(*,'(2(a,i5))') 'hid_t: ',hid_t, ' hsize_t: ',hsize_t
>>   write(*,'(5(a,i3))') 'rank: ',irank,' nbl: ',nbl, ' dset_dims:
>> ',dset_dims, &
>>        ' chunk_dims: ',chunk_dims, ' offset: ',offset
>>
>>   ! close file
>>   call h5fclose_f(file_id,ierr)
>>
>>   call h5close_f(ierr)
>>
>>   call MPI_FINALIZE(ierr)
>>
>> end program test
>>
>
>
> On my workstation this program behaves appropriately and the data set is:
> h5dump -d /Block/nnodes_gl lit_restart.h5
> HDF5 "lit_restart.h5" {
> DATASET "/Block/nnodes_gl" {
>    DATATYPE  H5T_STD_I32LE
>    DATASPACE  SIMPLE { ( 6 ) / ( 6 ) }
>    DATA {
>    (0): 1, 1, 2, 2, 2, 2
>    }
> }
> }
>
>
> However, on both the pvfs2 and luster clusters the output is:
> h5dump -d /Block/nnodes_gl lit_restart.h5
> HDF5 "lit_restart.h5" {
> DATASET "/Block/nnodes_gl" {
>    DATATYPE  H5T_STD_I32LE
>    DATASPACE  SIMPLE { ( 6 ) / ( 6 ) }
>    DATA {
>    (0): 2, 2, 2, 2, 0, 0
>    }
> }
> }
>
> I ran 'make check' on our pvfs2 cluster and got quite a few errors on the
> parallel section of things.  I've attached the relevant output in
> out.debug.  Have I configured hdf5 incorrectly?
>
> Thanks,
> Peter.
>
>
>
_______________________________________________
Hdf-forum is for HDF software users discussion.
[email protected]
http://mail.hdfgroup.org/mailman/listinfo/hdf-forum_hdfgroup.org

Reply via email to