Hello, at RRZE we tried to write > 2 GB data (per process) at once to a file with MPI_File_write_at(_all). Thereby the function returns with error code 35.
Attached you will find the compressed output of "ompi_info --all" and a test program (large_mpi_test.F90) with that the problem can be reproduced. The Open MPI Vesion used was 1.3.3. The underlaying file system used is Lustre which is able to deal with files > 2GB. Further if we retrieve the file size with MPI_File_get_size for a file >= 2 GB the returned size is negative. The attached test program get_file_size.f90 will demonstrate the behavior: $ dd if=/dev/zero of=delme.dat bs=1K count=1M 1048576+0 records in 1048576+0 records out 1073741824 bytes (1.1 GB) copied, 9.1401 seconds, 117 MB/s $ mpirun -n 1 get_file_size file size of delme.dat: [0] file size = 1073741824 bytes, 0.102E+04 MB $ dd if=/dev/zero of=delme.dat bs=1K count=2M 2097152+0 records in 2097152+0 records out 2147483648 bytes (2.1 GB) copied, 18.1567 seconds, 118 MB/s $ mpirun -n 1 get_file_size file size of delme.dat: [0] file size = -2147483648 bytes, -.205E+04 MB Regards Markus Wittmann
ompi_info.log.tar.gz
Description: GNU Zip compressed data
program large_mpi_test
implicit none
include "mpif.h"
integer :: num_elements,mode
real*8, allocatable :: fp_data(:)
integer :: mp_ierr,mp_rk,mp_size
character (len=255) :: fn = "test.sta" ! used for mpi-io
integer :: fh,amode,status(MPI_STATUS_SIZE)
integer(KIND=MPI_OFFSET_KIND) :: my_global_off
call mpi_init(mp_ierr)
call mpi_comm_rank(mpi_comm_world,mp_rk ,mp_ierr)
call mpi_comm_size(mpi_comm_world,mp_size,mp_ierr)
if (mp_rk.eq.0) then
write(*,*) "mp_size =", mp_size
write(*,*) "enter number of data elements must be <2147483647"
write(*,*) "but max be larger than 268435456"
write(*,*) "the available memory should be larger than 8x this number"
read(*,*) num_elements
write(*,*) "size of data", num_elements*8.d0/1024.d0/1024.d0, " MB"
write(*,*) "tests to use"
write(*,*) " 1 : send/recv only"
write(*,*) " 2 : MPI-IO only"
write(*,*) " 3 : both"
read(*,*) mode
end if
call mpi_bcast(num_elements,1,MPI_INTEGER,0,mpi_comm_world,mp_ierr)
call mpi_bcast(mode,1,MPI_INTEGER,0,mpi_comm_world,mp_ierr)
if (mp_rk.eq.0) then
write(*,*) "allocating memory ..."
end if
allocate( fp_data(num_elements) ) ! there's hopefully enough memory
fp_data(:) = 0
if ( mode.eq.1 .or. mode.eq.3 ) then
if (mp_rk.eq.0) then
write(*,*) "starting send/recv."
call mpi_send(fp_data,num_elements,MPI_DOUBLE_PRECISION,1,99,mpi_comm_world,mp_ierr)
end if
if (mp_rk.eq.1) then
call mpi_recv(fp_data,num_elements,MPI_DOUBLE_PRECISION,0,99,mpi_comm_world,status,mp_ierr)
write(*,*) "send/recv done."
end if
call mpi_barrier(mpi_comm_world,mp_ierr)
end if
if ( mode.eq.2 .or. mode.eq.3 ) then
if (mp_rk.eq.0) then
write(*,*) "starting MPI-IO test"
! delete old status file (should not procude a visable error if
! the file does not exit - and we do not check mp_ierr here)
call mpi_file_delete(fn,MPI_INFO_NULL,mp_ierr)
end if
! MPI_MODE_CREATE is definitely required as the file has to exist
! before! (according to my tests - no idea about the standard)
amode = IOR(MPI_MODE_WRONLY, MPI_MODE_CREATE)
call mpi_barrier(mpi_comm_world,mp_ierr)
call mpi_file_open(MPI_COMM_WORLD,fn,amode,MPI_INFO_NULL,fh,mp_ierr)
my_global_off = mp_rk ! make type conversin :-)
my_global_off = (my_global_off*num_elements)*8 ! double_precision=8 assumed
write(*,*) "rk=", mp_rk, "writing at", my_global_off
call mpi_file_write_at(fh,my_global_off,fp_data,num_elements, &
MPI_DOUBLE_PRECISION,status,mp_ierr)
if ( mp_ierr .ne. MPI_SUCCESS ) then
write(*,*) "MPI_File_write_at_all() failed - rank", mp_rk, "; error=", mp_ierr
end if
call mpi_file_close(fh,mp_ierr)
if ( mp_ierr .ne. MPI_SUCCESS ) then
write(*,*) "writing restart data: MPI_File_close() failed - rank", mp_rk
end if
if (mp_rk.eq.0) then
write(*,*) "DONE"
end if
end if
call mpi_finalize(mp_ierr)
end program large_mpi_testprogram get_file_size
implicit none
include 'mpif.h'
integer :: error
integer :: rank
integer :: fh
integer (kind=MPI_OFFSET_KIND) :: file_size
character (len=*), parameter :: file_name = 'delme.dat'
call mpi_init(error)
call mpi_assert_success(error)
call mpi_comm_rank(MPI_COMM_WORLD, rank, error)
call mpi_assert_success(error)
if (rank.eq.0) write(*,'(a,a,a)') 'file size of ', file_name, ':'
call mpi_file_open(MPI_COMM_WORLD, file_name, MPI_MODE_RDONLY, &
MPI_INFO_NULL, fh, error)
call mpi_assert_success(error)
call mpi_file_get_size(fh, file_size, error)
call mpi_assert_success(error)
write(*,'(a,i0,a,i0,a,e9.3,a)') '[', rank, '] file size = ', file_size, &
' bytes, ', float(file_size) / 2**20, ' MB'
call mpi_file_close(fh, error)
call mpi_assert_success(error)
call mpi_finalize(error)
call mpi_assert_success(error)
contains
subroutine mpi_assert_success(error)
integer :: error
character (len=MPI_MAX_ERROR_STRING) :: error_string
integer :: string_length, error_code
if (error.ne.MPI_SUCCESS) then
call mpi_error_string(error, error_string, string_length, error_code)
write(*,'(a,i0,a,i0,x,a)') '# [', rank, &
'] ERROR: mpi_assert_success failed with error code: ', &
error, error_string
stop
end if
end subroutine mpi_assert_success
end program get_file_size
