Hello, at RRZE we tried to write > 2 GB data (per process) at once to a file with MPI_File_write_at(_all). Thereby the function returns with error code 35.
Attached you will find the compressed output of "ompi_info --all" and a test program (large_mpi_test.F90) with that the problem can be reproduced. The Open MPI Vesion used was 1.3.3. The underlaying file system used is Lustre which is able to deal with files > 2GB. Further if we retrieve the file size with MPI_File_get_size for a file >= 2 GB the returned size is negative. The attached test program get_file_size.f90 will demonstrate the behavior: $ dd if=/dev/zero of=delme.dat bs=1K count=1M 1048576+0 records in 1048576+0 records out 1073741824 bytes (1.1 GB) copied, 9.1401 seconds, 117 MB/s $ mpirun -n 1 get_file_size file size of delme.dat: [0] file size = 1073741824 bytes, 0.102E+04 MB $ dd if=/dev/zero of=delme.dat bs=1K count=2M 2097152+0 records in 2097152+0 records out 2147483648 bytes (2.1 GB) copied, 18.1567 seconds, 118 MB/s $ mpirun -n 1 get_file_size file size of delme.dat: [0] file size = -2147483648 bytes, -.205E+04 MB Regards Markus Wittmann
ompi_info.log.tar.gz
Description: GNU Zip compressed data
program large_mpi_test implicit none include "mpif.h" integer :: num_elements,mode real*8, allocatable :: fp_data(:) integer :: mp_ierr,mp_rk,mp_size character (len=255) :: fn = "test.sta" ! used for mpi-io integer :: fh,amode,status(MPI_STATUS_SIZE) integer(KIND=MPI_OFFSET_KIND) :: my_global_off call mpi_init(mp_ierr) call mpi_comm_rank(mpi_comm_world,mp_rk ,mp_ierr) call mpi_comm_size(mpi_comm_world,mp_size,mp_ierr) if (mp_rk.eq.0) then write(*,*) "mp_size =", mp_size write(*,*) "enter number of data elements must be <2147483647" write(*,*) "but max be larger than 268435456" write(*,*) "the available memory should be larger than 8x this number" read(*,*) num_elements write(*,*) "size of data", num_elements*8.d0/1024.d0/1024.d0, " MB" write(*,*) "tests to use" write(*,*) " 1 : send/recv only" write(*,*) " 2 : MPI-IO only" write(*,*) " 3 : both" read(*,*) mode end if call mpi_bcast(num_elements,1,MPI_INTEGER,0,mpi_comm_world,mp_ierr) call mpi_bcast(mode,1,MPI_INTEGER,0,mpi_comm_world,mp_ierr) if (mp_rk.eq.0) then write(*,*) "allocating memory ..." end if allocate( fp_data(num_elements) ) ! there's hopefully enough memory fp_data(:) = 0 if ( mode.eq.1 .or. mode.eq.3 ) then if (mp_rk.eq.0) then write(*,*) "starting send/recv." call mpi_send(fp_data,num_elements,MPI_DOUBLE_PRECISION,1,99,mpi_comm_world,mp_ierr) end if if (mp_rk.eq.1) then call mpi_recv(fp_data,num_elements,MPI_DOUBLE_PRECISION,0,99,mpi_comm_world,status,mp_ierr) write(*,*) "send/recv done." end if call mpi_barrier(mpi_comm_world,mp_ierr) end if if ( mode.eq.2 .or. mode.eq.3 ) then if (mp_rk.eq.0) then write(*,*) "starting MPI-IO test" ! delete old status file (should not procude a visable error if ! the file does not exit - and we do not check mp_ierr here) call mpi_file_delete(fn,MPI_INFO_NULL,mp_ierr) end if ! MPI_MODE_CREATE is definitely required as the file has to exist ! before! (according to my tests - no idea about the standard) amode = IOR(MPI_MODE_WRONLY, MPI_MODE_CREATE) call mpi_barrier(mpi_comm_world,mp_ierr) call mpi_file_open(MPI_COMM_WORLD,fn,amode,MPI_INFO_NULL,fh,mp_ierr) my_global_off = mp_rk ! make type conversin :-) my_global_off = (my_global_off*num_elements)*8 ! double_precision=8 assumed write(*,*) "rk=", mp_rk, "writing at", my_global_off call mpi_file_write_at(fh,my_global_off,fp_data,num_elements, & MPI_DOUBLE_PRECISION,status,mp_ierr) if ( mp_ierr .ne. MPI_SUCCESS ) then write(*,*) "MPI_File_write_at_all() failed - rank", mp_rk, "; error=", mp_ierr end if call mpi_file_close(fh,mp_ierr) if ( mp_ierr .ne. MPI_SUCCESS ) then write(*,*) "writing restart data: MPI_File_close() failed - rank", mp_rk end if if (mp_rk.eq.0) then write(*,*) "DONE" end if end if call mpi_finalize(mp_ierr) end program large_mpi_test
program get_file_size implicit none include 'mpif.h' integer :: error integer :: rank integer :: fh integer (kind=MPI_OFFSET_KIND) :: file_size character (len=*), parameter :: file_name = 'delme.dat' call mpi_init(error) call mpi_assert_success(error) call mpi_comm_rank(MPI_COMM_WORLD, rank, error) call mpi_assert_success(error) if (rank.eq.0) write(*,'(a,a,a)') 'file size of ', file_name, ':' call mpi_file_open(MPI_COMM_WORLD, file_name, MPI_MODE_RDONLY, & MPI_INFO_NULL, fh, error) call mpi_assert_success(error) call mpi_file_get_size(fh, file_size, error) call mpi_assert_success(error) write(*,'(a,i0,a,i0,a,e9.3,a)') '[', rank, '] file size = ', file_size, & ' bytes, ', float(file_size) / 2**20, ' MB' call mpi_file_close(fh, error) call mpi_assert_success(error) call mpi_finalize(error) call mpi_assert_success(error) contains subroutine mpi_assert_success(error) integer :: error character (len=MPI_MAX_ERROR_STRING) :: error_string integer :: string_length, error_code if (error.ne.MPI_SUCCESS) then call mpi_error_string(error, error_string, string_length, error_code) write(*,'(a,i0,a,i0,x,a)') '# [', rank, & '] ERROR: mpi_assert_success failed with error code: ', & error, error_string stop end if end subroutine mpi_assert_success end program get_file_size