Dear all!
I included a file to this message (seclr4.F) that is a necessary replacement if you want to use ELPA versions from 2017 and onwards. Since ELPA changed its interface, the WIEN2k implementation needed to be adapted as well. Just copy the file to $WIENROOT/SRC_lapw1 and recompile lapw1 - you do not have to change anything else. The old ELPA implementation in WIEN2k (for older versions) still works with the new file: You will have to add the switch " -DELPA15 " to your ELPA options (ELPA_OPT) in the Makefile in $WIENROOT/SRC_lapw1. This switch is an ADDITION to the already present switch "-DELPA" - both are needed to use the old interface with the new file. Kind regards Thomas Ruh
! This file uses LAPACK routines ! SUBROUTINE SECLR4 ! #ifdef Parallel use matrices, only : H, S, HSROWS, Z, EIGVAL #else use matrices, only : HS, HSDIAG, HSROWS, Z, EIGVAL #endif use parallel, only : MYID, DESCHS, DESCZ, NPROW, NPCOL, BARRIER, & BLOCKSIZE, ICTXTALL, LDHS, LCOLZ, abort_parallel, & NPE, myrowhs, mycolhs,ictxt,ictxtall, NBELW_PARA, & #ifdef ELPA lcolhs, elpa_switch, scala_switch, elpa_mem_default #else elpa_switch, scala_switch #endif use lapw_timer, only : READ_CPU_TIME, READ_WALL_TIME, & START_TIMER, STOP_TIMER, & time_cholesky, time_transform, time_sep, & #ifdef ELPA time_backtransform, time_matrixcomplete #else time_backtransform #endif use lolog, only : nlo, loor, lapw, ilo #ifdef ELPA use comi, only : NVAA, NE, NBELW, NAT, NT, LNSMAX, NEMAX_IT #else use comi, only : NVAA, NE, NBELW, NAT, NT, LNSMAX #endif use coml, only : SPRSWF, PRNTWF, WFTAPE, REL, ITER, NOHNS use comr, only : RKM, EL, EU, WEIGHT, divide #ifdef ELPA #ifdef ELPA15 !!! ELPA versions before 2017 use elpa1 use elpa2 use elpa_utilities use elpa2_utilities use mod_blacs_infrastructure use test_util IMPLICIT NONE INCLUDE 'mpif.h' INCLUDE 'elpa.h' INCLUDE 'param.inc' #else !!! ELPA versions elpa2017 or newer use elpa IMPLICIT NONE INCLUDE 'mpif.h' INCLUDE 'param.inc' #endif #else IMPLICIT NONE INCLUDE 'param.inc' #endif ! .................................................................. ! ! interface between matrix set up in LAPW1 and LAPACK (BLAS) matrix ! subroutines; DOUBLE PRECISION version ! ! Solve the secular equations H*Z = EIGVAL*S*Z ! H and S are stored in the array HS, ! H is stored below the diagonal, whereas S is stored above ! the diagonal. The diagonal of H is stored in the ! one dimensional array HSDIAG. ! NVAA is the dimension of the secular matrices ! there is a limit on NE because of the dimensions used ! NS is the total space available for the eigenvectors Z ! NE is returned as the number of Eigenvalues between EL1 and EU1 ! except NE is limited to (NS/NVAA) ! EL1 and EU1 are input as EIGVAL(1) and EIGVAL(2). ! IF EIGVAL(2) .LT. EIGVAL(1) THEN no bounds are applied ! ! D.D.Koelling April,1974 ! ! .................................................................. ! ! ! ! Locals ! INTEGER IER1, IER2, NE1, NMAT1, NSPC INTEGER I, BLOCK1, BLOCK2 DOUBLE PRECISION EL1, EU1, Q DOUBLE PRECISION Flop DOUBLE PRECISION WTIME(4) INTEGER STDERR PARAMETER (STDERR = 99) ! ! External Functions ! ! INTEGER NEBLW1 ! EXTERNAL NEBLW1 ! ! Local Parameters ! INTEGER NBMIN, NBVAL, NXVAL PARAMETER (NBMIN = 1, NBVAL = 2, NXVAL = 1) ! ! Locals ! CHARACTER*67 ERRMSG CHARACTER JOBZ, RANG, UPLO INTEGER IL, INFO, IU, J INTEGER, allocatable :: IFAIL(:), IWORK(:) INTEGER LWORK, LIWORK, LRWORK, LWORKQ, LIWORKQ, LRWORKQ INTEGER NEV INTEGER, allocatable :: ICLUSTR(:) DOUBLE PRECISION, allocatable :: GAP(:) DOUBLE PRECISION SCAL INTEGER NN, NP0, MQ0, NQ0, CLUSTERSIZE INTEGER IERR DOUBLE PRECISION ABSTOL, ORFAC double precision work24 !, test, RUH integer itest, rtest !test RUH integer, allocatable :: ISUPPZ(:) !test RUH !_REAL DOUBLE PRECISION, allocatable :: WORK(:) !_COMPLEX DOUBLE COMPLEX, allocatable :: WORK(:) !_COMPLEX DOUBLE PRECISION, allocatable :: RWORK(:) DOUBLE PRECISION CP(5) logical out,out1 data out/.false./ data out1/.false./ #ifdef ELPA double precision transformation_begin, transformation_end !_REAL double precision, allocatable :: sendbuf(:),receivebuf(:),h_temp(:,:) !_COMPLEX double complex, allocatable :: sendbuf(:),receivebuf(:),h_temp(:,:) integer number_of_eigenvalues,mpi_comm_rows,mpi_comm_cols,mpierr,dimmatrix,shapeh(2) integer i_global,j_global,i_local,j_local,trans_i_local,trans_j_local integer row_coordinate,column_coordinate,trans_row_coordinate,trans_column_coordinate integer row_step,column_step,trans_row_step,trans_column_step,element_counter integer column_block_coordinate,row_block_coordinate,trans_row_block_coordinate,trans_column_block_coordinate integer sourceid,targetid,mirrortag,status(MPI_STATUS_SIZE) #ifdef ELPA15 !!! ELPA versions before 2017 logical success #else !!! ELPA versions elpa2017 or newer integer success class(elpa_t), pointer :: elpa_pt #endif #endif ! ! External Subroutines ! !_REAL EXTERNAL OUTERR, DPOTRF, DSCGST, DSYEVX2 !_COMPLEX EXTERNAL OUTERR, ZPOTRF, ZHCGST, ZHEEVX2 #ifdef Parallel INTEGER NUMROC, ICEIL EXTERNAL NUMROC, ICEIL real*8 PDLAMCH EXTERNAL PDLAMCH #endif ! ! Intrinsic Functions ! INTRINSIC MIN ! ! read(5,'(L1)',err=111,end=111) out1 if(out1) out=out1 if(out1) print*, 'out set' 111 continue ! out=.true. ! out=.false. CALL BARRIER #ifdef Parallel allocate( Z(LDHS, LCOLZ)) Z=0.0D0 NN = MAX(HSROWS, BLOCKSIZE, 2) NP0 = NUMROC(NN,BLOCKSIZE, 0, 0, NPROW) ! MQ0 = NUMROC(MAX(NUME, BLOCKSIZE, 2), BLOCKSIZE, 0, 0, NPCOL) MQ0 = NUMROC(MAX(LCOLZ, BLOCKSIZE, 2), BLOCKSIZE, 0, 0, NPCOL) NQ0 = NUMROC( NN, BLOCKSIZE, 0, 0, NPCOL ) CLUSTERSIZE = 600 CLUSTERSIZE=max(64,(NVAA+NLO)/(NPROW*NPCOL)/4) ! LRWORK from Documentation ! LRWORK = 4*HSROWS + MAX( 5*NN, NP0*MQ0) + & ! ICEIL(lcolz,NPROW*NPCOL)*NN + (CLUSTERSIZE-1) * HSROWS ! LRWORK from PZHEEVX !_COMPLEX LRWORK = 4*HSROWS + & !_COMPLEX MAX( ICEIL( lcolz, NPROW*NPCOL )*NN + MAX( 5*NN, NP0*MQ0 ), & !_COMPLEX MAX( ( BLOCKSIZE*( BLOCKSIZE-1 ) ) / 2, & !_COMPLEX ( MQ0+NP0 )*BLOCKSIZE ) + BLOCKSIZE*BLOCKSIZE) + & !_COMPLEX (CLUSTERSIZE-1) * HSROWS !_COMPLEX LWORK = HSROWS + MAX(BLOCKSIZE*(NP0+1),3) + CLUSTERSIZE * HSROWS !_COMPLEX LWORK = HSROWS + (NP0 + NQ0 + BLOCKSIZE) * BLOCKSIZE + & !_COMPLEX CLUSTERSIZE * HSROWS !_REAL LWORK = 5*HSROWS + MAX(5*NN, NP0*MQ0 + 2*BLOCKSIZE*BLOCKSIZE) + & !_REAL ICEIL(lcolz, NPROW*NPCOL)*NN + CLUSTERSIZE * HSROWS LIWORK = 6*MAX(HSROWS, NPROW*NPCOL+1,4) allocate( ICLUSTR(2*NPROW*NPCOL),GAP(NPROW*NPCOL) ) ! *********** ADDED *********** ! Find the correct sizes LWORKQ=-1 LIWORKQ=-1 LRWORKQ=-1 !_REAL allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), STAT=IERR ) !_COMPLEX allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), RWORK(LRWORK),STAT=IERR ) if(IERR.ne.0) THEN write(6,*) 'not enough memory: ', IERR call Abort_Parallel stop 'not enough memory' end if JOBZ='V' RANG='V' UPLO='U' IF (EIGVAL(1) .GE. EIGVAL(2)) THEN EIGVAL(1) = -1.0D+70 EIGVAL(2) = 1.0D+70 ENDIF EL1 = EIGVAL(1) EU1 = EIGVAL(2) IL = 0 IU = 0 ABSTOL = 0.0D+0 abstol=2.d0*PDLAMCH(ICTXTALL,'S') ! ORFAC = 1.0D-3 !abstol and orfac checked by PB 8.5.2014: Au-atom problem, !pdlamch and orfac=10-3 gives correct eigval + fastest time !orfac 10-2 gives false eigenvectors, ! if problems occur, switch to pdsyevx16 ORFAC = 1.0D-3 !!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & #ifdef old_scalapack !_REAL CALL PDSYEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_REAL DESCHS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_REAL WORK,LWORKQ,IWORK,LIWORKQ,IFAIL,ICLUSTR,GAP,INFO) !_COMPLEX CALL PZHEEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_COMPLEX DESCHS,EL1,EU1,IL,IU, & !_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_COMPLEX WORK,LWORKQ,RWORK,LRWORKQ,IWORK,LIWORKQ,IFAIL, & !_COMPLEX ICLUSTR,GAP,INFO) #else !!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !!_REAL DESCHS,EL1,EU1,IL,IU, & !!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !!_REAL WORK,LWORKQ,IWORK,LIWORKQ,IFAIL,ICLUSTR,GAP,INFO) !_REAL CALL PDSYEVR(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_REAL DESCHS,EL1,EU1,IL,IU, & !_REAL NE,NEV,EIGVAL,Z,1,1,DESCZ, & !_REAL WORK,LWORKQ,IWORK,LIWORKQ,INFO) !_COMPLEX CALL PZHEEVX16(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_COMPLEX DESCHS,EL1,EU1,IL,IU, & !_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_COMPLEX WORK,LWORKQ,RWORK,LRWORKQ,IWORK,LIWORKQ,IFAIL, & !_COMPLEX ICLUSTR,GAP,INFO) #endif ! One can probably get away with these, but... !!_REAL LWORK=max(nint(WORK(1))/min(NPROW,NPCOL),LWORK) !!_REAL LIWORK=IWORK(1) !!_COMPLEX LRWORK=max(nint(RWORK(1))/min(NPROW,NPCOL),LRWORK) ! This may be a little faster, but the effect is really small ! Use dble here to handle both real & complex cases #ifdef old_scalapack LWORK=nint(dble(WORK(1)))+(clustersize-1)*HSROWS #else !_REAL LWORK=nint(dble(WORK(1))) !_COMPLEX LWORK=nint(dble(WORK(1)))+(clustersize-1)*HSROWS #endif LIWORK=IWORK(1) !_COMPLEX LRWORK=nint(RWORK(1))+(clustersize-1)*HSROWS deallocate (IWORK, WORK, IFAIL, STAT=ierr) !_COMPLEX deallocate (RWORK, STAT=ierr) 601 format('Scalapack Workspace size ',F8.2,a,F8.2,a) !_REAL write(6,601)(LWORK*8.)/dble(1024*1024),' Mb' !_COMPLEX write(6,601)(LWORK*8.)/dble(1024*1024), ' and ',(LRWORK*8.)/dble(1024*1024),' Mb' ! *********** END ADDED ******* #else BLOCK2 = 1 !_REAL BLOCK1 = BLOCKSIZE !_REAL BLOCK2 = 32 !_REAL IF(HSROWS.LT.3500) BLOCK2=16 !_REAL! IF(HSROWS.LT.1500) BLOCK2=6 !_REAL! IF(HSROWS.LT.1200) BLOCK2=4 !_REAL IF(HSROWS.LT.2000) BLOCK2=1 !_REAL INQUIRE(FILE="OLD_DIAG", EXIST=out1) !_REAL if(out1) BLOCK2 = 1 !_REAL! BLOCK2 = 1 if(BLOCK2.eq.1) then LWORK = HSROWS*(BLOCKSIZE+3) else LWORK = HSROWS*(4*max(BLOCKSIZE,BLOCK2)+8+2*BLOCK2) endif LIWORK = 5*HSROWS LRWORK = 7*HSROWS #endif !_COMPLEX allocate( RWORK(LRWORK) ) allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), STAT=IERR ) if(IERR.ne.0) THEN write(6,*) 'not enough memory: ', IERR call Abort_Parallel stop 'not enough memory' end if IF (EIGVAL(1) .GE. EIGVAL(2)) THEN EIGVAL(1) = -1.0D+70 EIGVAL(2) = 1.0D+70 ENDIF EL1 = EIGVAL(1) EU1 = EIGVAL(2) ! ! conjugate matrix ! #ifndef Parallel !_COMPLEX DO J = 1, NVAA+NLO !_COMPLEX DO I = 1, NVAA+NLO !_COMPLEX HS(I,J) = DCONJG(HS(I,J)) !_COMPLEX END DO !_COMPLEX HSDIAG(J) = DCONJG(HSDIAG(J)) !_COMPLEX END DO #endif ! ! form a Cholesky factorization of SP (packed overlap-matrix S). ! ! ! swap Diagonal of Overlap matrix into matrix ! !print #ifndef Parallel if(out) then write(87,*) HSrows,nvaa,nlo write(88,1234) HS write(87,1234) HSdiag 1234 format(4e23.15) endif #endif #ifndef Parallel !_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) !_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) #endif ! ! write (6,*)'matrix size',NVAA+NLO CALL START_TIMER(time_cholesky) #ifdef Parallel if(deschs(2).gt.0) then !_REAL CALL PDPOTRF('U',NVAA+NLO,S,1,1,DESCHS,INFO) !_COMPLEX CALL PZPOTRF('U',NVAA+NLO,S,1,1,DESCHS,INFO) end if #else !_REAL CALL DPOTRF('U',NVAA+NLO,HS,HSROWS,INFO) !_COMPLEX CALL ZPOTRF('U',NVAA+NLO,HS,HSROWS,INFO) #endif CALL STOP_TIMER(time_cholesky) IF(INFO .NE. 0) THEN write(6,*) 'Cholesky INFO = ', INFO write(stderr,*) 'Cholesky INFO = ', INFO CALL OUTERR('SECLR4','POTRF (Scalapack/LAPACK) failed.') Call Abort_Parallel STOP 'SECLR4 - Error' ENDIF ! ! swap Diagonal of Hamilton matrix into matrix ! #ifndef Parallel !_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) !_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) #endif CALL BARRIER ! ! transform the problem to a standard eigenvalue problem. ! CALL START_TIMER(time_transform) #ifdef Parallel if(deschs(2).gt.0) then !_REAL CALL PDSYGST(1,'U',NVAA+NLO,H,1,1,DESCHS,S,1,1,DESCHS,SCAL,INFO) !_COMPLEX CALL PZHEGST(1,'U',NVAA+NLO,H,1,1,DESCHS,S,1,1,DESCHS,SCAL,INFO) end if #else !_REAL CALL DSCGST(NVAA+NLO,HS,HSROWS,HSDIAG,WORK,INFO) !_COMPLEX CALL ZHCGST(NVAA+NLO,HS,HSROWS,HSDIAG,WORK,INFO) #endif CALL STOP_TIMER(time_transform) IF(INFO .NE. 0) THEN write(6,*) 'GST INFO = ', INFO write(stderr,*) 'GST INFO = ', INFO CALL OUTERR('SECLR4','SCGST (Scalapack/LAPACK) failed.') Call Abort_Parallel STOP 'SECLR4 - Error' ENDIF CALL BARRIER #ifdef Parallel #ifdef ELPA if(elpa_switch) then if(myid.eq.0) call cpu_time(transformation_begin) call START_TIMER(time_matrixcomplete) dimmatrix = nvaa + nlo shapeh=shape(h) if(.not.elpa_mem_default) then allocate(sendbuf(blocksize*blocksize)) allocate(receivebuf(blocksize*blocksize)) do j_global=1,dimmatrix,blocksize column_coordinate=mod((j_global-1)/blocksize,npcol) trans_row_coordinate=mod((j_global-1)/blocksize,nprow) do i_global=1,j_global,blocksize row_coordinate=mod((i_global-1)/blocksize,nprow) trans_column_coordinate=mod((i_global-1)/blocksize,npcol) sourceid=row_coordinate*npcol+column_coordinate targetid=trans_row_coordinate*npcol+trans_column_coordinate if((myid.eq.sourceid).or.(myid.eq.targetid)) then row_block_coordinate=(i_global-1)/(nprow*blocksize) column_block_coordinate=(j_global-1)/(npcol*blocksize) i_local=mod(i_global-1,blocksize)+1+row_block_coordinate*blocksize j_local=mod(j_global-1,blocksize)+1+column_block_coordinate*blocksize trans_column_block_coordinate=(i_global-1)/(npcol*blocksize) trans_row_block_coordinate=(j_global-1)/(nprow*blocksize) trans_i_local=mod(i_global-1,blocksize)+1+trans_row_block_coordinate*blocksize trans_j_local=mod(j_global-1,blocksize)+1+trans_column_block_coordinate*blocksize if(sourceid.eq.targetid) then do trans_column_step=0,min(blocksize-1,shapeh(2)-blocksize*trans_column_block_coordinate-1) row_step=trans_column_step if(row_step.gt.shapeh(1)-blocksize*row_block_coordinate-1) cycle do trans_row_step=0,min(blocksize-1,shapeh(1)- blocksize*trans_row_block_coordinate-1) column_step=trans_row_step if(column_step.gt.shapeh(2)-blocksize*column_block_coordinate-1) cycle if(i_global+row_step.eq.j_global+column_step) cycle !_REAL H(trans_i_local+trans_row_step,trans_j_local+trans_column_step)= & !_REAL H(i_local+row_step,j_local+column_step) !_COMPLEX H(trans_i_local+trans_row_step,trans_j_local+trans_column_step)= & !_COMPLEX DCONJG(H(i_local+row_step,j_local+column_step)) enddo enddo else if(myid.eq.sourceid) then mirrortag=j_global element_counter=1 do column_step=0,min(blocksize-1,shapeh(2)-blocksize*column_block_coordinate-1) do row_step=0,min(blocksize-1,shapeh(1)-blocksize*row_block_coordinate-1) !_REAL sendbuf(element_counter)=H(i_local+row_step,j_local+column_step) !_COMPLEX sendbuf(element_counter)=DCONJG(H(i_local+row_step,j_local+column_step)) element_counter=element_counter+1 enddo enddo !_REAL call MPI_SEND(sendbuf,blocksize*blocksize,MPI_DOUBLE_PRECISION,targetid, & !_REAL mirrortag,MPI_COMM_WORLD,mpierr) !_COMPLEX call MPI_SEND(sendbuf,blocksize*blocksize,MPI_DOUBLE_COMPLEX,targetid, & !_COMPLEX mirrortag,MPI_COMM_WORLD,mpierr) elseif(myid.eq.targetid) then mirrortag=j_global !_REAL call MPI_RECV(receivebuf,blocksize*blocksize,MPI_DOUBLE_PRECISION,sourceid, & !_REAL mirrortag,MPI_COMM_WORLD,status,mpierr) !_COMPLEX call MPI_RECV(receivebuf,blocksize*blocksize,MPI_DOUBLE_COMPLEX,sourceid, & !_COMPLEX mirrortag,MPI_COMM_WORLD,status,mpierr) element_counter=1 do row_step=0,min(blocksize-1,shapeh(1)-blocksize*trans_row_block_coordinate-1) do column_step=0,min(blocksize-1,shapeh(2)-blocksize*trans_column_block_coordinate-1) H(trans_i_local+row_step,trans_j_local+column_step)=receivebuf(element_counter) element_counter=element_counter+1 enddo enddo endif endif endif enddo enddo else allocate(h_temp(shapeh(1),shapeh(2))) h_temp=0.d0 do i_global=1,dimmatrix row_coordinate=mod((i_global-1)/blocksize,nprow) do j_global=1,i_global if(abs(i_global-j_global).gt.(blocksize-1)) cycle column_coordinate=mod((j_global-1)/blocksize,npcol) sourceid=row_coordinate*npcol+column_coordinate if(sourceid.eq.myid) then row_block_coordinate=(i_global-1)/(nprow*blocksize) column_block_coordinate=(j_global-1)/(npcol*blocksize) i_local=mod(i_global-1,blocksize)+1+row_block_coordinate*blocksize j_local=mod(j_global-1,blocksize)+1+column_block_coordinate*blocksize if(i_global.gt.j_global) then H(i_local,j_local)=0.d0 elseif(i_global.eq.j_global) then H(i_local,j_local)=H(i_local,j_local)/2.d0 endif endif enddo enddo !_REAL call pdtran(dimmatrix,dimmatrix,1.d0,H,1,1,deschs,0.d0,h_temp,1,1,deschs) !_COMPLEX call pztranc(dimmatrix,dimmatrix,(1.d0,0.d0),H,1,1,deschs,(0.d0,0.d0),h_temp,1,1,deschs) !_REAL call pdgeadd('n',dimmatrix,dimmatrix,1.d0,h_temp,1,1,deschs,1.d0,H,1,1,deschs) !_COMPLEX call pzgeadd('n',dimmatrix,dimmatrix,1.d0,h_temp,1,1,deschs,1.d0,H,1,1,deschs) deallocate(h_temp) endif call STOP_TIMER(time_matrixcomplete) ! if(myid.eq.0) call cpu_time(transformation_end) !!!DEBUG ! if(myid.eq.0) print'(a,f7.3)','Time for creation of complete matrix: ',transformation_end-transformation_begin !!!DEBUG endif #endif #endif ! compute the eigenvalues within energy-range EL1, EU1 ! CALL START_TIMER(time_sep) IL = 0 IU = 0 ABSTOL = 0.0D+0 ! ORFAC = 1.0D-3 !abstol and orfac changed back PB 8.5.2014: Au-atom +lapwso_mpi problem ORFAC = 1.0D-3 #ifdef Parallel #ifdef ELPA if(elpa_switch) then number_of_eigenvalues=NEMAX_IT NE=number_of_eigenvalues #ifdef ELPA15 !!! ELPA versions before 2017 mpierr=get_elpa_row_col_comms(mpi_comm_world, myrowhs, mycolhs, & mpi_comm_rows, mpi_comm_cols) !_REAL success = solve_evp_real_2stage(dimmatrix, number_of_eigenvalues, H, ldhs, eigval, z, ldhs, & !_REAL blocksize, lcolz, mpi_comm_rows, mpi_comm_cols, mpi_comm_world) !_COMPLEX success = solve_evp_complex_2stage(dimmatrix, number_of_eigenvalues, H, ldhs, eigval, z, ldhs, & !_COMPLEX blocksize, lcolz, mpi_comm_rows, mpi_comm_cols, mpi_comm_world) #else !!! ELPA versions elpa2017 or newer if (elpa_init(20180501) /= elpa_ok) then print*, "ELPA API version not supported" stop endif elpa_pt => elpa_allocate () ! set parameters call elpa_pt%set("na", dimmatrix, success) call elpa_pt%set("nev", number_of_eigenvalues, success) call elpa_pt%set("local_nrows", ldhs, success) call elpa_pt%set("local_ncols", lcolhs, success) call elpa_pt%set("nblk", blocksize, success) call elpa_pt%set("mpi_comm_parent", mpi_comm_world, success) call elpa_pt%set("process_row", myrowhs, success) call elpa_pt%set("process_col", mycolhs, success) success = elpa_pt%setup() call elpa_pt%set("solver", elpa_solver_2stage, success) ! eigenvectors call elpa_pt%eigenvectors(H,eigval,z,success) ! cleanup call elpa_deallocate(elpa_pt) call elpa_uninit() #endif endif if(scala_switch) then #endif abstol=2.d0*PDLAMCH(ICTXTALL,'S') JOBZ='V' RANG='V' UPLO='U' if(deschs(2).gt.0) then EL1=-1.0D+70 !!! RANG='V' !!! if(EU.ge.9.d9) RANG='A' !!! #ifdef old_scalapack !_REAL CALL PDSYEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_REAL DESCHS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_REAL WORK,LWORK,IWORK,LIWORK,IFAIL,ICLUSTR,GAP,INFO) !_COMPLEX CALL PZHEEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_COMPLEX DESCHS,EL1,EU1,IL,IU, & !_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_COMPLEX WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK,IFAIL, & !_COMPLEX ICLUSTR,GAP,INFO) #else !!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !!_REAL DESCHS,EL1,EU1,IL,IU, & !!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !!_REAL WORK,LWORK,IWORK,LIWORK,IFAIL,ICLUSTR,GAP,INFO) !_REAL CALL PDSYEVR(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_REAL DESCHS,EL1,EU1,IL,IU, & !_REAL NE,NEV,EIGVAL,Z,1,1,DESCZ, & !_REAL WORK,LWORK,IWORK,LIWORK,INFO) !_COMPLEX CALL PZHEEVX16(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, & !_COMPLEX DESCHS,EL1,EU1,IL,IU, & !_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, & !_COMPLEX WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK,IFAIL, & !_COMPLEX ICLUSTR,GAP,INFO) #endif #ifdef ELPA end if #endif endif IF (INFO .NE. 0) THEN write(6,*) 'SEP INFO = ', INFO if(info.eq.2) then 99 format(':WARN : WARNING: Not all eigenvectors are orthogonal') ! WRITE(21,99) WRITE(6,99) ! write(6,*) 'Numbers of eigenvectors that failed to converge =', IFAIL write(6,*) 'Eigenvalue clusters in eigensolver =', ICLUSTR write(6,*) 'Gap in eigensolver =', Gap write(6,*) ' Increase CLUSTERSIZE in SECLR4' else write(stderr,*) 'SEP INFO = ', INFO CALL OUTERR('SECLR4','SYEVX (Scalapack/LAPACK) failed.') IF(INFO.EQ.4) THEN write(6,*) 'NEV, NE = ', NEV, NE write(6,*) 'ICLUSTR = ', ICLUSTR write(6,*) 'WORK(1) = ', WORK(1:2), LWORK !_COMPLEX write(6,*) 'RWORK(1) = ', RWORK(1:2), LRWORK write(6,*) 'IWORK(1) = ', IWORK(1:2), LIWORK END IF Call Abort_Parallel STOP 'SECLR4 - Error' end if ENDIF #else if(EU.ge.9d9) then !test RUH if(divide) then !#################################################### !############### divide and conquer ################# !#################################################### NE=HSROWS !_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, & !_REAL EIGVAL,WORK,-1,IWORK,LIWORK, & !_REAL INFO) !_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,& !_COMPLEX EIGVAL,WORK,-1,RWORK,-1,IWORK,-1, & !_COMPLEX INFO) LWORK = WORK(1) deallocate(WORK) allocate(WORK(LWORK)) !_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, & !_REAL EIGVAL,WORK,LWORK,IWORK,-1, & !_REAL INFO) !_REAL LIWORK = WORK(1) !_REAL deallocate(IWORK) !_REAL allocate(IWORK(LIWORK)) !_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,& !_COMPLEX EIGVAL,WORK,-1,RWORK,-1,IWORK,-1, & !_COMPLEX INFO) !_COMPLEX LRWORK=RWORK(1) !_COMPLEX LIWORK=IWORK(1) !_COMPLEX deallocate(RWORK) !_COMPLEX allocate(RWORK(LRWORK)) !_COMPLEX deallocate(IWORK) !_COMPLEX allocate(IWORK(LIWORK)) !_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, & !_REAL EIGVAL,WORK,LWORK,IWORK,LIWORK, & !_REAL INFO) !_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,& !_COMPLEX EIGVAL,WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK, & !_COMPLEX INFO) else !#################################################### !######### relatively robust representation ######### !#################################################### allocate(ISUPPZ(2*HSROWS)) !_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,-1,IWORK,LIWORK,IFAIL, & !_REAL INFO) !_REAL LWORK = WORK(1) !_REAL deallocate(WORK) !_REAL allocate(WORK(LWORK)) !_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,IWORK,-1,IFAIL, & !_REAL INFO) !_REAL LIWORK = WORK(1) !_REAL deallocate(IWORK) !_REAL allocate(IWORK(LIWORK)) !_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,IWORK,LIWORK,IFAIL, & !_REAL INFO) !_COMPLEX CALL ZHEEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, & !_COMPLEX NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,-1,RWORK,-1, & !_COMPLEX IWORK,-1,INFO) !_COMPLEX LWORK=WORK(1) !_COMPLEX LRWORK=RWORK(1) !_COMPLEX LIWORK=IWORK(1) !_COMPLEX deallocate(WORK) !_COMPLEX allocate(WORK(LWORK)) !_COMPLEX deallocate(RWORK) !_COMPLEX allocate(RWORK(LRWORK)) !_COMPLEX deallocate(IWORK) !_COMPLEX allocate(IWORK(LIWORK)) !_COMPLEX CALL ZHEEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, & !_COMPLEX NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,RWORK,LRWORK, & !_COMPLEX IWORK,LIWORK,INFO) deallocate(ISUPPZ) endif else !#################################################### !########### standard algorithm (few EVs) ########### !#################################################### !_REAL CALL DSYXEV4('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, & !_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,WORK,LWORK,IWORK,IFAIL, & !_REAL BLOCK1,BLOCK2,0.d0,LCOLZ,NBELW,INFO) !_REAL! CALL DSYEVX2('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, & !_REAL! ABSTOL,NE,EIGVAL,Z,HSROWS,WORK,LWORK,IWORK,IFAIL, & !_REAL! BLOCK1,LCOLZ,NBELW,INFO) !_COMPLEX CALL ZHEEVX2('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, & !_COMPLEX NE,EIGVAL,Z,HSROWS,WORK,LWORK,RWORK,IWORK,IFAIL, & !_COMPLEX BLOCKSIZE, LCOLZ, NBELW, INFO) ! compare: Complex routines need additional parameter RWORK (after WORK) endif IF (INFO .NE. 0) THEN write(6,*) 'SEP INFO = ', INFO write(stderr,*) 'SEP INFO = ', INFO CALL OUTERR('SECLR4','SYEVX (Scalapack/LAPACK) failed.') STOP 'SECLR4 - Error' ENDIF write(6,*) NE, ' Eigenvalues computed ' IF (NE .GT. LCOLZ) THEN CALL OUTERR('SECLR4','More than NUME eigenvalues.') WRITE (ERRMSG,9000) LCOLZ,NE 9000 FORMAT ('Increase NUME from',i4,' to',i4,' OR') CALL OUTERR('SECLR4',ERRMSG) CALL OUTERR('SECLR4','reduce E2 in *.klist.(see *.output1)') WRITE(6,9001) (eigval(j),j=1,ne) 9001 FORMAT (5f15.6) STOP 'SECLR4 - Error' ENDIF #endif CALL STOP_TIMER(time_sep) CALL BARRIER ! ! swap Diagonal of Overlap matrix into matrix ! #ifndef Parallel !_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) !_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1) ! ! ! conjugate overlap matrix ! !_COMPLEX DO J = 1, NVAA+NLO !_COMPLEX DO I = 1, J !_COMPLEX HS(I,J) = DCONJG(HS(I,J)) !_COMPLEX END DO !_COMPLEX END DO #endif ! ! backtransform eigenvectors to the original problem: ! for H*x=(lambda)*S*x; ! backtransform eigenvectors: x = inv(L)*y or inv(U)*y ! CALL START_TIMER(time_backtransform) #ifdef Parallel if(deschs(2).gt.0) then !_REAL CALL PDTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, 1.0D0, & !_REAL S, 1,1,DESCHS, Z, 1,1,DESCZ) !_COMPLEX CALL PZTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, (1.0D0,0.0D0), & !_COMPLEX S, 1,1,DESCHS, Z, 1,1,DESCZ) end if #else !_REAL CALL DTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, 1.0D0, & !_REAL HS, HSROWS, Z, HSROWS) !_COMPLEX CALL ZTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, (1.0D0,0.0D0), & !_COMPLEX HS, HSROWS, Z, HSROWS) #endif CALL STOP_TIMER(time_backtransform) CALL BARRIER #ifdef Parallel NBELW_PARA=0 !!! do i=1,NE !!! if(EIGVAL(i).lt. EL) then !!! NBELW_PARA=NBELW_PARA+1 !!! else !!! exit !!! endif !!! enddo !!! #endif #ifdef Parallel deallocate( ICLUSTR,GAP ) #endif deallocate( IFAIL, IWORK ) deallocate( WORK ) !_COMPLEX deallocate( RWORK ) CALL BARRIER !print #ifndef Parallel if(out) then do i=1,ne write(86,*) eigval(i) write(89,1234) (Z(j,i),j=1,hsrows) enddo endif #endif ! Flop = DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0 !_COMPLEX Flop = Flop*4.0D0 if(READ_CPU_TIME(time_cholesky).eq.0.d0) then WRITE (6,1001) 'Cholesky complete (CPU)' , & READ_CPU_TIME(time_cholesky) else WRITE (6,1001) 'Cholesky complete (CPU)' , & READ_CPU_TIME(time_cholesky) , & 1E-6* Flop/READ_CPU_TIME(time_cholesky) endif #ifdef Parallel if(READ_WALL_TIME(time_cholesky).eq.0.d0) then WRITE (6,1001) 'Cholesky complete (WALL)' , & READ_WALL_TIME(time_cholesky) else WRITE (6,1001) 'Cholesky complete (WALL)' , & READ_WALL_TIME(time_cholesky) , & 1E-6* Flop/READ_WALL_TIME(time_cholesky) endif #endif Flop = 3.0D0*DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0 !_COMPLEX Flop = Flop*4.0D0 if(READ_CPU_TIME(time_transform).eq.0.d0) then WRITE (6,1001) 'Transform to eig.problem (CPU)' , & READ_CPU_TIME(time_transform) else WRITE (6,1001) 'Transform to eig.problem (CPU)' , & READ_CPU_TIME(time_transform) , & 1E-6* Flop/READ_CPU_TIME(time_transform) endif #ifdef Parallel if(READ_WALL_TIME(time_transform).eq.0.d0) then WRITE (6,1001) 'Transform to eig.problem (WALL)' , & READ_WALL_TIME(time_transform) else WRITE (6,1001) 'Transform to eig.problem (WALL)' , & READ_WALL_TIME(time_transform) , & 1E-6* Flop/READ_WALL_TIME(time_transform) endif #endif Flop = 4.0D0*DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0 !_COMPLEX Flop = Flop*4.0D0 #if defined(ELPA) && defined(Parallel) if(elpa_switch) then WRITE (6,1001) 'Complete Matrix H (CPU)' , & READ_CPU_TIME(time_matrixcomplete) WRITE (6,1001) 'Complete Matrix H (WALL)' , & READ_WALL_TIME(time_matrixcomplete) endif #endif #ifdef ELPA if(elpa_switch) then if(READ_CPU_TIME(time_sep).eq.0.d0) then WRITE (6,1001) 'Compute EVs using ELPA (CPU)' , & READ_CPU_TIME(time_sep) else WRITE (6,1001) 'Compute EVs using ELPA (CPU)' , & READ_CPU_TIME(time_sep), & 1E-6* Flop/READ_CPU_TIME(time_sep) endif else #else if(READ_CPU_TIME(time_sep).eq.0.d0) then WRITE (6,1001) 'Compute eigenvalues (CPU)' , & READ_CPU_TIME(time_sep) else WRITE (6,1001) 'Compute eigenvalues (CPU)' , & READ_CPU_TIME(time_sep), & 1E-6* Flop/READ_CPU_TIME(time_sep) endif #endif #ifdef ELPA endif #endif #ifdef Parallel #ifdef ELPA if (elpa_switch) then if(READ_WALL_TIME(time_sep).eq.0.d0) then WRITE (6,1001) 'Compute EVs using ELPA (WALL)' , & READ_WALL_TIME(time_sep) else WRITE (6,1001) 'Compute EVs using ELPA (WALL)' , & READ_WALL_TIME(time_sep) , & 1E-6* Flop/READ_WALL_TIME(time_sep) endif else #endif if(READ_WALL_TIME(time_sep).eq.0.d0) then WRITE (6,1001) 'Compute eigenvalues (WALL)' , & READ_WALL_TIME(time_sep) else WRITE (6,1001) 'Compute eigenvalues (WALL)' , & READ_WALL_TIME(time_sep) , & 1E-6* Flop/READ_WALL_TIME(time_sep) endif #ifdef ELPA endif #endif #endif Flop = DBLE(NVAA+NLO)*(NVAA+NLO)*NE/4.0D0 !_COMPLEX Flop = Flop*4.0D0 if(READ_CPU_TIME(time_backtransform).eq.0.d0) then WRITE (6,1001) 'Backtransform (CPU)' , & READ_CPU_TIME(time_backtransform) else WRITE (6,1001) 'Backtransform (CPU)' , & READ_CPU_TIME(time_backtransform) , & 1E-6* Flop/READ_CPU_TIME(time_backtransform) endif #ifdef Parallel if(READ_WALL_TIME(time_backtransform).eq.0.d0) then WRITE (6,1001) 'Backtransform (WALL)' , & READ_WALL_TIME(time_backtransform) else WRITE (6,1001) 'Backtransform (WALL)' , & READ_WALL_TIME(time_backtransform) , & 1E-6* Flop/READ_WALL_TIME(time_backtransform) endif #endif 1001 FORMAT (1X,'Seclr4(',A,') :',t45,f10.3:t58,f10.2,' Mflops') ! RETURN ! 6000 FORMAT (/,/,/,10X,'ERROR IN SCLRE2, IER1=',I5,'#E FOUND=',I5, & ' IER2=',I5) ! ! End of 'SECLR4' ! END
_______________________________________________ Wien mailing list Wien@zeus.theochem.tuwien.ac.at http://zeus.theochem.tuwien.ac.at/mailman/listinfo/wien SEARCH the MAILING-LIST at: http://www.mail-archive.com/wien@zeus.theochem.tuwien.ac.at/index.html