Dear all!
I included a file to this message (seclr4.F) that is a necessary replacement if
you want to use ELPA versions from 2017 and onwards. Since ELPA changed its
interface, the WIEN2k implementation needed to be adapted as well.
Just copy the file to $WIENROOT/SRC_lapw1 and recompile lapw1 - you do not have
to change anything else.
The old ELPA implementation in WIEN2k (for older versions) still works with the
new file: You will have to add the switch " -DELPA15 " to your ELPA options
(ELPA_OPT) in the Makefile in $WIENROOT/SRC_lapw1. This switch is an ADDITION
to the already present switch "-DELPA" - both are needed to use the old
interface with the new file.
Kind regards
Thomas Ruh
! This file uses LAPACK routines
!
SUBROUTINE SECLR4
!
#ifdef Parallel
use matrices, only : H, S, HSROWS, Z, EIGVAL
#else
use matrices, only : HS, HSDIAG, HSROWS, Z, EIGVAL
#endif
use parallel, only : MYID, DESCHS, DESCZ, NPROW, NPCOL, BARRIER, &
BLOCKSIZE, ICTXTALL, LDHS, LCOLZ, abort_parallel, &
NPE, myrowhs, mycolhs,ictxt,ictxtall, NBELW_PARA, &
#ifdef ELPA
lcolhs, elpa_switch, scala_switch, elpa_mem_default
#else
elpa_switch, scala_switch
#endif
use lapw_timer, only : READ_CPU_TIME, READ_WALL_TIME, &
START_TIMER, STOP_TIMER, &
time_cholesky, time_transform, time_sep, &
#ifdef ELPA
time_backtransform, time_matrixcomplete
#else
time_backtransform
#endif
use lolog, only : nlo, loor, lapw, ilo
#ifdef ELPA
use comi, only : NVAA, NE, NBELW, NAT, NT, LNSMAX, NEMAX_IT
#else
use comi, only : NVAA, NE, NBELW, NAT, NT, LNSMAX
#endif
use coml, only : SPRSWF, PRNTWF, WFTAPE, REL, ITER, NOHNS
use comr, only : RKM, EL, EU, WEIGHT, divide
#ifdef ELPA
#ifdef ELPA15
!!! ELPA versions before 2017
use elpa1
use elpa2
use elpa_utilities
use elpa2_utilities
use mod_blacs_infrastructure
use test_util
IMPLICIT NONE
INCLUDE 'mpif.h'
INCLUDE 'elpa.h'
INCLUDE 'param.inc'
#else
!!! ELPA versions elpa2017 or newer
use elpa
IMPLICIT NONE
INCLUDE 'mpif.h'
INCLUDE 'param.inc'
#endif
#else
IMPLICIT NONE
INCLUDE 'param.inc'
#endif
! ..................................................................
!
! interface between matrix set up in LAPW1 and LAPACK (BLAS) matrix
! subroutines; DOUBLE PRECISION version
!
! Solve the secular equations H*Z = EIGVAL*S*Z
! H and S are stored in the array HS,
! H is stored below the diagonal, whereas S is stored above
! the diagonal. The diagonal of H is stored in the
! one dimensional array HSDIAG.
! NVAA is the dimension of the secular matrices
! there is a limit on NE because of the dimensions used
! NS is the total space available for the eigenvectors Z
! NE is returned as the number of Eigenvalues between EL1 and EU1
! except NE is limited to (NS/NVAA)
! EL1 and EU1 are input as EIGVAL(1) and EIGVAL(2).
! IF EIGVAL(2) .LT. EIGVAL(1) THEN no bounds are applied
!
! D.D.Koelling April,1974
!
! ..................................................................
!
!
!
! Locals
!
INTEGER IER1, IER2, NE1, NMAT1, NSPC
INTEGER I, BLOCK1, BLOCK2
DOUBLE PRECISION EL1, EU1, Q
DOUBLE PRECISION Flop
DOUBLE PRECISION WTIME(4)
INTEGER STDERR
PARAMETER (STDERR = 99)
!
! External Functions
!
! INTEGER NEBLW1
! EXTERNAL NEBLW1
!
! Local Parameters
!
INTEGER NBMIN, NBVAL, NXVAL
PARAMETER (NBMIN = 1, NBVAL = 2, NXVAL = 1)
!
! Locals
!
CHARACTER*67 ERRMSG
CHARACTER JOBZ, RANG, UPLO
INTEGER IL, INFO, IU, J
INTEGER, allocatable :: IFAIL(:), IWORK(:)
INTEGER LWORK, LIWORK, LRWORK, LWORKQ, LIWORKQ, LRWORKQ
INTEGER NEV
INTEGER, allocatable :: ICLUSTR(:)
DOUBLE PRECISION, allocatable :: GAP(:)
DOUBLE PRECISION SCAL
INTEGER NN, NP0, MQ0, NQ0, CLUSTERSIZE
INTEGER IERR
DOUBLE PRECISION ABSTOL, ORFAC
double precision work24 !, test, RUH
integer itest, rtest !test RUH
integer, allocatable :: ISUPPZ(:) !test RUH
!_REAL DOUBLE PRECISION, allocatable :: WORK(:)
!_COMPLEX DOUBLE COMPLEX, allocatable :: WORK(:)
!_COMPLEX DOUBLE PRECISION, allocatable :: RWORK(:)
DOUBLE PRECISION CP(5)
logical out,out1
data out/.false./
data out1/.false./
#ifdef ELPA
double precision transformation_begin, transformation_end
!_REAL double precision, allocatable :: sendbuf(:),receivebuf(:),h_temp(:,:)
!_COMPLEX double complex, allocatable :: sendbuf(:),receivebuf(:),h_temp(:,:)
integer number_of_eigenvalues,mpi_comm_rows,mpi_comm_cols,mpierr,dimmatrix,shapeh(2)
integer i_global,j_global,i_local,j_local,trans_i_local,trans_j_local
integer row_coordinate,column_coordinate,trans_row_coordinate,trans_column_coordinate
integer row_step,column_step,trans_row_step,trans_column_step,element_counter
integer column_block_coordinate,row_block_coordinate,trans_row_block_coordinate,trans_column_block_coordinate
integer sourceid,targetid,mirrortag,status(MPI_STATUS_SIZE)
#ifdef ELPA15
!!! ELPA versions before 2017
logical success
#else
!!! ELPA versions elpa2017 or newer
integer success
class(elpa_t), pointer :: elpa_pt
#endif
#endif
!
! External Subroutines
!
!_REAL EXTERNAL OUTERR, DPOTRF, DSCGST, DSYEVX2
!_COMPLEX EXTERNAL OUTERR, ZPOTRF, ZHCGST, ZHEEVX2
#ifdef Parallel
INTEGER NUMROC, ICEIL
EXTERNAL NUMROC, ICEIL
real*8 PDLAMCH
EXTERNAL PDLAMCH
#endif
!
! Intrinsic Functions
!
INTRINSIC MIN
!
!
read(5,'(L1)',err=111,end=111) out1
if(out1) out=out1
if(out1) print*, 'out set'
111 continue
! out=.true.
! out=.false.
CALL BARRIER
#ifdef Parallel
allocate( Z(LDHS, LCOLZ))
Z=0.0D0
NN = MAX(HSROWS, BLOCKSIZE, 2)
NP0 = NUMROC(NN,BLOCKSIZE, 0, 0, NPROW)
! MQ0 = NUMROC(MAX(NUME, BLOCKSIZE, 2), BLOCKSIZE, 0, 0, NPCOL)
MQ0 = NUMROC(MAX(LCOLZ, BLOCKSIZE, 2), BLOCKSIZE, 0, 0, NPCOL)
NQ0 = NUMROC( NN, BLOCKSIZE, 0, 0, NPCOL )
CLUSTERSIZE = 600
CLUSTERSIZE=max(64,(NVAA+NLO)/(NPROW*NPCOL)/4)
! LRWORK from Documentation
! LRWORK = 4*HSROWS + MAX( 5*NN, NP0*MQ0) + &
! ICEIL(lcolz,NPROW*NPCOL)*NN + (CLUSTERSIZE-1) * HSROWS
! LRWORK from PZHEEVX
!_COMPLEX LRWORK = 4*HSROWS + &
!_COMPLEX MAX( ICEIL( lcolz, NPROW*NPCOL )*NN + MAX( 5*NN, NP0*MQ0 ), &
!_COMPLEX MAX( ( BLOCKSIZE*( BLOCKSIZE-1 ) ) / 2, &
!_COMPLEX ( MQ0+NP0 )*BLOCKSIZE ) + BLOCKSIZE*BLOCKSIZE) + &
!_COMPLEX (CLUSTERSIZE-1) * HSROWS
!_COMPLEX LWORK = HSROWS + MAX(BLOCKSIZE*(NP0+1),3) + CLUSTERSIZE * HSROWS
!_COMPLEX LWORK = HSROWS + (NP0 + NQ0 + BLOCKSIZE) * BLOCKSIZE + &
!_COMPLEX CLUSTERSIZE * HSROWS
!_REAL LWORK = 5*HSROWS + MAX(5*NN, NP0*MQ0 + 2*BLOCKSIZE*BLOCKSIZE) + &
!_REAL ICEIL(lcolz, NPROW*NPCOL)*NN + CLUSTERSIZE * HSROWS
LIWORK = 6*MAX(HSROWS, NPROW*NPCOL+1,4)
allocate( ICLUSTR(2*NPROW*NPCOL),GAP(NPROW*NPCOL) )
! *********** ADDED ***********
! Find the correct sizes
LWORKQ=-1
LIWORKQ=-1
LRWORKQ=-1
!_REAL allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), STAT=IERR )
!_COMPLEX allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), RWORK(LRWORK),STAT=IERR )
if(IERR.ne.0) THEN
write(6,*) 'not enough memory: ', IERR
call Abort_Parallel
stop 'not enough memory'
end if
JOBZ='V'
RANG='V'
UPLO='U'
IF (EIGVAL(1) .GE. EIGVAL(2)) THEN
EIGVAL(1) = -1.0D+70
EIGVAL(2) = 1.0D+70
ENDIF
EL1 = EIGVAL(1)
EU1 = EIGVAL(2)
IL = 0
IU = 0
ABSTOL = 0.0D+0
abstol=2.d0*PDLAMCH(ICTXTALL,'S')
! ORFAC = 1.0D-3
!abstol and orfac checked by PB 8.5.2014: Au-atom problem,
!pdlamch and orfac=10-3 gives correct eigval + fastest time
!orfac 10-2 gives false eigenvectors,
! if problems occur, switch to pdsyevx16
ORFAC = 1.0D-3
!!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
#ifdef old_scalapack
!_REAL CALL PDSYEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_REAL DESCHS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_REAL WORK,LWORKQ,IWORK,LIWORKQ,IFAIL,ICLUSTR,GAP,INFO)
!_COMPLEX CALL PZHEEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_COMPLEX DESCHS,EL1,EU1,IL,IU, &
!_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_COMPLEX WORK,LWORKQ,RWORK,LRWORKQ,IWORK,LIWORKQ,IFAIL, &
!_COMPLEX ICLUSTR,GAP,INFO)
#else
!!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!!_REAL DESCHS,EL1,EU1,IL,IU, &
!!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!!_REAL WORK,LWORKQ,IWORK,LIWORKQ,IFAIL,ICLUSTR,GAP,INFO)
!_REAL CALL PDSYEVR(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_REAL DESCHS,EL1,EU1,IL,IU, &
!_REAL NE,NEV,EIGVAL,Z,1,1,DESCZ, &
!_REAL WORK,LWORKQ,IWORK,LIWORKQ,INFO)
!_COMPLEX CALL PZHEEVX16(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_COMPLEX DESCHS,EL1,EU1,IL,IU, &
!_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_COMPLEX WORK,LWORKQ,RWORK,LRWORKQ,IWORK,LIWORKQ,IFAIL, &
!_COMPLEX ICLUSTR,GAP,INFO)
#endif
! One can probably get away with these, but...
!!_REAL LWORK=max(nint(WORK(1))/min(NPROW,NPCOL),LWORK)
!!_REAL LIWORK=IWORK(1)
!!_COMPLEX LRWORK=max(nint(RWORK(1))/min(NPROW,NPCOL),LRWORK)
! This may be a little faster, but the effect is really small
! Use dble here to handle both real & complex cases
#ifdef old_scalapack
LWORK=nint(dble(WORK(1)))+(clustersize-1)*HSROWS
#else
!_REAL LWORK=nint(dble(WORK(1)))
!_COMPLEX LWORK=nint(dble(WORK(1)))+(clustersize-1)*HSROWS
#endif
LIWORK=IWORK(1)
!_COMPLEX LRWORK=nint(RWORK(1))+(clustersize-1)*HSROWS
deallocate (IWORK, WORK, IFAIL, STAT=ierr)
!_COMPLEX deallocate (RWORK, STAT=ierr)
601 format('Scalapack Workspace size ',F8.2,a,F8.2,a)
!_REAL write(6,601)(LWORK*8.)/dble(1024*1024),' Mb'
!_COMPLEX write(6,601)(LWORK*8.)/dble(1024*1024), ' and ',(LRWORK*8.)/dble(1024*1024),' Mb'
! *********** END ADDED *******
#else
BLOCK2 = 1
!_REAL BLOCK1 = BLOCKSIZE
!_REAL BLOCK2 = 32
!_REAL IF(HSROWS.LT.3500) BLOCK2=16
!_REAL! IF(HSROWS.LT.1500) BLOCK2=6
!_REAL! IF(HSROWS.LT.1200) BLOCK2=4
!_REAL IF(HSROWS.LT.2000) BLOCK2=1
!_REAL INQUIRE(FILE="OLD_DIAG", EXIST=out1)
!_REAL if(out1) BLOCK2 = 1
!_REAL! BLOCK2 = 1
if(BLOCK2.eq.1) then
LWORK = HSROWS*(BLOCKSIZE+3)
else
LWORK = HSROWS*(4*max(BLOCKSIZE,BLOCK2)+8+2*BLOCK2)
endif
LIWORK = 5*HSROWS
LRWORK = 7*HSROWS
#endif
!_COMPLEX allocate( RWORK(LRWORK) )
allocate( IFAIL(HSROWS), IWORK(LIWORK), WORK(LWORK), STAT=IERR )
if(IERR.ne.0) THEN
write(6,*) 'not enough memory: ', IERR
call Abort_Parallel
stop 'not enough memory'
end if
IF (EIGVAL(1) .GE. EIGVAL(2)) THEN
EIGVAL(1) = -1.0D+70
EIGVAL(2) = 1.0D+70
ENDIF
EL1 = EIGVAL(1)
EU1 = EIGVAL(2)
!
! conjugate matrix
!
#ifndef Parallel
!_COMPLEX DO J = 1, NVAA+NLO
!_COMPLEX DO I = 1, NVAA+NLO
!_COMPLEX HS(I,J) = DCONJG(HS(I,J))
!_COMPLEX END DO
!_COMPLEX HSDIAG(J) = DCONJG(HSDIAG(J))
!_COMPLEX END DO
#endif
!
! form a Cholesky factorization of SP (packed overlap-matrix S).
!
!
! swap Diagonal of Overlap matrix into matrix
!
!print
#ifndef Parallel
if(out) then
write(87,*) HSrows,nvaa,nlo
write(88,1234) HS
write(87,1234) HSdiag
1234 format(4e23.15)
endif
#endif
#ifndef Parallel
!_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
!_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
#endif
!
! write (6,*)'matrix size',NVAA+NLO
CALL START_TIMER(time_cholesky)
#ifdef Parallel
if(deschs(2).gt.0) then
!_REAL CALL PDPOTRF('U',NVAA+NLO,S,1,1,DESCHS,INFO)
!_COMPLEX CALL PZPOTRF('U',NVAA+NLO,S,1,1,DESCHS,INFO)
end if
#else
!_REAL CALL DPOTRF('U',NVAA+NLO,HS,HSROWS,INFO)
!_COMPLEX CALL ZPOTRF('U',NVAA+NLO,HS,HSROWS,INFO)
#endif
CALL STOP_TIMER(time_cholesky)
IF(INFO .NE. 0) THEN
write(6,*) 'Cholesky INFO = ', INFO
write(stderr,*) 'Cholesky INFO = ', INFO
CALL OUTERR('SECLR4','POTRF (Scalapack/LAPACK) failed.')
Call Abort_Parallel
STOP 'SECLR4 - Error'
ENDIF
!
! swap Diagonal of Hamilton matrix into matrix
!
#ifndef Parallel
!_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
!_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
#endif
CALL BARRIER
!
! transform the problem to a standard eigenvalue problem.
!
CALL START_TIMER(time_transform)
#ifdef Parallel
if(deschs(2).gt.0) then
!_REAL CALL PDSYGST(1,'U',NVAA+NLO,H,1,1,DESCHS,S,1,1,DESCHS,SCAL,INFO)
!_COMPLEX CALL PZHEGST(1,'U',NVAA+NLO,H,1,1,DESCHS,S,1,1,DESCHS,SCAL,INFO)
end if
#else
!_REAL CALL DSCGST(NVAA+NLO,HS,HSROWS,HSDIAG,WORK,INFO)
!_COMPLEX CALL ZHCGST(NVAA+NLO,HS,HSROWS,HSDIAG,WORK,INFO)
#endif
CALL STOP_TIMER(time_transform)
IF(INFO .NE. 0) THEN
write(6,*) 'GST INFO = ', INFO
write(stderr,*) 'GST INFO = ', INFO
CALL OUTERR('SECLR4','SCGST (Scalapack/LAPACK) failed.')
Call Abort_Parallel
STOP 'SECLR4 - Error'
ENDIF
CALL BARRIER
#ifdef Parallel
#ifdef ELPA
if(elpa_switch) then
if(myid.eq.0) call cpu_time(transformation_begin)
call START_TIMER(time_matrixcomplete)
dimmatrix = nvaa + nlo
shapeh=shape(h)
if(.not.elpa_mem_default) then
allocate(sendbuf(blocksize*blocksize))
allocate(receivebuf(blocksize*blocksize))
do j_global=1,dimmatrix,blocksize
column_coordinate=mod((j_global-1)/blocksize,npcol)
trans_row_coordinate=mod((j_global-1)/blocksize,nprow)
do i_global=1,j_global,blocksize
row_coordinate=mod((i_global-1)/blocksize,nprow)
trans_column_coordinate=mod((i_global-1)/blocksize,npcol)
sourceid=row_coordinate*npcol+column_coordinate
targetid=trans_row_coordinate*npcol+trans_column_coordinate
if((myid.eq.sourceid).or.(myid.eq.targetid)) then
row_block_coordinate=(i_global-1)/(nprow*blocksize)
column_block_coordinate=(j_global-1)/(npcol*blocksize)
i_local=mod(i_global-1,blocksize)+1+row_block_coordinate*blocksize
j_local=mod(j_global-1,blocksize)+1+column_block_coordinate*blocksize
trans_column_block_coordinate=(i_global-1)/(npcol*blocksize)
trans_row_block_coordinate=(j_global-1)/(nprow*blocksize)
trans_i_local=mod(i_global-1,blocksize)+1+trans_row_block_coordinate*blocksize
trans_j_local=mod(j_global-1,blocksize)+1+trans_column_block_coordinate*blocksize
if(sourceid.eq.targetid) then
do trans_column_step=0,min(blocksize-1,shapeh(2)-blocksize*trans_column_block_coordinate-1)
row_step=trans_column_step
if(row_step.gt.shapeh(1)-blocksize*row_block_coordinate-1) cycle
do trans_row_step=0,min(blocksize-1,shapeh(1)- blocksize*trans_row_block_coordinate-1)
column_step=trans_row_step
if(column_step.gt.shapeh(2)-blocksize*column_block_coordinate-1) cycle
if(i_global+row_step.eq.j_global+column_step) cycle
!_REAL H(trans_i_local+trans_row_step,trans_j_local+trans_column_step)= &
!_REAL H(i_local+row_step,j_local+column_step)
!_COMPLEX H(trans_i_local+trans_row_step,trans_j_local+trans_column_step)= &
!_COMPLEX DCONJG(H(i_local+row_step,j_local+column_step))
enddo
enddo
else
if(myid.eq.sourceid) then
mirrortag=j_global
element_counter=1
do column_step=0,min(blocksize-1,shapeh(2)-blocksize*column_block_coordinate-1)
do row_step=0,min(blocksize-1,shapeh(1)-blocksize*row_block_coordinate-1)
!_REAL sendbuf(element_counter)=H(i_local+row_step,j_local+column_step)
!_COMPLEX sendbuf(element_counter)=DCONJG(H(i_local+row_step,j_local+column_step))
element_counter=element_counter+1
enddo
enddo
!_REAL call MPI_SEND(sendbuf,blocksize*blocksize,MPI_DOUBLE_PRECISION,targetid, &
!_REAL mirrortag,MPI_COMM_WORLD,mpierr)
!_COMPLEX call MPI_SEND(sendbuf,blocksize*blocksize,MPI_DOUBLE_COMPLEX,targetid, &
!_COMPLEX mirrortag,MPI_COMM_WORLD,mpierr)
elseif(myid.eq.targetid) then
mirrortag=j_global
!_REAL call MPI_RECV(receivebuf,blocksize*blocksize,MPI_DOUBLE_PRECISION,sourceid, &
!_REAL mirrortag,MPI_COMM_WORLD,status,mpierr)
!_COMPLEX call MPI_RECV(receivebuf,blocksize*blocksize,MPI_DOUBLE_COMPLEX,sourceid, &
!_COMPLEX mirrortag,MPI_COMM_WORLD,status,mpierr)
element_counter=1
do row_step=0,min(blocksize-1,shapeh(1)-blocksize*trans_row_block_coordinate-1)
do column_step=0,min(blocksize-1,shapeh(2)-blocksize*trans_column_block_coordinate-1)
H(trans_i_local+row_step,trans_j_local+column_step)=receivebuf(element_counter)
element_counter=element_counter+1
enddo
enddo
endif
endif
endif
enddo
enddo
else
allocate(h_temp(shapeh(1),shapeh(2)))
h_temp=0.d0
do i_global=1,dimmatrix
row_coordinate=mod((i_global-1)/blocksize,nprow)
do j_global=1,i_global
if(abs(i_global-j_global).gt.(blocksize-1)) cycle
column_coordinate=mod((j_global-1)/blocksize,npcol)
sourceid=row_coordinate*npcol+column_coordinate
if(sourceid.eq.myid) then
row_block_coordinate=(i_global-1)/(nprow*blocksize)
column_block_coordinate=(j_global-1)/(npcol*blocksize)
i_local=mod(i_global-1,blocksize)+1+row_block_coordinate*blocksize
j_local=mod(j_global-1,blocksize)+1+column_block_coordinate*blocksize
if(i_global.gt.j_global) then
H(i_local,j_local)=0.d0
elseif(i_global.eq.j_global) then
H(i_local,j_local)=H(i_local,j_local)/2.d0
endif
endif
enddo
enddo
!_REAL call pdtran(dimmatrix,dimmatrix,1.d0,H,1,1,deschs,0.d0,h_temp,1,1,deschs)
!_COMPLEX call pztranc(dimmatrix,dimmatrix,(1.d0,0.d0),H,1,1,deschs,(0.d0,0.d0),h_temp,1,1,deschs)
!_REAL call pdgeadd('n',dimmatrix,dimmatrix,1.d0,h_temp,1,1,deschs,1.d0,H,1,1,deschs)
!_COMPLEX call pzgeadd('n',dimmatrix,dimmatrix,1.d0,h_temp,1,1,deschs,1.d0,H,1,1,deschs)
deallocate(h_temp)
endif
call STOP_TIMER(time_matrixcomplete)
! if(myid.eq.0) call cpu_time(transformation_end) !!!DEBUG
! if(myid.eq.0) print'(a,f7.3)','Time for creation of complete matrix: ',transformation_end-transformation_begin !!!DEBUG
endif
#endif
#endif
! compute the eigenvalues within energy-range EL1, EU1
!
CALL START_TIMER(time_sep)
IL = 0
IU = 0
ABSTOL = 0.0D+0
! ORFAC = 1.0D-3
!abstol and orfac changed back PB 8.5.2014: Au-atom +lapwso_mpi problem
ORFAC = 1.0D-3
#ifdef Parallel
#ifdef ELPA
if(elpa_switch) then
number_of_eigenvalues=NEMAX_IT
NE=number_of_eigenvalues
#ifdef ELPA15
!!! ELPA versions before 2017
mpierr=get_elpa_row_col_comms(mpi_comm_world, myrowhs, mycolhs, &
mpi_comm_rows, mpi_comm_cols)
!_REAL success = solve_evp_real_2stage(dimmatrix, number_of_eigenvalues, H, ldhs, eigval, z, ldhs, &
!_REAL blocksize, lcolz, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
!_COMPLEX success = solve_evp_complex_2stage(dimmatrix, number_of_eigenvalues, H, ldhs, eigval, z, ldhs, &
!_COMPLEX blocksize, lcolz, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
#else
!!! ELPA versions elpa2017 or newer
if (elpa_init(20180501) /= elpa_ok) then
print*, "ELPA API version not supported"
stop
endif
elpa_pt => elpa_allocate ()
! set parameters
call elpa_pt%set("na", dimmatrix, success)
call elpa_pt%set("nev", number_of_eigenvalues, success)
call elpa_pt%set("local_nrows", ldhs, success)
call elpa_pt%set("local_ncols", lcolhs, success)
call elpa_pt%set("nblk", blocksize, success)
call elpa_pt%set("mpi_comm_parent", mpi_comm_world, success)
call elpa_pt%set("process_row", myrowhs, success)
call elpa_pt%set("process_col", mycolhs, success)
success = elpa_pt%setup()
call elpa_pt%set("solver", elpa_solver_2stage, success)
! eigenvectors
call elpa_pt%eigenvectors(H,eigval,z,success)
! cleanup
call elpa_deallocate(elpa_pt)
call elpa_uninit()
#endif
endif
if(scala_switch) then
#endif
abstol=2.d0*PDLAMCH(ICTXTALL,'S')
JOBZ='V'
RANG='V'
UPLO='U'
if(deschs(2).gt.0) then
EL1=-1.0D+70 !!!
RANG='V'
!!! if(EU.ge.9.d9) RANG='A' !!!
#ifdef old_scalapack
!_REAL CALL PDSYEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_REAL DESCHS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_REAL WORK,LWORK,IWORK,LIWORK,IFAIL,ICLUSTR,GAP,INFO)
!_COMPLEX CALL PZHEEVX(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_COMPLEX DESCHS,EL1,EU1,IL,IU, &
!_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_COMPLEX WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK,IFAIL, &
!_COMPLEX ICLUSTR,GAP,INFO)
#else
!!_REAL CALL PDSYEVX17(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!!_REAL DESCHS,EL1,EU1,IL,IU, &
!!_REAL ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!!_REAL WORK,LWORK,IWORK,LIWORK,IFAIL,ICLUSTR,GAP,INFO)
!_REAL CALL PDSYEVR(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_REAL DESCHS,EL1,EU1,IL,IU, &
!_REAL NE,NEV,EIGVAL,Z,1,1,DESCZ, &
!_REAL WORK,LWORK,IWORK,LIWORK,INFO)
!_COMPLEX CALL PZHEEVX16(JOBZ,RANG,UPLO,NVAA+NLO,H,1,1, &
!_COMPLEX DESCHS,EL1,EU1,IL,IU, &
!_COMPLEX ABSTOL,NE,NEV,EIGVAL,ORFAC,Z,1,1,DESCZ, &
!_COMPLEX WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK,IFAIL, &
!_COMPLEX ICLUSTR,GAP,INFO)
#endif
#ifdef ELPA
end if
#endif
endif
IF (INFO .NE. 0) THEN
write(6,*) 'SEP INFO = ', INFO
if(info.eq.2) then
99 format(':WARN : WARNING: Not all eigenvectors are orthogonal')
! WRITE(21,99)
WRITE(6,99)
! write(6,*) 'Numbers of eigenvectors that failed to converge =', IFAIL
write(6,*) 'Eigenvalue clusters in eigensolver =', ICLUSTR
write(6,*) 'Gap in eigensolver =', Gap
write(6,*) ' Increase CLUSTERSIZE in SECLR4'
else
write(stderr,*) 'SEP INFO = ', INFO
CALL OUTERR('SECLR4','SYEVX (Scalapack/LAPACK) failed.')
IF(INFO.EQ.4) THEN
write(6,*) 'NEV, NE = ', NEV, NE
write(6,*) 'ICLUSTR = ', ICLUSTR
write(6,*) 'WORK(1) = ', WORK(1:2), LWORK
!_COMPLEX write(6,*) 'RWORK(1) = ', RWORK(1:2), LRWORK
write(6,*) 'IWORK(1) = ', IWORK(1:2), LIWORK
END IF
Call Abort_Parallel
STOP 'SECLR4 - Error'
end if
ENDIF
#else
if(EU.ge.9d9) then !test RUH
if(divide) then
!####################################################
!############### divide and conquer #################
!####################################################
NE=HSROWS
!_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, &
!_REAL EIGVAL,WORK,-1,IWORK,LIWORK, &
!_REAL INFO)
!_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,&
!_COMPLEX EIGVAL,WORK,-1,RWORK,-1,IWORK,-1, &
!_COMPLEX INFO)
LWORK = WORK(1)
deallocate(WORK)
allocate(WORK(LWORK))
!_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, &
!_REAL EIGVAL,WORK,LWORK,IWORK,-1, &
!_REAL INFO)
!_REAL LIWORK = WORK(1)
!_REAL deallocate(IWORK)
!_REAL allocate(IWORK(LIWORK))
!_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,&
!_COMPLEX EIGVAL,WORK,-1,RWORK,-1,IWORK,-1, &
!_COMPLEX INFO)
!_COMPLEX LRWORK=RWORK(1)
!_COMPLEX LIWORK=IWORK(1)
!_COMPLEX deallocate(RWORK)
!_COMPLEX allocate(RWORK(LRWORK))
!_COMPLEX deallocate(IWORK)
!_COMPLEX allocate(IWORK(LIWORK))
!_REAL CALL DSYEVD('V','L',NVAA+NLO,HS,HSROWS, &
!_REAL EIGVAL,WORK,LWORK,IWORK,LIWORK, &
!_REAL INFO)
!_COMPLEX CALL ZHEEVD('V','L',NVAA+NLO,HS,HSROWS,&
!_COMPLEX EIGVAL,WORK,LWORK,RWORK,LRWORK,IWORK,LIWORK, &
!_COMPLEX INFO)
else
!####################################################
!######### relatively robust representation #########
!####################################################
allocate(ISUPPZ(2*HSROWS))
!_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,-1,IWORK,LIWORK,IFAIL, &
!_REAL INFO)
!_REAL LWORK = WORK(1)
!_REAL deallocate(WORK)
!_REAL allocate(WORK(LWORK))
!_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,IWORK,-1,IFAIL, &
!_REAL INFO)
!_REAL LIWORK = WORK(1)
!_REAL deallocate(IWORK)
!_REAL allocate(IWORK(LIWORK))
!_REAL CALL DSYEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,IWORK,LIWORK,IFAIL, &
!_REAL INFO)
!_COMPLEX CALL ZHEEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, &
!_COMPLEX NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,-1,RWORK,-1, &
!_COMPLEX IWORK,-1,INFO)
!_COMPLEX LWORK=WORK(1)
!_COMPLEX LRWORK=RWORK(1)
!_COMPLEX LIWORK=IWORK(1)
!_COMPLEX deallocate(WORK)
!_COMPLEX allocate(WORK(LWORK))
!_COMPLEX deallocate(RWORK)
!_COMPLEX allocate(RWORK(LRWORK))
!_COMPLEX deallocate(IWORK)
!_COMPLEX allocate(IWORK(LIWORK))
!_COMPLEX CALL ZHEEVR('V','A','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, &
!_COMPLEX NE,EIGVAL,Z,HSROWS,ISUPPZ,WORK,LWORK,RWORK,LRWORK, &
!_COMPLEX IWORK,LIWORK,INFO)
deallocate(ISUPPZ)
endif
else
!####################################################
!########### standard algorithm (few EVs) ###########
!####################################################
!_REAL CALL DSYXEV4('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, &
!_REAL ABSTOL,NE,EIGVAL,Z,HSROWS,WORK,LWORK,IWORK,IFAIL, &
!_REAL BLOCK1,BLOCK2,0.d0,LCOLZ,NBELW,INFO)
!_REAL! CALL DSYEVX2('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU, &
!_REAL! ABSTOL,NE,EIGVAL,Z,HSROWS,WORK,LWORK,IWORK,IFAIL, &
!_REAL! BLOCK1,LCOLZ,NBELW,INFO)
!_COMPLEX CALL ZHEEVX2('V','V','L',NVAA+NLO,HS,HSROWS,EL1,EU1,IL,IU,ABSTOL, &
!_COMPLEX NE,EIGVAL,Z,HSROWS,WORK,LWORK,RWORK,IWORK,IFAIL, &
!_COMPLEX BLOCKSIZE, LCOLZ, NBELW, INFO)
! compare: Complex routines need additional parameter RWORK (after WORK)
endif
IF (INFO .NE. 0) THEN
write(6,*) 'SEP INFO = ', INFO
write(stderr,*) 'SEP INFO = ', INFO
CALL OUTERR('SECLR4','SYEVX (Scalapack/LAPACK) failed.')
STOP 'SECLR4 - Error'
ENDIF
write(6,*) NE, ' Eigenvalues computed '
IF (NE .GT. LCOLZ) THEN
CALL OUTERR('SECLR4','More than NUME eigenvalues.')
WRITE (ERRMSG,9000) LCOLZ,NE
9000 FORMAT ('Increase NUME from',i4,' to',i4,' OR')
CALL OUTERR('SECLR4',ERRMSG)
CALL OUTERR('SECLR4','reduce E2 in *.klist.(see *.output1)')
WRITE(6,9001) (eigval(j),j=1,ne)
9001 FORMAT (5f15.6)
STOP 'SECLR4 - Error'
ENDIF
#endif
CALL STOP_TIMER(time_sep)
CALL BARRIER
!
! swap Diagonal of Overlap matrix into matrix
!
#ifndef Parallel
!_REAL CALL DSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
!_COMPLEX CALL ZSWAP(HSROWS,HS,HSROWS+1,HSDIAG,1)
!
!
! conjugate overlap matrix
!
!_COMPLEX DO J = 1, NVAA+NLO
!_COMPLEX DO I = 1, J
!_COMPLEX HS(I,J) = DCONJG(HS(I,J))
!_COMPLEX END DO
!_COMPLEX END DO
#endif
!
! backtransform eigenvectors to the original problem:
! for H*x=(lambda)*S*x;
! backtransform eigenvectors: x = inv(L)*y or inv(U)*y
!
CALL START_TIMER(time_backtransform)
#ifdef Parallel
if(deschs(2).gt.0) then
!_REAL CALL PDTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, 1.0D0, &
!_REAL S, 1,1,DESCHS, Z, 1,1,DESCZ)
!_COMPLEX CALL PZTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, (1.0D0,0.0D0), &
!_COMPLEX S, 1,1,DESCHS, Z, 1,1,DESCZ)
end if
#else
!_REAL CALL DTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, 1.0D0, &
!_REAL HS, HSROWS, Z, HSROWS)
!_COMPLEX CALL ZTRSM('L', 'U', 'N', 'N', NVAA+NLO, NE, (1.0D0,0.0D0), &
!_COMPLEX HS, HSROWS, Z, HSROWS)
#endif
CALL STOP_TIMER(time_backtransform)
CALL BARRIER
#ifdef Parallel
NBELW_PARA=0 !!!
do i=1,NE !!!
if(EIGVAL(i).lt. EL) then !!!
NBELW_PARA=NBELW_PARA+1 !!!
else !!!
exit !!!
endif !!!
enddo !!!
#endif
#ifdef Parallel
deallocate( ICLUSTR,GAP )
#endif
deallocate( IFAIL, IWORK )
deallocate( WORK )
!_COMPLEX deallocate( RWORK )
CALL BARRIER
!print
#ifndef Parallel
if(out) then
do i=1,ne
write(86,*) eigval(i)
write(89,1234) (Z(j,i),j=1,hsrows)
enddo
endif
#endif
!
Flop = DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0
!_COMPLEX Flop = Flop*4.0D0
if(READ_CPU_TIME(time_cholesky).eq.0.d0) then
WRITE (6,1001) 'Cholesky complete (CPU)' , &
READ_CPU_TIME(time_cholesky)
else
WRITE (6,1001) 'Cholesky complete (CPU)' , &
READ_CPU_TIME(time_cholesky) , &
1E-6* Flop/READ_CPU_TIME(time_cholesky)
endif
#ifdef Parallel
if(READ_WALL_TIME(time_cholesky).eq.0.d0) then
WRITE (6,1001) 'Cholesky complete (WALL)' , &
READ_WALL_TIME(time_cholesky)
else
WRITE (6,1001) 'Cholesky complete (WALL)' , &
READ_WALL_TIME(time_cholesky) , &
1E-6* Flop/READ_WALL_TIME(time_cholesky)
endif
#endif
Flop = 3.0D0*DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0
!_COMPLEX Flop = Flop*4.0D0
if(READ_CPU_TIME(time_transform).eq.0.d0) then
WRITE (6,1001) 'Transform to eig.problem (CPU)' , &
READ_CPU_TIME(time_transform)
else
WRITE (6,1001) 'Transform to eig.problem (CPU)' , &
READ_CPU_TIME(time_transform) , &
1E-6* Flop/READ_CPU_TIME(time_transform)
endif
#ifdef Parallel
if(READ_WALL_TIME(time_transform).eq.0.d0) then
WRITE (6,1001) 'Transform to eig.problem (WALL)' , &
READ_WALL_TIME(time_transform)
else
WRITE (6,1001) 'Transform to eig.problem (WALL)' , &
READ_WALL_TIME(time_transform) , &
1E-6* Flop/READ_WALL_TIME(time_transform)
endif
#endif
Flop = 4.0D0*DBLE(NVAA+NLO)*(NVAA+NLO)*(NVAA+NLO)/3.0D0
!_COMPLEX Flop = Flop*4.0D0
#if defined(ELPA) && defined(Parallel)
if(elpa_switch) then
WRITE (6,1001) 'Complete Matrix H (CPU)' , &
READ_CPU_TIME(time_matrixcomplete)
WRITE (6,1001) 'Complete Matrix H (WALL)' , &
READ_WALL_TIME(time_matrixcomplete)
endif
#endif
#ifdef ELPA
if(elpa_switch) then
if(READ_CPU_TIME(time_sep).eq.0.d0) then
WRITE (6,1001) 'Compute EVs using ELPA (CPU)' , &
READ_CPU_TIME(time_sep)
else
WRITE (6,1001) 'Compute EVs using ELPA (CPU)' , &
READ_CPU_TIME(time_sep), &
1E-6* Flop/READ_CPU_TIME(time_sep)
endif
else
#else
if(READ_CPU_TIME(time_sep).eq.0.d0) then
WRITE (6,1001) 'Compute eigenvalues (CPU)' , &
READ_CPU_TIME(time_sep)
else
WRITE (6,1001) 'Compute eigenvalues (CPU)' , &
READ_CPU_TIME(time_sep), &
1E-6* Flop/READ_CPU_TIME(time_sep)
endif
#endif
#ifdef ELPA
endif
#endif
#ifdef Parallel
#ifdef ELPA
if (elpa_switch) then
if(READ_WALL_TIME(time_sep).eq.0.d0) then
WRITE (6,1001) 'Compute EVs using ELPA (WALL)' , &
READ_WALL_TIME(time_sep)
else
WRITE (6,1001) 'Compute EVs using ELPA (WALL)' , &
READ_WALL_TIME(time_sep) , &
1E-6* Flop/READ_WALL_TIME(time_sep)
endif
else
#endif
if(READ_WALL_TIME(time_sep).eq.0.d0) then
WRITE (6,1001) 'Compute eigenvalues (WALL)' , &
READ_WALL_TIME(time_sep)
else
WRITE (6,1001) 'Compute eigenvalues (WALL)' , &
READ_WALL_TIME(time_sep) , &
1E-6* Flop/READ_WALL_TIME(time_sep)
endif
#ifdef ELPA
endif
#endif
#endif
Flop = DBLE(NVAA+NLO)*(NVAA+NLO)*NE/4.0D0
!_COMPLEX Flop = Flop*4.0D0
if(READ_CPU_TIME(time_backtransform).eq.0.d0) then
WRITE (6,1001) 'Backtransform (CPU)' , &
READ_CPU_TIME(time_backtransform)
else
WRITE (6,1001) 'Backtransform (CPU)' , &
READ_CPU_TIME(time_backtransform) , &
1E-6* Flop/READ_CPU_TIME(time_backtransform)
endif
#ifdef Parallel
if(READ_WALL_TIME(time_backtransform).eq.0.d0) then
WRITE (6,1001) 'Backtransform (WALL)' , &
READ_WALL_TIME(time_backtransform)
else
WRITE (6,1001) 'Backtransform (WALL)' , &
READ_WALL_TIME(time_backtransform) , &
1E-6* Flop/READ_WALL_TIME(time_backtransform)
endif
#endif
1001 FORMAT (1X,'Seclr4(',A,') :',t45,f10.3:t58,f10.2,' Mflops')
!
RETURN
!
6000 FORMAT (/,/,/,10X,'ERROR IN SCLRE2, IER1=',I5,'#E FOUND=',I5, &
' IER2=',I5)
!
! End of 'SECLR4'
!
END
_______________________________________________
Wien mailing list
[email protected]
http://zeus.theochem.tuwien.ac.at/mailman/listinfo/wien
SEARCH the MAILING-LIST at:
http://www.mail-archive.com/[email protected]/index.html