The attached testcase yields (on a core2 duo, gcc trunk):

gfortran -O3 -ftree-vectorize -ffast-math -march=native test.f90
time ./a.out
real    0m3.414s

ifort -xT -O3  test.f90
time ./a.out
real    0m1.556s

The assembly contains:

        ifort   gfortran
mulpd     140          0
mulsd       0        280

so the reason seems that ifort vectorizes the following code (full testcase attached):

SUBROUTINE collocate_core_6(res,coef_xyz,pol_x,pol_y,pol_z,cmax,kg,jg)

 IMPLICIT NONE
 INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
 integer, PARAMETER :: lp=6
    real(wp), INTENT(OUT)    :: res
    integer, INTENT(IN)     :: cmax,kg,jg
    real(wp), INTENT(IN)    :: pol_x(0:lp,-cmax:cmax)
    real(wp), INTENT(IN)    :: pol_y(1:2,0:lp,-cmax:0)
    real(wp), INTENT(IN)    :: pol_z(1:2,0:lp,-cmax:0)
    real(wp), INTENT(IN)    :: coef_xyz(((lp+1)*(lp+2)*(lp+3))/6)
    real(wp) ::  coef_xy(2,(lp+1)*(lp+2)/2)
    real(wp) ::  coef_x(4,0:lp)

[...]
    coef_x(1:2,4)=coef_x(1:2,4)+coef_xy(1:2,12)*pol_y(1,1,jg)
    coef_x(3:4,4)=coef_x(3:4,4)+coef_xy(1:2,12)*pol_y(2,1,jg)
    coef_x(1:2,5)=coef_x(1:2,5)+coef_xy(1:2,13)*pol_y(1,1,jg)
    coef_x(3:4,5)=coef_x(3:4,5)+coef_xy(1:2,13)*pol_y(2,1,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,14)*pol_y(1,2,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,14)*pol_y(2,2,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,15)*pol_y(1,2,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,15)*pol_y(2,2,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,16)*pol_y(1,2,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,16)*pol_y(2,2,jg)
    coef_x(1:2,3)=coef_x(1:2,3)+coef_xy(1:2,17)*pol_y(1,2,jg)
    coef_x(3:4,3)=coef_x(3:4,3)+coef_xy(1:2,17)*pol_y(2,2,jg)
    coef_x(1:2,4)=coef_x(1:2,4)+coef_xy(1:2,18)*pol_y(1,2,jg)
    coef_x(3:4,4)=coef_x(3:4,4)+coef_xy(1:2,18)*pol_y(2,2,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,19)*pol_y(1,3,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,19)*pol_y(2,3,jg)
[...]

either it is able to interpret the short vectors as such, or it realizes that these very short implicit loops are nevertheless favourable for vectorization.

Is there a trick to get gcc vectorize these loops, or is there some technology missing for this ?

Should I file a PR for this (this is somewhat similar to PR31079 and PR31021)?

Thanks in advance,

Joost
SUBROUTINE collocate_core_6(res,coef_xyz,pol_x,pol_y,pol_z,cmax,kg,jg)

 IMPLICIT NONE
 INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
 integer, PARAMETER :: lp=6
    real(wp), INTENT(OUT)    :: res
    integer, INTENT(IN)     :: cmax,kg,jg
    real(wp), INTENT(IN)    :: pol_x(0:lp,-cmax:cmax)
    real(wp), INTENT(IN)    :: pol_y(1:2,0:lp,-cmax:0)
    real(wp), INTENT(IN)    :: pol_z(1:2,0:lp,-cmax:0)
    real(wp), INTENT(IN)    :: coef_xyz(((lp+1)*(lp+2)*(lp+3))/6)
    real(wp) ::  coef_xy(2,(lp+1)*(lp+2)/2)
    real(wp) ::  coef_x(4,0:lp)

    coef_xy=0.0_wp
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(1)*pol_z(:,0,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(2)*pol_z(:,0,kg)
    coef_xy(:,3)=coef_xy(:,3)+coef_xyz(3)*pol_z(:,0,kg)
    coef_xy(:,4)=coef_xy(:,4)+coef_xyz(4)*pol_z(:,0,kg)
    coef_xy(:,5)=coef_xy(:,5)+coef_xyz(5)*pol_z(:,0,kg)
    coef_xy(:,6)=coef_xy(:,6)+coef_xyz(6)*pol_z(:,0,kg)
    coef_xy(:,7)=coef_xy(:,7)+coef_xyz(7)*pol_z(:,0,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(8)*pol_z(:,0,kg)
    coef_xy(:,9)=coef_xy(:,9)+coef_xyz(9)*pol_z(:,0,kg)
    coef_xy(:,10)=coef_xy(:,10)+coef_xyz(10)*pol_z(:,0,kg)
    coef_xy(:,11)=coef_xy(:,11)+coef_xyz(11)*pol_z(:,0,kg)
    coef_xy(:,12)=coef_xy(:,12)+coef_xyz(12)*pol_z(:,0,kg)
    coef_xy(:,13)=coef_xy(:,13)+coef_xyz(13)*pol_z(:,0,kg)
    coef_xy(:,14)=coef_xy(:,14)+coef_xyz(14)*pol_z(:,0,kg)
    coef_xy(:,15)=coef_xy(:,15)+coef_xyz(15)*pol_z(:,0,kg)
    coef_xy(:,16)=coef_xy(:,16)+coef_xyz(16)*pol_z(:,0,kg)
    coef_xy(:,17)=coef_xy(:,17)+coef_xyz(17)*pol_z(:,0,kg)
    coef_xy(:,18)=coef_xy(:,18)+coef_xyz(18)*pol_z(:,0,kg)
    coef_xy(:,19)=coef_xy(:,19)+coef_xyz(19)*pol_z(:,0,kg)
    coef_xy(:,20)=coef_xy(:,20)+coef_xyz(20)*pol_z(:,0,kg)
    coef_xy(:,21)=coef_xy(:,21)+coef_xyz(21)*pol_z(:,0,kg)
    coef_xy(:,22)=coef_xy(:,22)+coef_xyz(22)*pol_z(:,0,kg)
    coef_xy(:,23)=coef_xy(:,23)+coef_xyz(23)*pol_z(:,0,kg)
    coef_xy(:,24)=coef_xy(:,24)+coef_xyz(24)*pol_z(:,0,kg)
    coef_xy(:,25)=coef_xy(:,25)+coef_xyz(25)*pol_z(:,0,kg)
    coef_xy(:,26)=coef_xy(:,26)+coef_xyz(26)*pol_z(:,0,kg)
    coef_xy(:,27)=coef_xy(:,27)+coef_xyz(27)*pol_z(:,0,kg)
    coef_xy(:,28)=coef_xy(:,28)+coef_xyz(28)*pol_z(:,0,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(29)*pol_z(:,1,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(30)*pol_z(:,1,kg)
    coef_xy(:,3)=coef_xy(:,3)+coef_xyz(31)*pol_z(:,1,kg)
    coef_xy(:,4)=coef_xy(:,4)+coef_xyz(32)*pol_z(:,1,kg)
    coef_xy(:,5)=coef_xy(:,5)+coef_xyz(33)*pol_z(:,1,kg)
    coef_xy(:,6)=coef_xy(:,6)+coef_xyz(34)*pol_z(:,1,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(35)*pol_z(:,1,kg)
    coef_xy(:,9)=coef_xy(:,9)+coef_xyz(36)*pol_z(:,1,kg)
    coef_xy(:,10)=coef_xy(:,10)+coef_xyz(37)*pol_z(:,1,kg)
    coef_xy(:,11)=coef_xy(:,11)+coef_xyz(38)*pol_z(:,1,kg)
    coef_xy(:,12)=coef_xy(:,12)+coef_xyz(39)*pol_z(:,1,kg)
    coef_xy(:,14)=coef_xy(:,14)+coef_xyz(40)*pol_z(:,1,kg)
    coef_xy(:,15)=coef_xy(:,15)+coef_xyz(41)*pol_z(:,1,kg)
    coef_xy(:,16)=coef_xy(:,16)+coef_xyz(42)*pol_z(:,1,kg)
    coef_xy(:,17)=coef_xy(:,17)+coef_xyz(43)*pol_z(:,1,kg)
    coef_xy(:,19)=coef_xy(:,19)+coef_xyz(44)*pol_z(:,1,kg)
    coef_xy(:,20)=coef_xy(:,20)+coef_xyz(45)*pol_z(:,1,kg)
    coef_xy(:,21)=coef_xy(:,21)+coef_xyz(46)*pol_z(:,1,kg)
    coef_xy(:,23)=coef_xy(:,23)+coef_xyz(47)*pol_z(:,1,kg)
    coef_xy(:,24)=coef_xy(:,24)+coef_xyz(48)*pol_z(:,1,kg)
    coef_xy(:,26)=coef_xy(:,26)+coef_xyz(49)*pol_z(:,1,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(50)*pol_z(:,2,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(51)*pol_z(:,2,kg)
    coef_xy(:,3)=coef_xy(:,3)+coef_xyz(52)*pol_z(:,2,kg)
    coef_xy(:,4)=coef_xy(:,4)+coef_xyz(53)*pol_z(:,2,kg)
    coef_xy(:,5)=coef_xy(:,5)+coef_xyz(54)*pol_z(:,2,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(55)*pol_z(:,2,kg)
    coef_xy(:,9)=coef_xy(:,9)+coef_xyz(56)*pol_z(:,2,kg)
    coef_xy(:,10)=coef_xy(:,10)+coef_xyz(57)*pol_z(:,2,kg)
    coef_xy(:,11)=coef_xy(:,11)+coef_xyz(58)*pol_z(:,2,kg)
    coef_xy(:,14)=coef_xy(:,14)+coef_xyz(59)*pol_z(:,2,kg)
    coef_xy(:,15)=coef_xy(:,15)+coef_xyz(60)*pol_z(:,2,kg)
    coef_xy(:,16)=coef_xy(:,16)+coef_xyz(61)*pol_z(:,2,kg)
    coef_xy(:,19)=coef_xy(:,19)+coef_xyz(62)*pol_z(:,2,kg)
    coef_xy(:,20)=coef_xy(:,20)+coef_xyz(63)*pol_z(:,2,kg)
    coef_xy(:,23)=coef_xy(:,23)+coef_xyz(64)*pol_z(:,2,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(65)*pol_z(:,3,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(66)*pol_z(:,3,kg)
    coef_xy(:,3)=coef_xy(:,3)+coef_xyz(67)*pol_z(:,3,kg)
    coef_xy(:,4)=coef_xy(:,4)+coef_xyz(68)*pol_z(:,3,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(69)*pol_z(:,3,kg)
    coef_xy(:,9)=coef_xy(:,9)+coef_xyz(70)*pol_z(:,3,kg)
    coef_xy(:,10)=coef_xy(:,10)+coef_xyz(71)*pol_z(:,3,kg)
    coef_xy(:,14)=coef_xy(:,14)+coef_xyz(72)*pol_z(:,3,kg)
    coef_xy(:,15)=coef_xy(:,15)+coef_xyz(73)*pol_z(:,3,kg)
    coef_xy(:,19)=coef_xy(:,19)+coef_xyz(74)*pol_z(:,3,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(75)*pol_z(:,4,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(76)*pol_z(:,4,kg)
    coef_xy(:,3)=coef_xy(:,3)+coef_xyz(77)*pol_z(:,4,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(78)*pol_z(:,4,kg)
    coef_xy(:,9)=coef_xy(:,9)+coef_xyz(79)*pol_z(:,4,kg)
    coef_xy(:,14)=coef_xy(:,14)+coef_xyz(80)*pol_z(:,4,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(81)*pol_z(:,5,kg)
    coef_xy(:,2)=coef_xy(:,2)+coef_xyz(82)*pol_z(:,5,kg)
    coef_xy(:,8)=coef_xy(:,8)+coef_xyz(83)*pol_z(:,5,kg)
    coef_xy(:,1)=coef_xy(:,1)+coef_xyz(84)*pol_z(:,6,kg)
    coef_x=0.0_wp
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,1)*pol_y(1,0,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,1)*pol_y(2,0,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,2)*pol_y(1,0,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,2)*pol_y(2,0,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,3)*pol_y(1,0,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,3)*pol_y(2,0,jg)
    coef_x(1:2,3)=coef_x(1:2,3)+coef_xy(1:2,4)*pol_y(1,0,jg)
    coef_x(3:4,3)=coef_x(3:4,3)+coef_xy(1:2,4)*pol_y(2,0,jg)
    coef_x(1:2,4)=coef_x(1:2,4)+coef_xy(1:2,5)*pol_y(1,0,jg)
    coef_x(3:4,4)=coef_x(3:4,4)+coef_xy(1:2,5)*pol_y(2,0,jg)
    coef_x(1:2,5)=coef_x(1:2,5)+coef_xy(1:2,6)*pol_y(1,0,jg)
    coef_x(3:4,5)=coef_x(3:4,5)+coef_xy(1:2,6)*pol_y(2,0,jg)
    coef_x(1:2,6)=coef_x(1:2,6)+coef_xy(1:2,7)*pol_y(1,0,jg)
    coef_x(3:4,6)=coef_x(3:4,6)+coef_xy(1:2,7)*pol_y(2,0,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,8)*pol_y(1,1,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,8)*pol_y(2,1,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,9)*pol_y(1,1,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,9)*pol_y(2,1,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,10)*pol_y(1,1,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,10)*pol_y(2,1,jg)
    coef_x(1:2,3)=coef_x(1:2,3)+coef_xy(1:2,11)*pol_y(1,1,jg)
    coef_x(3:4,3)=coef_x(3:4,3)+coef_xy(1:2,11)*pol_y(2,1,jg)
    coef_x(1:2,4)=coef_x(1:2,4)+coef_xy(1:2,12)*pol_y(1,1,jg)
    coef_x(3:4,4)=coef_x(3:4,4)+coef_xy(1:2,12)*pol_y(2,1,jg)
    coef_x(1:2,5)=coef_x(1:2,5)+coef_xy(1:2,13)*pol_y(1,1,jg)
    coef_x(3:4,5)=coef_x(3:4,5)+coef_xy(1:2,13)*pol_y(2,1,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,14)*pol_y(1,2,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,14)*pol_y(2,2,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,15)*pol_y(1,2,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,15)*pol_y(2,2,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,16)*pol_y(1,2,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,16)*pol_y(2,2,jg)
    coef_x(1:2,3)=coef_x(1:2,3)+coef_xy(1:2,17)*pol_y(1,2,jg)
    coef_x(3:4,3)=coef_x(3:4,3)+coef_xy(1:2,17)*pol_y(2,2,jg)
    coef_x(1:2,4)=coef_x(1:2,4)+coef_xy(1:2,18)*pol_y(1,2,jg)
    coef_x(3:4,4)=coef_x(3:4,4)+coef_xy(1:2,18)*pol_y(2,2,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,19)*pol_y(1,3,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,19)*pol_y(2,3,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,20)*pol_y(1,3,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,20)*pol_y(2,3,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,21)*pol_y(1,3,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,21)*pol_y(2,3,jg)
    coef_x(1:2,3)=coef_x(1:2,3)+coef_xy(1:2,22)*pol_y(1,3,jg)
    coef_x(3:4,3)=coef_x(3:4,3)+coef_xy(1:2,22)*pol_y(2,3,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,23)*pol_y(1,4,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,23)*pol_y(2,4,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,24)*pol_y(1,4,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,24)*pol_y(2,4,jg)
    coef_x(1:2,2)=coef_x(1:2,2)+coef_xy(1:2,25)*pol_y(1,4,jg)
    coef_x(3:4,2)=coef_x(3:4,2)+coef_xy(1:2,25)*pol_y(2,4,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,26)*pol_y(1,5,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,26)*pol_y(2,5,jg)
    coef_x(1:2,1)=coef_x(1:2,1)+coef_xy(1:2,27)*pol_y(1,5,jg)
    coef_x(3:4,1)=coef_x(3:4,1)+coef_xy(1:2,27)*pol_y(2,5,jg)
    coef_x(1:2,0)=coef_x(1:2,0)+coef_xy(1:2,28)*pol_y(1,6,jg)
    coef_x(3:4,0)=coef_x(3:4,0)+coef_xy(1:2,28)*pol_y(2,6,jg)
    res=SUM(coef_x)
  END SUBROUTINE collocate_core_6

 INTEGER, PARAMETER :: wp = SELECTED_REAL_KIND ( 14, 200 )
 integer, PARAMETER :: lp=6,cmax=0,kg=0,jg=0
 real(wp)   :: res
 real(wp)   :: pol_x(0:lp,-cmax:cmax)
 real(wp)   :: pol_y(1:2,0:lp,-cmax:0)
 real(wp)   :: pol_z(1:2,0:lp,-cmax:0)
 real(wp)   :: coef_xyz(((lp+1)*(lp+2)*(lp+3))/6)

 coef_xyz=0
 pol_x=0
 pol_y=0
 pol_z=0
 DO i=1,10000000
 CALL collocate_core_6(res,coef_xyz,pol_x,pol_y,pol_z,cmax,kg,jg)
 ENDDO

 END

Reply via email to