| Issue |
168682
|
| Summary |
[flang][openmp] inefficient code generation for openmp parallel loops?
|
| Labels |
flang
|
| Assignees |
|
| Reporter |
shivaramaarao
|
consider the following openmp program (reduced from 362.fma3d omp2012 suite)
```
module submod
TYPE :: NT
REAL(KIND(0D0)) Mi
END TYPE
TYPE :: MT
REAL(KIND(0D0)) Ax
REAL(KIND(0D0)) Ay
REAL(KIND(0D0)) Az
END TYPE
TYPE :: FT
REAL(KIND(0D0)) Xi
REAL(KIND(0D0)) Yi
REAL(KIND(0D0)) Zi
REAL(KIND(0D0)) Xe
REAL(KIND(0D0)) Ye
REAL(KIND(0D0)) Ze
END TYPE
TYPE (FT), DIMENSION(:), ALLOCATABLE :: FF
TYPE (MT), DIMENSION(:), ALLOCATABLE :: MM
TYPE (NT), DIMENSION(:), ALLOCATABLE :: NN
INTEGER :: N, NR
end module
SUBROUTINE SUB()
USE submod
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(N)
DO N = 1,NR
MM(N)%Ax = NN(N)%Mi * (FF(N)%Xe-FF(N)%Xi)
MM(N)%Ay = NN(N)%Mi * (FF(N)%Ye-FF(N)%Yi)
MM(N)%Az = NN(N)%Mi * (FF(N)%Ze-FF(N)%Zi)
ENDDO
!$OMP END PARALLEL DO
PRINT *, MM
END SUBROUTINE
```
The x86 code generation (shown below) for the openmp loop is suboptimal.
$ flang -O3 -fopenmp omp_p.f90
```
.LBB1_2:
movslq %r10d, %r11
movq %r11, %r14
movq %r11, %r15
leal 1(%r11), %r10d
movq %r11, %r12
subq %rdx, %r12
movsd (%rcx,%r12,8), %xmm0
subq %rdi, %r14
leaq (%r14,%r14,2), %r14
shlq $4, %r14
subq %r9, %r15
leaq (%r15,%r15,2), %r15
movupd (%rsi,%r14), %xmm1
movupd 24(%rsi,%r14), %xmm2
subpd %xmm1, %xmm2
movapd %xmm0, %xmm1
unpcklpd %xmm0, %xmm1
mulpd %xmm2, %xmm1
movupd %xmm1, (%r8,%r15,8)
movsd 40(%rsi,%r14), %xmm1
subsd 16(%rsi,%r14), %xmm1
addl %eax, %r11d
incl %r11d
mulsd %xmm0, %xmm1
movsd %xmm1, 16(%r8,%r15,8)
cmpl $2, %r11d
jne .LBB1_2
```
There are many index calculation expressions in the generated code. when -fopenmp is not used, the code generated doesnt have so many index calculations.
compared to above, the classic flang generated code is better and is shown below.
```
.LBB2_5: # %vector.body
# =>This Inner Loop Header: Depth=1
.loc 1 53 1 is_stmt 1 # t3.f90:53:1
movupd 80(%r15,%rbp,2), %xmm2
movupd 64(%r15,%rbp,2), %xmm3
movupd (%r15,%rbp,2), %xmm4
movupd 16(%r15,%rbp,2), %xmm0
movupd 32(%r15,%rbp,2), %xmm1
movupd 48(%r15,%rbp,2), %xmm5
movapd %xmm4, %xmm6
unpcklpd %xmm5, %xmm6 # xmm6 = xmm6[0],xmm5[0]
unpckhpd %xmm5, %xmm4 # xmm4 = xmm4[1],xmm5[1]
movapd %xmm0, %xmm5
unpcklpd %xmm3, %xmm5 # xmm5 = xmm5[0],xmm3[0]
unpckhpd %xmm3, %xmm0 # xmm0 = xmm0[1],xmm3[1]
subpd %xmm6, %xmm0
movapd %xmm1, %xmm3
unpcklpd %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0]
.loc 1 54 1 # t3.f90:54:1
subpd %xmm4, %xmm3
.loc 1 53 1 # t3.f90:53:1
unpckhpd %xmm2, %xmm1 # xmm1 = xmm1[1],xmm2[1]
.loc 1 55 1 # t3.f90:55:1
subpd %xmm5, %xmm1
.loc 1 53 1 # t3.f90:53:1
movupd (%r12,%rcx,8), %xmm2
mulpd %xmm2, %xmm0
.loc 1 54 1 # t3.f90:54:1
mulpd %xmm2, %xmm3
.loc 1 55 1 # t3.f90:55:1
mulpd %xmm2, %xmm1
movapd %xmm0, %xmm2
unpcklpd %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[0]
unpckhpd %xmm1, %xmm3 # xmm3 = xmm3[1],xmm1[1]
shufpd $2, %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[1]
movupd %xmm1, 16(%r13,%rbp)
movupd %xmm3, 32(%r13,%rbp)
movupd %xmm2, (%r13,%rbp)
addq $2, %rcx
addq $48, %rbp
cmpq %rcx, %r14
jne .LBB2_5
```
As we see there are less index calculations and more compute instructions in classic flang generated code. Any idea why we have many index calculation code in llvm flang generated code?
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs