[llvm-bugs] [Bug 168682] [flang][openmp] inefficient code generation for openmp parallel loops?

LLVM Bugs via llvm-bugs Wed, 19 Nov 2025 13:45:42 -0800

Issue	168682
Summary	[flang][openmp] inefficient code generation for openmp parallel loops?
Labels	flang
Assignees
Reporter	shivaramaarao

    consider the following openmp program (reduced from 362.fma3d omp2012 suite)
```
module submod
      TYPE :: NT
        REAL(KIND(0D0))  Mi
 END TYPE


      TYPE :: MT
        REAL(KIND(0D0))  Ax
 REAL(KIND(0D0))  Ay
        REAL(KIND(0D0))  Az
      END TYPE

 TYPE :: FT
        REAL(KIND(0D0))  Xi
        REAL(KIND(0D0))  Yi
 REAL(KIND(0D0))  Zi
        REAL(KIND(0D0))  Xe
        REAL(KIND(0D0)) Ye
        REAL(KIND(0D0))  Ze
      END TYPE

      TYPE (FT), DIMENSION(:), ALLOCATABLE :: FF
      TYPE (MT), DIMENSION(:), ALLOCATABLE :: MM
      TYPE (NT), DIMENSION(:), ALLOCATABLE :: NN
      INTEGER :: N, NR
end module

SUBROUTINE SUB()
USE submod
!$OMP PARALLEL DO DEFAULT(SHARED) PRIVATE(N)
      DO N = 1,NR
        MM(N)%Ax = NN(N)%Mi * (FF(N)%Xe-FF(N)%Xi)
        MM(N)%Ay = NN(N)%Mi * (FF(N)%Ye-FF(N)%Yi)
 MM(N)%Az = NN(N)%Mi * (FF(N)%Ze-FF(N)%Zi)
      ENDDO
!$OMP END PARALLEL DO
      PRINT *, MM

END SUBROUTINE
```

The x86 code generation (shown below) for the openmp loop is suboptimal.
$ flang -O3 -fopenmp omp_p.f90

```
.LBB1_2:
         movslq  %r10d, %r11
         movq %r11, %r14
         movq    %r11, %r15
         leal    1(%r11), %r10d
 movq    %r11, %r12
         subq    %rdx, %r12
         movsd (%rcx,%r12,8), %xmm0
         subq    %rdi, %r14
         leaq (%r14,%r14,2), %r14
         shlq    $4, %r14
         subq    %r9, %r15
 leaq    (%r15,%r15,2), %r15
         movupd  (%rsi,%r14), %xmm1
 movupd  24(%rsi,%r14), %xmm2
         subpd   %xmm1, %xmm2
 movapd  %xmm0, %xmm1
         unpcklpd        %xmm0, %xmm1
         mulpd %xmm2, %xmm1
         movupd  %xmm1, (%r8,%r15,8)
         movsd 40(%rsi,%r14), %xmm1
         subsd   16(%rsi,%r14), %xmm1
         addl %eax, %r11d
         incl    %r11d
         mulsd   %xmm0, %xmm1
 movsd   %xmm1, 16(%r8,%r15,8)
         cmpl    $2, %r11d
         jne .LBB1_2
```
There are many index calculation expressions in the generated code. when -fopenmp is not used, the code generated doesnt have so many index calculations.

compared to above, the classic flang generated code is better and is shown below.
```
.LBB2_5:                                # %vector.body
                                        # =>This Inner Loop Header: Depth=1
        .loc    1 53 1 is_stmt 1                # t3.f90:53:1
        movupd  80(%r15,%rbp,2), %xmm2
        movupd 64(%r15,%rbp,2), %xmm3
        movupd  (%r15,%rbp,2), %xmm4
        movupd 16(%r15,%rbp,2), %xmm0
        movupd  32(%r15,%rbp,2), %xmm1
 movupd  48(%r15,%rbp,2), %xmm5
        movapd  %xmm4, %xmm6
 unpcklpd        %xmm5, %xmm6                    # xmm6 = xmm6[0],xmm5[0]
 unpckhpd        %xmm5, %xmm4                    # xmm4 = xmm4[1],xmm5[1]
        movapd  %xmm0, %xmm5
        unpcklpd        %xmm3, %xmm5                    # xmm5 = xmm5[0],xmm3[0]
        unpckhpd %xmm3, %xmm0                    # xmm0 = xmm0[1],xmm3[1]
        subpd %xmm6, %xmm0
        movapd  %xmm1, %xmm3
        unpcklpd        %xmm2, %xmm3                    # xmm3 = xmm3[0],xmm2[0]
        .loc    1 54 1 # t3.f90:54:1
        subpd   %xmm4, %xmm3
 .loc    1 53 1                          # t3.f90:53:1
        unpckhpd %xmm2, %xmm1                    # xmm1 = xmm1[1],xmm2[1]
        .loc    1 55 1                          # t3.f90:55:1
        subpd   %xmm5, %xmm1
 .loc    1 53 1                          # t3.f90:53:1
        movupd (%r12,%rcx,8), %xmm2
        mulpd   %xmm2, %xmm0
        .loc    1 54 1 # t3.f90:54:1
        mulpd   %xmm2, %xmm3
 .loc    1 55 1                          # t3.f90:55:1
        mulpd   %xmm2, %xmm1
        movapd  %xmm0, %xmm2
        unpcklpd        %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[0]
        unpckhpd        %xmm1, %xmm3 # xmm3 = xmm3[1],xmm1[1]
        shufpd  $2, %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[1]
        movupd  %xmm1, 16(%r13,%rbp)
        movupd  %xmm3, 32(%r13,%rbp)
        movupd  %xmm2, (%r13,%rbp)
        addq    $2, %rcx
        addq    $48, %rbp
 cmpq    %rcx, %r14
        jne     .LBB2_5
```

As we see there are less index calculations and more compute instructions in classic flang generated code. Any idea why we have many index calculation code in llvm flang generated code?

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 168682] [flang][openmp] inefficient code generation for openmp parallel loops?

Reply via email to