[Bug tree-optimization/92283] [10 Regression] 454.calculix miscomparison since r276645 with -O2 -march=znver2

rguenth at gcc dot gnu.org Wed, 06 Nov 2019 06:53:43 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92283


--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
So unrolling the inner loop of

846                  if(iperturb.ge.2) then
                        do m3=1,3
                           do m4=1,3
                              fn(m2,konl(m1))=fn(m2,konl(m1))+
     &                             xsj*skl(m4,m3)*weight*
     &                             (vkl(m2,m4)*shp(m3,m1)+
     &                             vkl(m2,m3)*shp(m4,m1))/2.d0
                           enddo
                        enddo
                     endif

is enough to trigger the failure for example but I don't see anything wrong
happening here.  -fdbgcnt=gimple_unroll:0:24 succeeds, 0:25 fails but
25:26 succeeds (only above loop unrolled).  There's subtle association
differences but entirely within the scope of the fortran standard
(parenthesis honored, only the adds are associated).
Given other loops play into this as well here
the issue is probably inconsistencies leading to failing cancelling or
simply too much jitter....

Thus we're unlucky(?)

Building results.f with -fno-associative-math doesn't help but I'm not
sure how much the frontend likes to be messed with its choice of default
option values for those.

Not unrolled loop:

.L169:
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmovsd  -40(%rdi,%r8,8), %xmm8  # MEM[base: _4159, index:
ivtmp.1147_4167, step: 8, offset: -40B], _1031
        movq    %r8, %rax       # ivtmp.1147, tmp2656
        salq    $5, %rax        #, tmp2656
        vmovsd  -32(%rcx,%rax), %xmm6   # MEM[base: _4156, index: _4155,
offset: -32B], _1036
        xorl    %eax, %eax      # ivtmp.1141
.L167:
        vmulsd  %xmm0, %xmm5, %xmm0     # pretmp_5253, xsj.389_997, tmp2657
        vmulsd  %xmm4, %xmm0, %xmm1     # weight, tmp2657, _1023
        vmulsd  -32(%rdi,%rax), %xmm6, %xmm0    # MEM[base: _4159, index:
ivtmp.1141_4178, offset: -32B], _1036, tmp2658
        vfmadd231sd     (%rcx,%rax,4), %xmm8, %xmm0     # MEM[base: _4156,
index: ivtmp.1141_4178, step: 4, offset: 0B], _1031, _1041
# results.f:848:                            do m4=1,3
        addq    $8, %rax        #, ivtmp.1141
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmulsd  %xmm1, %xmm0, %xmm0     # _1023, _1041, tmp2659
        vfmadd231sd     %xmm7, %xmm0, %xmm3     # tmp3594, tmp2659,
fn__I_lsm.737
# results.f:848:                            do m4=1,3
        cmpq    $24, %rax       #, ivtmp.1141
        je      .L166   #,
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmovsd  -24(%r9,%rax), %xmm0    # MEM[base: _4168, index:
ivtmp.1141_4177, offset: -24B], pretmp_5253
        jmp     .L167   #
        .p2align 4
        .p2align 3
.L166:
# results.f:847:                         do m3=1,3
        incq    %r8     # ivtmp.1147
        cmpq    $4, %r8 #, ivtmp.1147
        je      .L168   #,
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmovsd  (%r9), %xmm0    # MEM[base: _4154, offset: 0B], pretmp_5253
        addq    $24, %r9        #, ivtmp.1164
        jmp     .L169   #

unrolled one:

.L167:
        vmovsd  (%rdi), %xmm15  # MEM[base: _4141, offset: 0B], _1036
        vmovsd  (%r8), %xmm0    # MEM[base: _4142, offset: 0B], _1031
# results.f:847:                         do m3=1,3
        addq    $24, %rdx       #, ivtmp.1140
        addq    $8, %r8 #, ivtmp.1132
        addq    $32, %rdi       #, ivtmp.1136
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmulsd  %xmm12, %xmm6, %xmm12   # pretmp_5248, xsj.389_997, tmp2676
        vmulsd  %xmm4, %xmm12, %xmm13   # weight, tmp2676, _4551
        vmulsd  %xmm10, %xmm15, %xmm12  # _4543, _1036, tmp2677
        vfmadd231sd     %xmm11, %xmm0, %xmm12   # _4547, _1031, _4541
        vmulsd  %xmm13, %xmm12, %xmm12  # _4551, _4541, tmp2678
        vmulsd  -16(%rdx), %xmm6, %xmm13        # MEM[base: _4140, offset: 8B],
xsj.389_997, tmp2680
        vfmadd132sd     %xmm7, %xmm1, %xmm12    # tmp3622, fn__I_lsm.737, _4537
        vmulsd  %xmm8, %xmm15, %xmm1    # _4515, _1036, tmp2681
        vmulsd  %xmm3, %xmm15, %xmm15   # _1039, _1036, tmp2685
        vfmadd231sd     %xmm9, %xmm0, %xmm1     # _4519, _1031, _4513
        vfmadd132sd     %xmm5, %xmm15, %xmm0    # _1027, tmp2685, _1041
        vmulsd  %xmm4, %xmm13, %xmm13   # weight, tmp2680, _4523
        vmulsd  %xmm13, %xmm1, %xmm1    # _4523, _4513, tmp2682
        vfmadd132sd     %xmm7, %xmm12, %xmm1    # tmp3622, _4537, _4509
        vmulsd  -8(%rdx), %xmm6, %xmm12 # MEM[base: _4140, offset: 16B],
xsj.389_997, tmp2684
        vmulsd  %xmm4, %xmm12, %xmm12   # weight, tmp2684, _1023
        vmulsd  %xmm12, %xmm0, %xmm0    # _1023, _1041, tmp2686
        vfmadd231sd     %xmm7, %xmm0, %xmm1     # tmp3622, tmp2686,
fn__I_lsm.737
# results.f:847:                         do m3=1,3
        cmpq    %rdx, %r12      # ivtmp.1140, _4136
        je      .L166   #,
# results.f:852:      &                             vkl(m2,m3)*shp(m4,m1))/2.d0
        vmovsd  (%rdx), %xmm12  # MEM[base: _4138, offset: 0B], pretmp_5248
        jmp     .L167   #

but as said, not unrolling others while still unrolling this one doesn't
help.

Disabling FMA with -mno-fma helps.

-fdbgcnt=gimple_unroll:25:27 unrolling the preceeding loop at 839 as well
(there's CSE and thus more association opportunities then) doesn't trigger
the failure on its own either.

I'm going to test & commit the dbgcnt patch.

[Bug tree-optimization/92283] [10 Regression] 454.calculix miscomparison since r276645 with -O2 -march=znver2

Reply via email to