https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99394
Bug ID: 99394
Summary: s254 benchmark of TSVC is vectorized by clang and not
by gcc
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: hubicka at gcc dot gnu.org
Target Milestone: ---
Clang is vectorizing s254 loop with -mtune=archive on znver2 leading to about
758% speedup. Loop is:
real_t s254(struct args_t * func_args)
{
// scalar and array expansion
// carry around variable
initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);
real_t x;
for (int nl = 0; nl < 4*iterations; nl++) {
x = b[LEN_1D-1];
for (int i = 0; i < LEN_1D; i++) {
a[i] = (b[i] + x) * (real_t).5;
x = b[i];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}
and clang produces:
0000000000407d30 <s254>:
407d30: 41 56 push %r14
407d32: 53 push %rbx
407d33: 48 83 ec 28 sub $0x28,%rsp
407d37: 49 89 fe mov %rdi,%r14
407d3a: bf 6b e2 42 00 mov $0x42e26b,%edi
407d3f: e8 cc f8 00 00 call 417610 <initialise_arrays>
407d44: 31 db xor %ebx,%ebx
407d46: 4c 89 f7 mov %r14,%rdi
407d49: 31 f6 xor %esi,%esi
407d4b: e8 10 93 ff ff call 401060 <gettimeofday@plt>
407d50: c4 62 7d 18 05 af 62 vbroadcastss 0x262af(%rip),%ymm8
# 42e008 <_IO_stdin_used+0x8>
407d57: 02 00
407d59: c5 7c 11 04 24 vmovups %ymm8,(%rsp)
407d5e: 66 90 xchg %ax,%ax
407d60: 48 c7 c0 00 0c fe ff mov $0xfffffffffffe0c00,%rax
407d67: c4 e2 7d 18 05 8c a7 vbroadcastss 0x4a78c(%rip),%ymm0
# 4524fc <b+0x1f3fc>
407d6e: 04 00
407d70: c5 fc 28 88 00 25 45 vmovaps 0x452500(%rax),%ymm1
407d77: 00
407d78: c5 fc 28 90 20 25 45 vmovaps 0x452520(%rax),%ymm2
407d7f: 00
407d80: c5 fc 28 98 40 25 45 vmovaps 0x452540(%rax),%ymm3
407d87: 00
407d88: c4 e3 7d 06 c1 21 vperm2f128 $0x21,%ymm1,%ymm0,%ymm0
407d8e: c5 fc 28 a0 60 25 45 vmovaps 0x452560(%rax),%ymm4
407d95: 00
407d96: c5 fc c6 c1 03 vshufps $0x3,%ymm1,%ymm0,%ymm0
407d9b: c5 fc c6 c1 98 vshufps $0x98,%ymm1,%ymm0,%ymm0
407da0: c4 e3 75 06 ea 21 vperm2f128 $0x21,%ymm2,%ymm1,%ymm5
407da6: c5 d4 c6 ea 03 vshufps $0x3,%ymm2,%ymm5,%ymm5
407dab: c5 d4 c6 ea 98 vshufps $0x98,%ymm2,%ymm5,%ymm5
407db0: c4 e3 6d 06 f3 21 vperm2f128 $0x21,%ymm3,%ymm2,%ymm6
407db6: c5 cc c6 f3 03 vshufps $0x3,%ymm3,%ymm6,%ymm6
407dbb: c5 cc c6 f3 98 vshufps $0x98,%ymm3,%ymm6,%ymm6
407dc0: c4 e3 65 06 fc 21 vperm2f128 $0x21,%ymm4,%ymm3,%ymm7
407dc6: c5 c4 c6 fc 03 vshufps $0x3,%ymm4,%ymm7,%ymm7
407dcb: c5 c4 c6 fc 98 vshufps $0x98,%ymm4,%ymm7,%ymm7
407dd0: c5 f4 58 c0 vaddps %ymm0,%ymm1,%ymm0
407dd4: c5 ec 58 cd vaddps %ymm5,%ymm2,%ymm1
407dd8: c5 e4 58 d6 vaddps %ymm6,%ymm3,%ymm2
407ddc: c5 dc 58 df vaddps %ymm7,%ymm4,%ymm3
407de0: c5 bc 59 c0 vmulps %ymm0,%ymm8,%ymm0
407de4: c5 bc 59 c9 vmulps %ymm1,%ymm8,%ymm1
407de8: c5 bc 59 d2 vmulps %ymm2,%ymm8,%ymm2
407dec: c5 bc 59 db vmulps %ymm3,%ymm8,%ymm3
407df0: c5 fc 29 80 00 19 47 vmovaps %ymm0,0x471900(%rax)
407df7: 00
407df8: c5 fc 29 88 20 19 47 vmovaps %ymm1,0x471920(%rax)
407dff: 00
407e00: c5 fc 29 90 40 19 47 vmovaps %ymm2,0x471940(%rax)
407e07: 00
407e08: c5 fc 29 98 60 19 47 vmovaps %ymm3,0x471960(%rax)
407e0f: 00
407e10: c5 fc 28 c4 vmovaps %ymm4,%ymm0
407e14: 48 83 e8 80 sub $0xffffffffffffff80,%rax
407e18: 0f 85 52 ff ff ff jne 407d70 <s254+0x40>
407e1e: bf 00 25 45 00 mov $0x452500,%edi
407e23: be 00 31 43 00 mov $0x433100,%esi
407e28: ba 00 19 47 00 mov $0x471900,%edx
407e2d: b9 00 0d 49 00 mov $0x490d00,%ecx
407e32: 41 b8 00 01 4b 00 mov $0x4b0100,%r8d
407e38: 41 b9 00 f5 4c 00 mov $0x4cf500,%r9d
407e3e: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0
407e42: 68 00 f5 54 00 push $0x54f500
407e47: 68 00 f5 50 00 push $0x50f500
407e4c: c5 f8 77 vzeroupper
407e4f: e8 6c db 00 00 call 4159c0 <dummy>
407e54: c5 7c 10 44 24 10 vmovups 0x10(%rsp),%ymm8
407e5a: 48 83 c4 10 add $0x10,%rsp
407e5e: 83 c3 01 add $0x1,%ebx
407e61: 81 fb 80 1a 06 00 cmp $0x61a80,%ebx
407e67: 0f 85 f3 fe ff ff jne 407d60 <s254+0x30>
407e6d: 49 83 c6 10 add $0x10,%r14
407e71: 4c 89 f7 mov %r14,%rdi
407e74: 31 f6 xor %esi,%esi
407e76: c5 f8 77 vzeroupper
407e79: e8 e2 91 ff ff call 401060 <gettimeofday@plt>
407e7e: bf 6b e2 42 00 mov $0x42e26b,%edi
407e83: 48 83 c4 28 add $0x28,%rsp
407e87: 5b pop %rbx
407e88: 41 5e pop %r14
407e8a: e9 71 f1 01 00 jmp 427000 <calc_checksum>
407e8f: 90 nop