https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108410

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
Adding fully masked AVX512 and AVX512 with a masked epilog data:

size   scalar     128     256     512    512e    512f
    1    9.42   11.32    9.35   11.17   15.13   16.89
    2    5.72    6.53    6.66    6.66    7.62    8.56
    3    4.49    5.10    5.10    5.74    5.08    5.73
    4    4.10    4.33    4.29    5.21    3.79    4.25
    6    3.78    3.85    3.86    4.76    2.54    2.85
    8    3.64    1.89    3.76    4.50    1.92    2.16
   12    3.56    2.21    3.75    4.26    1.26    1.42
   16    3.36    0.83    1.06    4.16    0.95    1.07
   20    3.39    1.42    1.33    4.07    0.75    0.85
   24    3.23    0.66    1.72    4.22    0.62    0.70
   28    3.18    1.09    2.04    4.20    0.54    0.61
   32    3.16    0.47    0.41    0.41    0.47    0.53
   34    3.16    0.67    0.61    0.56    0.44    0.50
   38    3.19    0.95    0.95    0.82    0.40    0.45
   42    3.09    0.58    1.21    1.13    0.36    0.40

text sizes are not much different:

         1389    1837    2125    1629    1721    1689

the AVX2 size is large because we completely peel the scalar epilogue,
same for the SSE case.  The scalar epilogue of the 512 loop iterates
32 times (too many for peeling), the masked loop/epilogue are quite
large due to the EVEX encoded instructions so the saved scalar/vector
epilogues do not show.

The AVX512 masked epilogue case now looks like:

        .p2align 3
.L5:
        vmovdqu8        (%r8,%rax), %zmm0
        vpavgb  (%rsi,%rax), %zmm0, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax)
        addq    $64, %rax
        cmpq    %rcx, %rax
        jne     .L5
        movl    %edx, %ecx
        andl    $-64, %ecx
        testb   $63, %dl
        je      .L19
.L4:
        movl    %ecx, %eax
        subl    %ecx, %edx
        movl    $255, %ecx
        cmpl    %ecx, %edx
        cmova   %ecx, %edx
        vpbroadcastb    %edx, %zmm0
        vpcmpub $6, .LC0(%rip), %zmm0, %k1
        vmovdqu8        (%rsi,%rax), %zmm0{%k1}{z}
        vmovdqu8        (%r8,%rax), %zmm1{%k1}{z}
        vpavgb  %zmm1, %zmm0, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax){%k1}
.L19:
        vzeroupper
        ret

where there's a missed optimization around the saturation to 255.

The fully masked AVX512 loop is

        vmovdqa64       .LC0(%rip), %zmm3
        movl    $255, %eax
        cmpl    %eax, %ecx 
        cmovbe  %ecx, %eax
        vpbroadcastb    %eax, %zmm0
        vpcmpub $6, %zmm3, %zmm0, %k1
        .p2align 4
        .p2align 3
.L4:
        vmovdqu8        (%rsi,%rax), %zmm1{%k1}
        vmovdqu8        (%r8,%rax), %zmm2{%k1}
        movl    %r10d, %edx
        movl    $255, %ecx
        subl    %eax, %edx
        cmpl    %ecx, %edx
        cmova   %ecx, %edx
        vpavgb  %zmm2, %zmm1, %zmm0
        vmovdqu8        %zmm0, (%rdi,%rax){%k1}
        vpbroadcastb    %edx, %zmm0
        addq    $64, %rax
        movl    %r9d, %edx
        subl    %eax, %edx
        vpcmpub $6, %zmm3, %zmm0, %k1
        cmpl    $64, %edx
        ja      .L4
        vzeroupper
        ret

which is a much larger loop body due to the mask creation.  At least
that interleaves nicely (dependence wise) with the loop control and
vectorized stmts.  What needs to be optimized somehow is what IVOPTs
makes out of the decreasing remaining scalar iters IV with the 
IV required for the memory accesses.  Without IVOPTs the body looks
like

.L4:
        vmovdqu8        (%rsi), %zmm1{%k1}
        vmovdqu8        (%rdx), %zmm2{%k1}
        movl    $255, %eax
        movl    %ecx, %r8d
        subl    $64, %ecx
        addq    $64, %rsi
        addq    $64, %rdx
        vpavgb  %zmm2, %zmm1, %zmm0
        vmovdqu8        %zmm0, (%rdi){%k1}
        addq    $64, %rdi
        cmpl    %eax, %ecx
        cmovbe  %ecx, %eax
        vpbroadcastb    %eax, %zmm0
        vpcmpub $6, %zmm3, %zmm0, %k1
        cmpl    $64, %r8d
        ja      .L4

and the key thing to optimize is

  ivtmp_78 = ivtmp_77 + 4294967232; // -64
  _79 = MIN_EXPR <ivtmp_78, 255>;
  _80 = (unsigned char) _79;
  _81 = {_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80, _80,
_80, _80};

that is we want to broadcast a saturated (to vector element precision) value.

Reply via email to