256-bit vector VCOND_EXPR under avx512f

crazylht at gmail dot com Tue, 26 Nov 2019 19:17:36 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92686


            Bug ID: 92686
           Summary: Inefficient mask operation for 128/256-bit vector
                    VCOND_EXPR under avx512f
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
                CC: hjl.tools at gmail dot com, wwwhhhyyy333 at gmail dot com
  Target Milestone: ---
            Target: i386, x86-64

Cat test.c

void mc_weight( unsigned int *__restrict dst, unsigned int *__restrict src1,
int *__restrict src2)
{
        for( int x = 0; x < 16; x++ )
            dst[x] = src1[x] > src2[x] ? src1[x] : dst[x];
}

With option -Ofast -march=skylake-avx512

gcc using xmm register as mask and using vpblendvb for condition vector move

        vmovdqu32       (%rsi), %ymm0
        vpminud (%rdx), %ymm0, %ymm1
        vpcmpeqd        %ymm1, %ymm0, %ymm1
        vpblendvb       %ymm1, (%rdi), %ymm0, %ymm0
        vmovdqu32       %ymm0, (%rdi)
        vmovdqu32       32(%rsi), %ymm0
        vpminud 32(%rdx), %ymm0, %ymm1
        vpcmpeqd        %ymm1, %ymm0, %ymm1
        vpblendvb       %ymm1, 32(%rdi), %ymm0, %ymm0
        vmovdqu32       %ymm0, 32(%rdi)
        vzeroupper


But there is mask register in avx512f, it could be better as:

        vmovdqu   (%rsi), %ymm0                                 #5.25
        vmovdqu   32(%rsi), %ymm1                               #5.25
        vpcmpud   $6, (%rdx), %ymm0, %k1                        #5.25
        vpcmpud   $6, 32(%rdx), %ymm1, %k2                      #5.25
        vmovdqu32 %ymm0, (%rdi){%k1}                            #5.6
        vmovdqu32 %ymm1, 32(%rdi){%k2}                          #5.6
        vzeroupper                                              #6.1
        ret                                                     #6.1

That's because currently gcc only hanlde 512-bit vector
=-----------------------
 3437  /* In AVX512F the result of comparison is an integer mask.  */           
 3438  bool maskcmp = false;                                                    
 3439  rtx x;                                                                   
 3440                                                                           
 3441  if (GET_MODE_SIZE (cmp_ops_mode) == 64)                                  
 3442    {                                                                      
 3443      unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);                 
 3444      cmp_mode = int_mode_for_size (nbits, 0).require ();                  
 3445      maskcmp = true;                                                      
 3446    }                                                                      
 3447  else          
------------------------

With additional -mprefer-vector-width=512, gcc have 

        vmovdqu32       (%rsi), %zmm0
        vpminud (%rdx), %zmm0, %zmm1
        vpcmpeqd        %zmm1, %zmm0, %k1
        vmovdqu32       (%rdi), %zmm0{%k1}
        vmovdqu32       %zmm0, (%rdi)
        vzeroupper
        ret

Since mask register is related to isa not vector size, under avx512f we can
also have 128/256-bit vector condition move.

[Bug target/92686] New: Inefficient mask operation for 128/256-bit vector VCOND_EXPR under avx512f

Reply via email to