> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>:
> 
> When 2 vectors are equal, kmask is allones and kortest will set CF,
> else CF will be cleared.
> 
> So CF bit can be used to check for the result of the comparison.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

Is that also profitable for 256bit aka AVX10?
Is there a jump on carry in case the result feeds control flow rather than a 
value and is using ktest better then (does combine figure this out?)

> Before:
>        vmovdqu (%rsi), %ymm0
>        vpxorq  (%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        jne     .L2
>        vmovdqu 32(%rsi), %ymm0
>        vpxorq  32(%rdi), %ymm0, %ymm0
>        vptest  %ymm0, %ymm0
>        je      .L5
> .L2:
>        movl    $1, %eax
>        xorl    $1, %eax
>        vzeroupper
>        ret
> 
> After:
>        vmovdqu64       (%rsi), %zmm0
>        xorl    %eax, %eax
>        vpcmpeqd        (%rdi), %zmm0, %k0
>        kortestw        %k0, %k0
>        setc    %al
>        vzeroupper
>        ret
> 
> gcc/ChangeLog:
> 
>    PR target/104610
>    * config/i386/i386-expand.cc (ix86_expand_branch): Handle
>    512-bit vector with vpcmpeq + kortest.
>    * config/i386/i386.md (cbranchxi4): New expander.
>    * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
>    and V8DImode.
> 
> gcc/testsuite/ChangeLog:
> 
>    * gcc.target/i386/pr104610-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc             | 55 +++++++++++++++-------
> gcc/config/i386/i386.md                    | 16 +++++++
> gcc/config/i386/sse.md                     | 36 +++++++++++---
> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> 4 files changed, 99 insertions(+), 22 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
> 
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..c664cb61e80 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx 
> op1, rtx label)
>   rtx tmp;
> 
>   /* Handle special case - vector comparsion with boolean result, transform
> -     it using ptest instruction.  */
> +     it using ptest instruction or vpcmpeq + kortest.  */
>   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
>       || (mode == TImode && !TARGET_64BIT)
> -      || mode == OImode)
> +      || mode == OImode
> +      || GET_MODE_SIZE (mode) == 64)
>     {
> -      rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> -      machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> +      unsigned msize = GET_MODE_SIZE (mode);
> +      machine_mode p_mode
> +    = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> +      /* kortest set CF when result is 0xFFFF (op0 == op1).  */
> +      rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
> 
>       gcc_assert (code == EQ || code == NE);
> 
> -      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +      /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors.  */
> +      if (msize == 64)
>    {
> -      op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> -      op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> -      mode = p_mode;
> +      if (mode != V16SImode)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +        }
> +
> +      tmp = gen_reg_rtx (HImode);
> +      emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> +      emit_insn (gen_kortesthi_ccc (tmp, tmp));
> +    }
> +      /* Using ptest for 128/256-bit vectors.  */
> +      else
> +    {
> +      if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> +        {
> +          op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> +          op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> +          mode = p_mode;
> +        }
> +
> +      /* Generate XOR since we can't check that one operand is zero
> +         vector.  */
> +      tmp = gen_reg_rtx (mode);
> +      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> +      tmp = gen_lowpart (p_mode, tmp);
> +      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> +                  gen_rtx_UNSPEC (CCZmode,
> +                          gen_rtvec (2, tmp, tmp),
> +                          UNSPEC_PTEST)));
>    }
> -      /* Generate XOR since we can't check that one operand is zero vector.  
> */
> -      tmp = gen_reg_rtx (mode);
> -      emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> -      tmp = gen_lowpart (p_mode, tmp);
> -      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> -                  gen_rtx_UNSPEC (CCZmode,
> -                          gen_rtvec (2, tmp, tmp),
> -                          UNSPEC_PTEST)));
>       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
>       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
>                  gen_rtx_LABEL_REF (VOIDmode, label),
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index abaf2f311e8..51d8d0c3b97 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
>   DONE;
> })
> 
> +(define_expand "cbranchxi4"
> +  [(set (reg:CC FLAGS_REG)
> +    (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> +            (match_operand:XI 2 "nonimmediate_operand")))
> +   (set (pc) (if_then_else
> +           (match_operator 0 "bt_comparison_operator"
> +        [(reg:CC FLAGS_REG) (const_int 0)])
> +           (label_ref (match_operand 3))
> +           (pc)))]
> +  "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> +{
> +  ix86_expand_branch (GET_CODE (operands[0]),
> +              operands[1], operands[2], operands[3]);
> +  DONE;
> +})
> +
> (define_expand "cstore<mode>4"
>   [(set (reg:CC FLAGS_REG)
>    (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c988935d4df..88fb1154699 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> -(define_insn "kortest<mode>"
> -  [(set (reg:CC FLAGS_REG)
> -    (unspec:CC
> +(define_insn "*kortest<mode>"
> +  [(set (reg FLAGS_REG)
> +    (unspec
>      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
>       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
>      UNSPEC_KORTEST))]
> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
>    (set_attr "type" "msklog")
>    (set_attr "prefix" "vex")])
> 
> +(define_insn "kortest<mode>_ccc"
> +  [(set (reg:CCC FLAGS_REG)
> +    (unspec:CCC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_insn "kortest<mode>_ccz"
> +  [(set (reg:CCZ FLAGS_REG)
> +    (unspec:CCZ
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> +(define_expand "kortest<mode>"
> +  [(set (reg:CC FLAGS_REG)
> +    (unspec:CC
> +      [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> +       (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> +      UNSPEC_KORTEST))]
> +  "TARGET_AVX512F")
> +
> (define_insn "kunpckhi"
>   [(set (match_operand:HI 0 "register_operand" "=k")
>    (ior:HI
> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
> 
> (define_expand "cbranch<mode>4"
>   [(set (reg:CC FLAGS_REG)
> -    (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> -            (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> +    (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> +            (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
>    (set (pc) (if_then_else
>           (match_operator 0 "bt_comparison_operator"
>        [(reg:CC FLAGS_REG) (const_int 0)])
>           (label_ref (match_operand 3))
>           (pc)))]
> -  "TARGET_SSE4_1"
> +  "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> {
>   ix86_expand_branch (GET_CODE (operands[0]),
>              operands[1], operands[2], operands[3]);
> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c 
> b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> new file mode 100644
> index 00000000000..999ef926a18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> +
> +int compare (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) == 0;
> +}
> +
> +int compare1 (const char* s1, const char* s2)
> +{
> +  return __builtin_memcmp (s1, s2, 64) != 0;
> +}
> -- 
> 2.31.1
> 

Reply via email to