> Am 27.10.2023 um 07:50 schrieb liuhongt <hongtao....@intel.com>:
>
> When 2 vectors are equal, kmask is allones and kortest will set CF,
> else CF will be cleared.
>
> So CF bit can be used to check for the result of the comparison.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
Is that also profitable for 256bit aka AVX10?
Is there a jump on carry in case the result feeds control flow rather than a
value and is using ktest better then (does combine figure this out?)
> Before:
> vmovdqu (%rsi), %ymm0
> vpxorq (%rdi), %ymm0, %ymm0
> vptest %ymm0, %ymm0
> jne .L2
> vmovdqu 32(%rsi), %ymm0
> vpxorq 32(%rdi), %ymm0, %ymm0
> vptest %ymm0, %ymm0
> je .L5
> .L2:
> movl $1, %eax
> xorl $1, %eax
> vzeroupper
> ret
>
> After:
> vmovdqu64 (%rsi), %zmm0
> xorl %eax, %eax
> vpcmpeqd (%rdi), %zmm0, %k0
> kortestw %k0, %k0
> setc %al
> vzeroupper
> ret
>
> gcc/ChangeLog:
>
> PR target/104610
> * config/i386/i386-expand.cc (ix86_expand_branch): Handle
> 512-bit vector with vpcmpeq + kortest.
> * config/i386/i386.md (cbranchxi4): New expander.
> * config/i386/sse.md: (cbranch<mode>4): Extend to V16SImode
> and V8DImode.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr104610-2.c: New test.
> ---
> gcc/config/i386/i386-expand.cc | 55 +++++++++++++++-------
> gcc/config/i386/i386.md | 16 +++++++
> gcc/config/i386/sse.md | 36 +++++++++++---
> gcc/testsuite/gcc.target/i386/pr104610-2.c | 14 ++++++
> 4 files changed, 99 insertions(+), 22 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr104610-2.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..c664cb61e80 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -2411,30 +2411,53 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx
> op1, rtx label)
> rtx tmp;
>
> /* Handle special case - vector comparsion with boolean result, transform
> - it using ptest instruction. */
> + it using ptest instruction or vpcmpeq + kortest. */
> if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> || (mode == TImode && !TARGET_64BIT)
> - || mode == OImode)
> + || mode == OImode
> + || GET_MODE_SIZE (mode) == 64)
> {
> - rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
> - machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
> + unsigned msize = GET_MODE_SIZE (mode);
> + machine_mode p_mode
> + = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
> + /* kortest set CF when result is 0xFFFF (op0 == op1). */
> + rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
>
> gcc_assert (code == EQ || code == NE);
>
> - if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> + /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
> + if (msize == 64)
> {
> - op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> - op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> - mode = p_mode;
> + if (mode != V16SImode)
> + {
> + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> + }
> +
> + tmp = gen_reg_rtx (HImode);
> + emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
> + emit_insn (gen_kortesthi_ccc (tmp, tmp));
> + }
> + /* Using ptest for 128/256-bit vectors. */
> + else
> + {
> + if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
> + {
> + op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
> + op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
> + mode = p_mode;
> + }
> +
> + /* Generate XOR since we can't check that one operand is zero
> + vector. */
> + tmp = gen_reg_rtx (mode);
> + emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> + tmp = gen_lowpart (p_mode, tmp);
> + emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> + gen_rtx_UNSPEC (CCZmode,
> + gen_rtvec (2, tmp, tmp),
> + UNSPEC_PTEST)));
> }
> - /* Generate XOR since we can't check that one operand is zero vector.
> */
> - tmp = gen_reg_rtx (mode);
> - emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
> - tmp = gen_lowpart (p_mode, tmp);
> - emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
> - gen_rtx_UNSPEC (CCZmode,
> - gen_rtvec (2, tmp, tmp),
> - UNSPEC_PTEST)));
> tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
> tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
> gen_rtx_LABEL_REF (VOIDmode, label),
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index abaf2f311e8..51d8d0c3b97 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -1442,6 +1442,22 @@ (define_expand "cbranchoi4"
> DONE;
> })
>
> +(define_expand "cbranchxi4"
> + [(set (reg:CC FLAGS_REG)
> + (compare:CC (match_operand:XI 1 "nonimmediate_operand")
> + (match_operand:XI 2 "nonimmediate_operand")))
> + (set (pc) (if_then_else
> + (match_operator 0 "bt_comparison_operator"
> + [(reg:CC FLAGS_REG) (const_int 0)])
> + (label_ref (match_operand 3))
> + (pc)))]
> + "TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256"
> +{
> + ix86_expand_branch (GET_CODE (operands[0]),
> + operands[1], operands[2], operands[3]);
> + DONE;
> +})
> +
> (define_expand "cstore<mode>4"
> [(set (reg:CC FLAGS_REG)
> (compare:CC (match_operand:SDWIM 2 "nonimmediate_operand")
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c988935d4df..88fb1154699 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -2175,9 +2175,9 @@ (define_insn "ktest<mode>"
> (set_attr "type" "msklog")
> (set_attr "prefix" "vex")])
>
> -(define_insn "kortest<mode>"
> - [(set (reg:CC FLAGS_REG)
> - (unspec:CC
> +(define_insn "*kortest<mode>"
> + [(set (reg FLAGS_REG)
> + (unspec
> [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand" "k")
> (match_operand:SWI1248_AVX512BWDQ 1 "register_operand" "k")]
> UNSPEC_KORTEST))]
> @@ -2187,6 +2187,30 @@ (define_insn "kortest<mode>"
> (set_attr "type" "msklog")
> (set_attr "prefix" "vex")])
>
> +(define_insn "kortest<mode>_ccc"
> + [(set (reg:CCC FLAGS_REG)
> + (unspec:CCC
> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> + UNSPEC_KORTEST))]
> + "TARGET_AVX512F")
> +
> +(define_insn "kortest<mode>_ccz"
> + [(set (reg:CCZ FLAGS_REG)
> + (unspec:CCZ
> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> + UNSPEC_KORTEST))]
> + "TARGET_AVX512F")
> +
> +(define_expand "kortest<mode>"
> + [(set (reg:CC FLAGS_REG)
> + (unspec:CC
> + [(match_operand:SWI1248_AVX512BWDQ 0 "register_operand")
> + (match_operand:SWI1248_AVX512BWDQ 1 "register_operand")]
> + UNSPEC_KORTEST))]
> + "TARGET_AVX512F")
> +
> (define_insn "kunpckhi"
> [(set (match_operand:HI 0 "register_operand" "=k")
> (ior:HI
> @@ -27840,14 +27864,14 @@ (define_insn "<avx512>_store<mode>_mask"
>
> (define_expand "cbranch<mode>4"
> [(set (reg:CC FLAGS_REG)
> - (compare:CC (match_operand:VI48_AVX 1 "register_operand")
> - (match_operand:VI48_AVX 2 "nonimmediate_operand")))
> + (compare:CC (match_operand:VI48_AVX_AVX512F 1 "register_operand")
> + (match_operand:VI48_AVX_AVX512F 2 "nonimmediate_operand")))
> (set (pc) (if_then_else
> (match_operator 0 "bt_comparison_operator"
> [(reg:CC FLAGS_REG) (const_int 0)])
> (label_ref (match_operand 3))
> (pc)))]
> - "TARGET_SSE4_1"
> + "TARGET_SSE4_1 && (<MODE_SIZE> != 64 || !TARGET_PREFER_AVX256)"
> {
> ix86_expand_branch (GET_CODE (operands[0]),
> operands[1], operands[2], operands[3]);
> diff --git a/gcc/testsuite/gcc.target/i386/pr104610-2.c
> b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> new file mode 100644
> index 00000000000..999ef926a18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104610-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -mtune=generic" } */
> +/* { dg-final { scan-assembler-times {(?n)vpcmpeq.*zmm} 2 } } */
> +/* { dg-final { scan-assembler-times {(?n)kortest.*k[0-7]} 2 } } */
> +
> +int compare (const char* s1, const char* s2)
> +{
> + return __builtin_memcmp (s1, s2, 64) == 0;
> +}
> +
> +int compare1 (const char* s1, const char* s2)
> +{
> + return __builtin_memcmp (s1, s2, 64) != 0;
> +}
> --
> 2.31.1
>