> -----Original Message-----
> From: Jakub Jelinek <[email protected]>
> Sent: Saturday, May 16, 2026 4:45 PM
> To: Uros Bizjak <[email protected]>; Liu, Hongtao <[email protected]>
> Cc: [email protected]
> Subject: [PATCH] i386: Implement bitreverse<mode>2 optab for GFNI
> [PR50481]
> 
> Hi!
> 
> The following patch implements the bitreverse<mode>2 optab for -mgfni -
> msse2 (SSE2 because apparently -mgfni doesn't imply -msse nor -msse2).
> This is done by using gf2p8affineqb insn with a special constant which
> reverses bits in each byte, and for modes wider than QImode also by doing a
> byteswap afterwards.
> With -m64 it emits
> .LC0:
>         .byte   1, 2, 4, 8, 16, 32, 64, -128
>         .byte   1, 2, 4, 8, 16, 32, 64, -128
> and
>         movd    %edi, %xmm0
>         gf2p8affineqb   $0, .LC0(%rip), %xmm0
>         movd    %xmm0, %eax
> for __builtin_bitreverse8,
>         movd    %edi, %xmm0
>         gf2p8affineqb   $0, .LC0(%rip), %xmm0
>         movd    %xmm0, %eax
>         rolw    $8, %ax
> for __builtin_bitreverse16,
>         movd    %edi, %xmm0
>         gf2p8affineqb   $0, .LC0(%rip), %xmm0
>         movd    %xmm0, %eax
>         bswap   %eax
> for __builtin_bitreverse32,
>         movq    %rdi, %xmm0
>         gf2p8affineqb   $0, .LC0(%rip), %xmm0
>         movq    %xmm0, %rax
>         bswap   %rax
> for __builtin_bitreverse64, and
>         movq    %rdi, %xmm0
>         pinsrq  $1, %rsi, %xmm0
>         gf2p8affineqb   $0, .LC0(%rip), %xmm0
>         movq    %xmm0, %rax
>         pextrq  $1, %xmm0, %rdx
>         bswap   %rax
>         bswap   %rdx
>         xchgq   %rdx, %rax
> for __builtin_bitreverse128 (only the xchgq is unnecessary and surprising,
> some RA issue).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2026-05-16  Jakub Jelinek  <[email protected]>
> 
>       PR target/50481
>       * config/i386/i386-protos.h (ix86_expand_gfni_bitreverse): Declare.
>       * config/i386/i386-expand.cc (ix86_expand_gfni_bitreverse): New
>       function.
>       * config/i386/i386.md (bitreverse<mode>2): New expander.
> 
>       * gcc.target/i386/gfni-builtin-bitreverse-1.c: New test.
> 
> --- gcc/config/i386/i386-protos.h.jj  2026-03-27 10:17:13.850335374
> +0100
> +++ gcc/config/i386/i386-protos.h     2026-05-15 14:25:47.587708641
> +0200
> @@ -263,6 +263,7 @@ extern rtx ix86_expand_ternlog (machine_
>                               int idx, rtx target);
>  extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);  extern void
> ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx);
> +extern void ix86_expand_gfni_bitreverse (rtx, rtx);
> 
> 
>  #ifdef TREE_CODE
> --- gcc/config/i386/i386-expand.cc.jj 2026-05-01 14:52:30.166900240
> +0200
> +++ gcc/config/i386/i386-expand.cc    2026-05-15 16:03:32.784427891
> +0200
> @@ -28069,5 +28069,122 @@ ix86_expand_vector_bf2sf_with_vec_perm (
>    emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target,
> vperm_mode));  }
> 
> +/* Implement bitreverse<mode>2 using gf2p8affineqb.  */
> +
> +void
> +ix86_expand_gfni_bitreverse (rtx dest, rtx src) {
> +  machine_mode mode = GET_MODE (dest);
> +  rtx temp;
> +  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> +    {
> +      rtx temp1 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
> +      rtx temp2 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
> +      if (mode == TImode)
> +     {
> +       temp = lowpart_subreg (DImode, src, TImode);
> +       emit_insn (gen_rtx_SET (temp1, gen_rtx_VEC_CONCAT (V2DImode,
> temp,
> +                                                          const0_rtx)));
> +       temp = gen_rtx_SUBREG (DImode, src, 8);

Src could be memory, so it's better to use gen_highpart which calls adjust 
address_nv?

> +       emit_insn (gen_rtx_SET (temp2, gen_rtx_VEC_CONCAT (V2DImode,
> temp,
> +                                                          const0_rtx)));
> +     }
> +      else
> +     {
> +       temp = lowpart_subreg (SImode, src, DImode);
> +       emit_insn (gen_vec_setv4si_0 (temp1, CONST0_RTX (V4SImode),
> temp));
> +       temp = gen_rtx_SUBREG (SImode, src, 4);

Ditto.
Others LGTM.

> +       emit_insn (gen_vec_setv4si_0 (temp2, CONST0_RTX (V4SImode),
> temp));
> +       temp1 = lowpart_subreg (V2DImode, temp1, V4SImode);
> +       temp2 = lowpart_subreg (V2DImode, temp2, V4SImode);
> +     }
> +      temp = gen_reg_rtx (V2DImode);
> +      emit_insn (gen_vec_interleave_lowv2di (temp, temp1, temp2));
> +    }
> +  else if (mode != DImode)
> +    {
> +      if (mode != SImode)
> +     {
> +       src = force_reg (mode, src);
> +       src = lowpart_subreg (SImode, src, mode);
> +     }
> +      temp = gen_reg_rtx (V4SImode);
> +      emit_insn (gen_vec_setv4si_0 (temp, CONST0_RTX (V4SImode), src));
> +    }
> +  else
> +    {
> +      temp = gen_reg_rtx (V2DImode);
> +      emit_insn (gen_rtx_SET (temp, gen_rtx_VEC_CONCAT (V2DImode, src,
> +                                                     const0_rtx)));
> +    }
> +  src = temp;
> +  temp = gen_reg_rtx (V16QImode);
> +  rtx src2 = gen_rtx_CONST_VECTOR (V16QImode,
> +                                gen_rtvec (16, GEN_INT (1), GEN_INT (2),
> +                                           GEN_INT (4), GEN_INT (8),
> +                                           GEN_INT (16), GEN_INT (32),
> +                                           GEN_INT (64), GEN_INT (-128),
> +                                           GEN_INT (1), GEN_INT (2),
> +                                           GEN_INT (4), GEN_INT (8),
> +                                           GEN_INT (16), GEN_INT (32),
> +                                           GEN_INT (64), GEN_INT (-128)));
> +  src2 = validize_mem (force_const_mem (V16QImode, src2));
> +  src = lowpart_subreg (V16QImode, src, GET_MODE (src));
> +  emit_insn (gen_vgf2p8affineqb_v16qi (temp, src, src2, const0_rtx));
> +  if (mode == QImode)
> +    {
> +      rtx temp1 = gen_reg_rtx (SImode);
> +      rtx temp2 = lowpart_subreg (V4SImode, temp, V16QImode);
> +      rtx temp3 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
> +      emit_insn (gen_rtx_SET (temp1,
> +                           gen_rtx_VEC_SELECT (SImode, temp2, temp3)));
> +      emit_move_insn (dest, lowpart_subreg (QImode, temp1, SImode));
> +      return;
> +    }
> +  rtx target = gen_reg_rtx ((GET_MODE_SIZE (mode) < 4 || !TARGET_64BIT)
> +                         ? SImode : mode == TImode ? DImode : mode);
> +  emit_move_insn (target, lowpart_subreg (GET_MODE (target), temp,
> +V16QImode));
> +  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> +    {
> +      rtx temp1 = gen_reg_rtx (GET_MODE (target));
> +      if (mode == TImode || TARGET_SSE4_1)
> +     {
> +       rtx temp2 = lowpart_subreg (mode == TImode ? V2DImode :
> V4SImode,
> +                                   temp, V16QImode);
> +       rtx temp3 = gen_rtx_PARALLEL (VOIDmode,
> +                                     gen_rtvec (1, GEN_INT (mode ==
> TImode
> +                                                            ? 1 : 2)));
> +       emit_insn (gen_rtx_SET (temp1,
> +                               gen_rtx_VEC_SELECT (GET_MODE (target),
> temp2,
> +                                                   temp3)));
> +     }
> +      else
> +     {
> +       rtx temp2 = gen_reg_rtx (V4SImode);
> +       rtx temp3 = lowpart_subreg (V4SImode, temp, V16QImode);
> +       emit_insn (gen_sse2_pshufd (temp2, temp3, GEN_INT (0xaa)));
> +       emit_move_insn (temp1, lowpart_subreg (GET_MODE (target),
> temp2,
> +                                              V4SImode));
> +     }
> +      rtx temp4 = gen_reg_rtx (GET_MODE (target));
> +      rtx temp5 = gen_reg_rtx (GET_MODE (target));
> +      rtx (*gen_bswap) (rtx, rtx)
> +     = mode == TImode ? gen_bswapdi2 : gen_bswapsi2;
> +      emit_insn (gen_bswap (temp4, target));
> +      emit_insn (gen_bswap (temp5, temp1));
> +      temp4 = gen_rtx_ZERO_EXTEND (mode, temp4);
> +      temp5 = gen_rtx_ZERO_EXTEND (mode, temp5);
> +      rtx shift = GEN_INT (GET_MODE_PRECISION (GET_MODE (target)));
> +      temp4 = gen_rtx_ASHIFT (mode, temp4, shift);
> +      emit_insn (gen_rtx_SET (dest, gen_rtx_IOR (mode, temp4, temp5)));
> +      return;
> +    }
> +  if (mode == HImode)
> +    target = lowpart_subreg (mode, target, SImode);
> +  if (mode == SImode)
> +    emit_insn (gen_bswapsi2 (dest, target));
> +  else
> +    emit_insn (gen_rtx_SET (dest, gen_rtx_BSWAP (mode, target))); }
> 
>  #include "gt-i386-expand.h"
> --- gcc/config/i386/i386.md.jj        2026-05-15 08:31:22.766559483 +0200
> +++ gcc/config/i386/i386.md   2026-05-15 15:24:38.109933975 +0200
> @@ -23157,6 +23157,15 @@ (define_peephole2
>    operands[3] = gen_lowpart (HImode, operands[2]);
>  })
> 
> +(define_expand "bitreverse<mode>2"
> +  [(set (match_operand:SWIDWI 0 "register_operand")
> +     (bitreverse:SWIDWI (match_operand:SWIDWI 1
> "nonimmediate_operand")))]
> +  "TARGET_GFNI && TARGET_SSE2"
> +{
> +  ix86_expand_gfni_bitreverse (operands[0], operands[1]);
> +  DONE;
> +})
> +
>  (define_expand "paritydi2"
>    [(set (match_operand:DI 0 "register_operand")
>       (parity:DI (match_operand:DI 1 "register_operand")))]
> --- gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c.jj      
> 2026-05-15
> 16:10:20.505533901 +0200
> +++ gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c 2026-05-15
> 16:10:13.093659229 +0200
> @@ -0,0 +1,13 @@
> +/* { dg-do run { target gfni } } */
> +/* { dg-options "-mgfni -O2" } */
> +
> +#define main do_test
> +#include "../../gcc.dg/builtin-bitreverse-1.c"
> +#undef main
> +
> +int
> +main ()
> +{
> +  if (__builtin_cpu_supports ("gfni"))
> +    do_test ();
> +}
> 
>       Jakub

Reply via email to