Re: [x86 PATCH] Add STV support for DImode and SImode rotations by constant.

2023-06-30 Thread Uros Bizjak via Gcc-patches
On Fri, Jun 30, 2023 at 9:29 AM Roger Sayle  wrote:
>
>
> This patch implements scalar-to-vector (STV) support for DImode and SImode
> rotations by constant bit counts.  Scalar rotations are almost always
> optimal on x86, requiring only one or two instructions, but it is also
> possible to implement these efficiently with SSE2, requiring only one
> or two instructions for SImode rotations and at most 3 instructions for
> DImode rotations.  This allows GCC to STV rotations with a small or no
> penalty if there are other (net) benefits to converting a chain.  An
> example of the benefits is shown below, which is based upon the BLAKE2
> cryptographic hash function:
>
> unsigned long long a,b,c,d;
>
> unsigned long rot(unsigned long long x, int y)
> {
>   return (x<>(64-y));
> }
>
> void foo()
> {
>   d = rot(d ^ a,32);
>   c = c + d;
>   b = rot(b ^ c,24);
>   a = a + b;
>   d = rot(d ^ a,16);
>   c = c + d;
>   b = rot(b ^ c,63);
> }
>
> where with -m32 -O2 -msse2
>
> Before (59 insns, 247 bytes):
> foo:pushl   %edi
> xorl%edx, %edx
> pushl   %esi
> pushl   %ebx
> subl$16, %esp
> movqa, %xmm1
> movqd, %xmm0
> movqb, %xmm2
> pxor%xmm1, %xmm0
> psrlq   $32, %xmm0
> movd%xmm0, %eax
> movd%edx, %xmm0
> movd%eax, %xmm3
> punpckldq   %xmm0, %xmm3
> movqc, %xmm0
> paddq   %xmm3, %xmm0
> pxor%xmm0, %xmm2
> movd%xmm2, %ecx
> psrlq   $32, %xmm2
> movd%xmm2, %ebx
> movl%ecx, %eax
> shldl   $24, %ebx, %ecx
> shldl   $24, %eax, %ebx
> movd%ebx, %xmm4
> movd%ecx, %xmm2
> punpckldq   %xmm4, %xmm2
> movdqa  .LC0, %xmm4
> pand%xmm4, %xmm2
> paddq   %xmm2, %xmm1
> movq%xmm1, a
> pxor%xmm3, %xmm1
> movd%xmm1, %esi
> psrlq   $32, %xmm1
> movd%xmm1, %edi
> movl%esi, %eax
> shldl   $16, %edi, %esi
> shldl   $16, %eax, %edi
> movd%esi, %xmm1
> movd%edi, %xmm3
> punpckldq   %xmm3, %xmm1
> pand%xmm4, %xmm1
> movq%xmm1, d
> paddq   %xmm1, %xmm0
> movq%xmm0, c
> pxor%xmm2, %xmm0
> movd%xmm0, 8(%esp)
> psrlq   $32, %xmm0
> movl8(%esp), %eax
> movd%xmm0, 12(%esp)
> movl12(%esp), %edx
> shrdl   $1, %edx, %eax
> xorl%edx, %edx
> movl%eax, b
> movl%edx, b+4
> addl$16, %esp
> popl%ebx
> popl%esi
> popl%edi
> ret
>
> After (32 insns, 165 bytes):
> movqa, %xmm1
> xorl%edx, %edx
> movqd, %xmm0
> movqb, %xmm2
> movdqa  .LC0, %xmm4
> pxor%xmm1, %xmm0
> psrlq   $32, %xmm0
> movd%xmm0, %eax
> movd%edx, %xmm0
> movd%eax, %xmm3
> punpckldq   %xmm0, %xmm3
> movqc, %xmm0
> paddq   %xmm3, %xmm0
> pxor%xmm0, %xmm2
> pshufd  $68, %xmm2, %xmm2
> psrldq  $5, %xmm2
> pand%xmm4, %xmm2
> paddq   %xmm2, %xmm1
> movq%xmm1, a
> pxor%xmm3, %xmm1
> pshuflw $147, %xmm1, %xmm1
> pand%xmm4, %xmm1
> movq%xmm1, d
> paddq   %xmm1, %xmm0
> movq%xmm0, c
> pxor%xmm2, %xmm0
> pshufd  $20, %xmm0, %xmm0
> psrlq   $1, %xmm0
> pshufd  $136, %xmm0, %xmm0
> pand%xmm4, %xmm0
> movq%xmm0, b
> ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-06-30  Roger Sayle  
>
> gcc/ChangeLog
> * config/i386/i386-features.cc (compute_convert_gain): Provide
> gains/costs for ROTATE and ROTATERT (by an integer constant).
> (general_scalar_chain::convert_rotate): New helper function to
> convert a DImode or SImode rotation by an integer constant into
> SSE vector form.
> (general_scalar_chain::convert_insn): Call the new convert_rotate
> for ROTATE and ROTATERT.
> (general_scalar_to_vector_candidate_p): Consider ROTATE and
> ROTATERT to be candidates if the second operand is an integer
> constant, valid for a rotation (or shift) in the given mode.
> * config/i386/i386-features.h (general_scalar_chain): Add new
> helper method convert_rotate.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/rotate-6.c: New test case.
> * gcc.target/i386/sse2-stv-1.c: Likewise.

LGTM.

Please note that AVX512VL provides VPROLD/VPROLQ and VPRORD/VPRORQ
native rotate instructions that can come handy here.

[x86 PATCH] Add STV support for DImode and SImode rotations by constant.

2023-06-30 Thread Roger Sayle

This patch implements scalar-to-vector (STV) support for DImode and SImode
rotations by constant bit counts.  Scalar rotations are almost always
optimal on x86, requiring only one or two instructions, but it is also
possible to implement these efficiently with SSE2, requiring only one
or two instructions for SImode rotations and at most 3 instructions for
DImode rotations.  This allows GCC to STV rotations with a small or no
penalty if there are other (net) benefits to converting a chain.  An
example of the benefits is shown below, which is based upon the BLAKE2
cryptographic hash function:

unsigned long long a,b,c,d;

unsigned long rot(unsigned long long x, int y)
{
  return (x<>(64-y));
}

void foo()
{
  d = rot(d ^ a,32);
  c = c + d;
  b = rot(b ^ c,24);
  a = a + b;
  d = rot(d ^ a,16);
  c = c + d;
  b = rot(b ^ c,63);
}

where with -m32 -O2 -msse2

Before (59 insns, 247 bytes):
foo:pushl   %edi
xorl%edx, %edx
pushl   %esi
pushl   %ebx
subl$16, %esp
movqa, %xmm1
movqd, %xmm0
movqb, %xmm2
pxor%xmm1, %xmm0
psrlq   $32, %xmm0
movd%xmm0, %eax
movd%edx, %xmm0
movd%eax, %xmm3
punpckldq   %xmm0, %xmm3
movqc, %xmm0
paddq   %xmm3, %xmm0
pxor%xmm0, %xmm2
movd%xmm2, %ecx
psrlq   $32, %xmm2
movd%xmm2, %ebx
movl%ecx, %eax
shldl   $24, %ebx, %ecx
shldl   $24, %eax, %ebx
movd%ebx, %xmm4
movd%ecx, %xmm2
punpckldq   %xmm4, %xmm2
movdqa  .LC0, %xmm4
pand%xmm4, %xmm2
paddq   %xmm2, %xmm1
movq%xmm1, a
pxor%xmm3, %xmm1
movd%xmm1, %esi
psrlq   $32, %xmm1
movd%xmm1, %edi
movl%esi, %eax
shldl   $16, %edi, %esi
shldl   $16, %eax, %edi
movd%esi, %xmm1
movd%edi, %xmm3
punpckldq   %xmm3, %xmm1
pand%xmm4, %xmm1
movq%xmm1, d
paddq   %xmm1, %xmm0
movq%xmm0, c
pxor%xmm2, %xmm0
movd%xmm0, 8(%esp)
psrlq   $32, %xmm0
movl8(%esp), %eax
movd%xmm0, 12(%esp)
movl12(%esp), %edx
shrdl   $1, %edx, %eax
xorl%edx, %edx
movl%eax, b
movl%edx, b+4
addl$16, %esp
popl%ebx
popl%esi
popl%edi
ret

After (32 insns, 165 bytes):
movqa, %xmm1
xorl%edx, %edx
movqd, %xmm0
movqb, %xmm2
movdqa  .LC0, %xmm4
pxor%xmm1, %xmm0
psrlq   $32, %xmm0
movd%xmm0, %eax
movd%edx, %xmm0
movd%eax, %xmm3
punpckldq   %xmm0, %xmm3
movqc, %xmm0
paddq   %xmm3, %xmm0
pxor%xmm0, %xmm2
pshufd  $68, %xmm2, %xmm2
psrldq  $5, %xmm2
pand%xmm4, %xmm2
paddq   %xmm2, %xmm1
movq%xmm1, a
pxor%xmm3, %xmm1
pshuflw $147, %xmm1, %xmm1
pand%xmm4, %xmm1
movq%xmm1, d
paddq   %xmm1, %xmm0
movq%xmm0, c
pxor%xmm2, %xmm0
pshufd  $20, %xmm0, %xmm0
psrlq   $1, %xmm0
pshufd  $136, %xmm0, %xmm0
pand%xmm4, %xmm0
movq%xmm0, b
ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-06-30  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain): Provide
gains/costs for ROTATE and ROTATERT (by an integer constant).
(general_scalar_chain::convert_rotate): New helper function to
convert a DImode or SImode rotation by an integer constant into
SSE vector form.
(general_scalar_chain::convert_insn): Call the new convert_rotate
for ROTATE and ROTATERT.
(general_scalar_to_vector_candidate_p): Consider ROTATE and
ROTATERT to be candidates if the second operand is an integer
constant, valid for a rotation (or shift) in the given mode.
* config/i386/i386-features.h (general_scalar_chain): Add new
helper method convert_rotate.

gcc/testsuite/ChangeLog
* gcc.target/i386/rotate-6.c: New test case.
* gcc.target/i386/sse2-stv-1.c: Likewise.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4a3b07a..b98baba 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -582,6 +582,25 @@ general_scalar_chain::compute_convert_gain ()
  igain -= vector_const_cost (XEXP (src, 0));
break;
 
+ case ROTATE:
+ case ROTATERT:
+   igain += m * ix86_cost->shift_const;
+