From d048b87a626112c1629d075d23f74cade7b2fb2e Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 15 Jun 2020 13:48:45 +0800
Subject: [PATCH] Optimize V16QI/V32QI/V64QI shift by constant.

gcc/ChangeLog:
	PR target/95524
	* gcc/config/i386/i386-expand.c
	(ix86_expand_vec_shift_qihi_constant): New function.
	* gcc/config/i386/i386-protos.h: Declare.
	* gcc/config/i386/sse.md: Optimize shift V*QImode by constant.

gcc/testsuite/ChangeLog:
	* gcc.target/i386/avx2-shiftqihi-constant-1.c: New test.
	* gcc.target/i386/avx2-shiftqihi-constant-2.c: Ditto.
	* gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Ditto.
	* gcc.target/i386/avx512bw-shiftqihi-constant-2.c: Ditto.
	* gcc.target/i386/sse2-shiftqihi-constant-1.c: Ditto.
	* gcc.target/i386/sse2-shiftqihi-constant-2.c: Ditto.
---
 gcc/config/i386/i386-expand.c                 | 102 ++++++++++++++++++
 gcc/config/i386/i386-protos.h                 |   1 +
 gcc/config/i386/sse.md                        |   3 +
 .../i386/avx2-shiftqihi-constant-1.c          |  31 ++++++
 .../i386/avx2-shiftqihi-constant-2.c          |  62 +++++++++++
 .../i386/avx512bw-shiftqihi-constant-1.c      |  31 ++++++
 .../i386/avx512bw-shiftqihi-constant-2.c      |  62 +++++++++++
 .../i386/sse2-shiftqihi-constant-1.c          |  31 ++++++
 .../i386/sse2-shiftqihi-constant-2.c          |  62 +++++++++++
 9 files changed, 385 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 3a414f69b3b..24e47058da0 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -19532,6 +19532,108 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
   return true;
 }
 
+/* Expand a vector operation shift by constant for a V*QImode in terms of the
+   same operation on V*HImode. Return true if success. */
+bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode, himode;
+  unsigned int shift_constant, and_constant, xor_constant;
+  rtx vec_const_and, vec_const_xor;
+  rtx tmp, op1_subreg;
+  rtx (*gen_shift) (rtx, rtx, rtx);
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_xor) (rtx, rtx, rtx);
+  rtx (*gen_sub) (rtx, rtx, rtx);
+
+  /* Only optimize shift by constant.  */
+  if (!CONST_INT_P (op2))
+    return false;
+
+  qimode = GET_MODE (dest);
+  shift_constant = INTVAL (op2);
+  /* Shift constant greater equal 8 result into 0.  */
+  if (shift_constant > 7)
+    {
+      if (code == ASHIFT || code == LSHIFTRT)
+	{
+	  emit_move_insn (dest, CONST0_RTX (qimode));
+	  return true;
+	}
+      /* Sign bit not known.  */
+      else if (code == ASHIFTRT)
+	return false;
+      else
+	gcc_unreachable ();
+    }
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  /* Record sign bit.  */
+  xor_constant = 1 << (8 - shift_constant - 1);
+
+  /* Zero upper/lower bits shift from left/right element.  */
+  and_constant = code == ASHIFT ? 256 - (1 << shift_constant) :
+    (1 << (8 - shift_constant)) - 1;
+
+  switch (qimode)
+    {
+    case V16QImode:
+      himode = V8HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv8hi3 :
+	(code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3;
+      gen_and = gen_andv16qi3;
+      gen_xor = gen_xorv16qi3;
+      gen_sub = gen_subv16qi3;
+      break;
+    case V32QImode:
+      himode = V16HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv16hi3 :
+	(code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3;
+      gen_and = gen_andv32qi3;
+      gen_xor = gen_xorv32qi3;
+      gen_sub = gen_subv32qi3;
+      break;
+    case V64QImode:
+      himode = V32HImode;
+      gen_shift = (code == ASHIFT) ? gen_ashlv32hi3 :
+	(code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3;
+      gen_and = gen_andv64qi3;
+      gen_xor = gen_xorv64qi3;
+      gen_sub = gen_subv64qi3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  tmp = gen_reg_rtx (himode);
+  vec_const_and = gen_reg_rtx (qimode);
+  op1_subreg = simplify_gen_subreg (himode, op1, qimode, 0);
+
+  /* For ASHIFT and LSHIFTRT, perform operation like
+     vpsllw/vpsrlw $shift_constant, %op1, %dest.
+     vpand %vec_const_and, %dest.  */
+  emit_insn (gen_shift (tmp, op1_subreg, op2));
+  emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+  emit_move_insn (vec_const_and,
+		  ix86_build_const_vector (qimode, true,
+					   GEN_INT (and_constant)));
+  emit_insn (gen_and (dest, dest, vec_const_and));
+
+  /* For ASHIFTRT, perform extra operation like
+     vpxor %vec_const_xor, %dest, %dest
+     vpsubb %vec_const_xor, %dest, %dest  */
+  if (code == ASHIFTRT)
+    {
+      vec_const_xor = gen_reg_rtx (qimode);
+      emit_move_insn (vec_const_xor,
+		      ix86_build_const_vector (qimode, true,
+					       GEN_INT (xor_constant)));
+      emit_insn (gen_xor (dest, dest, vec_const_xor));
+      emit_insn (gen_sub (dest, dest, vec_const_xor));
+    }
+  return true;
+}
+
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index f5320494fa1..7c2ce618f3f 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -206,6 +206,7 @@ extern void ix86_expand_round_sse4 (rtx, rtx);
 
 extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx);
 
 extern rtx ix86_split_stack_guard (void);
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index aa9fdc87c68..b466950af40 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -19863,6 +19863,9 @@
       gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
       emit_insn (gen (operands[0], operands[1], tmp));
     }
+  else if (ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
+						operands[1], operands[2]))
+    DONE;
   else
     ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
   DONE;
diff --git a/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c
new file mode 100644
index 00000000000..72065039581
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c
@@ -0,0 +1,31 @@
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%ymm" 3 } }  */
+typedef char v32qi  __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi  __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+foo_ashiftrt_256 (v32qi a)
+{
+  return a >> 2;
+}
+/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%ymm" 1 } } */
+
+__attribute__((noipa)) v32qi
+foo_ashift_256 (v32qi a)
+{
+  return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%ymm" 1 } }  */
+
+__attribute__((noipa)) v32uqi
+foo_lshiftrt_256 (v32uqi a)
+{
+  return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%ymm" 1 } }  */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c
new file mode 100644
index 00000000000..509d5a8d762
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c
@@ -0,0 +1,62 @@
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512bw -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "avx512bw-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+typedef char v64qi  __attribute__ ((vector_size (64)));
+typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
+
+#define TEST_SHIFT(N)					\
+  do							\
+    {							\
+      int i;						\
+      for (i = 0; i < 64; i++)				\
+	exp1.a[i] = op1.a[i] << N;			\
+      res1.x = (__m512i) (((v64qi) op1.x) << N);	\
+      if (check_union512i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 64; i++)				\
+	exp1.a[i] = op1.a[i] >> N;			\
+      res1.x = (__m512i) (((v64qi) op1.x) >> N);	\
+      if (check_union512i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 64; i++)				\
+	exp2.a[i] = op2.a[i] >> N;			\
+      res2.x = (__m512i) (((v64uqi) op2.x >> N));	\
+      if (check_union512i_ub (res2, exp2.a))		\
+	abort ();					\
+    }							\
+  while (0)
+
+static void
+TEST (void)
+{
+  union512i_b op1, exp1, res1;
+  union512i_ub op2, exp2, res2;
+  for (int i = 0; i != 64; i++)
+    {
+      op2.a[i] = i * i;
+      op1.a[i] = i * i + 200 * i;
+    }
+  TEST_SHIFT (0);
+  TEST_SHIFT (1);
+  TEST_SHIFT (2);
+  TEST_SHIFT (3);
+  TEST_SHIFT (4);
+  TEST_SHIFT (5);
+  TEST_SHIFT (6);
+  TEST_SHIFT (7);
+  TEST_SHIFT (8);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
new file mode 100644
index 00000000000..78bf5d33689
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c
@@ -0,0 +1,31 @@
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } }  */
+typedef char v64qi  __attribute__ ((vector_size (64)));
+typedef unsigned char v64uqi  __attribute__ ((vector_size (64)));
+
+__attribute__((noipa)) v64qi
+foo_ashiftrt_512 (v64qi a)
+{
+  return a >> 2;
+}
+/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
+
+__attribute__((noipa)) v64qi
+foo_ashift_512 (v64qi a)
+{
+  return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%zmm" 1 } }  */
+
+__attribute__((noipa)) v64uqi
+foo_lshiftrt_512 (v64uqi a)
+{
+  return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%zmm" 1 } }  */
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c
new file mode 100644
index 00000000000..d6f7934f3b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c
@@ -0,0 +1,62 @@
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "avx2-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+typedef char v32qi  __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi  __attribute__ ((vector_size (32)));
+
+#define TEST_SHIFT(N)					\
+  do							\
+    {							\
+      int i;						\
+      for (i = 0; i < 32; i++)				\
+	exp1.a[i] = op1.a[i] << N;			\
+      res1.x = (__m256i) (((v32qi) op1.x) << N);	\
+      if (check_union256i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 32; i++)				\
+	exp1.a[i] = op1.a[i] >> N;			\
+      res1.x = (__m256i) (((v32qi) op1.x) >> N);	\
+      if (check_union256i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 32; i++)				\
+	exp2.a[i] = op2.a[i] >> N;			\
+      res2.x = (__m256i) (((v32uqi) op2.x >> N));	\
+      if (check_union256i_ub (res2, exp2.a))		\
+	abort ();					\
+    }							\
+  while (0)
+
+static void
+TEST (void)
+{
+  union256i_b op1, exp1, res1;
+  union256i_ub op2, exp2, res2;
+  for (int i = 0; i != 32; i++)
+    {
+      op2.a[i] = i * i;
+      op1.a[i] = i * i + 200 * i;
+    }
+  TEST_SHIFT (0);
+  TEST_SHIFT (1);
+  TEST_SHIFT (2);
+  TEST_SHIFT (3);
+  TEST_SHIFT (4);
+  TEST_SHIFT (5);
+  TEST_SHIFT (6);
+  TEST_SHIFT (7);
+  TEST_SHIFT (8);
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c
new file mode 100644
index 00000000000..f1c68cb2972
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c
@@ -0,0 +1,31 @@
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "pand\[^\n\]*%xmm" 3 { xfail *-*-* } } } */
+typedef char v16qi  __attribute__ ((vector_size (16)));
+typedef unsigned char v16uqi  __attribute__ ((vector_size (16)));
+
+__attribute__((noipa)) v16qi
+foo_ashiftrt_128 (v16qi a)
+{
+  return a >> 2;
+}
+/* { dg-final { scan-assembler-times "psraw\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "psubb\[^\n\]*%xmm" 1 } } */
+
+__attribute__((noipa)) v16qi
+foo_ashift_128 (v16qi a)
+{
+  return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "psllw\[^\n\]*%xmm" 1 { xfail *-*-* } } } */
+
+__attribute__((noipa)) v16uqi
+foo_lshiftrt_128 (v16uqi a)
+{
+  return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "psrlw\[^\n\]*%xmm" 1 } }  */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c
new file mode 100644
index 00000000000..d95171f7a47
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c
@@ -0,0 +1,62 @@
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "sse2-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+typedef char v16qi  __attribute__ ((vector_size (16)));
+typedef unsigned char v16uqi  __attribute__ ((vector_size (16)));
+
+#define TEST_SHIFT(N)					\
+  do							\
+    {							\
+      int i;						\
+      for (i = 0; i < 16; i++)				\
+	exp1.a[i] = op1.a[i] << N;			\
+      res1.x = (__m128i) (((v16qi) op1.x) << N);	\
+      if (check_union128i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 16; i++)				\
+	exp1.a[i] = op1.a[i] >> N;			\
+      res1.x = (__m128i) (((v16qi) op1.x) >> N);	\
+      if (check_union128i_b (res1, exp1.a))		\
+	abort ();					\
+							\
+      for (i = 0; i < 16; i++)				\
+	exp2.a[i] = op2.a[i] >> N;			\
+      res2.x = (__m128i) (((v16uqi) op2.x >> N));	\
+      if (check_union128i_ub (res2, exp2.a))		\
+	abort ();					\
+    }							\
+  while (0)
+
+static void
+TEST (void)
+{
+  union128i_b op1, exp1, res1;
+  union128i_ub op2, exp2, res2;
+  for (int i = 0; i != 16; i++)
+    {
+      op2.a[i] = i * i;
+      op1.a[i] = i * i + 200 * i;
+    }
+  TEST_SHIFT (0);
+  TEST_SHIFT (1);
+  TEST_SHIFT (2);
+  TEST_SHIFT (3);
+  TEST_SHIFT (4);
+  TEST_SHIFT (5);
+  TEST_SHIFT (6);
+  TEST_SHIFT (7);
+  TEST_SHIFT (8);
+}
+
-- 
2.18.1

