[gcc r16-271] x86: Add a pass to remove redundant all 0s/1s vector load

H.J. Lu via Gcc-cvs Tue, 29 Apr 2025 00:10:30 -0700

https://gcc.gnu.org/g:d1cada7481420a23fbec525548ef5bdf64839a34


commit r16-271-gd1cada7481420a23fbec525548ef5bdf64839a34
Author: H.J. Lu <hjl.to...@gmail.com>
Date:   Fri Nov 29 18:22:14 2024 +0800

    x86: Add a pass to remove redundant all 0s/1s vector load
    
    For all different modes of all 0s/1s vectors, we can use the single widest
    all 0s/1s vector register for all 0s/1s vector uses in the whole function.
    Add a pass to generate a single widest all 0s/1s vector set instruction at
    entry of the nearest common dominator for basic blocks with all 0s/1s
    vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
    of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
    144 to 142.
    
    NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
    is for vectors of all 0s and all 1s with different sizes and different
    components.  PR target/92080 is for broadcast of the same component to
    different vector sizes.  This patch covers only all 0s and all 1s cases
    of PR target/92080.
    
    gcc/
    
            PR target/92080
            PR target/117839
            * config/i386/i386-features.cc (ix86_place_single_vector_set):
            New function.
            (remove_partial_avx_dependency): Use it.
            (ix86_get_vector_load_mode): New function.
            (replace_vector_const): Likewise.
            (remove_redundant_vector_load): Likewise.
            (pass_data_remove_redundant_vector_load): Likewise.
            (pass_remove_redundant_vector_load): Likewise.
            (make_pass_remove_redundant_vector_load): Likewise.
            * config/i386/i386-passes.def: Add
            pass_remove_redundant_vector_load after
            pass_remove_partial_avx_dependency.
            * config/i386/i386-protos.h
            (make_pass_remove_redundant_vector_load): New.
            * config/i386/i386.cc (ix86_modes_tieable_p): Return true for
            narrower non-scalar-integer modes in SSE registers.
    
    gcc/testsuite/
    
            PR target/92080
            PR target/117839
            * gcc.target/i386/pr117839-1a.c: New test.
            * gcc.target/i386/pr117839-1b.c: Likewise.
            * gcc.target/i386/pr117839-2.c: Likewise.
            * gcc.target/i386/pr92080-1.c: Likewise.
            * gcc.target/i386/pr92080-2.c: Likewise.
            * gcc.target/i386/pr92080-3.c: Likewise.
    
    Signed-off-by: H.J. Lu <hjl.to...@gmail.com>

Diff:
---
 gcc/config/i386/i386-features.cc            | 303 +++++++++++++++++++++++++---
 gcc/config/i386/i386-passes.def             |   1 +
 gcc/config/i386/i386-protos.h               |   2 +
 gcc/config/i386/i386.cc                     |  25 +--
 gcc/testsuite/gcc.target/i386/pr117839-1a.c |  35 ++++
 gcc/testsuite/gcc.target/i386/pr117839-1b.c |   5 +
 gcc/testsuite/gcc.target/i386/pr117839-2.c  |  40 ++++
 gcc/testsuite/gcc.target/i386/pr92080-1.c   |  72 +++++++
 gcc/testsuite/gcc.target/i386/pr92080-2.c   |  59 ++++++
 gcc/testsuite/gcc.target/i386/pr92080-3.c   |  48 +++++
 10 files changed, 549 insertions(+), 41 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c35ac24fd8ae..31f3ee2ef171 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3034,6 +3034,42 @@ ix86_rpad_gate ()
          && optimize_function_for_speed_p (cfun));
 }
 
+/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
+   for basic block map BBS, which is in the fake loop that contains the
+   whole function, so that there is only a single vector set in the
+   whole function.   */
+
+static void
+ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
+{
+  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
+  while (bb->loop_father->latch
+        != EXIT_BLOCK_PTR_FOR_FN (cfun))
+    bb = get_immediate_dominator (CDI_DOMINATORS,
+                                 bb->loop_father->header);
+
+  rtx set = gen_rtx_SET (dest, src);
+
+  rtx_insn *insn = BB_HEAD (bb);
+  while (insn && !NONDEBUG_INSN_P (insn))
+    {
+      if (insn == BB_END (bb))
+       {
+         insn = NULL;
+         break;
+       }
+      insn = NEXT_INSN (insn);
+    }
+
+  rtx_insn *set_insn;
+  if (insn == BB_HEAD (bb))
+    set_insn = emit_insn_before (set, insn);
+  else
+    set_insn = emit_insn_after (set,
+                               insn ? PREV_INSN (insn) : BB_END (bb));
+  df_insn_rescan (set_insn);
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions/rcp/sqrt/rsqrt/round, generate a single
        vxorps %xmmN, %xmmN, %xmmN
@@ -3188,35 +3224,10 @@ remove_partial_avx_dependency (void)
       calculate_dominance_info (CDI_DOMINATORS);
       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
 
-      /* Generate a vxorps at entry of the nearest dominator for basic
-        blocks with conversions, which is in the fake loop that
-        contains the whole function, so that there is only a single
-        vxorps in the whole function.   */
-      bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
-                                            convert_bbs);
-      while (bb->loop_father->latch
-            != EXIT_BLOCK_PTR_FOR_FN (cfun))
-       bb = get_immediate_dominator (CDI_DOMINATORS,
-                                     bb->loop_father->header);
-
-      set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
-
-      insn = BB_HEAD (bb);
-      while (insn && !NONDEBUG_INSN_P (insn))
-       {
-         if (insn == BB_END (bb))
-           {
-             insn = NULL;
-             break;
-           }
-         insn = NEXT_INSN (insn);
-       }
-      if (insn == BB_HEAD (bb))
-       set_insn = emit_insn_before (set, insn);
-      else
-       set_insn = emit_insn_after (set,
-                                   insn ? PREV_INSN (insn) : BB_END (bb));
-      df_insn_rescan (set_insn);
+      ix86_place_single_vector_set (v4sf_const0,
+                                   CONST0_RTX (V4SFmode),
+                                   convert_bbs);
+
       loop_optimizer_finalize ();
 
       if (!control_flow_insns.is_empty ())
@@ -3288,6 +3299,240 @@ make_pass_remove_partial_avx_dependency (gcc::context 
*ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* Return a machine mode suitable for vector SIZE.  */
+
+static machine_mode
+ix86_get_vector_load_mode (unsigned int size)
+{
+  machine_mode mode;
+  if (size == 64)
+    mode = V64QImode;
+  else if (size == 32)
+    mode = V32QImode;
+  else
+    mode = V16QImode;
+  return mode;
+}
+
+/* Replace the source operand of instructions in VECTOR_INSNS with
+   VECTOR_CONST in VECTOR_MODE.  */
+
+static void
+replace_vector_const (machine_mode vector_mode, rtx vector_const,
+                     auto_bitmap &vector_insns)
+{
+  bitmap_iterator bi;
+  unsigned int id;
+
+  EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
+    {
+      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
+
+      /* Get the single SET instruction.  */
+      rtx set = single_set (insn);
+      rtx dest = SET_SRC (set);
+      machine_mode mode = GET_MODE (dest);
+
+      rtx replace;
+      /* Replace the source operand with VECTOR_CONST.  */
+      if (SUBREG_P (dest) || mode == vector_mode)
+       replace = vector_const;
+      else
+       replace = gen_rtx_SUBREG (mode, vector_const, 0);
+
+      /* NB: Don't run recog_memoized here since vector SUBREG may not
+        be valid.  Let LRA handle vector SUBREG.  */
+      SET_SRC (set) = replace;
+      /* Drop possible dead definitions.  */
+      PATTERN (insn) = set;
+      df_insn_rescan (insn);
+    }
+}
+
+/* At entry of the nearest common dominator for basic blocks with vector
+   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
+   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
+   uses.
+
+   NB: We want to generate only a single widest vector set to cover the
+   whole function.  The LCM algorithm isn't appropriate here since it
+   may place a vector set inside the loop.  */
+
+static unsigned int
+remove_redundant_vector_load (void)
+{
+  timevar_push (TV_MACH_DEP);
+
+  auto_bitmap zero_bbs;
+  auto_bitmap m1_bbs;
+  auto_bitmap zero_insns;
+  auto_bitmap m1_insns;
+
+  basic_block bb;
+  rtx_insn *insn;
+  unsigned HOST_WIDE_INT zero_count = 0;
+  unsigned HOST_WIDE_INT m1_count = 0;
+  unsigned int zero_size = 0;
+  unsigned int m1_size = 0;
+
+  df_set_flags (DF_DEFER_INSN_RESCAN);
+
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      FOR_BB_INSNS (bb, insn)
+       {
+         if (!NONDEBUG_INSN_P (insn))
+           continue;
+
+         rtx set = single_set (insn);
+         if (!set)
+           continue;
+
+         /* Record single set vector instruction with CONST0_RTX and
+            CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
+            CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
+            maximum size of CONST0_RTX and CONSTM1_RTX.  */
+
+         rtx dest = SET_DEST (set);
+         machine_mode mode = GET_MODE (dest);
+         /* Skip non-vector instruction.  */
+         if (!VECTOR_MODE_P (mode))
+           continue;
+
+         rtx src = SET_SRC (set);
+         /* Skip non-vector load instruction.  */
+         if (!REG_P (dest) && !SUBREG_P (dest))
+           continue;
+
+         if (src == CONST0_RTX (mode))
+           {
+             /* Record vector instruction with CONST0_RTX.  */
+             bitmap_set_bit (zero_insns, INSN_UID (insn));
+
+             /* Record the maximum vector size.  */
+             if (zero_size < GET_MODE_SIZE (mode))
+               zero_size = GET_MODE_SIZE (mode);
+
+             /* Record the basic block with CONST0_RTX.  */
+             bitmap_set_bit (zero_bbs, bb->index);
+             zero_count++;
+           }
+         else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+                  && src == CONSTM1_RTX (mode))
+           {
+             /* Record vector instruction with CONSTM1_RTX.  */
+             bitmap_set_bit (m1_insns, INSN_UID (insn));
+
+             /* Record the maximum vector size.  */
+             if (m1_size < GET_MODE_SIZE (mode))
+               m1_size = GET_MODE_SIZE (mode);
+
+             /* Record the basic block with CONSTM1_RTX.  */
+             bitmap_set_bit (m1_bbs, bb->index);
+             m1_count++;
+           }
+       }
+    }
+
+  if (zero_count > 1 || m1_count > 1)
+    {
+      machine_mode zero_mode, m1_mode;
+      rtx vector_const0, vector_constm1;
+
+      if (zero_count > 1)
+       {
+         zero_mode = ix86_get_vector_load_mode (zero_size);
+         vector_const0 = gen_reg_rtx (zero_mode);
+         replace_vector_const (zero_mode, vector_const0, zero_insns);
+       }
+      else
+       {
+         zero_mode = VOIDmode;
+         vector_const0 = nullptr;
+       }
+
+      if (m1_count > 1)
+       {
+         m1_mode = ix86_get_vector_load_mode (m1_size);
+         vector_constm1 = gen_reg_rtx (m1_mode);
+         replace_vector_const (m1_mode, vector_constm1, m1_insns);
+       }
+      else
+       {
+         m1_mode = VOIDmode;
+         vector_constm1 = nullptr;
+       }
+
+      /* (Re-)discover loops so that bb->loop_father can be used in the
+        analysis below.  */
+      calculate_dominance_info (CDI_DOMINATORS);
+      loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+
+      if (vector_const0)
+       ix86_place_single_vector_set (vector_const0,
+                                     CONST0_RTX (zero_mode),
+                                     zero_bbs);
+
+      if (vector_constm1)
+       ix86_place_single_vector_set (vector_constm1,
+                                     CONSTM1_RTX (m1_mode),
+                                     m1_bbs);
+
+      loop_optimizer_finalize ();
+
+      df_process_deferred_rescans ();
+    }
+
+  df_clear_flags (DF_DEFER_INSN_RESCAN);
+
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_remove_redundant_vector_load =
+{
+  RTL_PASS, /* type */
+  "rrvl", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_remove_redundant_vector_load : public rtl_opt_pass
+{
+public:
+  pass_remove_redundant_vector_load (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *fun) final override
+    {
+      return (TARGET_SSE2
+             && optimize
+             && optimize_function_for_speed_p (fun));
+    }
+
+  unsigned int execute (function *) final override
+    {
+      return remove_redundant_vector_load ();
+    }
+}; // class pass_remove_redundant_vector_load
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_remove_redundant_vector_load (gcc::context *ctxt)
+{
+  return new pass_remove_redundant_vector_load (ctxt);
+}
+
 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
    instructions when there are no flag set between a flag
    producer and user.  */
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 39f8bc65ddc1..06f0288b0671 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -35,5 +35,6 @@ along with GCC; see the file COPYING3.  If not see
      PR116174.  */
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_redundant_vector_load);
   INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
   INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bea3fd4b2e2a..c59b5a67e3a5 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -427,6 +427,8 @@ extern rtl_opt_pass 
*make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_remove_redundant_vector_load
+  (gcc::context *);
 extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
 extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index dd0762421777..ae2386785afa 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21493,19 +21493,20 @@ ix86_modes_tieable_p (machine_mode mode1, 
machine_mode mode2)
     return mode1 == SFmode;
 
   /* If MODE2 is only appropriate for an SSE register, then tie with
-     any other mode acceptable to SSE registers.  */
-  if (GET_MODE_SIZE (mode2) == 64
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 64
-           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 32
-      && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 32
-           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-  if (GET_MODE_SIZE (mode2) == 16
+     any vector modes or scalar floating point modes acceptable to SSE
+     registers, excluding scalar integer modes with SUBREG:
+       (subreg:QI (reg:TI 99) 0))
+       (subreg:HI (reg:TI 99) 0))
+       (subreg:SI (reg:TI 99) 0))
+       (subreg:DI (reg:TI 99) 0))
+     to avoid unnecessary move from SSE register to integer register.
+   */
+  if (GET_MODE_SIZE (mode2) >= 16
+      && (GET_MODE_SIZE (mode1) == GET_MODE_SIZE (mode2)
+         || ((VECTOR_MODE_P (mode1) || SCALAR_FLOAT_MODE_P (mode1))
+             && GET_MODE_SIZE (mode1) <= GET_MODE_SIZE (mode2)))
       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 16
-           && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+    return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
 
   /* If MODE2 is appropriate for an MMX register, then tie
      with any other mode acceptable to MMX registers.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1a.c 
b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
new file mode 100644
index 000000000000..4501cfbcad41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1a.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx -msse2 -mtune=generic" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
\]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+void
+clear_memory (void *mem1, size_t nclears1, void *mem2, size_t nclears2)
+{
+  size_t *d1 = (size_t *) mem1;
+
+  *(d1 + 0) = 0;
+  *(d1 + 1) = 0;
+  *(d1 + 2) = 0;
+  if (nclears1 > 3)
+    {
+      *(d1 + nclears1 - 4) = 0;
+      *(d1 + nclears1 - 4 + 1) = 0;
+      *(d1 + nclears1 - 4 + 2) = 0;
+      *(d1 + nclears1 - 4 + 3) = 0;
+    }
+
+  double *d2 = (double *) mem2;
+
+  *(d2 + 0) = 0;
+  *(d2 + 1) = 0;
+  *(d2 + 2) = 0;
+  if (nclears2 > 3)
+    {
+      *(d2 + nclears2 - 4) = 0;
+      *(d2 + nclears2 - 4 + 1) = 0;
+      *(d2 + nclears2 - 4 + 2) = 0;
+      *(d2 + nclears2 - 4 + 3) = 0;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-1b.c 
b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
new file mode 100644
index 000000000000..e71b991a2073
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-1b.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
\]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include "pr117839-1a.c"
diff --git a/gcc/testsuite/gcc.target/i386/pr117839-2.c 
b/gcc/testsuite/gcc.target/i386/pr117839-2.c
new file mode 100644
index 000000000000..c76744cf98b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117839-2.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "xor\[a-z\]*\[\t 
\]*%xmm\[0-9\]\+,\[^,\]*" 1 } } */
+
+#include <stddef.h>
+
+float
+clear_memory (void *mem, size_t clearsize)
+{
+  size_t *d = (size_t *) mem;
+  size_t nclears = clearsize / sizeof (size_t);
+
+  *(d + 0) = 0;
+  *(d + 1) = 0;
+  *(d + 2) = 0;
+  if (nclears > 9)
+    {
+      *(d + 5) = 0;
+      *(d + 5 + 1) = 0;
+      *(d + 5 + 2) = 0;
+      *(d + 5 + 3) = 0;
+      *(d + nclears - 8) = 0;
+      *(d + nclears - 8 + 1) = 0;
+      *(d + nclears - 8 + 2) = 0;
+      *(d + nclears - 8 + 3) = 0;
+    }
+  else
+    {
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      *(d + 3) = 0;
+      *(d + 4) = 0;
+      *(d + nclears - 4) = 0;
+      *(d + nclears - 4 + 1) = 0;
+      *(d + nclears - 4 + 2) = 0;
+      *(d + nclears - 4 + 3) = 0;
+    }
+
+  return nclears;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-1.c 
b/gcc/testsuite/gcc.target/i386/pr92080-1.c
new file mode 100644
index 000000000000..82d1ffd4e1a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-1.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef double v2df __attribute__((vector_size(16)));
+typedef double v4df __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v2di l1;
+v4sf f1;
+v2df d1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+v4di l2;
+v8sf f2;
+v4df d2;
+
+void
+foo ()
+{
+  d1 = __extension__(v2df){0, 0};
+  f1 = __extension__(v4sf){0, 0, 0};
+  l1 = __extension__(v2di){0, 0};
+  s1 = __extension__(v4si){0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+  s1 = __extension__(v4si){-1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+  d2 = __extension__(v4df){0, 0, 0, 0};
+  f2 = __extension__(v8sf){0, 0, 0, 0, 0, 0, 0, 0};
+  l2 = __extension__(v4di){0, 0, 0, 0};
+  s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+  s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-2.c 
b/gcc/testsuite/gcc.target/i386/pr92080-2.c
new file mode 100644
index 000000000000..d160d90de538
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-2.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 1 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo (int i, int j)
+{
+  switch (i)
+    {
+    case 1:
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      s1 = __extension__(v4si){0, 0, 0, 0};
+      s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+      break;
+    case 2:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1};
+      break;
+    case 3:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0};
+      break;
+    default:
+      break;
+    }
+
+  switch (i)
+    {
+    case 1:
+      s1 = __extension__(v4si){-1, -1, -1, -1};
+      b2 = __extension__(v32qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      h2 = __extension__(v16hi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0};
+      break;
+    case 2:
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0};
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1};
+      s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080-3.c 
b/gcc/testsuite/gcc.target/i386/pr92080-3.c
new file mode 100644
index 000000000000..2174def4e6d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080-3.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64" } */
+/* { dg-final { scan-assembler-times "pxor" 1 } } */
+/* { dg-final { scan-assembler-times "pcmpeq" 1 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+
+void
+foo (int i, int j)
+{
+  switch (i)
+    {
+    case 1:
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      s1 = __extension__(v4si){0, 0, 0, 0};
+      break;
+    case 2:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1};
+      break;
+    case 3:
+      h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0};
+      break;
+    default:
+      break;
+    }
+
+  switch (i)
+    {
+    case 1:
+      s1 = __extension__(v4si){-1, -1, -1, -1};
+      break;
+    case 2:
+      b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0};
+      h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+      break;
+    case 3:
+      b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1};
+      break;
+    }
+}

[gcc r16-271] x86: Add a pass to remove redundant all 0s/1s vector load

Reply via email to