https://gcc.gnu.org/g:6901d56fea2132389e9a7ae60b4d9731e6e3c65a

commit r16-190-g6901d56fea2132389e9a7ae60b4d9731e6e3c65a
Author: Richard Biener <rguent...@suse.de>
Date:   Tue Mar 4 10:34:39 2025 +0100

    tree-optimization/119103 - missed overwidening detection for shift
    
    When vectorizing a shift of u16 data by an amount that's known to
    be less than 16 we currently fail to emit a vector u16 shift.  The
    first reason is that the promotion of the shift amount is hoisted
    only by PRE and that cannot preserve range info, the second reason
    is that pattern detection doesn't use range info when computing
    the precision required for an operation.
    
    The following addresses the first issue by making LIM hoist all
    expressions for the pass that runs right before PRE and the
    second issue by querying ranges for the shift amount.
    
            PR tree-optimization/119103
            * tree-ssa-loop-im.cc (in_loop_pipeline): Globalize.
            (compute_invariantness): Override costing when we run
            right before PRE and PRE is enabled.
            (pass_lim::execute): Adjust.
            * tree-vect-patterns.cc (vect_determine_precisions_from_users):
            For variable shift amounts use range information.
    
            * gcc.target/i386/pr119103.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr119103.c | 13 +++++++++++++
 gcc/tree-ssa-loop-im.cc                  | 10 ++++++++--
 gcc/tree-vect-patterns.cc                | 26 +++++++++++++++++++-------
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr119103.c 
b/gcc/testsuite/gcc.target/i386/pr119103.c
new file mode 100644
index 000000000000..57210dc3bbe8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr119103.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+
+void lshift(unsigned short *x, unsigned char amount)
+{
+  if (amount > 15)
+    __builtin_unreachable();
+
+  for (int i = 0; i < 16; i++)
+    x[i] <<= amount;
+}
+
+/* { dg-final { scan-assembler "vpsllw" } } */
diff --git a/gcc/tree-ssa-loop-im.cc b/gcc/tree-ssa-loop-im.cc
index 225964c62156..a3ca5af3e3e4 100644
--- a/gcc/tree-ssa-loop-im.cc
+++ b/gcc/tree-ssa-loop-im.cc
@@ -143,6 +143,8 @@ public:
                                   different modes.  */
 };
 
+static bool in_loop_pipeline;
+
 /* We use six bits per loop in the ref->dep_loop bitmap to record
    the dep_kind x dep_state combinations.  */
 
@@ -1239,7 +1241,11 @@ compute_invariantness (basic_block bb)
                   lim_data->cost);
        }
 
-      if (lim_data->cost >= LIM_EXPENSIVE)
+      if (lim_data->cost >= LIM_EXPENSIVE
+         /* When we run before PRE and PRE is active hoist all expressions
+            since PRE would do so anyway and we can preserve range info
+            but PRE cannot.  */
+         || (flag_tree_pre && !in_loop_pipeline))
        set_profitable_level (stmt);
     }
 }
@@ -3759,7 +3765,7 @@ public:
 unsigned int
 pass_lim::execute (function *fun)
 {
-  bool in_loop_pipeline = scev_initialized_p ();
+  in_loop_pipeline = scev_initialized_p ();
   if (!in_loop_pipeline)
     loop_optimizer_init (LOOPS_NORMAL | LOOPS_HAVE_RECORDED_EXITS);
 
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 4f0a7ea162b9..ca19add83c05 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6544,10 +6544,22 @@ vect_determine_precisions_from_users (stmt_vec_info 
stmt_info, gassign *stmt)
     case RSHIFT_EXPR:
       {
        tree shift = gimple_assign_rhs2 (stmt);
-       if (TREE_CODE (shift) != INTEGER_CST
-           || !wi::ltu_p (wi::to_widest (shift), precision))
+       unsigned int min_const_shift, max_const_shift;
+       wide_int min_shift, max_shift;
+       if (TREE_CODE (shift) == SSA_NAME
+           && vect_get_range_info (shift, &min_shift, &max_shift)
+           && wi::ge_p (min_shift, 0, TYPE_SIGN (TREE_TYPE (shift)))
+           && wi::lt_p (max_shift, TYPE_PRECISION (type),
+                        TYPE_SIGN (TREE_TYPE (shift))))
+         {
+           min_const_shift = min_shift.to_uhwi ();
+           max_const_shift = max_shift.to_uhwi ();
+         }
+       else if (TREE_CODE (shift) == INTEGER_CST
+                && wi::ltu_p (wi::to_widest (shift), precision))
+         min_const_shift = max_const_shift = TREE_INT_CST_LOW (shift);
+       else
          return;
-       unsigned int const_shift = TREE_INT_CST_LOW (shift);
        if (code == LSHIFT_EXPR)
          {
            /* Avoid creating an undefined shift.
@@ -6559,16 +6571,16 @@ vect_determine_precisions_from_users (stmt_vec_info 
stmt_info, gassign *stmt)
               of vectorization.  This sort of thing should really be
               handled before vectorization.  */
            operation_precision = MAX (stmt_info->min_output_precision,
-                                      const_shift + 1);
+                                      max_const_shift + 1);
            /* We need CONST_SHIFT fewer bits of the input.  */
-           min_input_precision = (MAX (operation_precision, const_shift)
-                                  - const_shift);
+           min_input_precision = (MAX (operation_precision, max_const_shift)
+                                  - min_const_shift);
          }
        else
          {
            /* We need CONST_SHIFT extra bits to do the operation.  */
            operation_precision = (stmt_info->min_output_precision
-                                  + const_shift);
+                                  + max_const_shift);
            min_input_precision = operation_precision;
          }
        break;

Reply via email to