b optimization [PR117927]

Jakub Jelinek Wed, 08 Jan 2025 00:58:09 -0800

Hi!

As mentioned in the PR, the a r<< (bitsize-b) to a r>> b and similar
match.pd optimization which has been introduced in GCC 15 can introduce
UB which wasn't there before, in particular if b is equal at runtime
to bitsize, then a r<< 0 is turned into a r>> bitsize.


The following patch fixes it by optimizing it early only if VRP
tells us the count isn't equal to the bitsize, and late into
a r>> (b & (bitsize - 1)) if bitsize is power of two and the subtraction
has single use, on various targets the masking then goes away because
its rotate instructions do masking already.  The latter can't be
done too early though, because then the expr_not_equal_to case is
basically useless and we introduce the masking always and can't find out
anymore that there was originally no masking.  Even cfun->after_inlining
check would be too early, there is forwprop before vrp, so the patch
introduces a new PROP for vrp2 finish, after which there is another
forwprop.  Or do you have ideas about what other condition could be used
for late matching only?

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2025-01-08  Jakub Jelinek  <ja...@redhat.com>
            Andrew Pinski  <quic_apin...@quicinc.com>

        PR tree-optimization/117927
        * tree-pass.h (PROP_vrp): Define.
        * tree-vrp.cc (pass_vrp::execute): Set PROP_vrp in curr_properties
        at the end if final_p.
        * match.pd (a rrotate (32-b) -> a lrotate b): Only optimize either
        if @2 is known not to be equal to prec or if after vrp2 the
        subtraction has single use and prec is power of two; in that case
        transform it into orotate by masked count.

        * gcc.dg/tree-ssa/pr117927.c: New test.

--- gcc/tree-pass.h.jj  2025-01-07 20:10:42.617965365 +0100
+++ gcc/tree-pass.h     2025-01-07 21:34:13.859794503 +0100
@@ -230,6 +230,7 @@ protected:
 #define PROP_assumptions_done  (1 << 19)       /* Assume function kept
                                                   around.  */
 #define PROP_gimple_lbitint    (1 << 20)       /* lowered large _BitInt */
+#define PROP_vrp               (1 << 21)       /* vrp2 pass finished.  */
 
 #define PROP_gimple \
   (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
--- gcc/tree-vrp.cc.jj  2025-01-07 20:10:42.617965365 +0100
+++ gcc/tree-vrp.cc     2025-01-07 21:35:24.308814807 +0100
@@ -1081,8 +1081,8 @@ private:
   gimple *m_last_bb_stmt;
 };
 
-/* Main entry point for a VRP pass using just ranger. This can be called
-  from anywhere to perform a VRP pass, including from EVRP.  */
+/* Main entry point for a VRP pass using just ranger.  This can be called
+   from anywhere to perform a VRP pass, including from EVRP.  */
 
 unsigned int
 execute_ranger_vrp (struct function *fun, bool final_p)
@@ -1334,6 +1334,7 @@ public:
     {
       // Check for fast vrp.
       bool use_fvrp = (&data == &pass_data_fast_vrp);
+      unsigned int todo;
       if (!use_fvrp && last_basic_block_for_fn (fun) > param_vrp_block_limit)
        {
          use_fvrp = true;
@@ -1344,8 +1345,12 @@ public:
                   param_vrp_block_limit);
        }
       if (use_fvrp)
-       return execute_fast_vrp (fun, final_p);
-      return execute_ranger_vrp (fun, final_p);
+       todo = execute_fast_vrp (fun, final_p);
+      else
+       todo = execute_ranger_vrp (fun, final_p);
+      if (final_p)
+       fun->curr_properties |= PROP_vrp;
+      return todo;
     }
 
  private:
--- gcc/match.pd.jj     2025-01-07 20:10:42.615965392 +0100
+++ gcc/match.pd        2025-01-07 21:36:27.991929199 +0100
@@ -4967,14 +4967,31 @@ (define_operator_list SYNC_FETCH_AND_AND
                            build_int_cst (TREE_TYPE (@1),
                                           element_precision (type)), @1); }))
 
-/* a rrotate (32-b) -> a lrotate b */
-/* a lrotate (32-b) -> a rrotate b */
+/* a rrotate (bitsize-b) -> a lrotate b */
+/* a lrotate (bitsize-b) -> a rrotate b */
+/* Only do the above when it is known that b != bitsize.
+   Otherwise canonicalize to a orotate (b & mask) when the subtraction
+   has single use and prec is a power of two.  */
 (for rotate (lrotate rrotate)
      orotate (rrotate lrotate)
  (simplify
-  (rotate @0 (minus INTEGER_CST@1 @2))
-   (if (element_precision (TREE_TYPE (@0)) == wi::to_wide (@1))
-     (orotate @0 @2))))
+  (rotate @0 (minus@3 INTEGER_CST@1 @2))
+  (with { auto prec = element_precision (TREE_TYPE (@0)); }
+   (if (prec == wi::to_wide (@1))
+    (switch
+     (if (expr_not_equal_to (@2, wi::uhwi (prec, TYPE_PRECISION (TREE_TYPE 
(@2)))))
+      (orotate @0 @2))
+     (if (single_use (@3)
+         && pow2p_hwi (prec)
+         /* Defer this case until VRP2 could be run and expr_not_equal_to
+            had a chance to match.  Otherwise we'd do pretty much always
+            just the second case.  */
+         && cfun
+         && ((cfun->curr_properties & PROP_vrp) != 0
+             || !flag_tree_vrp
+             || optimize_debug))
+      (orotate @0
+       (bit_and @2 { build_int_cst (TREE_TYPE (@2), prec - 1); }))))))))
 
 /* Turn (a OP c1) OP c2 into a OP (c1+c2).  */
 (for op (lrotate rrotate rshift lshift)
--- gcc/testsuite/gcc.dg/tree-ssa/pr117927.c.jj 2025-01-07 20:48:59.827825909 
+0100
+++ gcc/testsuite/gcc.dg/tree-ssa/pr117927.c    2025-01-07 20:48:59.827825909 
+0100
@@ -0,0 +1,97 @@
+/* PR tree-optimization/117927 */
+/* { dg-do compile { target { int32 && longlong64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times " r<< " 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " r>> " 4 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " & 31;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " & 63;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32 - " "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64 - " "optimized" } } */
+
+static inline
+unsigned lrotate32 (unsigned x, int t)
+{
+  unsigned tl = x << t;
+  unsigned th = x >> (-t & 31);
+  return tl | th;
+}
+
+static inline
+unsigned rrotate32 (unsigned x, int t)
+{
+  unsigned tl = x >> t;
+  unsigned th = x << (-t & 31);
+  return tl | th;
+}
+
+static inline
+unsigned long long lrotate64 (unsigned long long x, int t)
+{
+  unsigned long long tl = x << t;
+  unsigned long long th = x >> (-t & 63);
+  return tl | th;
+}
+
+static inline
+unsigned long long rrotate64 (unsigned long long x, int t)
+{
+  unsigned long long tl = x >> t;
+  unsigned long long th = x << (-t & 63);
+  return tl | th;
+}
+
+unsigned
+f1 (unsigned x, int t)
+{
+  return lrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f2 (unsigned long long x, int t)
+{
+  return lrotate64 (x, 64 - t);
+}
+
+unsigned
+f3 (unsigned x, int t)
+{
+  if (t == 32)
+    __builtin_unreachable ();
+  return lrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f4 (unsigned long long x, int t)
+{
+  if (t == 64)
+    __builtin_unreachable ();
+  return lrotate64 (x, 64 - t);
+}
+
+unsigned
+f5 (unsigned x, int t)
+{
+  return rrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f6 (unsigned long long x, int t)
+{
+  return rrotate64 (x, 64 - t);
+}
+
+unsigned
+f7 (unsigned x, int t)
+{
+  if (t == 32)
+    __builtin_unreachable ();
+  return rrotate32 (x, 32 - t);
+}
+
+unsigned long long
+f8 (unsigned long long x, int t)
+{
+  if (t == 64)
+    __builtin_unreachable ();
+  return rrotate64 (x, 64 - t);
+}

        Jakub

[PATCH] match.pd: Avoid introducing UB in the a r<< (32-b) -> a r>> b optimization [PR117927]

Reply via email to