[PATCH v2] Consider frequency in cost estimation when converting scalar to vector.

liuhongt Mon, 28 Apr 2025 22:22:22 -0700

> I am generally trying to get rid of remaing uses of REG_FREQ since the
> 10000 based fixed point arithmetics iot always working that well.
>
> You can do the sums in profile_count type (doing something reasonable
> when count is uninitialized) and then convert it to sreal for the final
> heuristics.


Changed, when count_max is initialized and function is not optimized to size,
use
        bb_count.initialized_p
        ? bb_count.probability_in (count_max).to_sreal ()
        : 0.1;

Otherwise, just use 1.


n some benchmark, I notice stv failed due to cost unprofitable, but the igain
is inside the loop, but sse<->integer conversion is outside the loop, current 
cost
model doesn't consider the frequency of those gain/cost.
The patch weights those cost with frequency.

The patch regressed gcc.target/i386/minmax-6.c under -m32 Since the
place of integer<->sse is before the branch, and the conversion to
min/max is in the branch, with static profile, the cost model is not
profitable anymore which is exactly the patch try to do.
Considering the original testcase is to guard RA issue, so restrict
the testcase under ! ia32 should still be ok.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

        * config/i386/i386-features.cc (get_bb_probability): New function.
        (scalar_chain::mark_dual_mode_def): Weight
        n_integer_to_sse/n_sse_to_integer with bb frequency.
        (general_scalar_chain::compute_convert_gain): Ditto, and
        adjust function prototype to return true/false when cost model
        is profitable or not.
        (timode_scalar_chain::compute_convert_gain): Ditto.
        (convert_scalars_to_vector): Adjust after the upper two
        function prototype are changed.
        * config/i386/i386-features.h (class scalar_chain): Change
        n_integer_to_sse/n_sse_to_integer from int to sreal.
        (class general_scalar_chain): Adjust prototype to return bool
        intead of int.
        (class timode_scalar_chain): Ditto.

gcc/testsuite/ChangeLog:
        * gcc.target/i386/minmax-6.c: Adjust testcase.
---
 gcc/config/i386/i386-features.cc         | 83 +++++++++++++++++-------
 gcc/config/i386/i386-features.h          | 10 +--
 gcc/testsuite/gcc.target/i386/minmax-6.c |  2 +-
 3 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c35ac24fd8a..57dcd94c211 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -326,6 +326,30 @@ scalar_chain::add_to_queue (unsigned insn_uid)
             insn_uid, chain_id);
 }
 
+/* Return BB probablity in cfun->cfg->count_max,
+   if count_max is uninitialized or optimized_for_size, return 1.
+   else if bb count is uninitialized, return 0.1 as a rough estimation.
+   else return bb_count.probablity_in (count_max).  */
+static sreal
+get_bb_probability (basic_block bb)
+{
+  profile_count bb_count = bb->count;
+  profile_count count_max = cfun->cfg->count_max;
+  sreal sreal_count = 1;
+  if (count_max.initialized_p ()
+      && !optimize_function_for_size_p (cfun))
+    {
+      if (bb_count.initialized_p ())
+       sreal_count = bb_count.probability_in (count_max).to_sreal ();
+      else
+       /* For unintialized BB, weight it with 0.1,
+          ??? just like REG_FREQ_MAX / BB_FREQ_MAX.   */
+       sreal_count = (sreal) 1 / (sreal) 10;
+    }
+
+  return sreal_count;
+}
+
 /* For DImode conversion, mark register defined by DEF as requiring
    conversion.  */
 
@@ -337,18 +361,20 @@ scalar_chain::mark_dual_mode_def (df_ref def)
   /* Record the def/insn pair so we can later efficiently iterate over
      the defs to convert on insns not in the chain.  */
   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+  sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (DF_REF_INSN (def)));
+
   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
     {
       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
          && !reg_new)
        return;
-      n_integer_to_sse++;
+      n_integer_to_sse += sreal_count;
     }
   else
     {
       if (!reg_new)
        return;
-      n_sse_to_integer++;
+      n_sse_to_integer += sreal_count;
     }
 
   if (dump_file)
@@ -529,15 +555,15 @@ general_scalar_chain::vector_const_cost (rtx exp)
   return ix86_cost->sse_load[smode == DImode ? 1 : 0];
 }
 
-/* Compute a gain for chain conversion.  */
+/* Return true if it's cost profitable for chain conversion.  */
 
-int
+bool
 general_scalar_chain::compute_convert_gain ()
 {
   bitmap_iterator bi;
   unsigned insn_uid;
-  int gain = 0;
-  int cost = 0;
+  sreal gain = 0;
+  sreal cost = 0;
 
   if (dump_file)
     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -556,6 +582,7 @@ general_scalar_chain::compute_convert_gain ()
       rtx src = SET_SRC (def_set);
       rtx dst = SET_DEST (def_set);
       int igain = 0;
+      sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (insn));
 
       if (REG_P (src) && REG_P (dst))
        igain += 2 * m - ix86_cost->xmm_move;
@@ -755,16 +782,19 @@ general_scalar_chain::compute_convert_gain ()
            }
        }
 
+      sreal real_gain = sreal_count * igain;
       if (igain != 0 && dump_file)
        {
-         fprintf (dump_file, "  Instruction gain %d for ", igain);
+         fprintf (dump_file, "  Instruction gain %.2f for ",
+                  real_gain.to_double ());
          dump_insn_slim (dump_file, insn);
        }
-      gain += igain;
+      gain += real_gain;
     }
 
   if (dump_file)
-    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
+    fprintf (dump_file, "  Instruction conversion gain: %.2f\n",
+            gain.to_double ());
 
   /* Cost the integer to sse and sse to integer moves.  */
   if (!optimize_function_for_size_p (cfun))
@@ -796,14 +826,14 @@ general_scalar_chain::compute_convert_gain ()
     }
 
   if (dump_file)
-    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
-
-  gain -= cost;
+    fprintf (dump_file, "  Registers conversion cost: %.2f\n",
+            cost.to_double ());
 
   if (dump_file)
-    fprintf (dump_file, "  Total gain: %d\n", gain);
+    fprintf (dump_file, "  Total gain: %.2f\n, total cost: %.2f",
+            gain.to_double (), cost.to_double ());
 
-  return gain;
+  return gain > cost ;
 }
 
 /* Insert generated conversion instruction sequence INSNS
@@ -1520,21 +1550,22 @@ timode_immed_const_gain (rtx cst)
   return 0;
 }
 
-/* Compute a gain for chain conversion.  */
+/* Return true it's cost profitable for for chain conversion.  */
 
-int
+bool
 timode_scalar_chain::compute_convert_gain ()
 {
   /* Assume that if we have to move TImode values between units,
      then transforming this chain isn't worth it.  */
-  if (n_sse_to_integer || n_integer_to_sse)
-    return -1;
+  if (n_sse_to_integer.to_int ()
+      || n_integer_to_sse.to_int ())
+    return false;
 
   bitmap_iterator bi;
   unsigned insn_uid;
 
   /* Split ties to prefer V1TImode when not optimizing for size.  */
-  int gain = optimize_size ? 0 : 1;
+  sreal gain = optimize_size ? 0 : 1;
 
   if (dump_file)
     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -1548,6 +1579,7 @@ timode_scalar_chain::compute_convert_gain ()
       HOST_WIDE_INT op1val;
       int scost, vcost;
       int igain = 0;
+      sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (insn));
 
       switch (GET_CODE (src))
        {
@@ -1754,18 +1786,21 @@ timode_scalar_chain::compute_convert_gain ()
          break;
        }
 
+      sreal real_gain = sreal_count * igain;
       if (igain != 0 && dump_file)
        {
-         fprintf (dump_file, "  Instruction gain %d for ", igain);
+         
+         fprintf (dump_file, "  Instruction gain %.2f for ", 
real_gain.to_double ());
          dump_insn_slim (dump_file, insn);
        }
-      gain += igain;
+      
+      gain += real_gain;
     }
 
   if (dump_file)
-    fprintf (dump_file, "  Total gain: %d\n", gain);
+    fprintf (dump_file, "  Total gain: %.2f\n", gain.to_double ());
 
-  return gain;
+  return gain > (sreal) 0;
 }
 
 /* Fix uses of converted REG in debug insns.  */
@@ -2561,7 +2596,7 @@ convert_scalars_to_vector (bool timode_p)
             conversions.  */
          if (chain->build (&candidates[i], uid, disallowed))
            {
-             if (chain->compute_convert_gain () > 0)
+             if (chain->compute_convert_gain ())
                converted_insns += chain->convert ();
              else if (dump_file)
                fprintf (dump_file, "Chain #%d conversion is not profitable\n",
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 24b0c4ed0cd..cc40c5c9841 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -153,12 +153,12 @@ class scalar_chain
 
   bitmap insns_conv;
   hash_map<rtx, rtx> defs_map;
-  unsigned n_sse_to_integer;
-  unsigned n_integer_to_sse;
+  sreal n_sse_to_integer;
+  sreal n_integer_to_sse;
   auto_vec<rtx_insn *> control_flow_insns;
 
   bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed);
-  virtual int compute_convert_gain () = 0;
+  virtual bool compute_convert_gain () = 0;
   int convert ();
 
  protected:
@@ -184,7 +184,7 @@ class general_scalar_chain : public scalar_chain
  public:
   general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
     : scalar_chain (smode_, vmode_) {}
-  int compute_convert_gain () final override;
+  bool compute_convert_gain () final override;
 
  private:
   void convert_insn (rtx_insn *insn) final override;
@@ -196,7 +196,7 @@ class timode_scalar_chain : public scalar_chain
 {
  public:
   timode_scalar_chain () : scalar_chain (TImode, V1TImode) {}
-  int compute_convert_gain () final override;
+  bool compute_convert_gain () final override;
 
  private:
   void fix_debug_reg_uses (rtx reg);
diff --git a/gcc/testsuite/gcc.target/i386/minmax-6.c 
b/gcc/testsuite/gcc.target/i386/minmax-6.c
index 615f919ba0a..b5b4363f931 100644
--- a/gcc/testsuite/gcc.target/i386/minmax-6.c
+++ b/gcc/testsuite/gcc.target/i386/minmax-6.c
@@ -15,4 +15,4 @@ UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
 /* We do not want the RA to spill %esi for it's dual-use but using
    pmaxsd is OK.  */
 /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */
-/* { dg-final { scan-assembler "pmaxsd" } } */
+/* { dg-final { scan-assembler "pmaxsd" { target { ! ia32 } } } } */
-- 
2.34.1

[PATCH v2] Consider frequency in cost estimation when converting scalar to vector.

Reply via email to