[PATCH v3] Consider frequency in cost estimation when converting scalar to vector.

liuhongt Tue, 13 May 2025 18:34:26 -0700

Update in V3
> > > +  cost_sse_integer = 0;
> > > +  weighted_cost_sse_integer = 0 ;
> Extra space here.
Changed.


> > > +       : ix86_size_cost.sse_to_integer;
>
> Please be sure to not revert the changes from my patch adding
>   COSTS_N_INSNS (...) / 2
> here and some other places.
Yes, keep the original cost, and weights it with bb_freq.
>
> I think with your change the size metrics will get less accurate.  Old
> code had hard-wired sizes which depends on ISA we use:
>
>  else if (TARGET_64BIT || smode == SImode)
>    {
>      cost += n_sse_to_integer * COSTS_N_BYTES (4);
>      cost += n_integer_to_sse * COSTS_N_BYTES (4);
>    }
>  else if (TARGET_SSE4_1)
>    {
>      /* vmovd (4 bytes) + vpextrd (6 bytes).  */
>      cost += n_sse_to_integer * COSTS_N_BYTES (10);
>      /* vmovd (4 bytes) + vpinsrd (6 bytes).  */
>      cost += n_integer_to_sse * COSTS_N_BYTES (10);
>    }
>  else
>    {
>      /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes).  */
>      cost += n_sse_to_integer * COSTS_N_BYTES (13);
>      /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes).  */
>      cost += n_integer_to_sse * COSTS_N_BYTES (12);
>    }
>
> While you now use common size_costs->sse_to_integer which will not be
> able to distinguish these.
>
> Perhaps we want to have a function returning correct sizes depending on
> ISA and use it here?  (In future it may be useful to get correct size
> costs for vectorizer too.)
Also keep the hard-wired sizes.
> > >      }
> > >
> > > +  if (speed_p)
> > > +    weighted_cost_sse_integer += bb_freq * cost;
>
> You use bb_freq just once, so it may be easier to just call the
> function.
Changed
> > > -                 if (optimize_insn_for_size_p ())
> > > +                 if (!speed_p)
>
> If the size tables are right, we should be able to eliminate some of
> these tests since same computation as for speed_p should work for size.
> However sizes are somewhat approximate, so this needs to be done
> carefully, so it is probably better done incremeentally.
Yes, still use ix86_cost, and just refactor optimize_insn_for_size_p () with 
!speed_p.
> Patch is OK with these changes.
Thansk for review.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

n some benchmark, I notice stv failed due to cost unprofitable, but the igain
is inside the loop, but sse<->integer conversion is outside the loop, current 
cost
model doesn't consider the frequency of those gain/cost.
The patch weights those cost with frequency.

gcc/ChangeLog:

        * config/i386/i386-features.cc
        (scalar_chain::mark_dual_mode_def): Weight
        cost of integer<->sse move with bb frequency when it's
        optimized_for_speed_p.
        (general_scalar_chain::compute_convert_gain): Ditto, and
        adjust function prototype to return true/false when cost model
        is profitable or not.
        (timode_scalar_chain::compute_convert_gain): Ditto.
        (convert_scalars_to_vector): Adjust after the upper two
        function prototype are changed.
        * config/i386/i386-features.h (class scalar_chain): Change
        n_integer_to_sse/n_sse_to_integer to cost_sse_integer, and add
        weighted_cost_sse_integer.
        (class general_scalar_chain): Adjust prototype to return bool
        intead of int.
        (class timode_scalar_chain): Ditto.
---
 gcc/config/i386/i386-features.cc | 177 +++++++++++++++++--------------
 gcc/config/i386/i386-features.h  |  11 +-
 2 files changed, 105 insertions(+), 83 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index cc8313bd292..6491c6b063f 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -296,9 +296,8 @@ scalar_chain::scalar_chain (enum machine_mode smode_, enum 
machine_mode vmode_)
   insns_conv = BITMAP_ALLOC (NULL);
   queue = NULL;
 
-  n_sse_to_integer = 0;
-  n_integer_to_sse = 0;
-
+  cost_sse_integer = 0;
+  weighted_cost_sse_integer = 0 ;
   max_visits = x86_stv_max_visits;
 }
 
@@ -337,20 +336,52 @@ scalar_chain::mark_dual_mode_def (df_ref def)
   /* Record the def/insn pair so we can later efficiently iterate over
      the defs to convert on insns not in the chain.  */
   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+  basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
+  profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+  bool speed_p = optimize_bb_for_speed_p (bb);
+  int cost = 0;
+
   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
     {
       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
          && !reg_new)
        return;
-      n_integer_to_sse++;
+
+      /* Cost integer to sse moves.  */
+      if (speed_p)
+       cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+      else if (TARGET_64BIT || smode == SImode)
+       cost = COSTS_N_BYTES (4);
+      /* vmovd (4 bytes) + vpinsrd (6 bytes).  */
+      else if (TARGET_SSE4_1)
+       cost = COSTS_N_BYTES (10);
+      /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes).  */
+      else
+       cost = COSTS_N_BYTES (12);
     }
   else
     {
       if (!reg_new)
        return;
-      n_sse_to_integer++;
+
+      /* Cost sse to integer moves.  */
+      if (speed_p)
+       cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
+      else if (TARGET_64BIT || smode == SImode)
+       cost = COSTS_N_BYTES (4);
+      /* vmovd (4 bytes) + vpextrd (6 bytes).  */
+      else if (TARGET_SSE4_1)
+       cost = COSTS_N_BYTES (10);
+      /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes).  */
+      else
+       cost = COSTS_N_BYTES (13);
     }
 
+  if (speed_p)
+    weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
+
+  cost_sse_integer += cost;
+
   if (dump_file)
     fprintf (dump_file,
             "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
@@ -531,15 +562,15 @@ general_scalar_chain::vector_const_cost (rtx exp, 
basic_block bb)
   return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
 }
 
-/* Compute a gain for chain conversion.  */
+/* Return true if it's cost profitable for chain conversion.  */
 
-int
+bool
 general_scalar_chain::compute_convert_gain ()
 {
   bitmap_iterator bi;
   unsigned insn_uid;
   int gain = 0;
-  int cost = 0;
+  sreal weighted_gain = 0;
 
   if (dump_file)
     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -559,10 +590,13 @@ general_scalar_chain::compute_convert_gain ()
       rtx dst = SET_DEST (def_set);
       basic_block bb = BLOCK_FOR_INSN (insn);
       int igain = 0;
+      profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+      bool speed_p = optimize_bb_for_speed_p (bb);
+      sreal bb_freq = bb->count.to_sreal_scale (entry_count);
 
       if (REG_P (src) && REG_P (dst))
        {
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            /* reg-reg move is 2 bytes, while SSE 3.  */
            igain += COSTS_N_BYTES (2 * m - 3);
          else
@@ -571,7 +605,7 @@ general_scalar_chain::compute_convert_gain ()
        }
       else if (REG_P (src) && MEM_P (dst))
        {
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            /* Integer load/store is 3+ bytes and SSE 4+.  */
            igain += COSTS_N_BYTES (3 * m - 4);
          else
@@ -581,7 +615,7 @@ general_scalar_chain::compute_convert_gain ()
        }
       else if (MEM_P (src) && REG_P (dst))
        {
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            igain += COSTS_N_BYTES (3 * m - 4);
          else
            igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
@@ -593,7 +627,7 @@ general_scalar_chain::compute_convert_gain ()
             of explicit load and store instructions.  */
          if (MEM_P (dst))
            {
-             if (optimize_bb_for_size_p (bb))
+             if (!speed_p)
                /* ??? This probably should account size difference
                   of SSE and integer load rather than full SSE load.  */
                igain -= COSTS_N_BYTES (8);
@@ -667,7 +701,7 @@ general_scalar_chain::compute_convert_gain ()
                igain -= vector_const_cost (XEXP (src, 1), bb);
              if (MEM_P (XEXP (src, 1)))
                {
-                 if (optimize_bb_for_size_p (bb))
+                 if (!speed_p)
                    igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
                  else
                    igain += COSTS_N_INSNS
@@ -730,7 +764,7 @@ general_scalar_chain::compute_convert_gain ()
            case CONST_INT:
              if (REG_P (dst))
                {
-                 if (optimize_bb_for_size_p (bb))
+                 if (!speed_p)
                    {
                      /* xor (2 bytes) vs. xorps (3 bytes).  */
                      if (src == const0_rtx)
@@ -769,14 +803,14 @@ general_scalar_chain::compute_convert_gain ()
              if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
                {
                  // movd (4 bytes) replaced with movdqa (4 bytes).
-                 if (!optimize_bb_for_size_p (bb))
+                 if (!!speed_p)
                    igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
                                            - ix86_cost->xmm_move) / 2;
                }
              else
                {
                  // pshufd; movd replaced with pshufd.
-                 if (optimize_bb_for_size_p (bb))
+                 if (!speed_p)
                    igain += COSTS_N_BYTES (4);
                  else
                    igain += ix86_cost->sse_to_integer;
@@ -788,55 +822,34 @@ general_scalar_chain::compute_convert_gain ()
            }
        }
 
+      if (speed_p)
+       weighted_gain += bb_freq * igain;
+      gain += igain;
+
       if (igain != 0 && dump_file)
        {
-         fprintf (dump_file, "  Instruction gain %d for ", igain);
+         fprintf (dump_file, "  Instruction gain %d with bb_freq %.2f for",
+                  igain, bb_freq.to_double ());
          dump_insn_slim (dump_file, insn);
        }
-      gain += igain;
     }
 
   if (dump_file)
-    fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
-
-  /* Cost the integer to sse and sse to integer moves.  */
-  if (!optimize_function_for_size_p (cfun))
     {
-      cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
-      /* ???  integer_to_sse but we only have that in the RA cost table.
-             Assume sse_to_integer/integer_to_sse are the same which they
-             are at the moment.  */
-      cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
+      fprintf (dump_file, "  Instruction conversion gain: %d, \n",
+              gain);
+      fprintf (dump_file, "  Registers conversion cost: %d\n",
+              cost_sse_integer);
+      fprintf (dump_file, "  Weighted instruction conversion gain: %.2f, \n",
+              weighted_gain.to_double ());
+      fprintf (dump_file, "  Weighted registers conversion cost: %.2f\n",
+              weighted_cost_sse_integer.to_double ());
     }
-  else if (TARGET_64BIT || smode == SImode)
-    {
-      cost += n_sse_to_integer * COSTS_N_BYTES (4);
-      cost += n_integer_to_sse * COSTS_N_BYTES (4);
-    }
-  else if (TARGET_SSE4_1)
-    {
-      /* vmovd (4 bytes) + vpextrd (6 bytes).  */
-      cost += n_sse_to_integer * COSTS_N_BYTES (10);
-      /* vmovd (4 bytes) + vpinsrd (6 bytes).  */
-      cost += n_integer_to_sse * COSTS_N_BYTES (10);
-    }
-  else
-    {
-      /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes).  */
-      cost += n_sse_to_integer * COSTS_N_BYTES (13);
-      /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes).  */
-      cost += n_integer_to_sse * COSTS_N_BYTES (12);
-    }
-
-  if (dump_file)
-    fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
-
-  gain -= cost;
 
-  if (dump_file)
-    fprintf (dump_file, "  Total gain: %d\n", gain);
-
-  return gain;
+  if (weighted_gain != weighted_cost_sse_integer)
+    return weighted_gain > weighted_cost_sse_integer;
+  else
+    return gain > cost_sse_integer;;
 }
 
 /* Insert generated conversion instruction sequence INSNS
@@ -1553,21 +1566,22 @@ timode_immed_const_gain (rtx cst, basic_block bb)
   return 0;
 }
 
-/* Compute a gain for chain conversion.  */
+/* Return true it's cost profitable for for chain conversion.  */
 
-int
+bool
 timode_scalar_chain::compute_convert_gain ()
 {
   /* Assume that if we have to move TImode values between units,
      then transforming this chain isn't worth it.  */
-  if (n_sse_to_integer || n_integer_to_sse)
-    return -1;
+  if (cost_sse_integer)
+    return false;
 
   bitmap_iterator bi;
   unsigned insn_uid;
 
   /* Split ties to prefer V1TImode when not optimizing for size.  */
   int gain = optimize_size ? 0 : 1;
+  sreal weighted_gain  = 0;
 
   if (dump_file)
     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
@@ -1582,32 +1596,33 @@ timode_scalar_chain::compute_convert_gain ()
       basic_block bb = BLOCK_FOR_INSN (insn);
       int scost, vcost;
       int igain = 0;
+      profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
+      bool speed_p = optimize_bb_for_speed_p (bb);
+      sreal bb_freq = bb->count.to_sreal_scale (entry_count);
 
       switch (GET_CODE (src))
        {
        case REG:
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
          else
            igain = COSTS_N_INSNS (1);
          break;
 
        case MEM:
-         igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7)
-                                             : COSTS_N_INSNS (1);
+         igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
          break;
 
        case CONST_INT:
          if (MEM_P (dst)
              && standard_sse_constant_p (src, V1TImode))
-           igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1;
+           igain = !speed_p ? COSTS_N_BYTES (11) : 1;
          break;
 
        case CONST_WIDE_INT:
          /* 2 x mov vs. vmovdqa.  */
          if (MEM_P (dst))
-           igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3)
-                                               : COSTS_N_INSNS (1);
+           igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
          break;
 
        case NOT:
@@ -1628,7 +1643,7 @@ timode_scalar_chain::compute_convert_gain ()
        case LSHIFTRT:
          /* See ix86_expand_v1ti_shift.  */
          op1val = INTVAL (XEXP (src, 1));
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            {
              if (op1val == 64 || op1val == 65)
                scost = COSTS_N_BYTES (5);
@@ -1662,7 +1677,7 @@ timode_scalar_chain::compute_convert_gain ()
        case ASHIFTRT:
          /* See ix86_expand_v1ti_ashiftrt.  */
          op1val = INTVAL (XEXP (src, 1));
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            {
              if (op1val == 64 || op1val == 127)
                scost = COSTS_N_BYTES (7);
@@ -1740,7 +1755,7 @@ timode_scalar_chain::compute_convert_gain ()
        case ROTATERT:
          /* See ix86_expand_v1ti_rotate.  */
          op1val = INTVAL (XEXP (src, 1));
-         if (optimize_bb_for_size_p (bb))
+         if (!speed_p)
            {
              scost = COSTS_N_BYTES (13);
              if ((op1val & 31) == 0)
@@ -1772,34 +1787,40 @@ timode_scalar_chain::compute_convert_gain ()
            {
              if (GET_CODE (XEXP (src, 0)) == AND)
                /* and;and;or (9 bytes) vs. ptest (5 bytes).  */
-               igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4)
-                                                   : COSTS_N_INSNS (2);
+               igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
              /* or (3 bytes) vs. ptest (5 bytes).  */
-             else if (optimize_bb_for_size_p (bb))
+             else if (!speed_p)
                igain = -COSTS_N_BYTES (2);
            }
          else if (XEXP (src, 1) == const1_rtx)
            /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes).  */
-           igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6)
-                                               : -COSTS_N_INSNS (1);
+           igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
          break;
 
        default:
          break;
        }
 
+      gain += igain;
+      if (speed_p)
+       weighted_gain += bb_freq * igain;
+
       if (igain != 0 && dump_file)
        {
-         fprintf (dump_file, "  Instruction gain %d for ", igain);
+         fprintf (dump_file, "  Instruction gain %d with bb_freq %.2f for ",
+                  igain, bb_freq.to_double ());
          dump_insn_slim (dump_file, insn);
        }
-      gain += igain;
     }
 
   if (dump_file)
-    fprintf (dump_file, "  Total gain: %d\n", gain);
+    fprintf (dump_file, "  Total gain: %d, weighted gain %.2f\n",
+            gain, weighted_gain.to_double ());
 
-  return gain;
+  if (weighted_gain > (sreal) 0)
+    return true;
+  else
+    return gain > 0;
 }
 
 /* Fix uses of converted REG in debug insns.  */
@@ -2595,7 +2616,7 @@ convert_scalars_to_vector (bool timode_p)
             conversions.  */
          if (chain->build (&candidates[i], uid, disallowed))
            {
-             if (chain->compute_convert_gain () > 0)
+             if (chain->compute_convert_gain ())
                converted_insns += chain->convert ();
              else if (dump_file)
                fprintf (dump_file, "Chain #%d conversion is not profitable\n",
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 7f7c0f78c96..e3719b31529 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -153,12 +153,13 @@ class scalar_chain
 
   bitmap insns_conv;
   hash_map<rtx, rtx> defs_map;
-  unsigned n_sse_to_integer;
-  unsigned n_integer_to_sse;
+  /* Cost of inserted conversion between ineteger and sse.  */
+  int cost_sse_integer;
+  sreal weighted_cost_sse_integer;
   auto_vec<rtx_insn *> control_flow_insns;
 
   bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed);
-  virtual int compute_convert_gain () = 0;
+  virtual bool compute_convert_gain () = 0;
   int convert ();
 
  protected:
@@ -184,7 +185,7 @@ class general_scalar_chain : public scalar_chain
  public:
   general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
     : scalar_chain (smode_, vmode_) {}
-  int compute_convert_gain () final override;
+  bool compute_convert_gain () final override;
 
  private:
   void convert_insn (rtx_insn *insn) final override;
@@ -196,7 +197,7 @@ class timode_scalar_chain : public scalar_chain
 {
  public:
   timode_scalar_chain () : scalar_chain (TImode, V1TImode) {}
-  int compute_convert_gain () final override;
+  bool compute_convert_gain () final override;
 
  private:
   void fix_debug_reg_uses (rtx reg);
-- 
2.34.1

[PATCH v3] Consider frequency in cost estimation when converting scalar to vector.

Reply via email to