This patch improves the cost/gain calculation used during the i386 backend's
SImode/DImode scalar-to-vector (STV) conversion pass.  The current code
handles loads and stores, but doesn't consider that converting other
scalar operations with a memory destination, requires an explicit load
before and an explicit store after the vector equivalent.

To ease the review, the significant change looks like:

         /* For operations on memory operands, include the overhead
            of explicit load and store instructions.  */
         if (MEM_P (dst))
           igain += !optimize_insn_for_size_p ()
                    ? (m * (ix86_cost->int_load[2]
                            + ix86_cost->int_store[2])
                       - (ix86_cost->sse_load[sse_cost_idx] +
                          ix86_cost->sse_store[sse_cost_idx]))
                    : -COSTS_N_BYTES (8);

however the patch itself is complicated by a change in indentation
which leads to a number of lines with only whitespace changes.
For architectures where integer load/store costs are the same as
vector load/store costs, there should be no change without -Os/-Oz.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2024-01-06  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        PR target/113231
        * config/i386/i386-features.cc (compute_convert_gain): Include
        the overhead of explicit load and store (movd) instructions when
        converting non-store scalar operations with memory destinations.

gcc/testsuite/ChangeLog
        PR target/113231
        * gcc.target/i386/pr113231.c: New test case.


Thanks again,
Roger
--

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4ae3e75..3677aef 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -563,183 +563,195 @@ general_scalar_chain::compute_convert_gain ()
       else if (MEM_P (src) && REG_P (dst))
        igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
       else
-       switch (GET_CODE (src))
-         {
-         case ASHIFT:
-         case ASHIFTRT:
-         case LSHIFTRT:
-           if (m == 2)
-             {
-               if (INTVAL (XEXP (src, 1)) >= 32)
-                 igain += ix86_cost->add;
-               /* Gain for extend highpart case.  */
-               else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
-                 igain += ix86_cost->shift_const - ix86_cost->sse_op;
-               else
-                 igain += ix86_cost->shift_const;
-             }
-
-           igain += ix86_cost->shift_const - ix86_cost->sse_op;
+       {
+         /* For operations on memory operands, include the overhead
+            of explicit load and store instructions.  */
+         if (MEM_P (dst))
+           igain += !optimize_insn_for_size_p ()
+                    ? (m * (ix86_cost->int_load[2]
+                            + ix86_cost->int_store[2])
+                       - (ix86_cost->sse_load[sse_cost_idx] +
+                          ix86_cost->sse_store[sse_cost_idx]))
+                    : -COSTS_N_BYTES (8);
 
-           if (CONST_INT_P (XEXP (src, 0)))
-             igain -= vector_const_cost (XEXP (src, 0));
-           break;
+         switch (GET_CODE (src))
+           {
+           case ASHIFT:
+           case ASHIFTRT:
+           case LSHIFTRT:
+             if (m == 2)
+               {
+                 if (INTVAL (XEXP (src, 1)) >= 32)
+                   igain += ix86_cost->add;
+                 /* Gain for extend highpart case.  */
+                 else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
+                   igain += ix86_cost->shift_const - ix86_cost->sse_op;
+                 else
+                   igain += ix86_cost->shift_const;
+               }
 
-         case ROTATE:
-         case ROTATERT:
-           igain += m * ix86_cost->shift_const;
-           if (TARGET_AVX512VL)
-             igain -= ix86_cost->sse_op;
-           else if (smode == DImode)
-             {
-               int bits = INTVAL (XEXP (src, 1));
-               if ((bits & 0x0f) == 0)
-                 igain -= ix86_cost->sse_op;
-               else if ((bits & 0x07) == 0)
-                 igain -= 2 * ix86_cost->sse_op;
-               else
-                 igain -= 3 * ix86_cost->sse_op;
-             }
-           else if (INTVAL (XEXP (src, 1)) == 16)
-             igain -= ix86_cost->sse_op;
-           else
-             igain -= 2 * ix86_cost->sse_op;
-           break;
+             igain += ix86_cost->shift_const - ix86_cost->sse_op;
 
-         case AND:
-         case IOR:
-         case XOR:
-         case PLUS:
-         case MINUS:
-           igain += m * ix86_cost->add - ix86_cost->sse_op;
-           /* Additional gain for andnot for targets without BMI.  */
-           if (GET_CODE (XEXP (src, 0)) == NOT
-               && !TARGET_BMI)
-             igain += m * ix86_cost->add;
-
-           if (CONST_INT_P (XEXP (src, 0)))
-             igain -= vector_const_cost (XEXP (src, 0));
-           if (CONST_INT_P (XEXP (src, 1)))
-             igain -= vector_const_cost (XEXP (src, 1));
-           if (MEM_P (XEXP (src, 1)))
-             {
-               if (optimize_insn_for_size_p ())
-                 igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
-               else
-                 igain += m * ix86_cost->int_load[2]
-                          - ix86_cost->sse_load[sse_cost_idx];
-             }
-           break;
+             if (CONST_INT_P (XEXP (src, 0)))
+               igain -= vector_const_cost (XEXP (src, 0));
+             break;
 
-         case NEG:
-         case NOT:
-           igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
+           case ROTATE:
+           case ROTATERT:
+             igain += m * ix86_cost->shift_const;
+             if (TARGET_AVX512VL)
+               igain -= ix86_cost->sse_op;
+             else if (smode == DImode)
+               {
+                 int bits = INTVAL (XEXP (src, 1));
+                 if ((bits & 0x0f) == 0)
+                   igain -= ix86_cost->sse_op;
+                 else if ((bits & 0x07) == 0)
+                   igain -= 2 * ix86_cost->sse_op;
+                 else
+                   igain -= 3 * ix86_cost->sse_op;
+               }
+             else if (INTVAL (XEXP (src, 1)) == 16)
+               igain -= ix86_cost->sse_op;
+             else
+               igain -= 2 * ix86_cost->sse_op;
+             break;
 
-           if (GET_CODE (XEXP (src, 0)) != ABS)
-             {
+           case AND:
+           case IOR:
+           case XOR:
+           case PLUS:
+           case MINUS:
+             igain += m * ix86_cost->add - ix86_cost->sse_op;
+             /* Additional gain for andnot for targets without BMI.  */
+             if (GET_CODE (XEXP (src, 0)) == NOT
+                 && !TARGET_BMI)
                igain += m * ix86_cost->add;
-               break;
-             }
-           /* FALLTHRU */
-
-         case ABS:
-         case SMAX:
-         case SMIN:
-         case UMAX:
-         case UMIN:
-           /* We do not have any conditional move cost, estimate it as a
-              reg-reg move.  Comparisons are costed as adds.  */
-           igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
-           /* Integer SSE ops are all costed the same.  */
-           igain -= ix86_cost->sse_op;
-           break;
 
-         case COMPARE:
-           if (XEXP (src, 1) != const0_rtx)
-             {
-               /* cmp vs. pxor;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (m - 3);
-             }
-           else if (GET_CODE (XEXP (src, 0)) != AND)
-             {
-               /* test vs. pshufd;ptest.  */
-               igain += COSTS_N_INSNS (m - 2);
-             }
-           else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
-             {
-               /* and;test vs. pshufd;ptest.  */
-               igain += COSTS_N_INSNS (2 * m - 2);
-             }
-           else if (TARGET_BMI)
-             {
-               /* andn;test vs. pandn;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (2 * m - 3);
-             }
-           else
-             {
-               /* not;and;test vs. pandn;pshufd;ptest.  */
-               igain += COSTS_N_INSNS (3 * m - 3);
-             }
-           break;
+             if (CONST_INT_P (XEXP (src, 0)))
+               igain -= vector_const_cost (XEXP (src, 0));
+             if (CONST_INT_P (XEXP (src, 1)))
+               igain -= vector_const_cost (XEXP (src, 1));
+             if (MEM_P (XEXP (src, 1)))
+               {
+                 if (optimize_insn_for_size_p ())
+                   igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
+                 else
+                   igain += m * ix86_cost->int_load[2]
+                            - ix86_cost->sse_load[sse_cost_idx];
+               }
+             break;
 
-         case CONST_INT:
-           if (REG_P (dst))
-             {
-               if (optimize_insn_for_size_p ())
-                 {
-                   /* xor (2 bytes) vs. xorps (3 bytes).  */
-                   if (src == const0_rtx)
-                     igain -= COSTS_N_BYTES (1);
-                   /* movdi_internal vs. movv2di_internal.  */
-                   /* => mov (5 bytes) vs. movaps (7 bytes).  */
-                   else if (x86_64_immediate_operand (src, SImode))
-                     igain -= COSTS_N_BYTES (2);
-                   else
-                     /* ??? Larger immediate constants are placed in the
-                        constant pool, where the size benefit/impact of
-                        STV conversion is affected by whether and how
-                        often each constant pool entry is shared/reused.
-                        The value below is empirically derived from the
-                        CSiBE benchmark (and the optimal value may drift
-                        over time).  */
-                     igain += COSTS_N_BYTES (0);
-                 }
-               else
-                 {
-                   /* DImode can be immediate for TARGET_64BIT
-                      and SImode always.  */
-                   igain += m * COSTS_N_INSNS (1);
-                   igain -= vector_const_cost (src);
-                 }
-             }
-           else if (MEM_P (dst))
-             {
-               igain += (m * ix86_cost->int_store[2]
-                         - ix86_cost->sse_store[sse_cost_idx]);
-               igain -= vector_const_cost (src);
-             }
-           break;
+           case NEG:
+           case NOT:
+             igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
 
-         case VEC_SELECT:
-           if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
-             {
-               // movd (4 bytes) replaced with movdqa (4 bytes).
-               if (!optimize_insn_for_size_p ())
-                 igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
-             }
-           else
-             {
-               // pshufd; movd replaced with pshufd.
-               if (optimize_insn_for_size_p ())
-                 igain += COSTS_N_BYTES (4);
-               else
-                 igain += ix86_cost->sse_to_integer;
-             }
-           break;
+             if (GET_CODE (XEXP (src, 0)) != ABS)
+               {
+                 igain += m * ix86_cost->add;
+                 break;
+               }
+             /* FALLTHRU */
+
+           case ABS:
+           case SMAX:
+           case SMIN:
+           case UMAX:
+           case UMIN:
+             /* We do not have any conditional move cost, estimate it as a
+                reg-reg move.  Comparisons are costed as adds.  */
+             igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
+             /* Integer SSE ops are all costed the same.  */
+             igain -= ix86_cost->sse_op;
+             break;
 
-         default:
-           gcc_unreachable ();
-         }
+           case COMPARE:
+             if (XEXP (src, 1) != const0_rtx)
+               {
+                 /* cmp vs. pxor;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (m - 3);
+               }
+             else if (GET_CODE (XEXP (src, 0)) != AND)
+               {
+                 /* test vs. pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (m - 2);
+               }
+             else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
+               {
+                 /* and;test vs. pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (2 * m - 2);
+               }
+             else if (TARGET_BMI)
+               {
+                 /* andn;test vs. pandn;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (2 * m - 3);
+               }
+             else
+               {
+                 /* not;and;test vs. pandn;pshufd;ptest.  */
+                 igain += COSTS_N_INSNS (3 * m - 3);
+               }
+             break;
+
+           case CONST_INT:
+             if (REG_P (dst))
+               {
+                 if (optimize_insn_for_size_p ())
+                   {
+                     /* xor (2 bytes) vs. xorps (3 bytes).  */
+                     if (src == const0_rtx)
+                       igain -= COSTS_N_BYTES (1);
+                     /* movdi_internal vs. movv2di_internal.  */
+                     /* => mov (5 bytes) vs. movaps (7 bytes).  */
+                     else if (x86_64_immediate_operand (src, SImode))
+                       igain -= COSTS_N_BYTES (2);
+                     else
+                       /* ??? Larger immediate constants are placed in the
+                          constant pool, where the size benefit/impact of
+                          STV conversion is affected by whether and how
+                          often each constant pool entry is shared/reused.
+                          The value below is empirically derived from the
+                          CSiBE benchmark (and the optimal value may drift
+                          over time).  */
+                       igain += COSTS_N_BYTES (0);
+                   }
+                 else
+                   {
+                     /* DImode can be immediate for TARGET_64BIT
+                        and SImode always.  */
+                     igain += m * COSTS_N_INSNS (1);
+                     igain -= vector_const_cost (src);
+                   }
+               }
+             else if (MEM_P (dst))
+               {
+                 igain += (m * ix86_cost->int_store[2]
+                           - ix86_cost->sse_store[sse_cost_idx]);
+                 igain -= vector_const_cost (src);
+               }
+             break;
+
+           case VEC_SELECT:
+             if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+               {
+                 // movd (4 bytes) replaced with movdqa (4 bytes).
+                 if (!optimize_insn_for_size_p ())
+                   igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+               }
+             else
+               {
+                 // pshufd; movd replaced with pshufd.
+                 if (optimize_insn_for_size_p ())
+                   igain += COSTS_N_BYTES (4);
+                 else
+                   igain += ix86_cost->sse_to_integer;
+               }
+             break;
+
+           default:
+             gcc_unreachable ();
+           }
+       }
 
       if (igain != 0 && dump_file)
        {
diff --git a/gcc/testsuite/gcc.target/i386/pr113231.c 
b/gcc/testsuite/gcc.target/i386/pr113231.c
new file mode 100644
index 0000000..f9dcd9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113231.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+void foo(int *i) { *i *= 2; }
+void bar(int *i) { *i <<= 2; }
+void baz(int *i) { *i >>= 2; }
+
+/* { dg-final { scan-assembler-not "movd" } } */

Reply via email to