[gcc r16-89] Fix vectorizer costs of COND_EXPR, MIN_EXPR, MAX_EXPR, ABS_EXPR, ABSU_EXPR

Jan Hubicka via Gcc-cvs Tue, 22 Apr 2025 14:48:41 -0700

https://gcc.gnu.org/g:0650ea627399a0ef23db434d4fce6b52b9faf557


commit r16-89-g0650ea627399a0ef23db434d4fce6b52b9faf557
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Tue Apr 22 23:47:14 2025 +0200

    Fix vectorizer costs of COND_EXPR, MIN_EXPR, MAX_EXPR, ABS_EXPR, ABSU_EXPR
    
    this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
    MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
ABSU_EXPR
    but it was only correct for FP variant (wehre it corresponds to andss 
clearing
    sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
    SSE3 instroduced an instruction.
    
    MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
    tables they costs same as sse_op on all targets. Integer translated to 
single
    instruction since SSE3.
    
    COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
simplified
    the sequence and AVX512 introduced masked registers.
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Add 
special cases
            for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR and ABSU_EXPR more 
realistic.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr89618-2.c: XFAIL.

Diff:
---
 gcc/config/i386/i386.cc                   | 95 ++++++++++++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/pr89618-2.c |  8 ++-
 2 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d15f91ddd2cb..aef41454d9d5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25300,7 +25300,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
              else if (X87_FLOAT_MODE_P (mode))
                stmt_cost = ix86_cost->fadd;
              else
-               stmt_cost = ix86_cost->add;
+               stmt_cost = ix86_cost->add;
            }
          else
            stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25355,7 +25355,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                            (subcode == RSHIFT_EXPR
                             && !TYPE_UNSIGNED (TREE_TYPE (op1)))
                            ? ASHIFTRT : LSHIFTRT, mode,
-                           TREE_CODE (op2) == INTEGER_CST,
+                           TREE_CODE (op2) == INTEGER_CST,
                            cst_and_fits_in_hwi (op2)
                            ? int_cst_value (op2) : -1,
                            false, false, NULL, NULL);
@@ -25364,7 +25364,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        case NOP_EXPR:
          /* Only sign-conversions are free.  */
          if (tree_nop_conversion_p
-               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
                 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
            stmt_cost = 0;
          else if (fp)
@@ -25372,17 +25372,94 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                          (ix86_tune_cost, GET_MODE_BITSIZE (mode));
          break;
 
-       case BIT_IOR_EXPR:
-       case ABS_EXPR:
-       case ABSU_EXPR:
+       case COND_EXPR:
+         {
+           /* SSE2 conditinal move sequence is:
+                pcmpgtd %xmm5, %xmm0
+                pand    %xmm0, %xmm2
+                pandn   %xmm1, %xmm0
+                por     %xmm2, %xmm0
+              while SSE4 uses cmp + blend
+              and AVX512 masked moves.  */
+
+           int ninsns = TARGET_SSE4_1 ? 2 : 4;
+
+           if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+             stmt_cost = ninsns * ix86_cost->sse_op;
+           else if (X87_FLOAT_MODE_P (mode))
+             /* x87 requires conditional branch.  We don't have cost for
+                that.  */
+             ;
+           else if (VECTOR_MODE_P (mode))
+             stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+           else
+             /* compare + cmov.  */
+             stmt_cost = ix86_cost->add * 2;
+         }
+         break;
+
        case MIN_EXPR:
        case MAX_EXPR:
+         if (fp)
+           {
+             if (X87_FLOAT_MODE_P (mode))
+               /* x87 requires conditional branch.  We don't have cost for
+                  that.  */
+               ;
+             else
+               /* minss  */
+               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+           }
+         else
+           {
+             if (VECTOR_MODE_P (mode))
+               {
+                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+                 /* vpmin was introduced in SSE3.
+                    SSE2 needs pcmpgtd + pand + pandn + pxor.  */
+                 if (!TARGET_SSSE3)
+                   stmt_cost *= 4;
+               }
+             else
+               /* cmp + cmov.  */
+               stmt_cost = ix86_cost->add * 2;
+           }
+         break;
+
+       case ABS_EXPR:
+       case ABSU_EXPR:
+         if (fp)
+           {
+             if (X87_FLOAT_MODE_P (mode))
+               /* fabs.  */
+               stmt_cost = ix86_cost->fabs;
+             else
+               /* andss of sign bit.  */
+               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+           }
+         else
+           {
+             if (VECTOR_MODE_P (mode))
+               {
+                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+                 /* vabs was introduced in SSE3.
+                    SSE3 uses psrat + pxor + psub.  */
+                 if (!TARGET_SSSE3)
+                   stmt_cost *= 3;
+               }
+             else
+               /* neg + cmov.  */
+               stmt_cost = ix86_cost->add * 2;
+           }
+         break;
+
+       case BIT_IOR_EXPR:
        case BIT_XOR_EXPR:
        case BIT_AND_EXPR:
        case BIT_NOT_EXPR:
-         if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
-           stmt_cost = ix86_cost->sse_op;
-         else if (VECTOR_MODE_P (mode))
+         gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+                     && !X87_FLOAT_MODE_P (mode));
+         if (VECTOR_MODE_P (mode))
            stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
          else
            stmt_cost = ix86_cost->add;
diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c 
b/gcc/testsuite/gcc.target/i386/pr89618-2.c
index c414053b8eca..11d658f52a46 100644
--- a/gcc/testsuite/gcc.target/i386/pr89618-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c
@@ -19,5 +19,9 @@ void foo (int n, int *off, double *a)
 }
 
 /* Make sure the cost model selects SSE vectors rather than AVX to avoid
-   too many scalar ops for the address computes in the loop.  */
-/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
{ target { ! ia32 } } } } */
+   too many scalar ops for the address computes in the loop. 
+  
+   Since open-coded scatters are costed wrong, we no longer vectorize after 
fixing
+   COND_EXPR costs.  See PR119902.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
{ target { ! ia32 } xfail *-*-*  } } } */
+/* { dg-final { scan-tree-dump-not "loop vectorized using 32 byte vectors" 
"vect" { target { ! ia32 } } } } */

[gcc r16-89] Fix vectorizer costs of COND_EXPR, MIN_EXPR, MAX_EXPR, ABS_EXPR, ABSU_EXPR

Reply via email to