https://gcc.gnu.org/g:dbd2fd4ac086d62dbd733cc591a1126bd5e12e1a

commit dbd2fd4ac086d62dbd733cc591a1126bd5e12e1a
Author: Tamar Christina <[email protected]>
Date:   Mon Jun 9 07:03:27 2025 +0100

    middle-end: Add new parameter to scale scalar loop costing in vectorizer
    
    This patch adds a new param vect-scalar-cost-multiplier to scale the scalar
    costing during vectorization.  If the cost is set high enough and when using
    the dynamic cost model it has the effect of effectively disabling the
    costing vs scalar and assumes all vectorization to be profitable.
    
    This is similar to using the unlimited cost model, but unlike unlimited it
    does not fully disable the vector cost model.  That means that we still
    perform comparisons between vector modes.  And it means it also still does
    costing for alias analysis.
    
    As an example, the following:
    
    void
    foo (char *restrict a, int *restrict b, int *restrict c,
         int *restrict d, int stride)
    {
        if (stride <= 1)
            return;
    
        for (int i = 0; i < 3; i++)
            {
                int res = c[i];
                int t = b[i * stride];
                if (a[i] != 0)
                    res = t * d[i];
                c[i] = res;
            }
    }
    
    compiled with -O3 -march=armv8-a+sve -fvect-cost-model=dynamic fails to
    vectorize as it assumes scalar would be faster, and with
    -fvect-cost-model=unlimited it picks a vector type that's so big that the 
large
    sequence generated is working on mostly inactive lanes:
    
            ...
            and     p3.b, p3/z, p4.b, p4.b
            whilelo p0.s, wzr, w7
            ld1w    z23.s, p3/z, [x3, #3, mul vl]
            ld1w    z28.s, p0/z, [x5, z31.s, sxtw 2]
            add     x0, x5, x0
            punpklo p6.h, p6.b
            ld1w    z27.s, p4/z, [x0, z31.s, sxtw 2]
            and     p6.b, p6/z, p0.b, p0.b
            punpklo p4.h, p7.b
            ld1w    z24.s, p6/z, [x3, #2, mul vl]
            and     p4.b, p4/z, p2.b, p2.b
            uqdecw  w6
            ld1w    z26.s, p4/z, [x3]
            whilelo p1.s, wzr, w6
            mul     z27.s, p5/m, z27.s, z23.s
            ld1w    z29.s, p1/z, [x4, z31.s, sxtw 2]
            punpkhi p7.h, p7.b
            mul     z24.s, p5/m, z24.s, z28.s
            and     p7.b, p7/z, p1.b, p1.b
            mul     z26.s, p5/m, z26.s, z30.s
            ld1w    z25.s, p7/z, [x3, #1, mul vl]
            st1w    z27.s, p3, [x2, #3, mul vl]
            mul     z25.s, p5/m, z25.s, z29.s
            st1w    z24.s, p6, [x2, #2, mul vl]
            st1w    z25.s, p7, [x2, #1, mul vl]
            st1w    z26.s, p4, [x2]
            ...
    
    With -fvect-cost-model=dynamic --param vect-scalar-cost-multiplier=200
    you get more reasonable code:
    
    foo:
            cmp     w4, 1
            ble     .L1
            ptrue   p7.s, vl3
            index   z0.s, #0, w4
            ld1b    z29.s, p7/z, [x0]
            ld1w    z30.s, p7/z, [x1, z0.s, sxtw 2]
            ptrue   p6.b, all
            cmpne   p7.b, p7/z, z29.b, #0
            ld1w    z31.s, p7/z, [x3]
            mul     z31.s, p6/m, z31.s, z30.s
            st1w    z31.s, p7, [x2]
    .L1:
            ret
    
    This model has been useful internally for performance exploration and 
cost-model
    validation.  It allows us to force realistic vectorization overriding the 
cost
    model to be able to tell whether it's correct wrt to profitability.
    
    gcc/ChangeLog:
    
            * params.opt (vect-scalar-cost-multiplier): New.
            * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Use it.
            * doc/invoke.texi (vect-scalar-cost-multiplier): Document it.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/cost_model_16.c: New test.
    
    (cherry picked from commit 4238e3470d3fa9b9697f9cf6ad26d4ef76fdf248)

Diff:
---
 gcc/doc/invoke.texi                                 |  5 +++++
 gcc/params.opt                                      |  4 ++++
 .../gcc.target/aarch64/sve/cost_model_16.c          | 21 +++++++++++++++++++++
 gcc/tree-vect-loop.cc                               |  3 ++-
 4 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index bee70fdd8389..3a54aecc093a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17241,9 +17241,14 @@ this parameter.  The default value of this parameter 
is 50.
 @item vect-induction-float
 Enable loop vectorization of floating point inductions.
 
+
 @item vect-force-slp
 Force the use of SLP when vectorizing, fail if not possible.
 
+@item vect-scalar-cost-multiplier
+Apply the given multiplier % to scalar loop costing during vectorization.
+Increasing the cost multiplier will make vector loops more profitable.
+
 @item vrp-block-limit
 Maximum number of basic blocks before VRP switches to a lower memory algorithm.
 
diff --git a/gcc/params.opt b/gcc/params.opt
index 64e453d29b74..5e61374f07ff 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1265,6 +1265,10 @@ Enable loop vectorization of floating point inductions.
 Common Joined UInteger Var(param_vect_force_slp) Init(1) IntegerRange(0, 1) 
Param Optimization
 Force the use of SLP when vectorizing, fail if not possible.
 
+-param=vect-scalar-cost-multiplier=
+Common Joined UInteger Var(param_vect_scalar_cost_multiplier) Init(100) 
IntegerRange(0, 10000) Param Optimization
+The scaling multiplier as a percentage to apply to all scalar loop costing 
when performing vectorization profitability analysis.  The default value is 100.
+
 -param=vrp-block-limit=
 Common Joined UInteger Var(param_vrp_block_limit) Init(150000) Optimization 
Param
 Maximum number of basic blocks before VRP switches to a fast model with less 
memory requirements.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_16.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_16.c
new file mode 100644
index 000000000000..bfe49ef15f3a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_16.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a+sve --param 
vect-scalar-cost-multiplier=1000 -fdump-tree-vect-details" } */
+
+void
+foo (char *restrict a, int *restrict b, int *restrict c,
+     int *restrict d, int stride)
+{
+    if (stride <= 1)
+        return;
+
+    for (int i = 0; i < 3; i++)
+        {
+            int res = c[i];
+            int t = b[i * stride];
+            if (a[i] != 0)
+                res = t * d[i];
+            c[i] = res;
+        }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops in function" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index d489d9fde8f7..5698a6defee2 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4713,7 +4713,8 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
      TODO: Consider assigning different costs to different scalar
      statements.  */
 
-  scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
+  scalar_single_iter_cost = (loop_vinfo->scalar_costs->total_cost ()
+                            * param_vect_scalar_cost_multiplier) / 100;
 
   /* Add additional cost for the peeled instructions in prologue and epilogue
      loop.  (For fully-masked loops there will be no peeling.)

Reply via email to