[PATCH] RISC-V: Apply LMUL cost penalty to vector operations

Zhongyao Chen Wed, 19 Nov 2025 00:18:32 -0800

The penalty is only applied when loops are fully unrolled (NITERS < VF),
where vector_stmt with larger LMUL should have higher execution overhead.


        PR target/122558

gcc/ChangeLog:

        * config/riscv/riscv-vector-costs.cc (get_lmul_cost_penalty): New
        function to calculate LMUL-based cost penalty.
        (costs::adjust_stmt_cost): Apply LMUL penalty to
        segment/unit-stride load/store operations
        and vector arithmetic operations.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr122558.c: New test.

Signed-off-by: Zhongyao Chen <[email protected]>
---
 gcc/config/riscv/riscv-vector-costs.cc        | 86 +++++++++++++++++++
 .../gcc.target/riscv/rvv/autovec/pr122558.c   | 37 ++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 27ced61e815..d3002f1799f 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1099,6 +1099,64 @@ segment_loadstore_group_size (enum vect_cost_for_stmt 
kind,
   return 0;
 }
 
+/* Calculate LMUL-based cost adjustment factor.
+   Larger LMUL values increase execution overhead.
+
+   This penalty is only applied when the loop is completely unrolled.
+   Returns additional cost to be added based on LMUL.  */
+static unsigned
+get_lmul_cost_penalty (machine_mode mode, loop_vec_info loop_vinfo)
+{
+  if (!riscv_v_ext_vector_mode_p (mode))
+    return 0;
+
+  /* Only apply LMUL penalty when loop is completely unrolled.
+     For non-unrolled loops, larger LMUL reduces iteration count,
+     which may provide overall benefit despite slower instructions.  */
+  if (!loop_vinfo)
+    return 0;
+
+  /* Check if loop will be completely unrolled:
+     - NITERS must be known at compile time
+     - NITERS must be less than VF (single iteration)  */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    return 0;
+
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  unsigned HOST_WIDE_INT niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+
+  /* If NITERS >= VF, loop will have multiple iterations.
+     In this case, larger LMUL reduces loop count, don't penalize.  */
+  if (maybe_ge (poly_uint64 (niters), vf))
+    return 0;
+
+  /* Loop is completely unrolled (single iteration).
+     Apply LMUL penalty since larger LMUL increases latency.  */
+  enum vlmul_type vlmul = get_vlmul (mode);
+
+  /* Cost penalty increases with LMUL:
+     - m1 (LMUL_1): 0 penalty (baseline)
+     - m2 (LMUL_2): +1
+     - m4 (LMUL_4): +2
+     - m8 (LMUL_8): +3
+     - mf2/mf4/mf8: 0 (already efficient)  */
+  switch (vlmul)
+    {
+    case LMUL_2:
+      return 1;
+    case LMUL_4:
+      return 2;
+    case LMUL_8:
+      return 3;
+    case LMUL_1:
+    case LMUL_F2:
+    case LMUL_F4:
+    case LMUL_F8:
+    default:
+      return 0;
+    }
+}
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
    For some statement, we would like to further fine-grain tweak the cost on
    top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1181,6 +1239,15 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
                    default:
                      break;
                    }
+
+                 /* Adjust cost for all segment load/store operations based on
+                    actual vectype LMUL.  Only penalize when loop is completely
+                    unrolled.  */
+                 if (vectype)
+                   {
+                     machine_mode actual_mode = TYPE_MODE (vectype);
+                     stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+                   }
                }
              else
                {
@@ -1236,10 +1303,29 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
                        }
                    }
                }
+
+             /* Apply LMUL penalty for unit-stride operations.
+                This ensures consistent cost modeling across all
+                vector load/store types when loop is unrolled.  */
+             if (vectype)
+               {
+                 machine_mode actual_mode = TYPE_MODE (vectype);
+                 stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+               }
            }
          break;
        }
 
+    case vector_stmt:
+      /* Adjust cost for all vector arithmetic operations based on LMUL.
+        Only penalize when loop is completely unrolled.  */
+      if (vectype)
+       {
+         machine_mode actual_mode = TYPE_MODE (vectype);
+         stmt_cost += get_lmul_cost_penalty (actual_mode, loop);
+       }
+      break;
+
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
new file mode 100644
index 00000000000..c9dbba64961
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr122558.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -mabi=lp64d -march=rv64gcv 
-mrvv-max-lmul=dynamic -fdump-tree-vect-all" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-O1" "-O2" "-Os" "-Og" "-Oz" } } */
+
+#include <stdint-gcc.h>
+
+void dct( int16_t d[16], int16_t dct[16] )
+{
+    int16_t tmp[16];
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = d[i*4+0] + d[i*4+3];
+        int s12 = d[i*4+1] + d[i*4+2];
+        int d03 = d[i*4+0] - d[i*4+3];
+        int d12 = d[i*4+1] - d[i*4+2];
+        tmp[0*4+i] =   s03 +   s12;
+        tmp[1*4+i] = 2*d03 +   d12;
+        tmp[2*4+i] =   s03 -   s12;
+        tmp[3*4+i] =   d03 - 2*d12;
+    }
+    for( int i = 0; i < 4; i++ )
+    {
+        int s03 = tmp[i*4+0] + tmp[i*4+3];
+        int s12 = tmp[i*4+1] + tmp[i*4+2];
+        int d03 = tmp[i*4+0] - tmp[i*4+3];
+        int d12 = tmp[i*4+1] - tmp[i*4+2];
+
+        dct[i*4+0] =   s03 +   s12;
+        dct[i*4+1] = 2*d03 +   d12;
+        dct[i*4+2] =   s03 -   s12;
+        dct[i*4+3] =   d03 - 2*d12;
+    }
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "Choosing vector mode RVVMF2QI" "vect" } } */
+
-- 
2.43.0

[PATCH] RISC-V: Apply LMUL cost penalty to vector operations

Reply via email to