From: Pan Li <pan2...@intel.com>

This patch would like to combine the vec_duplicate + vrub.vv to the
vrsub.vx.  From example as below code.  The related pattern will depend
on the cost of vec_duplicate from GR2VR.  Then the late-combine will
take action if the cost of GR2VR is zero, and reject the combination
if the GR2VR cost is greater than zero.

Assume we have example code like below, GR2VR cost is 0.

  #define DEF_VX_BINARY_REVERSE_CASE_0(T, OP, NAME)                   \
  void                                                                \
  test_vx_binary_reverse_##NAME##_##T##_case_0 (T * restrict out,     \
                                                T * restrict in, T x, \
                                                unsigned n)           \
  {                                                                   \
    for (unsigned i = 0; i < n; i++)                                  \
      out[i] = x OP in[i];                                            \
  }

  DEF_VX_BINARY_REVERSE_CASE_0(int32_t, -)

Before this patch:
  54   │ test_vx_binary_reverse_rsub_int32_t_case_0:
  55   │     beq a3,zero,.L27
  56   │     vsetvli a5,zero,e32,m1,ta,ma
  57   │     vmv.v.x v2,a2
  58   │     slli    a3,a3,32
  59   │     srli    a3,a3,32
  60   │ .L22:
  61   │     vsetvli a5,a3,e32,m1,ta,ma
  62   │     vle32.v v1,0(a1)
  63   │     slli    a4,a5,2
  64   │     sub a3,a3,a5
  65   │     add a1,a1,a4
  66   │     vsub.vv v1,v2,v1
  67   │     vse32.v v1,0(a0)
  68   │     add a0,a0,a4
  69   │     bne a3,zero,.L22

After this patch:
  50   │ test_vx_binary_reverse_rsub_int32_t_case_0:
  51   │     beq a3,zero,.L27
  52   │     slli    a3,a3,32
  53   │     srli    a3,a3,32
  54   │ .L22:
  55   │     vsetvli a5,a3,e32,m1,ta,ma
  56   │     vle32.v v1,0(a1)
  57   │     slli    a4,a5,2
  58   │     sub a3,a3,a5
  59   │     add a1,a1,a4
  60   │     vrsub.vx    v1,v1,a2
  61   │     vse32.v v1,0(a0)
  62   │     add a0,a0,a4
  63   │     bne a3,zero,.L22

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/ChangeLog:

        * config/riscv/autovec-opt.md: Leverage the new add func to
        expand the vx insn.
        * config/riscv/riscv-protos.h (expand_vx_binary_vec_dup_vec): Add
        new func decl to expand format v = vop(vec_dup(x), v).
        (expand_vx_binary_vec_vec_dup): Diito but for format
        v = vop(v, vec_dup(x)).
        * config/riscv/riscv-v.cc (expand_vx_binary_vec_dup_vec): Add new
        func impl to expand vx for v = vop(vec_dup(x), v).
        (expand_vx_binary_vec_vec_dup): Diito but for another format
        v = vop(v, vec_dup(x)).

Signed-off-by: Pan Li <pan2...@intel.com>
---
 gcc/config/riscv/autovec-opt.md | 16 +++++------
 gcc/config/riscv/riscv-protos.h |  2 ++
 gcc/config/riscv/riscv-v.cc     | 49 +++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 9c6bf06c3a9..a972eda8de4 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1691,25 +1691,25 @@ (define_insn_and_split "*<optab>_vx_<mode>"
   "&& 1"
   [(const_int 0)]
   {
-    rtx ops[] = {operands[0], operands[2], operands[1]};
-    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
-                                  riscv_vector::BINARY_OP, ops);
+    riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2],
+                                               operands[1], <CODE>,
+                                               <MODE>mode);
   }
   [(set_attr "type" "vialu")])
 
 (define_insn_and_split "*<optab>_vx_<mode>"
  [(set (match_operand:V_VLSI    0 "register_operand")
        (any_int_binop_no_shift_vx:V_VLSI
-        (match_operand:V_VLSI  2 "<binop_rhs2_predicate>")
+        (match_operand:V_VLSI  1 "<binop_rhs2_predicate>")
         (vec_duplicate:V_VLSI
-          (match_operand:<VEL> 1 "register_operand"))))]
+          (match_operand:<VEL> 2 "register_operand"))))]
   "TARGET_VECTOR && can_create_pseudo_p ()"
   "#"
   "&& 1"
   [(const_int 0)]
   {
-    rtx ops[] = {operands[0], operands[2], operands[1]};
-    riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode),
-                                  riscv_vector::BINARY_OP, ops);
+    riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1],
+                                               operands[2], <CODE>,
+                                               <MODE>mode);
   }
   [(set_attr "type" "vialu")])
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 271a9a3228d..b39b858acac 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -667,6 +667,8 @@ void expand_vec_oct_ustrunc (rtx, rtx, machine_mode, 
machine_mode,
                             machine_mode);
 void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode,
                             machine_mode);
+void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode);
+void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode);
 #endif
 bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
                          bool, void (*)(rtx *, rtx), enum avl_type);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 66c8b2921e2..1b5ef51886e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5498,6 +5498,55 @@ expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode 
vec_mode,
   expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode);
 }
 
+/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1).
+   Aka the first op comes from the vec_duplicate, and the second op is
+   the vector reg.  */
+
+void
+expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2,
+                             rtx_code code, machine_mode mode)
+{
+  enum insn_code icode;
+
+  switch (code)
+    {
+    case PLUS:
+      icode = code_for_pred_scalar (code, mode);
+      break;
+    case MINUS:
+      icode = code_for_pred_sub_reverse_scalar (mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  rtx ops[] = {op_0, op_1, op_2};
+  emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
+}
+
+/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)).
+   Aka the second op comes from the vec_duplicate, and the first op is
+   the vector reg.  */
+
+void
+expand_vx_binary_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2,
+                             rtx_code code, machine_mode mode)
+{
+  enum insn_code icode;
+
+  switch (code)
+    {
+    case MINUS:
+      icode = code_for_pred_scalar (code, mode);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  rtx ops[] = {op_0, op_1, op_2};
+  emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops);
+}
+
 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
    well.  */
 void
-- 
2.43.0

Reply via email to