From: Pan Li <pan2...@intel.com> This patch would like to combine the vec_duplicate + vrub.vv to the vrsub.vx. From example as below code. The related pattern will depend on the cost of vec_duplicate from GR2VR. Then the late-combine will take action if the cost of GR2VR is zero, and reject the combination if the GR2VR cost is greater than zero.
Assume we have example code like below, GR2VR cost is 0. #define DEF_VX_BINARY_REVERSE_CASE_0(T, OP, NAME) \ void \ test_vx_binary_reverse_##NAME##_##T##_case_0 (T * restrict out, \ T * restrict in, T x, \ unsigned n) \ { \ for (unsigned i = 0; i < n; i++) \ out[i] = x OP in[i]; \ } DEF_VX_BINARY_REVERSE_CASE_0(int32_t, -) Before this patch: 54 │ test_vx_binary_reverse_rsub_int32_t_case_0: 55 │ beq a3,zero,.L27 56 │ vsetvli a5,zero,e32,m1,ta,ma 57 │ vmv.v.x v2,a2 58 │ slli a3,a3,32 59 │ srli a3,a3,32 60 │ .L22: 61 │ vsetvli a5,a3,e32,m1,ta,ma 62 │ vle32.v v1,0(a1) 63 │ slli a4,a5,2 64 │ sub a3,a3,a5 65 │ add a1,a1,a4 66 │ vsub.vv v1,v2,v1 67 │ vse32.v v1,0(a0) 68 │ add a0,a0,a4 69 │ bne a3,zero,.L22 After this patch: 50 │ test_vx_binary_reverse_rsub_int32_t_case_0: 51 │ beq a3,zero,.L27 52 │ slli a3,a3,32 53 │ srli a3,a3,32 54 │ .L22: 55 │ vsetvli a5,a3,e32,m1,ta,ma 56 │ vle32.v v1,0(a1) 57 │ slli a4,a5,2 58 │ sub a3,a3,a5 59 │ add a1,a1,a4 60 │ vrsub.vx v1,v1,a2 61 │ vse32.v v1,0(a0) 62 │ add a0,a0,a4 63 │ bne a3,zero,.L22 The below test suites are passed for this patch. * The rv64gcv fully regression test. gcc/ChangeLog: * config/riscv/autovec-opt.md: Leverage the new add func to expand the vx insn. * config/riscv/riscv-protos.h (expand_vx_binary_vec_dup_vec): Add new func decl to expand format v = vop(vec_dup(x), v). (expand_vx_binary_vec_vec_dup): Diito but for format v = vop(v, vec_dup(x)). * config/riscv/riscv-v.cc (expand_vx_binary_vec_dup_vec): Add new func impl to expand vx for v = vop(vec_dup(x), v). (expand_vx_binary_vec_vec_dup): Diito but for another format v = vop(v, vec_dup(x)). Signed-off-by: Pan Li <pan2...@intel.com> --- gcc/config/riscv/autovec-opt.md | 16 +++++------ gcc/config/riscv/riscv-protos.h | 2 ++ gcc/config/riscv/riscv-v.cc | 49 +++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md index 9c6bf06c3a9..a972eda8de4 100644 --- a/gcc/config/riscv/autovec-opt.md +++ b/gcc/config/riscv/autovec-opt.md @@ -1691,25 +1691,25 @@ (define_insn_and_split "*<optab>_vx_<mode>" "&& 1" [(const_int 0)] { - rtx ops[] = {operands[0], operands[2], operands[1]}; - riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode), - riscv_vector::BINARY_OP, ops); + riscv_vector::expand_vx_binary_vec_dup_vec (operands[0], operands[2], + operands[1], <CODE>, + <MODE>mode); } [(set_attr "type" "vialu")]) (define_insn_and_split "*<optab>_vx_<mode>" [(set (match_operand:V_VLSI 0 "register_operand") (any_int_binop_no_shift_vx:V_VLSI - (match_operand:V_VLSI 2 "<binop_rhs2_predicate>") + (match_operand:V_VLSI 1 "<binop_rhs2_predicate>") (vec_duplicate:V_VLSI - (match_operand:<VEL> 1 "register_operand"))))] + (match_operand:<VEL> 2 "register_operand"))))] "TARGET_VECTOR && can_create_pseudo_p ()" "#" "&& 1" [(const_int 0)] { - rtx ops[] = {operands[0], operands[2], operands[1]}; - riscv_vector::emit_vlmax_insn (code_for_pred_scalar (<CODE>, <MODE>mode), - riscv_vector::BINARY_OP, ops); + riscv_vector::expand_vx_binary_vec_vec_dup (operands[0], operands[1], + operands[2], <CODE>, + <MODE>mode); } [(set_attr "type" "vialu")]) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 271a9a3228d..b39b858acac 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -667,6 +667,8 @@ void expand_vec_oct_ustrunc (rtx, rtx, machine_mode, machine_mode, machine_mode); void expand_vec_oct_sstrunc (rtx, rtx, machine_mode, machine_mode, machine_mode); +void expand_vx_binary_vec_dup_vec (rtx, rtx, rtx, rtx_code, machine_mode); +void expand_vx_binary_vec_vec_dup (rtx, rtx, rtx, rtx_code, machine_mode); #endif bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode, bool, void (*)(rtx *, rtx), enum avl_type); diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 66c8b2921e2..1b5ef51886e 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -5498,6 +5498,55 @@ expand_vec_oct_sstrunc (rtx op_0, rtx op_1, machine_mode vec_mode, expand_vec_double_sstrunc (op_0, quad_rtx, quad_mode); } +/* Expand the binary vx combine with the format like v2 = vop(vec_dup(x), v1). + Aka the first op comes from the vec_duplicate, and the second op is + the vector reg. */ + +void +expand_vx_binary_vec_dup_vec (rtx op_0, rtx op_1, rtx op_2, + rtx_code code, machine_mode mode) +{ + enum insn_code icode; + + switch (code) + { + case PLUS: + icode = code_for_pred_scalar (code, mode); + break; + case MINUS: + icode = code_for_pred_sub_reverse_scalar (mode); + break; + default: + gcc_unreachable (); + } + + rtx ops[] = {op_0, op_1, op_2}; + emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops); +} + +/* Expand the binary vx combine with the format like v2 = vop(v1, vec_dup(x)). + Aka the second op comes from the vec_duplicate, and the first op is + the vector reg. */ + +void +expand_vx_binary_vec_vec_dup (rtx op_0, rtx op_1, rtx op_2, + rtx_code code, machine_mode mode) +{ + enum insn_code icode; + + switch (code) + { + case MINUS: + icode = code_for_pred_scalar (code, mode); + break; + default: + gcc_unreachable (); + } + + rtx ops[] = {op_0, op_1, op_2}; + emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops); +} + /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as well. */ void -- 2.43.0