vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value to be insert, arg2 is the place to insert arg1 to arg0. Current expander generates stxv+stwx+lxv if arg2 is variable instead of constant, which causes serious store hit load performance issue on Power. This patch tries 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to unify the gimple code, then expander could use vec_set_optab to expand. 2) Expand the IFN VEC_SET to fast instructions: lvsl+xxperm+xxsel. In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too early in gimple stage if arg2 is variable, avoid generating store hit load instructions.
For Power9 V4SI: addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) => addis 9,2,.LC0@toc@ha addi 9,9,.LC0@toc@l mtvsrwz 33,5 lxv 32,0(9) sradi 9,6,2 addze 9,9 sldi 9,9,2 subf 9,9,6 subfic 9,9,3 sldi 9,9,2 subfic 9,9,20 lvsl 13,0,9 xxperm 33,33,45 xxperm 32,32,45 xxsel 34,34,33,32 Though instructions increase from 5 to 15, the performance is improved 60% in typical cases. Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and Power8-BE, bootstrap tested pass. gcc/ChangeLog: 2020-09-18 Xionghu Luo <luo...@linux.ibm.com> * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Rename to (altivec_lvsl_reg_<mode>2) and extend to SDI mode. * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Ajdust variable index vec_insert to VIEW_CONVERT_EXPR. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): New declare. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function. * config/rs6000/rs6000.md (FQHS): New mode iterator. (FD): New mode iterator. p8_mtvsrwz_v16qi<mode>2: New define_insn. p8_mtvsrd_v16qi<mode>2: New define_insn. * config/rs6000/vector.md: Add register operand2 match for vec_set index. * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. gcc/testsuite/ChangeLog: 2020-09-18 Xionghu Luo <luo...@linux.ibm.com> * gcc.target/powerpc/pr79251.c: New test. * gcc.target/powerpc/pr79251-run.c: New test. * gcc.target/powerpc/pr79251.h: New header. --- gcc/config/rs6000/altivec.md | 4 +- gcc/config/rs6000/rs6000-c.c | 22 ++- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 146 ++++++++++++++++++ gcc/config/rs6000/rs6000.md | 19 +++ gcc/config/rs6000/vector.md | 19 ++- gcc/config/rs6000/vsx.md | 2 +- .../gcc.target/powerpc/pr79251-run.c | 29 ++++ gcc/testsuite/gcc.target/powerpc/pr79251.c | 15 ++ gcc/testsuite/gcc.target/powerpc/pr79251.h | 19 +++ 10 files changed, 257 insertions(+), 19 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..66b636059a6 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2772,10 +2772,10 @@ (define_expand "altivec_lvsl" DONE; }) -(define_insn "altivec_lvsl_reg" +(define_insn "altivec_lvsl_reg_<mode>2" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SDI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 2fad3d94706..78abe49c833 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1509,9 +1509,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, tree arg1; tree arg2; tree arg1_type; - tree arg1_inner_type; tree decl, stmt; - tree innerptrtype; machine_mode mode; /* No second or third arguments. */ @@ -1563,8 +1561,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } - /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ - arg1_inner_type = TREE_TYPE (arg1_type); + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with + VIEW_CONVERT_EXPR. i.e.: + D.3192 = v1; + _1 = n & 3; + VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i; + v1 = D.3192; + D.3194 = v1; */ if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) arg2 = build_int_cst (TREE_TYPE (arg2), 0); else @@ -1593,15 +1596,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - - innerptrtype = build_pointer_type (arg1_inner_type); - - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); - stmt = convert (innerptrtype, stmt); - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); - stmt = build_indirect_ref (loc, stmt, RO_NULL); - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, - convert (TREE_TYPE (stmt), arg0)); + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0); stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); return stmt; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..f6f8bd65c2f 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_set_var (rtx, rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..d22d7999a61 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) emit_insn (gen_rtx_SET (target, x)); } +/* Insert value from VEC into idx of TARGET. */ + +void +rs6000_expand_vector_set_var (rtx target, rtx vec, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (vec); + + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) + gcc_unreachable (); + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) + && TARGET_DIRECT_MOVE_64BIT) + { + gcc_assert (GET_MODE (idx) == E_SImode); + machine_mode inner_mode = GET_MODE (val); + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + if (GET_MODE_SIZE (inner_mode) == 8) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 1 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); + /* idx = idx * 8. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); + /* idx = 16 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 4) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 3 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); + /* idx = idx * 4. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); + /* idx = 20 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 2) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 7 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); + /* idx = idx * 2. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + /* idx = 22 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 1) + if (!BYTES_BIG_ENDIAN) + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); + else + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); + else + gcc_unreachable (); + + /* lxv vs32, mask. + DImode: 0xffffffffffffffff0000000000000000 + SImode: 0x00000000ffffffff0000000000000000 + HImode: 0x000000000000ffff0000000000000000. + QImode: 0x00000000000000ff0000000000000000. */ + rtx mask = gen_reg_rtx (V16QImode); + rtx mask_v2di = gen_reg_rtx (V2DImode); + rtvec v = rtvec_alloc (2); + if (!BYTES_BIG_ENDIAN) + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); + } + else + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); + } + emit_insn ( + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); + emit_insn (gen_rtx_SET (mask, sub_mask)); + + /* mtvsrd[wz] f0,val. */ + rtx val_v16qi = gen_reg_rtx (V16QImode); + switch (inner_mode) + { + default: + gcc_unreachable (); + break; + case E_QImode: + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); + break; + case E_HImode: + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); + break; + case E_SImode: + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); + break; + case E_SFmode: + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); + break; + case E_DImode: + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); + break; + case E_DFmode: + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); + break; + } + + /* lvsl v1,0,idx. */ + rtx pcv = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); + + /* xxperm vs0,vs0,vs33. */ + /* xxperm vs32,vs32,vs33. */ + rtx val_perm = gen_reg_rtx (V16QImode); + rtx mask_perm = gen_reg_rtx (V16QImode); + emit_insn ( + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); + + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); + emit_insn (gen_rtx_SET (target, sub_target)); + + /* xxsel vs34,vs34,vs0,vs32. */ + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); + } +} + /* Extract field ELT from VEC into TARGET. */ void diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 43b620ae1c0..b02fda836d4 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8713,6 +8713,25 @@ (define_insn "p8_mtvsrwz" "mtvsrwz %x0,%1" [(set_attr "type" "mftgpr")]) +(define_mode_iterator FQHS [SF QI HI SI]) +(define_mode_iterator FD [DF DI]) + +(define_insn "p8_mtvsrwz_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRWZ))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrwz %x0,%1" + [(set_attr "type" "mftgpr")]) + +(define_insn "p8_mtvsrd_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRD))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrd %x0,%1" + [(set_attr "type" "mftgpr")]) + (define_insn_and_split "reload_fpr_from_gpr<mode>" [(set (match_operand:FMOVE64X 0 "register_operand" "=d") (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 796345c80d3..28e59c1c995 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1227,11 +1227,24 @@ (define_expand "vec_init<mode><VEC_base_l>" (define_expand "vec_set<mode>" [(match_operand:VEC_E 0 "vlogical_operand") (match_operand:<VEC_base> 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "reg_or_cint_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { - rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); - DONE; + if (CONST_INT_P (operands[2])) + { + rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); + DONE; + } + else + { + rtx target = gen_reg_rtx (V16QImode); + rs6000_expand_vector_set_var (target, operands[0], operands[1], + operands[2]); + rtx sub_target + = simplify_gen_subreg (GET_MODE (operands[0]), target, V16QImode, 0); + emit_insn (gen_rtx_SET (operands[0], sub_target)); + DONE; + } }) (define_expand "vec_extract<mode><VEC_base_l>" diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index dd750210758..7e82690d12d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5349,7 +5349,7 @@ (define_expand "xl_len_r" rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c new file mode 100644 index 00000000000..840f6712ad2 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target { lp64 && p9vector_hw } } } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +#define run_test(TYPE, num) \ + { \ + vector TYPE v; \ + vector TYPE u = {0x0}; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + v[k] = 0xaa; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + { \ + u = test##num (v, 254, k); \ + if (u[k] != (TYPE) 254) \ + __builtin_abort (); \ + } \ + } + +int +main (void) +{ + TEST_VEC_INSERT_ALL (run_test) + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c new file mode 100644 index 00000000000..8124f503df9 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ +/* { dg-final { scan-assembler-times {\mlvsl\M} 10 } } */ +/* { dg-final { scan-assembler-times {\mxxperm\M} 20 } } */ +/* { dg-final { scan-assembler-times {\mxxsel\M} 10 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h new file mode 100644 index 00000000000..609371c96cd --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h @@ -0,0 +1,19 @@ + +#define test(TYPE, num) \ + __attribute__ ((noinline, noclone)) \ + vector TYPE test##num (vector TYPE v, TYPE i, unsigned int n) \ + { \ + return vec_insert (i, v, n); \ + } + +#define TEST_VEC_INSERT_ALL(T) \ + T (char, 0) \ + T (unsigned char, 1) \ + T (short, 2) \ + T (unsigned short, 3) \ + T (int, 4) \ + T (unsigned int, 5) \ + T (long long, 6) \ + T (unsigned long long, 7) \ + T (float, 8) \ + T (double, 9) -- 2.27.0.90.geebb51ba8c