https://gcc.gnu.org/g:50679b8dc0805a7c628962bef50b40b9ace33d06
commit 50679b8dc0805a7c628962bef50b40b9ace33d06 Author: Peter Bergner <berg...@linux.ibm.com> Date: Thu May 1 17:49:03 2025 -0500 MMA+: Add initial support for some MMA+ built-ins Add support for MMA+ built-ins __builtin_mma_dmmr, __builtin_mma_dmxor, __builtin_mma_build_dmr, __builtin_mma_dmxvi8gerx4, __builtin_mma_dmxvi8gerx4pp, __builtin_mma_pmdmxvi8gerx4, __builtin_mma_pmdmxvi8gerx4pp . Diff: --- gcc/config/rs6000/mma.md | 190 +++++++++++++++++++++++++------ gcc/config/rs6000/predicates.md | 13 ++- gcc/config/rs6000/rs6000-builtin.cc | 112 +++++++++++++++--- gcc/config/rs6000/rs6000-builtins.def | 55 +++++++++ gcc/config/rs6000/rs6000-gen-builtins.cc | 46 +++++++- gcc/config/rs6000/rs6000.md | 16 ++- 6 files changed, 371 insertions(+), 61 deletions(-) diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md index 6103dc3c5237..fd3a0e592d88 100644 --- a/gcc/config/rs6000/mma.md +++ b/gcc/config/rs6000/mma.md @@ -24,7 +24,7 @@ ;; __vector_pair types that the MMA built-in functions reference. We ;; use OPAQUE_MODE to prevent anything from trying to open them up. -(define_constants [(MAX_MMA_OPERANDS 7)]) +(define_constants [(MAX_MMA_OPERANDS 9)]) ;; Constants for creating unspecs @@ -93,9 +93,15 @@ UNSPEC_MMA_DMSETDMRZ UNSPEC_DM_INSERT512_UPPER UNSPEC_DM_INSERT512_LOWER + UNSPEC_DM_INSERT1024 UNSPEC_DM_EXTRACT512 UNSPEC_DMR_RELOAD_FROM_MEMORY UNSPEC_DMR_RELOAD_TO_MEMORY + UNSPEC_DMF_DMXOR + UNSPEC_DMF_DMXVI8GERX4 + UNSPEC_DMF_DMXVI8GERX4PP + UNSPEC_DMF_PMDMXVI8GERX4 + UNSPEC_DMF_PMDMXVI8GERX4PP ]) (define_c_enum "unspecv" @@ -138,12 +144,18 @@ ;; MMA instructions with 1 vector pair and 1 vector arguments (define_int_iterator MMA_PV [UNSPEC_MMA_XVF64GER]) +;; DMF instructions with 1 vector pair and 1 vector arguments +(define_int_iterator DMF_PV [UNSPEC_DMF_DMXVI8GERX4]) + ;; MMA instructions with 1 accumulator, 1 vector pair and 1 vector arguments (define_int_iterator MMA_APV [UNSPEC_MMA_XVF64GERPP UNSPEC_MMA_XVF64GERPN UNSPEC_MMA_XVF64GERNP UNSPEC_MMA_XVF64GERNN]) +;; DMF instructions with 1 dmr, 1 vector pair and 1 vector arguments +(define_int_iterator DMF_DPV [UNSPEC_DMF_DMXVI8GERX4PP]) + ;; MMA instructions with 2 vector, 2 4-bit and 1 8-bit arguments (define_int_iterator MMA_VVI4I4I8 [UNSPEC_MMA_PMXVI4GER8]) @@ -193,6 +205,14 @@ (define_int_iterator MMA_AVVI4I4I4 [UNSPEC_MMA_PMXVI8GER4PP UNSPEC_MMA_PMXVI8GER4SPP]) +;; DMF instructions with 1 vector pair, 1 vector and 1 8-bit and 2 4-bit +;; arguments +(define_int_iterator DMF_PVI8I4I4 [UNSPEC_DMF_PMDMXVI8GERX4]) + +;; DMF instructions with 1dmr, 1 vector pair, 1 vector and 1 8-bit and +;; 2 4-bit arguments +(define_int_iterator DMF_DPVI8I4I4 [UNSPEC_DMF_PMDMXVI8GERX4PP]) + (define_int_attr acc [(UNSPEC_MMA_XXMFACC "xxmfacc") (UNSPEC_MMA_XXMTACC "xxmtacc")]) @@ -222,12 +242,14 @@ (UNSPEC_MMA_XVF32GERNP "xvf32gernp") (UNSPEC_MMA_XVF32GERNN "xvf32gernn")]) -(define_int_attr pv [(UNSPEC_MMA_XVF64GER "xvf64ger")]) +(define_int_attr pv [(UNSPEC_MMA_XVF64GER "xvf64ger") + (UNSPEC_DMF_DMXVI8GERX4 "dmxvi8gerx4")]) (define_int_attr apv [(UNSPEC_MMA_XVF64GERPP "xvf64gerpp") (UNSPEC_MMA_XVF64GERPN "xvf64gerpn") (UNSPEC_MMA_XVF64GERNP "xvf64gernp") - (UNSPEC_MMA_XVF64GERNN "xvf64gernn")]) + (UNSPEC_MMA_XVF64GERNN "xvf64gernn") + (UNSPEC_DMF_DMXVI8GERX4PP "dmxvi8gerx4pp")]) ;; The "pm" prefix is not in these expansions, so that we can generate ;; pmdmxvi4ger8 on systems with dense math registers and xvi4ger8 on systems @@ -271,6 +293,9 @@ (define_int_attr avvi4i4i4 [(UNSPEC_MMA_PMXVI8GER4PP "xvi8ger4pp") (UNSPEC_MMA_PMXVI8GER4SPP "xvi8ger4spp")]) +(define_int_attr pvi8i4i4 [(UNSPEC_DMF_PMDMXVI8GERX4 "pmdmxvi8gerx4")]) + +(define_int_attr dpvi8i4i4 [(UNSPEC_DMF_PMDMXVI8GERX4PP "pmdmxvi8gerx4pp")]) ;; Vector pair support. OOmode can only live in VSRs. (define_expand "movoo" @@ -430,14 +455,25 @@ ;; instructions. (define_insn "dm_insert512" [(set (match_operand:XO 0 "dmr_operand" "=wD") - (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa") - (match_operand:OO 2 "vsx_register_operand" "wa") - (match_operand 3 "const_0_to_1_operand")] - UNSPEC_DM_INSERT512_UPPER))] + (unspec:XO [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa") + (match_operand 3 "const_0_to_1_operand")] + UNSPEC_DM_INSERT512_UPPER))] "TARGET_DENSE_MATH" "dmxxinstdmr512 %0,%x1,%x2,%3" [(set_attr "type" "mma")]) +(define_insn "dm_insert1024" + [(set (match_operand:TDO 0 "dmr_operand" "=wD") + (unspec:TDO [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:OO 2 "vsx_register_operand" "wa") + (match_operand:OO 3 "vsx_register_operand" "wa") + (match_operand:OO 4 "vsx_register_operand" "wa")] + UNSPEC_DM_INSERT1024))] + "TARGET_DENSE_MATH" + "dmxxinstdmr512 %0,%x1,%x2,0\n\tdmxxinstdmr512 %0,%x3,%x4,1" + [(set_attr "type" "mma")]) + (define_expand "mma_assemble_acc" [(match_operand:XO 0 "accumulator_operand") (match_operand:V16QI 1 "mma_assemble_input_operand") @@ -502,6 +538,30 @@ DONE; }) +(define_expand "mma_build_dmr" + [(match_operand:TDO 0 "dmr_operand") + (match_operand:V16QI 1 "mma_assemble_input_operand") + (match_operand:V16QI 2 "mma_assemble_input_operand") + (match_operand:V16QI 3 "mma_assemble_input_operand") + (match_operand:V16QI 4 "mma_assemble_input_operand") + (match_operand:V16QI 5 "mma_assemble_input_operand") + (match_operand:V16QI 6 "mma_assemble_input_operand") + (match_operand:V16QI 7 "mma_assemble_input_operand") + (match_operand:V16QI 8 "mma_assemble_input_operand")] + "TARGET_DENSE_MATH" +{ + rtx vp0 = gen_reg_rtx (OOmode); + rtx vp1 = gen_reg_rtx (OOmode); + rtx vp2 = gen_reg_rtx (OOmode); + rtx vp3 = gen_reg_rtx (OOmode); + emit_insn (gen_vsx_assemble_pair (vp0, operands[1], operands[2])); + emit_insn (gen_vsx_assemble_pair (vp1, operands[3], operands[4])); + emit_insn (gen_vsx_assemble_pair (vp2, operands[5], operands[6])); + emit_insn (gen_vsx_assemble_pair (vp3, operands[7], operands[8])); + emit_insn (gen_dm_insert1024 (operands[0], vp0, vp1, vp2, vp3)); + DONE; +}) + ;; MMA instructions that do not use their accumulators as an input, still must ;; not allow their vector operands to overlap the registers used by the ;; accumulator. We enforce this by marking the output as early clobber. The @@ -531,7 +591,7 @@ [(set_attr "type" "mma")]) (define_insn "mma_dmsetdmrz" - [(set (match_operand:XO 0 "accumulator_operand" "=wD") + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") (unspec [(const_int 0)] UNSPEC_MMA_DMSETDMRZ))] "TARGET_DENSE_MATH" @@ -792,10 +852,11 @@ if (DMR_REGNO_P (regno0) && VSX_REGNO_P (regno1)) { - rtx op1_upper = gen_rtx_REG (XOmode, regno1); - rtx op1_lower = gen_rtx_REG (XOmode, regno1 + 4); - emit_insn (gen_movtdo_insert512_upper (op0, op1_upper)); - emit_insn (gen_movtdo_insert512_lower (op0, op0, op1_lower)); + rtx pair0 = gen_rtx_REG (OOmode, regno1); + rtx pair1 = gen_rtx_REG (OOmode, regno1 + 2); + rtx pair2 = gen_rtx_REG (OOmode, regno1 + 4); + rtx pair3 = gen_rtx_REG (OOmode, regno1 + 6); + emit_insn (gen_dm_insert1024 (op0, pair0, pair1, pair2, pair3)); DONE; } @@ -819,23 +880,13 @@ (set_attr "length" "*,*,32,8,*,8") (set_attr "max_prefixed_insns" "4,4,*,*,*,*")]) -;; Move from VSX registers to DMR registers via two insert 512 bit -;; instructions. -(define_insn "movtdo_insert512_upper" - [(set (match_operand:TDO 0 "dmr_operand" "=wD") - (unspec:TDO [(match_operand:XO 1 "vsx_register_operand" "wa")] - UNSPEC_DM_INSERT512_UPPER))] - "TARGET_DENSE_MATH" - "dmxxinstdmr512 %0,%1,%Y1,0" - [(set_attr "type" "mma")]) - -(define_insn "movtdo_insert512_lower" +(define_insn "dmf_dmxor" [(set (match_operand:TDO 0 "dmr_operand" "=wD") (unspec:TDO [(match_operand:TDO 1 "dmr_operand" "0") - (match_operand:XO 2 "vsx_register_operand" "wa")] - UNSPEC_DM_INSERT512_LOWER))] + (match_operand:TDO 2 "dmr_operand" "wD")] + UNSPEC_DMF_DMXOR))] "TARGET_DENSE_MATH" - "dmxxinstdmr512 %0,%2,%Y2,1" + "dmxor %0,%1,%2" [(set_attr "type" "mma")]) ;; Move from DMR registers to VSX registers via two extract 512 bit @@ -854,7 +905,10 @@ [(set (match_operand:TDO 0 "dmr_operand" "=wD") (unspec:TDO [(match_operand:TDO 1 "memory_operand" "m")] UNSPEC_DMR_RELOAD_FROM_MEMORY)) - (clobber (match_operand:XO 2 "vsx_register_operand" "=wa"))] + (clobber (match_operand:OO 2 "vsx_register_operand" "=wa")) + (clobber (match_operand:OO 3 "vsx_register_operand" "=wa")) + (clobber (match_operand:OO 4 "vsx_register_operand" "=wa")) + (clobber (match_operand:OO 5 "vsx_register_operand" "=wa"))] "TARGET_DENSE_MATH" "#" "&& reload_completed" @@ -862,18 +916,30 @@ { rtx dest = operands[0]; rtx src = operands[1]; - rtx tmp = operands[2]; - rtx mem_upper = adjust_address (src, XOmode, BYTES_BIG_ENDIAN ? 0 : 64); - rtx mem_lower = adjust_address (src, XOmode, BYTES_BIG_ENDIAN ? 64 : 0); + rtx pair0 = operands[2]; + rtx pair1 = operands[3]; + rtx pair2 = operands[4]; + rtx pair3 = operands[5]; - emit_move_insn (tmp, mem_upper); - emit_insn (gen_movtdo_insert512_upper (dest, tmp)); + if (BYTES_BIG_ENDIAN) + { + emit_move_insn (pair0, adjust_address (src, OOmode, 0)); + emit_move_insn (pair1, adjust_address (src, OOmode, 32)); + emit_move_insn (pair2, adjust_address (src, OOmode, 64)); + emit_move_insn (pair3, adjust_address (src, OOmode, 96)); + } + else + { + emit_move_insn (pair3, adjust_address (src, OOmode, 0)); + emit_move_insn (pair2, adjust_address (src, OOmode, 32)); + emit_move_insn (pair1, adjust_address (src, OOmode, 64)); + emit_move_insn (pair0, adjust_address (src, OOmode, 96)); + } - emit_move_insn (tmp, mem_lower); - emit_insn (gen_movtdo_insert512_lower (dest, dest, tmp)); + emit_insn (gen_dm_insert1024 (dest, pair0, pair1, pair2, pair3)); DONE; } - [(set_attr "length" "16") + [(set_attr "length" "20") (set_attr "max_prefixed_insns" "2") (set_attr "type" "vecload")]) @@ -903,3 +969,57 @@ } [(set_attr "length" "16") (set_attr "max_prefixed_insns" "2")]) + +(define_insn "dmf_<pv>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:V16QI 2 "vsx_register_operand" "wa")] + DMF_PV))] + "TARGET_DENSE_MATH" +{ + return "<pv> %0,%x1,%x2"; +} + [(set_attr "type" "dmf")]) + +(define_insn "dmf_<apv>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:TDO 1 "accumulator_operand" "0") + (match_operand:OO 2 "vsx_register_operand" "wa") + (match_operand:V16QI 3 "vsx_register_operand" "wa")] + DMF_DPV))] + "TARGET_DENSE_MATH" +{ + return "<apv> %0,%x2,%x3"; +} + [(set_attr "type" "dmf")]) + +(define_insn "dmf_<pvi8i4i4>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:OO 1 "vsx_register_operand" "wa") + (match_operand:V16QI 2 "vsx_register_operand" "wa") + (match_operand:SI 3 "u8bit_cint_operand" "n") + (match_operand:SI 4 "const_0_to_15_operand" "n") + (match_operand:SI 5 "const_0_to_15_operand" "n")] + DMF_PVI8I4I4))] + "TARGET_DENSE_MATH" +{ + return "<pvi8i4i4> %0,%x1,%x2,%3,%4,%5"; +} + [(set_attr "type" "dmf") + (set_attr "prefixed" "yes")]) + +(define_insn "dmf_<dpvi8i4i4>" + [(set (match_operand:TDO 0 "accumulator_operand" "=wD") + (unspec:TDO [(match_operand:TDO 1 "accumulator_operand" "0") + (match_operand:OO 2 "vsx_register_operand" "wa") + (match_operand:V16QI 3 "vsx_register_operand" "wa") + (match_operand:SI 4 "u8bit_cint_operand" "n") + (match_operand:SI 5 "const_0_to_15_operand" "n") + (match_operand:SI 6 "const_0_to_15_operand" "n")] + DMF_DPVI8I4I4))] + "TARGET_DENSE_MATH" +{ + return "<dpvi8i4i4> %0,%x2,%x3,%4,%5,%6"; +} + [(set_attr "type" "dmf") + (set_attr "prefixed" "yes")]) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 05899ff14d33..55f01e5f14ed 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -1405,7 +1405,18 @@ && (indexed_or_indirect_address (XEXP (op, 0), mode) || quad_address_p (XEXP (op, 0), mode, false)))))")) -;; Return 1 if this operand is valid for an MMA disassemble insn. +;; Return 1 if this input operand is valid for an MMA disassemble insn. +(define_predicate "mma_disassemble_input_operand" + (match_code "reg") +{ + if (TARGET_DENSE_MATH) + return vsx_register_operand (op, mode); + else if (TARGET_MMA) + return fpr_reg_operand (op, mode); + return 0; +}) + +;; Return 1 if this output operand is valid for an MMA disassemble insn. (define_predicate "mma_disassemble_output_operand" (match_code "reg,subreg,mem") { diff --git a/gcc/config/rs6000/rs6000-builtin.cc b/gcc/config/rs6000/rs6000-builtin.cc index 1885b1f636f3..00c1a6687101 100644 --- a/gcc/config/rs6000/rs6000-builtin.cc +++ b/gcc/config/rs6000/rs6000-builtin.cc @@ -1101,7 +1101,8 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, gimple *stmt = gsi_stmt (*gsi); size_t fncode = (size_t) fn_code; - if (!bif_is_mma (rs6000_builtin_info[fncode])) + if (!bif_is_mma (rs6000_builtin_info[fncode]) + && !bif_is_dm (rs6000_builtin_info[fncode])) return false; /* Each call that can be gimple-expanded has an associated built-in @@ -1109,11 +1110,11 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, already expanded it! Exceptions: lxvp and stxvp. */ if (rs6000_builtin_info[fncode].assoc_bif == RS6000_BIF_NONE && fncode != RS6000_BIF_LXVP - && fncode != RS6000_BIF_STXVP) + && fncode != RS6000_BIF_STXVP + && fncode != RS6000_BIF_DMMR) return false; bifdata *bd = &rs6000_builtin_info[fncode]; - unsigned nopnds = bd->nargs; gimple_seq new_seq = NULL; gimple *new_call; tree new_decl; @@ -1228,23 +1229,51 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, return true; } +#if 0 + if (fncode == RS6000_BIF_DMMR) + { + push_gimplify_context (true); + tree dst_ptr = gimple_call_arg (stmt, 0); + tree src_ptr = gimple_call_arg (stmt, 1); + tree dst = build_simple_mem_ref (dst_ptr); + tree src_type = build_pointer_type (dmr_type_node); + tree src = create_tmp_reg_or_ssa_name (TREE_TYPE (src_type)); + tree src_mem = build_simple_mem_ref (build1 (NOP_EXPR, + src_type, src_ptr)); + gimplify_assign (src, src_mem, &new_seq); + gimplify_assign (dst, src, &new_seq); + pop_gimplify_context (NULL); + gsi_replace_with_seq (gsi, new_seq, true); + return true; + } +#endif + /* Convert this built-in into an internal version that uses pass-by-value arguments. The internal built-in is found in the assoc_bif field. */ - new_decl = rs6000_builtin_decls[rs6000_builtin_info[fncode].assoc_bif]; + size_t new_fncode = rs6000_builtin_info[fncode].assoc_bif; + new_decl = rs6000_builtin_decls[new_fncode]; tree lhs, op[MAX_MMA_OPERANDS]; + tree lhs_type = NULL_TREE; tree acc = gimple_call_arg (stmt, 0); push_gimplify_context (true); - if (bif_is_quad (*bd)) + switch (insn_data[rs6000_builtin_info[new_fncode].icode].operand[0].mode) { - /* This built-in has a pass-by-reference accumulator input, so load it - into a temporary accumulator for use as a pass-by-value input. */ - op[0] = create_tmp_reg_or_ssa_name (vector_quad_type_node); - for (unsigned i = 1; i < nopnds; i++) - op[i] = gimple_call_arg (stmt, i); - gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq); + case TDOmode: + lhs_type = dmr_type_node; + break; + case XOmode: + lhs_type = vector_quad_type_node; + break; + case OOmode: + lhs_type = vector_pair_type_node; + break; + default: + gcc_unreachable (); } - else + +#if 0 + if (!bif_is_dmr (*bd) && !bif_is_quad (*bd)) { /* This built-in does not use its pass-by-reference accumulator argument as an input argument, so remove it from the input list. */ @@ -1253,6 +1282,39 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, op[i] = gimple_call_arg (stmt, i + 1); } + if (bif_is_dmr (*bd) || bif_is_quad (*bd)) + { + /* This built-in has a pass-by-reference accumulator input, so load it + into a temporary accumulator for use as a pass-by-value input. */ + op[0] = create_tmp_reg_or_ssa_name (lhs_type); + gimplify_assign (op[0], build_simple_mem_ref (acc), &new_seq); + } + else +#endif + + unsigned nopnds = 0; + for (unsigned i = 0; i < bd->nargs; i++) + { + tree arg = gimple_call_arg (stmt, i); + if (i == 0 && !bif_is_dmr (*bd) && !bif_is_quad (*bd)) + continue; + /* If this is another DMR operand, it is passed in by reference. + The internal built-ins use pass-by-value, so load this operand + into a variable and pass that in as our operand. */ + if (POINTER_TYPE_P (TREE_TYPE (arg)) + && TREE_TYPE (TREE_TYPE (arg)) == lhs_type) + { + tree op_mem = build_simple_mem_ref (build1 (NOP_EXPR, + TREE_TYPE (arg), + arg)); + op[nopnds] = create_tmp_reg_or_ssa_name (lhs_type); + gimplify_assign (op[nopnds], op_mem, &new_seq); + } + else + op[nopnds] = arg; + nopnds++; + } + switch (nopnds) { case 0: @@ -1282,14 +1344,19 @@ rs6000_gimple_fold_mma_builtin (gimple_stmt_iterator *gsi, new_call = gimple_build_call (new_decl, 7, op[0], op[1], op[2], op[3], op[4], op[5], op[6]); break; + case 8: + new_call = gimple_build_call (new_decl, 8, op[0], op[1], op[2], op[3], + op[4], op[5], op[6], op[7]); + break; + case 9: + new_call = gimple_build_call (new_decl, 9, op[0], op[1], op[2], op[3], + op[4], op[5], op[6], op[7], op[8]); + break; default: gcc_unreachable (); } - if (fncode == RS6000_BIF_BUILD_PAIR || fncode == RS6000_BIF_ASSEMBLE_PAIR_V) - lhs = create_tmp_reg_or_ssa_name (vector_pair_type_node); - else - lhs = create_tmp_reg_or_ssa_name (vector_quad_type_node); + lhs = create_tmp_reg_or_ssa_name (lhs_type); gimple_call_set_lhs (new_call, lhs); gimple_seq_add_stmt (&new_seq, new_call); gimplify_assign (build_simple_mem_ref (acc), lhs, &new_seq); @@ -3006,6 +3073,14 @@ mma_expand_builtin (tree exp, rtx target, insn_code icode, case 7: pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6]); break; + case 8: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6], + op[7]); + break; + case 9: + pat = GEN_FCN (icode) (op[0], op[1], op[2], op[3], op[4], op[5], op[6], + op[7], op[8]); + break; default: gcc_unreachable (); } @@ -3442,7 +3517,7 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, /* Position of first argument (0 for void-returning functions, else 1). */ int k; /* Modes for the return value, if any, and arguments. */ - const int MAX_BUILTIN_ARGS = 6; + const int MAX_BUILTIN_ARGS = 8; machine_mode mode[MAX_BUILTIN_ARGS + 1]; if (void_func) @@ -3577,7 +3652,8 @@ rs6000_expand_builtin (tree exp, rtx target, rtx /* subtarget */, if (bif_is_lxvrze (*bifaddr)) return lxvrze_expand_builtin (target, icode, op, mode[0], mode[1]); - if (bif_is_mma (*bifaddr)) + if (bif_is_mma (*bifaddr) + || bif_is_dm (*bifaddr)) return mma_expand_builtin (exp, target, icode, fcode); if (TREE_TYPE (TREE_TYPE (fndecl)) == void_type_node) diff --git a/gcc/config/rs6000/rs6000-builtins.def b/gcc/config/rs6000/rs6000-builtins.def index eef5f41f7615..3630c6a6136c 100644 --- a/gcc/config/rs6000/rs6000-builtins.def +++ b/gcc/config/rs6000/rs6000-builtins.def @@ -139,6 +139,7 @@ ; ibm128 Restrict usage to the case where __ibm128 is supported or if ibmld ; future Restrict usage to future instructions ; dm Restrict usage to dense math +; dmr MMA instruction using a dmr register as an input operand ; ; Each attribute corresponds to extra processing required when ; the built-in is expanded. All such special processing should @@ -3927,6 +3928,60 @@ void __builtin_vsx_stxvp (v256, unsigned long, const v256 *); STXVP nothing {mma,pair} +[dm] + void __builtin_mma_dmsetdmrz (dm1024 *); + DMSETDMRZ nothing {dm,dmint} + + dm1024 __builtin_mma_dmsetdmrz_internal (); + DMSETDMRZ_INTERNAL mma_dmsetdmrz {dm} + + void __builtin_mma_dmmr (dm1024 *, dm1024 *); + DMMR nothing {dm,dmint} + + dm1024 __builtin_mma_dmmr_internal (dm1024); + DMMR_INTERNAL movtdo {dm} + + void __builtin_mma_dmxor (dm1024 *, dm1024 *); + DMXOR nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxor_internal (dm1024, dm1024); + DMXOR_INTERNAL dmf_dmxor {dm} + + void __builtin_mma_build_dmr (dm1024 *, vuc, vuc, vuc, vuc, vuc, vuc, vuc, vuc); + BUILD_DMR nothing {dm,dmint} + + dm1024 __builtin_mma_build_dmr_internal (vuc, vuc, vuc, vuc, vuc, vuc, vuc, vuc); + BUILD_DMR_INTERNAL mma_build_dmr {dm} + + void __builtin_mma_dmxvi8gerx4 (dm1024 *, v256, vuc); + DMXVI8GERX4 nothing {dm,dmint} + + dm1024 __builtin_mma_dmxvi8gerx4_internal (v256, vuc); + DMXVI8GERX4_INTERNAL dmf_dmxvi8gerx4 {dm} + + void __builtin_mma_dmxvi8gerx4pp (dm1024 *, v256, vuc); + DMXVI8GERX4PP nothing {dm,dmint,dmr} + + dm1024 __builtin_mma_dmxvi8gerx4pp_internal (dm1024, v256, vuc); + DMXVI8GERX4PP_INTERNAL dmf_dmxvi8gerx4pp {dm} + + void __builtin_mma_pmdmxvi8gerx4 (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<4>); + PMDMXVI8GERX4 nothing {dm,pair,dmint} + + dm1024 __builtin_mma_pmdmxvi8gerx4_internal (v256, vuc, const int<8>, \ + const int<4>, const int<4>); + PMDMXVI8GERX4_INTERNAL dmf_pmdmxvi8gerx4 {dm,pair} + + void __builtin_mma_pmdmxvi8gerx4pp (dm1024 *, v256, vuc, const int<8>, \ + const int<4>, const int<4>); + PMDMXVI8GERX4PP nothing {dm,pair,dmint,dmr} + + dm1024 __builtin_mma_pmdmxvi8gerx4pp_internal (dm1024, v256, vuc, \ + const int<8>, const int<4>, \ + const int<4>); + PMDMXVI8GERX4PP_INTERNAL dmf_pmdmxvi8gerx4pp {dm,pair} + [future] const signed int __builtin_saturate_subtract32 (signed int, signed int); SAT_SUBSI sat_subsi3 {} diff --git a/gcc/config/rs6000/rs6000-gen-builtins.cc b/gcc/config/rs6000/rs6000-gen-builtins.cc index d9983fe03feb..45f4362feaad 100644 --- a/gcc/config/rs6000/rs6000-gen-builtins.cc +++ b/gcc/config/rs6000/rs6000-gen-builtins.cc @@ -94,6 +94,9 @@ along with GCC; see the file COPYING3. If not see ibmld Restrict usage to the case when TFmode is IBM-128 ibm128 Restrict usage to the case where __ibm128 is supported or if ibmld + dm Needs special handling for DMF/MMA+ instructions + dmint DMF/MMA+ instruction expanding to internal call at GIMPLE time + dmr MMA+ instruction using a dmr register as an input operand An example stanza might look like this: @@ -333,7 +336,8 @@ enum basetype BT_DECIMAL128, BT_IBM128, BT_VPAIR, - BT_VQUAD + BT_VQUAD, + BT_DMR }; /* Ways in which a const int value can be restricted. RES_BITS indicates @@ -403,6 +407,8 @@ struct attrinfo bool isibm128; bool isfuture; bool isdm; + bool isdmint; + bool isdmr; }; /* Fields associated with a function prototype (bif or overload). */ @@ -554,6 +560,7 @@ static typemap type_map[] = { "pv16qi", "ptr_V16QI" }, { "pv1poi", "ptr_vector_pair" }, { "pv1pxi", "ptr_vector_quad" }, + { "pv1tdoi", "ptr_dmr" }, { "pv1ti", "ptr_V1TI" }, { "pv2df", "ptr_V2DF" }, { "pv2di", "ptr_V2DI" }, @@ -584,6 +591,7 @@ static typemap type_map[] = { "v16qi", "V16QI" }, { "v1poi", "vector_pair" }, { "v1pxi", "vector_quad" }, + { "v1tdoi", "dmr" }, { "v1ti", "V1TI" }, { "v2df", "V2DF" }, { "v2di", "V2DI" }, @@ -1069,6 +1077,7 @@ match_type (typeinfo *typedata, int voidok) vd vector double v256 __vector_pair v512 __vector_quad + dm1024 __dmr For simplicity, We don't support "short int" and "long long int". We don't currently support a <basetype> of "_Float16". "signed" @@ -1250,6 +1259,13 @@ match_type (typeinfo *typedata, int voidok) handle_pointer (typedata); return 1; } + else if (!strcmp (token, "dm1024")) + { + typedata->isvector = 1; + typedata->base = BT_DMR; + handle_pointer (typedata); + return 1; + } else if (!strcmp (token, "signed")) typedata->issigned = 1; else if (!strcmp (token, "unsigned")) @@ -1448,6 +1464,12 @@ parse_bif_attrs (attrinfo *attrptr) attrptr->isibmld = 1; else if (!strcmp (attrname, "ibm128")) attrptr->isibm128 = 1; + else if (!strcmp (attrname, "dm")) + attrptr->isdm = 1; + else if (!strcmp (attrname, "dmint")) + attrptr->isdmint = 1; + else if (!strcmp (attrname, "dmr")) + attrptr->isdmr = 1; else { diag (oldpos, "unknown attribute.\n"); @@ -1481,7 +1503,7 @@ parse_bif_attrs (attrinfo *attrptr) "pred = %d, htm = %d, htmspr = %d, htmcr = %d, mma = %d, " "quad = %d, pair = %d, mmaint = %d, no32bit = %d, 32bit = %d, " "cpu = %d, ldstmask = %d, lxvrse = %d, lxvrze = %d, endian = %d, " - "ibmdld = %d, ibm128 = %d, future = %d, dm = %d.\n", + "ibmdld = %d, ibm128 = %d, future = %d, dm = %d, dmint = %d, dmr = %d.\n", attrptr->isextract, attrptr->isnosoft,attrptr->isldvec, attrptr->isstvec, attrptr->isreve, attrptr->ispred, attrptr->ishtm, attrptr->ishtmspr, attrptr->ishtmcr, attrptr->ismma, @@ -1489,7 +1511,7 @@ parse_bif_attrs (attrinfo *attrptr) attrptr->isno32bit, attrptr->is32bit, attrptr->iscpu, attrptr->isldstmask, attrptr->islxvrse, attrptr->islxvrze, attrptr->isendian, attrptr->isibmld, attrptr->isibm128, - attrptr->isfuture, attrptr->isdm); + attrptr->isfuture, attrptr->isdm, attrptr->isdmint, attrptr->isdmr); #endif return PC_OK; @@ -1550,6 +1572,10 @@ complete_vector_type (typeinfo *typeptr, char *buf, int *bufi) memcpy (&buf[*bufi], "1pxi", 4); *bufi += 4; break; + case BT_DMR: + memcpy (&buf[*bufi], "1tdoi", 5); + *bufi += 5; + break; default: diag (pos, "unhandled basetype %d.\n", typeptr->base); exit (1); @@ -2308,6 +2334,8 @@ write_decls (void) fprintf (header_file, "#define bif_ibm128_bit\t\t(0x00800000)\n"); fprintf (header_file, "#define bif_future_bit\t\t(0x01000000)\n"); fprintf (header_file, "#define bif_dm_bit\t\t(0x02000000)\n"); + fprintf (header_file, "#define bif_dmint_bit\t\t(0x04000000)\n"); + fprintf (header_file, "#define bif_dmr_bit\t\t(0x08000000)\n"); fprintf (header_file, "\n"); fprintf (header_file, "#define bif_is_extract(x)\t((x).bifattrs & bif_extract_bit)\n"); @@ -2357,6 +2385,10 @@ write_decls (void) "#define bif_is_future(x)\t((x).bifattrs & bif_future_bit)\n"); fprintf (header_file, "#define bif_is_dm(x)\t((x).bifattrs & bif_dm_bit)\n"); + fprintf (header_file, + "#define bif_is_dmint(x)\t((x).bifattrs & bif_dmint_bit)\n"); + fprintf (header_file, + "#define bif_is_dmr(x)\t((x).bifattrs & bif_dmr_bit)\n"); fprintf (header_file, "\n"); fprintf (header_file, @@ -2560,6 +2592,10 @@ write_bif_static_init (void) fprintf (init_file, " | bif_future_bit"); if (bifp->attrs.isdm) fprintf (init_file, " | bif_dm_bit"); + if (bifp->attrs.isdmint) + fprintf (init_file, " | bif_dmint_bit"); + if (bifp->attrs.isdmr) + fprintf (init_file, " | bif_dmr_bit"); fprintf (init_file, ",\n"); fprintf (init_file, " /* restr_opnd */\t{%d, %d, %d},\n", bifp->proto.restr_opnd[0], bifp->proto.restr_opnd[1], @@ -2593,8 +2629,8 @@ write_bif_static_init (void) : (bifp->kind == FNK_FPMATH ? "= fp, const" : "")))); fprintf (init_file, " /* assoc_bif */\tRS6000_BIF_%s%s\n", - bifp->attrs.ismmaint ? bifp->idname : "NONE", - bifp->attrs.ismmaint ? "_INTERNAL" : ""); + (bifp->attrs.ismmaint || bifp->attrs.isdmint) ? bifp->idname : "NONE", + (bifp->attrs.ismmaint || bifp->attrs.isdmint) ? "_INTERNAL" : ""); fprintf (init_file, " },\n"); } fprintf (init_file, " };\n\n"); diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index eda68cb8cf43..acc3754ebc70 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -223,7 +223,7 @@ vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm, vecfloat,vecfdiv,vecdouble,mtvsr,mfvsr,crypto, veclogical,veccmpfx,vecexts,vecmove, - htm,htmsimple,dfp,mma, + htm,htmsimple,dfp,mma,dmf, fused_arith_logical, fused_cmp_isel, fused_carry, @@ -371,7 +371,7 @@ (const (symbol_ref "(enum attr_cpu) rs6000_tune"))) ;; The ISA we implement. -(define_attr "isa" "any,p5,p6,p7,p7v,p8,p8v,p9,p9v,p9kf,p9tf,p10" +(define_attr "isa" "any,p5,p6,p7,p7v,p8,p8v,p9,p9v,p9kf,p9tf,p10,ftr,mma,dmf" (const_string "any")) ;; Is this alternative enabled for the current CPU/ISA/etc.? @@ -423,6 +423,18 @@ (and (eq_attr "isa" "p10") (match_test "TARGET_POWER10")) (const_int 1) + + (and (eq_attr "isa" "ftr") + (match_test "TARGET_FUTURE")) + (const_int 1) + + (and (eq_attr "isa" "mma") + (match_test "TARGET_MMA")) + (const_int 1) + + (and (eq_attr "isa" "dmf") + (match_test "TARGET_DENSE_MATH")) + (const_int 1) ] (const_int 0))) ;; If this instruction is microcoded on the CELL processor