https://gcc.gnu.org/g:d095370c890d2d51479de908bd6e5d342214612f
commit d095370c890d2d51479de908bd6e5d342214612f Author: Michael Meissner <[email protected]> Date: Mon Nov 3 19:54:47 2025 -0500 Add changes from future patch submission. 2025-11-03 Michael Meissner <[email protected]> gcc/ * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): Move to be after fp16_vectorization. * config/rs6000/rs6000-protos.h (bfloat16_operation_as_v4sf): Likewise. Diff: --- gcc/config/rs6000/float16.cc | 285 +++++++++++++++++++------------------- gcc/config/rs6000/rs6000-protos.h | 4 +- 2 files changed, 144 insertions(+), 145 deletions(-) diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc index b2a389270136..2c7b6278a16a 100644 --- a/gcc/config/rs6000/float16.cc +++ b/gcc/config/rs6000/float16.cc @@ -42,49 +42,43 @@ #include "common/common-target.h" #include "rs6000-internal.h" -/* Expand a bfloat16 scalar floating point operation: +/* Expand a 16-bit vector operation: ICODE: Operation to perform. RESULT: Result of the operation. OP1: Input operand1. OP2: Input operand2. OP3: Input operand3 or NULL_RTX. - SUBTYPE: Describe the operation. - - The operation is done as a V4SFmode vector operation. This is because - converting BFmode from a scalar BFmode to SFmode to do the operation and - back again takes quite a bit of time. GCC will only generate the native - operation if -Ofast is used. The float16.md code that calls this function - adds various combine operations to do the operation in V4SFmode instead of - SFmode. */ + SUBTYPE: Describe the operation. */ void -bfloat16_operation_as_v4sf (enum rtx_code icode, - rtx result, - rtx op1, - rtx op2, - rtx op3, - enum fp16_operation subtype) +fp16_vectorization (enum rtx_code icode, + rtx result, + rtx op1, + rtx op2, + rtx op3, + enum fp16_operation subtype) { gcc_assert (can_create_pseudo_p ()); - rtx result_v4sf = gen_reg_rtx (V4SFmode); - rtx ops_orig[3] = { op1, op2, op3 }; - rtx ops_v4sf[3]; + machine_mode result_mode = GET_MODE (result); + rtx op_orig[3] = { op1, op2, op3 }; + rtx op_hi[3]; + rtx op_lo[3]; + rtx result_hi; + rtx result_lo; size_t n_opts; switch (subtype) { case FP16_BINARY: n_opts = 2; - gcc_assert (op3 == NULL_RTX); break; case FP16_FMA: case FP16_FMS: case FP16_NFMA: case FP16_NFMS: - gcc_assert (icode == FMA); n_opts = 3; break; @@ -92,65 +86,52 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, gcc_unreachable (); } + /* Allocate 2 temporaries for the results and the input operands. */ + result_hi = gen_reg_rtx (V4SFmode); + result_lo = gen_reg_rtx (V4SFmode); + for (size_t i = 0; i < n_opts; i++) { - rtx op = ops_orig[i]; - rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode); - - gcc_assert (op != NULL_RTX); - - /* Remove truncation/extend added. */ - if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE) - op = XEXP (op, 0); + gcc_assert (op_orig[i] != NULL_RTX); + op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */ + op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */ - /* Convert operands to V4SFmode format. We use SPLAT for registers to - get the value into the upper 32-bits. We can use XXSPLTW to splat - words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the - odd half-words, and XXSPLTW can operate on all VSX registers instead - of just the Altivec registers. Using SPLAT instead of a shift also - insure that other bits are not a signalling NaN. If we are using - XXSPLTIW or XXSPLTIB to load the constant the other bits are - duplicated. */ + rtx interleave_hi = gen_reg_rtx (result_mode); + rtx interleave_lo = gen_reg_rtx (result_mode); + rtx orig = op_orig[i]; - if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode)) - emit_move_insn (tmp, CONST0_RTX (V4SFmode)); + rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN); + rs6000_expand_interleave (interleave_lo, orig, orig, BYTES_BIG_ENDIAN); - else if (GET_MODE (op) == BFmode) + if (result_mode == V8HFmode) { - emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op))); - emit_insn (gen_xvcvbf16spn_bf (tmp, tmp)); + emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi)); + emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo)); } - else if (GET_MODE (op) == SFmode) + else if (result_mode == V8BFmode) { - if (GET_CODE (op) == CONST_DOUBLE) - { - rtvec v = rtvec_alloc (4); - - for (size_t i = 0; i < 4; i++) - RTVEC_ELT (v, i) = op; - - emit_insn (gen_rtx_SET (tmp, - gen_rtx_CONST_VECTOR (V4SFmode, v))); - } - - else - emit_insn (gen_vsx_splat_v4sf (tmp, - force_reg (SFmode, op))); + emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi)); + emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo)); } else gcc_unreachable (); } - /* Do the operation in V4SFmode. */ + /* Do 2 sets of V4SFmode operations. */ switch (subtype) { case FP16_BINARY: - emit_insn (gen_rtx_SET (result_v4sf, + emit_insn (gen_rtx_SET (result_hi, gen_rtx_fmt_ee (icode, V4SFmode, - ops_v4sf[0], - ops_v4sf[1]))); + op_hi[0], + op_hi[1]))); + + emit_insn (gen_rtx_SET (result_lo, + gen_rtx_fmt_ee (icode, V4SFmode, + op_lo[0], + op_lo[1]))); break; case FP16_FMA: @@ -158,19 +139,31 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, case FP16_NFMA: case FP16_NFMS: { - rtx op1 = ops_v4sf[0]; - rtx op2 = ops_v4sf[1]; - rtx op3 = ops_v4sf[2]; + rtx op1_hi = op_hi[0]; + rtx op2_hi = op_hi[1]; + rtx op3_hi = op_hi[2]; + + rtx op1_lo = op_lo[0]; + rtx op2_lo = op_lo[1]; + rtx op3_lo = op_lo[2]; if (subtype == FP16_FMS || subtype == FP16_NFMS) - op3 = gen_rtx_NEG (V4SFmode, op3); + { + op3_hi = gen_rtx_NEG (V4SFmode, op3_hi); + op3_lo = gen_rtx_NEG (V4SFmode, op3_lo); + } - rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3); + rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi); + rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo); if (subtype == FP16_NFMA || subtype == FP16_NFMS) - op_fma = gen_rtx_NEG (V4SFmode, op_fma); + { + op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi); + op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo); + } - emit_insn (gen_rtx_SET (result_v4sf, op_fma)); + emit_insn (gen_rtx_SET (result_hi, op_fma_hi)); + emit_insn (gen_rtx_SET (result_lo, op_fma_lo)); } break; @@ -178,58 +171,62 @@ bfloat16_operation_as_v4sf (enum rtx_code icode, gcc_unreachable (); } - /* Convert V4SF result back to scalar mode. */ - if (GET_MODE (result) == BFmode) - emit_insn (gen_xvcvspbf16_bf (result, result_v4sf)); + /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */ + if (result_mode == V8HFmode) + emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo)); - else if (GET_MODE (result) == SFmode) - { - rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3); - emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element)); - } + else if (result_mode == V8BFmode) + emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo)); else gcc_unreachable (); -} + return; +} -/* Expand a 16-bit vector operation: +/* Expand a bfloat16 scalar floating point operation: ICODE: Operation to perform. RESULT: Result of the operation. OP1: Input operand1. OP2: Input operand2. OP3: Input operand3 or NULL_RTX. - SUBTYPE: Describe the operation. */ + SUBTYPE: Describe the operation. + + The operation is done as a V4SFmode vector operation. This is because + converting BFmode from a scalar BFmode to SFmode to do the operation and + back again takes quite a bit of time. GCC will only generate the native + operation if -Ofast is used. The float16.md code that calls this function + adds various combine operations to do the operation in V4SFmode instead of + SFmode. */ void -fp16_vectorization (enum rtx_code icode, - rtx result, - rtx op1, - rtx op2, - rtx op3, - enum fp16_operation subtype) +bfloat16_operation_as_v4sf (enum rtx_code icode, + rtx result, + rtx op1, + rtx op2, + rtx op3, + enum fp16_operation subtype) { gcc_assert (can_create_pseudo_p ()); - machine_mode result_mode = GET_MODE (result); - rtx op_orig[3] = { op1, op2, op3 }; - rtx op_hi[3]; - rtx op_lo[3]; - rtx result_hi; - rtx result_lo; + rtx result_v4sf = gen_reg_rtx (V4SFmode); + rtx ops_orig[3] = { op1, op2, op3 }; + rtx ops_v4sf[3]; size_t n_opts; switch (subtype) { case FP16_BINARY: n_opts = 2; + gcc_assert (op3 == NULL_RTX); break; case FP16_FMA: case FP16_FMS: case FP16_NFMA: case FP16_NFMS: + gcc_assert (icode == FMA); n_opts = 3; break; @@ -237,52 +234,65 @@ fp16_vectorization (enum rtx_code icode, gcc_unreachable (); } - /* Allocate 2 temporaries for the results and the input operands. */ - result_hi = gen_reg_rtx (V4SFmode); - result_lo = gen_reg_rtx (V4SFmode); - for (size_t i = 0; i < n_opts; i++) { - gcc_assert (op_orig[i] != NULL_RTX); - op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */ - op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */ + rtx op = ops_orig[i]; + rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode); - rtx interleave_hi = gen_reg_rtx (result_mode); - rtx interleave_lo = gen_reg_rtx (result_mode); - rtx orig = op_orig[i]; + gcc_assert (op != NULL_RTX); - rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN); - rs6000_expand_interleave (interleave_lo, orig, orig, BYTES_BIG_ENDIAN); + /* Remove truncation/extend added. */ + if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE) + op = XEXP (op, 0); - if (result_mode == V8HFmode) + /* Convert operands to V4SFmode format. We use SPLAT for registers to + get the value into the upper 32-bits. We can use XXSPLTW to splat + words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the + odd half-words, and XXSPLTW can operate on all VSX registers instead + of just the Altivec registers. Using SPLAT instead of a shift also + insure that other bits are not a signalling NaN. If we are using + XXSPLTIW or XXSPLTIB to load the constant the other bits are + duplicated. */ + + if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode)) + emit_move_insn (tmp, CONST0_RTX (V4SFmode)); + + else if (GET_MODE (op) == BFmode) { - emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi)); - emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo)); + emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op))); + emit_insn (gen_xvcvbf16spn_bf (tmp, tmp)); } - else if (result_mode == V8BFmode) + else if (GET_MODE (op) == SFmode) { - emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi)); - emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo)); + if (GET_CODE (op) == CONST_DOUBLE) + { + rtvec v = rtvec_alloc (4); + + for (size_t i = 0; i < 4; i++) + RTVEC_ELT (v, i) = op; + + emit_insn (gen_rtx_SET (tmp, + gen_rtx_CONST_VECTOR (V4SFmode, v))); + } + + else + emit_insn (gen_vsx_splat_v4sf (tmp, + force_reg (SFmode, op))); } else gcc_unreachable (); } - /* Do 2 sets of V4SFmode operations. */ + /* Do the operation in V4SFmode. */ switch (subtype) { case FP16_BINARY: - emit_insn (gen_rtx_SET (result_hi, - gen_rtx_fmt_ee (icode, V4SFmode, - op_hi[0], - op_hi[1]))); - - emit_insn (gen_rtx_SET (result_lo, + emit_insn (gen_rtx_SET (result_v4sf, gen_rtx_fmt_ee (icode, V4SFmode, - op_lo[0], - op_lo[1]))); + ops_v4sf[0], + ops_v4sf[1]))); break; case FP16_FMA: @@ -290,31 +300,19 @@ fp16_vectorization (enum rtx_code icode, case FP16_NFMA: case FP16_NFMS: { - rtx op1_hi = op_hi[0]; - rtx op2_hi = op_hi[1]; - rtx op3_hi = op_hi[2]; - - rtx op1_lo = op_lo[0]; - rtx op2_lo = op_lo[1]; - rtx op3_lo = op_lo[2]; + rtx op1 = ops_v4sf[0]; + rtx op2 = ops_v4sf[1]; + rtx op3 = ops_v4sf[2]; if (subtype == FP16_FMS || subtype == FP16_NFMS) - { - op3_hi = gen_rtx_NEG (V4SFmode, op3_hi); - op3_lo = gen_rtx_NEG (V4SFmode, op3_lo); - } + op3 = gen_rtx_NEG (V4SFmode, op3); - rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi); - rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo); + rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3); if (subtype == FP16_NFMA || subtype == FP16_NFMS) - { - op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi); - op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo); - } + op_fma = gen_rtx_NEG (V4SFmode, op_fma); - emit_insn (gen_rtx_SET (result_hi, op_fma_hi)); - emit_insn (gen_rtx_SET (result_lo, op_fma_lo)); + emit_insn (gen_rtx_SET (result_v4sf, op_fma)); } break; @@ -322,15 +320,16 @@ fp16_vectorization (enum rtx_code icode, gcc_unreachable (); } - /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */ - if (result_mode == V8HFmode) - emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo)); + /* Convert V4SF result back to scalar mode. */ + if (GET_MODE (result) == BFmode) + emit_insn (gen_xvcvspbf16_bf (result, result_v4sf)); - else if (result_mode == V8BFmode) - emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo)); + else if (GET_MODE (result) == SFmode) + { + rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3); + emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element)); + } else gcc_unreachable (); - - return; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 3c4d21299e1a..001dc1fc7f4b 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -269,10 +269,10 @@ enum fp16_operation { FP16_NFMS /* - ((a * b) - c). */ }; -extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, - enum fp16_operation); extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, enum fp16_operation); +extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE
