https://gcc.gnu.org/g:bdfca635571a640fca76c26ad55417e3169a82ce
commit bdfca635571a640fca76c26ad55417e3169a82ce Author: Michael Meissner <[email protected]> Date: Thu Nov 13 11:21:23 2025 -0500 Optimize __bfloat16 scalar code. Optimize __bfloat16 binary operations. Unlike _Float16 where we have instructions to convert between HFmode and SFmode as scalar values, with BFmode, we only have vector conversions. Thus to do: __bfloat16 a, b, c; a = b + c; the GCC compiler generates the following code: lxsihzx 0,4,2 // load __bfloat16 value b lxsihzx 12,5,2 // load __bfloat16 value c xxsldwi 0,0,0,1 // shift b into bits 16..31 xxsldwi 12,12,12,1 // shift c into bits 16..31 xvcvbf16spn 0,0 // vector convert b into V4SFmode xvcvbf16spn 12,12 // vector convert c into V4SFmode xscvspdpn 0,0 // convert b into SFmode scalar xscvspdpn 12,12 // convert c into SFmode scalar fadds 0,0,12 // add b+c xscvdpspn 0,0 // convert b+c into SFmode memory format xvcvspbf16 0,0 // convert b+c into BFmode memory format stxsihx 0,3,2 // store b+c Using the following combiner patterns that are defined in this patch, the code generated would be: lxsihzx 12,4,2 // load __bfloat16 value b lxsihzx 0,5,2 // load __bfloat16 value c xxspltw 12,12,1 // shift b into bits 16..31 xxspltw 0,0,1 // shift c into bits 16..31 xvcvbf16spn 12,12 // vector convert b into V4SFmode xvcvbf16spn 0,0 // vector convert c into V4SFmode xvaddsp 0,0,12 // vector b+c in V4SFmode xvcvspbf16 0,0 // convert b+c into BFmode memory format stxsihx 0,3,2 // store b+c We cannot just define insns like 'addbf3' to keep the operation as BFmode because GCC will not generate these patterns unless the user uses -Ofast. Without -Ofast, it will always convert BFmode into SFmode. 2025-11-13 Michael Meissner <[email protected]> gcc/ * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): New function to optimize __bfloat16 scalar operations. * config/rs6000/float16.md (bfloat16_binary_op_internal1): New __bfloat16 scalar combiner insns. (bfloat16_binary_op_internal2): Likewise. (bfloat16_fma_internal1): Likewise. (bfloat16_fma_internal2): Likewise. (bfloat16_fms_internal1): Likewise. (bfloat16_fms_internal2): Likewise. (bfloat16_nfma_internal1): Likewise. (bfloat16_nfma_internal2): Likewise. (bfloat16_nfms_internal3): Likewise. * config/rs6000/predicates.md (fp16_reg_or_constant_operand): New predicate. (bfloat16_v4sf_operand): Likewise. (bfloat16_bf_operand): Likewise. * config/rs6000/rs6000-protos.h (bfloat16_operation_as_v4sf): New declaration. Diff: --- gcc/config/rs6000/float16.cc | 150 ++++++++++++++++++++ gcc/config/rs6000/float16.md | 282 ++++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/predicates.md | 76 ++++++++++ gcc/config/rs6000/rs6000-protos.h | 2 + 4 files changed, 510 insertions(+) diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc index 5274a0df962f..2c7b6278a16a 100644 --- a/gcc/config/rs6000/float16.cc +++ b/gcc/config/rs6000/float16.cc @@ -183,3 +183,153 @@ fp16_vectorization (enum rtx_code icode, return; } + +/* Expand a bfloat16 scalar floating point operation: + + ICODE: Operation to perform. + RESULT: Result of the operation. + OP1: Input operand1. + OP2: Input operand2. + OP3: Input operand3 or NULL_RTX. + SUBTYPE: Describe the operation. + + The operation is done as a V4SFmode vector operation. This is because + converting BFmode from a scalar BFmode to SFmode to do the operation and + back again takes quite a bit of time. GCC will only generate the native + operation if -Ofast is used. The float16.md code that calls this function + adds various combine operations to do the operation in V4SFmode instead of + SFmode. */ + +void +bfloat16_operation_as_v4sf (enum rtx_code icode, + rtx result, + rtx op1, + rtx op2, + rtx op3, + enum fp16_operation subtype) +{ + gcc_assert (can_create_pseudo_p ()); + + rtx result_v4sf = gen_reg_rtx (V4SFmode); + rtx ops_orig[3] = { op1, op2, op3 }; + rtx ops_v4sf[3]; + size_t n_opts; + + switch (subtype) + { + case FP16_BINARY: + n_opts = 2; + gcc_assert (op3 == NULL_RTX); + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + gcc_assert (icode == FMA); + n_opts = 3; + break; + + default: + gcc_unreachable (); + } + + for (size_t i = 0; i < n_opts; i++) + { + rtx op = ops_orig[i]; + rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode); + + gcc_assert (op != NULL_RTX); + + /* Remove truncation/extend added. */ + if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE) + op = XEXP (op, 0); + + /* Convert operands to V4SFmode format. We use SPLAT for registers to + get the value into the upper 32-bits. We can use XXSPLTW to splat + words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the + odd half-words, and XXSPLTW can operate on all VSX registers instead + of just the Altivec registers. Using SPLAT instead of a shift also + insure that other bits are not a signalling NaN. If we are using + XXSPLTIW or XXSPLTIB to load the constant the other bits are + duplicated. */ + + if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode)) + emit_move_insn (tmp, CONST0_RTX (V4SFmode)); + + else if (GET_MODE (op) == BFmode) + { + emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op))); + emit_insn (gen_xvcvbf16spn_bf (tmp, tmp)); + } + + else if (GET_MODE (op) == SFmode) + { + if (GET_CODE (op) == CONST_DOUBLE) + { + rtvec v = rtvec_alloc (4); + + for (size_t i = 0; i < 4; i++) + RTVEC_ELT (v, i) = op; + + emit_insn (gen_rtx_SET (tmp, + gen_rtx_CONST_VECTOR (V4SFmode, v))); + } + + else + emit_insn (gen_vsx_splat_v4sf (tmp, + force_reg (SFmode, op))); + } + + else + gcc_unreachable (); + } + + /* Do the operation in V4SFmode. */ + switch (subtype) + { + case FP16_BINARY: + emit_insn (gen_rtx_SET (result_v4sf, + gen_rtx_fmt_ee (icode, V4SFmode, + ops_v4sf[0], + ops_v4sf[1]))); + break; + + case FP16_FMA: + case FP16_FMS: + case FP16_NFMA: + case FP16_NFMS: + { + rtx op1 = ops_v4sf[0]; + rtx op2 = ops_v4sf[1]; + rtx op3 = ops_v4sf[2]; + + if (subtype == FP16_FMS || subtype == FP16_NFMS) + op3 = gen_rtx_NEG (V4SFmode, op3); + + rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3); + + if (subtype == FP16_NFMA || subtype == FP16_NFMS) + op_fma = gen_rtx_NEG (V4SFmode, op_fma); + + emit_insn (gen_rtx_SET (result_v4sf, op_fma)); + } + break; + + default: + gcc_unreachable (); + } + + /* Convert V4SF result back to scalar mode. */ + if (GET_MODE (result) == BFmode) + emit_insn (gen_xvcvspbf16_bf (result, result_v4sf)); + + else if (GET_MODE (result) == SFmode) + { + rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3); + emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element)); + } + + else + gcc_unreachable (); +} diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index 690b8c2d6610..fe5422dc2892 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -963,3 +963,285 @@ "TARGET_BFLOAT16_HW" "xvcvbf16spn %x0,%x1" [(set_attr "type" "vecperm")]) + +;; Optimize __bfloat16 binary operations. Unlike _Float16 where we +;; have instructions to convert between HFmode and SFmode as scalar +;; values, with BFmode, we only have vector conversions. Thus to do: +;; +;; __bfloat16 a, b, c; +;; a = b + c; +;; +;; the GCC compiler would normally generate: +;; +;; lxsihzx 0,4,2 // load __bfloat16 value b +;; lxsihzx 12,5,2 // load __bfloat16 value c +;; xxsldwi 0,0,0,1 // shift b into bits 16..31 +;; xxsldwi 12,12,12,1 // shift c into bits 16..31 +;; xvcvbf16spn 0,0 // vector convert b into V4SFmode +;; xvcvbf16spn 12,12 // vector convert c into V4SFmode +;; xscvspdpn 0,0 // convert b into SFmode scalar +;; xscvspdpn 12,12 // convert c into SFmode scalar +;; fadds 0,0,12 // add b+c +;; xscvdpspn 0,0 // convert b+c into SFmode memory format +;; xvcvspbf16 0,0 // convert b+c into BFmode memory format +;; stxsihx 0,3,2 // store b+c +;; +;; Using the following combiner patterns, the code generated would now +;; be: +;; +;; lxsihzx 12,4,2 // load __bfloat16 value b +;; lxsihzx 0,5,2 // load __bfloat16 value c +;; xxspltw 12,12,1 // shift b into bits 16..31 +;; xxspltw 0,0,1 // shift c into bits 16..31 +;; xvcvbf16spn 12,12 // vector convert b into V4SFmode +;; xvcvbf16spn 0,0 // vector convert c into V4SFmode +;; xvaddsp 0,0,12 // vector b+c in V4SFmode +;; xvcvspbf16 0,0 // convert b+c into BFmode memory format +;; stxsihx 0,3,2 // store b+c +;; +;; We cannot just define insns like 'addbf3' to keep the operation as +;; BFmode because GCC will not generate these patterns unless the user +;; uses -Ofast. Without -Ofast, it will always convert BFmode into +;; SFmode. + +(define_insn_and_split "*bfloat16_binary_op_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (match_operator:SF 1 "fp16_binary_operator" + [(match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")]))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[2], SFmode) + || bfloat16_bf_operand (operands[3], SFmode))" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], + operands[3], NULL_RTX, FP16_BINARY); + DONE; +}) + +(define_insn_and_split "*bfloat16_binary_op_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (match_operator:SF 1 "fp16_binary_operator" + [(match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")])))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[2], SFmode) + || bfloat16_bf_operand (operands[3], SFmode))" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], + operands[3], NULL_RTX, FP16_BINARY); + DONE; +}) + +(define_insn_and_split "*bfloat16_fma_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_fma_internal2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_fms_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_fms_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal3" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (neg:BF + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal3" + [(set (match_operand:BF 0 "vsx_register_operand") + (neg:BF + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 785d09b94234..172991de3662 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -2276,3 +2276,79 @@ return constant_generates_xxspltiw (&vsx_const); }) + +;; Return 1 if this is a 16-bit floating point operand that can be used +;; in an add, subtract, or multiply operation that uses the vector +;; conversion function. +(define_predicate "fp16_reg_or_constant_operand" + (match_code "reg,subreg,const_double") +{ + if (REG_P (op) || SUBREG_P (op)) + return vsx_register_operand (op, mode); + + if (CONST_DOUBLE_P (op)) + return fp16_xxspltiw_constant (op, mode); + + return false; +}) + +;; Match binary operators where we convert a BFmode operand into a +;; SFmode operand so that we can optimize the BFmode operation to do +;; the operation in vector mode rather than convverting the BFmode to a +;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and +;; then converting the V4SFmode element to SFmode scalar. +(define_predicate "fp16_binary_operator" + (match_code "plus,minus,mult,smax,smin")) + +;; Match bfloat16/float operands that can be optimized to do the +;; operation in V4SFmode. +(define_predicate "bfloat16_v4sf_operand" + (match_code "reg,subreg,const_double,float_extend,float_truncate") +{ + if (mode != BFmode && mode != SFmode) + return false; + + if (REG_P (op) || SUBREG_P (op)) + return register_operand (op, mode); + + if (CONST_DOUBLE_P (op)) + return true; + + if (GET_CODE (op) == FLOAT_EXTEND) + { + rtx op_arg = XEXP (op, 0); + return (mode == SFmode + && GET_MODE (op_arg) == BFmode + && (REG_P (op_arg) || SUBREG_P (op_arg))); + } + + if (GET_CODE (op) == FLOAT_TRUNCATE) + { + rtx op_arg = XEXP (op, 0); + return (mode == BFmode + && GET_MODE (op_arg) == SFmode + && (REG_P (op_arg) || SUBREG_P (op_arg))); + } + + return false; +}) + +;; Match an operand that originally was an BFmode value to prevent +;; operations involing only SFmode values from being converted to +;; BFmode. +(define_predicate "bfloat16_bf_operand" + (match_code "reg,subreg,const_double,float_extend") +{ + if (mode == BFmode || GET_MODE (op) == BFmode) + return true; + + if (mode != SFmode) + return false; + + if (GET_MODE (op) == SFmode + && GET_CODE (op) == FLOAT_EXTEND + && GET_MODE (XEXP (op, 0)) == BFmode) + return true; + + return false; +}) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index dd5fcd69e836..3665a405cfd2 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -271,6 +271,8 @@ enum fp16_operation { extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, enum fp16_operation); +extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE
