https://gcc.gnu.org/g:418c0d25b91732dd1452841deda8f7b9d08ef973
commit 418c0d25b91732dd1452841deda8f7b9d08ef973 Author: Michael Meissner <[email protected]> Date: Mon Nov 10 18:20:39 2025 -0500 Add conversions between __bfloat16 and float/double. This patch provides conversions between __bfloat16 and float/double scalars on power10 and power11 systems. Unlike the support for _Float16, there is not a single instruction to convert between a __bfloat16 and float/double scalar value on the power10. Instead we have to use the vector conversion instructions. To convert a __bfloat16 scalar to a float/double scalar, GCC will generate: lxsihzx 0,0,4 Load value into vector register xxsldwi 0,0,0,1 Get the value into the upper 32-bits xvcvbf16spn 0,0 Convert vector __bfloat16 to vector float xscvspdpn 0,0 Convert memory float format to scalar To convert a scalar float/double to __bfloat16, GCC will generate: xscvdpsp 0,0 Convert float scalar to float memory format xvcvspbf16 0,0 Convert vector float to vector __bfloat16 2025-11-10 Michael Meissner <[email protected]> gcc/ * config/rs6000/float16.md (FP16_HW): Add BFmode. (VFP16_HW): New mode iterator. (cvt_fp16_to_v4sf_insn): New mode attribute. (FP16_VECTOR4): Likewise. (UNSPEC_FP16_SHIFT_LEFT_32BIT): New unspec constant. (UNSPEC_CVT_FP16_TO_V4SF): Likewise. (UNSPEC_XXSPLTW_FP16): Likewise. (UNSPEC_XVCVSPBF16_BF): Likewise. (extendbf<mode>2): New insns to convert between BFmode and SFmode/DFmode. (xscvdpspn_sf): Likewise. (xscvspdpn_sf): Likewise. (<fp16_vector8>_shift_left_32bit): Likewise. (trunc<mode>bf): Likewise. (vsx_xscvdpspn_sf): Likewise. (cvt_fp16_to_v4sf_<mode): Likewise. (cvt_fp16_to_v4sf_<mode>_le): Likewise. (cvt_fp16_to_v4sf_<mode>_be): Likewise. (dup_<mode>_to_v4s): Likewise. (xxspltw_<mode>): Likewise. (xvcvbf16spn_bf): Likewise. (xvcvspbf16_bf): Likewise. * config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define __BFLOAT16_HW__ if we have hardware support for __bfloat16. Diff: --- gcc/config/rs6000/float16.md | 246 +++++++++++++++++++++++++++++++++++++++++- gcc/config/rs6000/rs6000-c.cc | 3 + 2 files changed, 248 insertions(+), 1 deletion(-) diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index 13f3b32e86c1..fd310a8e63a6 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -27,7 +27,18 @@ ;; Mode iterator for 16-bit floating point modes on machines with ;; hardware support both as a scalar and as a vector. -(define_mode_iterator FP16_HW [(HF "TARGET_FLOAT16_HW")]) +(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW") + (HF "TARGET_FLOAT16_HW")]) + +(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW") + (V8HF "TARGET_FLOAT16_HW")]) + +;; Mode attribute giving the instruction to convert the even +;; V8HFmode or V8BFmode elements to V4SFmode +(define_mode_attr cvt_fp16_to_v4sf_insn [(BF "xvcvbf16spn") + (HF "xvcvhpsp") + (V8BF "xvcvbf16spn") + (V8HF "xvcvhpsp")]) ;; Mode attribute giving the vector mode for a 16-bit floating point ;; scalar in both upper and lower case. @@ -36,6 +47,20 @@ (define_mode_attr fp16_vector8 [(BF "v8bf") (HF "v8hf")]) + +;; Mode attribute giving the vector mode with 4 16-bit floating point +;; elements given a scalar or 8 element vector. +(define_mode_attr FP16_VECTOR4 [(BF "V4BF") + (HF "V4HF") + (V8BF "V4BF") + (V8HF "V4HF")]) + +;; UNSPEC constants +(define_c_enum "unspec" + [UNSPEC_FP16_SHIFT_LEFT_32BIT + UNSPEC_CVT_FP16_TO_V4SF + UNSPEC_XXSPLTW_FP16 + UNSPEC_XVCVSPBF16_BF]) ;; _Float16 and __bfloat16 moves (define_expand "mov<mode>" @@ -144,3 +169,222 @@ "TARGET_FLOAT16_HW" "xscvdphp %x0,%x1" [(set_attr "type" "fpsimple")]) + +;; Convert BFmode to SFmode/DFmode. +;; 3 instructions are generated: +;; VSPLTH -- duplicate BFmode into all elements +;; XVCVBF16SPN -- convert even BFmode elements to SFmode +;; XSCVSPNDP -- convert memory format of SFmode to DFmode. +(define_insn_and_split "extendbf<mode>2" + [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") + (float_extend:SFDF + (match_operand:BF 1 "vsx_register_operand" "v"))) + (clobber (match_scratch:V8BF 2 "=v"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2_v8bf = operands[2]; + + if (GET_CODE (op2_v8bf) == SCRATCH) + op2_v8bf = gen_reg_rtx (V8BFmode); + + rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf); + + /* XXSLDWI -- shift BFmode element into the upper 32 bits. */ + emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1)); + + /* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode. */ + emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf)); + + /* XSCVSPNDP -- convert single V4SFmode element to DFmode. */ + emit_insn (GET_MODE (op0) == SFmode + ? gen_xscvspdpn_sf (op0, op2_v4sf) + : gen_vsx_xscvspdpn (op0, op2_v4sf)); + + DONE; +} + [(set_attr "type" "fpsimple") + (set_attr "length" "12")]) + +;; Convert a SFmode scalar represented as DFmode to elements 0 and 1 of +;; V4SFmode. +(define_insn "xscvdpspn_sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDP))] + "VECTOR_UNIT_VSX_P (SFmode)" + "xscvdpspn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Convert element 0 of a V4SFmode to scalar SFmode (which on the +;; PowerPC uses the DFmode encoding). +(define_insn "xscvspdpn_sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa") + (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDPN))] + "TARGET_XSCVSPDPN" + "xscvspdpn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Vector shift left by 32 bits to get the 16-bit floating point value +;; into the upper 32 bits for the conversion. +(define_insn "<fp16_vector8>_shift_left_32bit" + [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa") + (unspec:<FP16_VECTOR8> + [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")] + UNSPEC_FP16_SHIFT_LEFT_32BIT))] + "" + "xxsldwi %x0,%x1,%x1,1" + [(set_attr "type" "vecperm")]) + +;; Convert SFmode/DFmode to BFmode. +;; 2 instructions are generated: +;; XSCVDPSPN -- convert SFmode/DFmode scalar to V4SFmode +;; XVCVSPBF16 -- convert V4SFmode to even V8BFmode + +(define_insn_and_split "trunc<mode>bf2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (match_operand:SFDF 1 "vsx_register_operand" "wa"))) + (clobber (match_scratch:V4SF 2 "=wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + + if (GET_CODE (op2) == SCRATCH) + op2 = gen_reg_rtx (V4SFmode); + + emit_insn (GET_MODE (op1) == SFmode + ? gen_xscvdpspn_sf (op2, op1) + : gen_vsx_xscvdpspn (op2, op1)); + + emit_insn (gen_xvcvspbf16_bf (op0, op2)); + DONE; +} + [(set_attr "type" "fpsimple")]) + +(define_insn "vsx_xscvdpspn_sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVDPSPN))] + "TARGET_XSCVDPSPN" + "xscvdpspn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Convert the even elements of a vector 16-bit floating point to +;; V4SFmode. Deal with little endian vs. big endian element ordering +;; in identifying which elements are converted. + +(define_expand "cvt_fp16_to_v4sf_<mode>" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)]))))] + "" +{ + int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1; + operands[2] = GEN_INT (0 + endian_adjust); + operands[3] = GEN_INT (2 + endian_adjust); + operands[4] = GEN_INT (4 + endian_adjust); + operands[5] = GEN_INT (6 + endian_adjust); +}) + +(define_insn "*cvt_fp16_to_v4sf_<mode>_le" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))] + "!WORDS_BIG_ENDIAN" + "<cvt_fp16_to_v4sf_insn> %x0,%x1" + [(set_attr "type" "vecfloat")]) + +(define_insn "*cvt_fp16_to_v4sf_<mode>_be" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))))] + "WORDS_BIG_ENDIAN" + "<cvt_fp16_to_v4sf_insn> %x0,%x1" + [(set_attr "type" "vecfloat")]) + +;; Duplicate and convert a 16-bit floating point scalar to V4SFmode. + +(define_insn_and_split "*dup_<mode>_to_v4sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (vec_duplicate:V4SF + (float_extend:SF + (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))] + "" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0); + + emit_insn (gen_xxspltw_<mode> (op0, op1)); + emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16)); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "vecperm")]) + +;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn. +;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can +;; use xxspltw instead of vspltw. This has the advantage that the +;; register allocator can use any of the 64 VSX registers instead of +;; being limited to the 32 Altivec registers that VSPLTH would require. + +(define_insn "xxspltw_<mode>" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")] + UNSPEC_XXSPLTW_FP16))] + "" + "xxspltw %x0,%x1,1" + [(set_attr "type" "vecperm")]) + +;; Convert a bfloat16 floating point scalar that has been splatted to +;; V4SFmode. + +(define_insn "xvcvbf16spn_bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_BFLOAT16_HW" + "xvcvbf16spn %x0,%x1" + [(set_attr "type" "vecperm")]) + +;; Convert a V4SFmode vector to a 16-bit floating point scalar. We +;; only care about the 2nd V4SFmode element, which is the element we +;; converted the 16-bit scalar (4th element) to V4SFmode to do the +;; operation, and converted it back. + +(define_insn "xvcvspbf16_bf" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPBF16_BF))] + "TARGET_BFLOAT16_HW" + "xvcvspbf16 %x0,%x1" + [(set_attr "type" "vecfloat")]) diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index e7091dd434b5..598d7b211e33 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -597,6 +597,9 @@ rs6000_target_modify_macros (bool define_p, if ((flags & OPTION_MASK_P9_VECTOR) != 0) rs6000_define_or_undefine_macro (define_p, "__FLOAT16_HW__"); + + if ((flags & OPTION_MASK_POWER10) != 0) + rs6000_define_or_undefine_macro (define_p, "__BFLOAT16_HW__"); } /* Tell the user if we are targeting CELL. */ if (rs6000_cpu == PROCESSOR_CELL)
