This patch provides conversions between __bfloat16 and float/double scalars on
power10 and power11 systems.
Unlike the support for _Float16, there is not a single instruction to convert
between a __bfloat16 and float/double scalar value on the power10.
Instead we have to use the vector conversion instructions.
To convert a __bfloat16 scalar to a float/double scalar, GCC will generate:
lxsihzx 0,0,4 Load value into vector register
xxsldwi 0,0,0,1 Get the value into the upper 32-bits
xvcvbf16spn 0,0 Convert vector __bfloat16 to vector float
xscvspdpn 0,0 Convert memory float format to scalar
To convert a scalar float/double to __bfloat16, GCC will generate:
xscvdpsp 0,0 Convert float scalar to float memory format
xvcvspbf16 0,0 Convert vector float to vector __bfloat16
All 11 patches have been tested on little endian and big endian PowerPC
servers with no regressions. Can I check in these patches?
2025-11-14 Michael Meissner <[email protected]>
gcc/
* config/rs6000/float16.md (FP16_HW): Add BFmode.
(VFP16_HW): New mode iterator.
(cvt_fp16_to_v4sf_insn): New mode attribute.
(FP16_VECTOR4): Likewise.
(UNSPEC_FP16_SHIFT_LEFT_32BIT): New unspec constant.
(UNSPEC_CVT_FP16_TO_V4SF): Likewise.
(UNSPEC_XXSPLTW_FP16): Likewise.
(UNSPEC_XVCVSPBF16_BF): Likewise.
(extendbf<mode>2): New insns to convert between BFmode and
SFmode/DFmode.
(xscvdpspn_sf): Likewise.
(xscvspdpn_sf): Likewise.
(<fp16_vector8>_shift_left_32bit): Likewise.
(trunc<mode>bf): Likewise.
(vsx_xscvdpspn_sf): Likewise.
(cvt_fp16_to_v4sf_<mode): Likewise.
(cvt_fp16_to_v4sf_<mode>_le): Likewise.
(cvt_fp16_to_v4sf_<mode>_be): Likewise.
(dup_<mode>_to_v4s): Likewise.
(xxspltw_<mode>): Likewise.
(xvcvbf16spn_bf): Likewise.
(xvcvspbf16_bf): Likewise.
* config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
__BFLOAT16_HW__ if we have hardware support for __bfloat16.
* config/rs6000/rs6000.cc (rs6000_init_hard_regno_mode_ok): Mark that we
use VSX arithmetic support for V8BFmode if we are a power10 or later.
---
gcc/config/rs6000/float16.md | 246 +++++++++++++++++++++++++++++++++-
gcc/config/rs6000/rs6000-c.cc | 3 +
gcc/config/rs6000/rs6000.cc | 3 +
3 files changed, 251 insertions(+), 1 deletion(-)
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 579703517d5..e27428d9486 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -28,7 +28,18 @@ (define_mode_iterator VFP16 [V8BF V8HF])
;; Mode iterator for 16-bit floating point modes on machines with
;; hardware support both as a scalar and as a vector.
-(define_mode_iterator FP16_HW [(HF "TARGET_FLOAT16_HW")])
+(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW")
+ (HF "TARGET_FLOAT16_HW")])
+
+(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW")
+ (V8HF "TARGET_FLOAT16_HW")])
+
+;; Mode attribute giving the instruction to convert the even
+;; V8HFmode or V8BFmode elements to V4SFmode
+(define_mode_attr cvt_fp16_to_v4sf_insn [(BF "xvcvbf16spn")
+ (HF "xvcvhpsp")
+ (V8BF "xvcvbf16spn")
+ (V8HF "xvcvhpsp")])
;; Mode attribute giving the vector mode for a 16-bit floating point
;; scalar in both upper and lower case.
@@ -37,6 +48,20 @@ (define_mode_attr FP16_VECTOR8 [(BF "V8BF")
(define_mode_attr fp16_vector8 [(BF "v8bf")
(HF "v8hf")])
+
+;; Mode attribute giving the vector mode with 4 16-bit floating point
+;; elements given a scalar or 8 element vector.
+(define_mode_attr FP16_VECTOR4 [(BF "V4BF")
+ (HF "V4HF")
+ (V8BF "V4BF")
+ (V8HF "V4HF")])
+
+;; UNSPEC constants
+(define_c_enum "unspec"
+ [UNSPEC_FP16_SHIFT_LEFT_32BIT
+ UNSPEC_CVT_FP16_TO_V4SF
+ UNSPEC_XXSPLTW_FP16
+ UNSPEC_XVCVSPBF16_BF])
;; _Float16 and __bfloat16 moves
(define_expand "mov<mode>"
@@ -179,3 +204,222 @@ (define_insn "trunc<mode>hf2"
"TARGET_FLOAT16_HW"
"xscvdphp %x0,%x1"
[(set_attr "type" "fpsimple")])
+
+;; Convert BFmode to SFmode/DFmode.
+;; 3 instructions are generated:
+;; VSPLTH -- duplicate BFmode into all elements
+;; XVCVBF16SPN -- convert even BFmode elements to SFmode
+;; XSCVSPNDP -- convert memory format of SFmode to DFmode.
+(define_insn_and_split "extendbf<mode>2"
+ [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
+ (float_extend:SFDF
+ (match_operand:BF 1 "vsx_register_operand" "v")))
+ (clobber (match_scratch:V8BF 2 "=v"))]
+ "TARGET_BFLOAT16_HW"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2_v8bf = operands[2];
+
+ if (GET_CODE (op2_v8bf) == SCRATCH)
+ op2_v8bf = gen_reg_rtx (V8BFmode);
+
+ rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf);
+
+ /* XXSLDWI -- shift BFmode element into the upper 32 bits. */
+ emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1));
+
+ /* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode. */
+ emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf));
+
+ /* XSCVSPNDP -- convert single V4SFmode element to DFmode. */
+ emit_insn (GET_MODE (op0) == SFmode
+ ? gen_xscvspdpn_sf (op0, op2_v4sf)
+ : gen_vsx_xscvspdpn (op0, op2_v4sf));
+
+ DONE;
+}
+ [(set_attr "type" "fpsimple")
+ (set_attr "length" "12")])
+
+;; Convert a SFmode scalar represented as DFmode to elements 0 and 1 of
+;; V4SFmode.
+(define_insn "xscvdpspn_sf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_VSX_CVSPDP))]
+ "VECTOR_UNIT_VSX_P (SFmode)"
+ "xscvdpspn %x0,%x1"
+ [(set_attr "type" "fp")])
+
+;; Convert element 0 of a V4SFmode to scalar SFmode (which on the
+;; PowerPC uses the DFmode encoding).
+(define_insn "xscvspdpn_sf"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+ (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_VSX_CVSPDPN))]
+ "TARGET_XSCVSPDPN"
+ "xscvspdpn %x0,%x1"
+ [(set_attr "type" "fp")])
+
+;; Vector shift left by 32 bits to get the 16-bit floating point value
+;; into the upper 32 bits for the conversion.
+(define_insn "<fp16_vector8>_shift_left_32bit"
+ [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa")
+ (unspec:<FP16_VECTOR8>
+ [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+ UNSPEC_FP16_SHIFT_LEFT_32BIT))]
+ ""
+ "xxsldwi %x0,%x1,%x1,1"
+ [(set_attr "type" "vecperm")])
+
+;; Convert SFmode/DFmode to BFmode.
+;; 2 instructions are generated:
+;; XSCVDPSPN -- convert SFmode/DFmode scalar to V4SFmode
+;; XVCVSPBF16 -- convert V4SFmode to even V8BFmode
+
+(define_insn_and_split "trunc<mode>bf2"
+ [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+ (float_truncate:BF
+ (match_operand:SFDF 1 "vsx_register_operand" "wa")))
+ (clobber (match_scratch:V4SF 2 "=wa"))]
+ "TARGET_BFLOAT16_HW"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op2 = operands[2];
+
+ if (GET_CODE (op2) == SCRATCH)
+ op2 = gen_reg_rtx (V4SFmode);
+
+ emit_insn (GET_MODE (op1) == SFmode
+ ? gen_xscvdpspn_sf (op2, op1)
+ : gen_vsx_xscvdpspn (op2, op1));
+
+ emit_insn (gen_xvcvspbf16_bf (op0, op2));
+ DONE;
+}
+ [(set_attr "type" "fpsimple")])
+
+(define_insn "vsx_xscvdpspn_sf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_VSX_CVDPSPN))]
+ "TARGET_XSCVDPSPN"
+ "xscvdpspn %x0,%x1"
+ [(set_attr "type" "fp")])
+
+;; Convert the even elements of a vector 16-bit floating point to
+;; V4SFmode. Deal with little endian vs. big endian element ordering
+;; in identifying which elements are converted.
+
+(define_expand "cvt_fp16_to_v4sf_<mode>"
+ [(set (match_operand:V4SF 0 "vsx_register_operand")
+ (float_extend:V4SF
+ (vec_select:<FP16_VECTOR4>
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (parallel [(match_dup 2)
+ (match_dup 3)
+ (match_dup 4)
+ (match_dup 5)]))))]
+ ""
+{
+ int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1;
+ operands[2] = GEN_INT (0 + endian_adjust);
+ operands[3] = GEN_INT (2 + endian_adjust);
+ operands[4] = GEN_INT (4 + endian_adjust);
+ operands[5] = GEN_INT (6 + endian_adjust);
+})
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_le"
+ [(set (match_operand:V4SF 0 "vsx_register_operand")
+ (float_extend:V4SF
+ (vec_select:<FP16_VECTOR4>
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (parallel [(const_int 1)
+ (const_int 3)
+ (const_int 5)
+ (const_int 7)]))))]
+ "!WORDS_BIG_ENDIAN"
+ "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+ [(set_attr "type" "vecfloat")])
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_be"
+ [(set (match_operand:V4SF 0 "vsx_register_operand")
+ (float_extend:V4SF
+ (vec_select:<FP16_VECTOR4>
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (parallel [(const_int 0)
+ (const_int 2)
+ (const_int 4)
+ (const_int 6)]))))]
+ "WORDS_BIG_ENDIAN"
+ "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+ [(set_attr "type" "vecfloat")])
+
+;; Duplicate and convert a 16-bit floating point scalar to V4SFmode.
+
+(define_insn_and_split "*dup_<mode>_to_v4sf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (vec_duplicate:V4SF
+ (float_extend:SF
+ (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))]
+ ""
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0);
+
+ emit_insn (gen_xxspltw_<mode> (op0, op1));
+ emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16));
+ DONE;
+}
+ [(set_attr "length" "8")
+ (set_attr "type" "vecperm")])
+
+;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn.
+;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can
+;; use xxspltw instead of vspltw. This has the advantage that the
+;; register allocator can use any of the 64 VSX registers instead of
+;; being limited to the 32 Altivec registers that VSPLTH would require.
+
+(define_insn "xxspltw_<mode>"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+ UNSPEC_XXSPLTW_FP16))]
+ ""
+ "xxspltw %x0,%x1,1"
+ [(set_attr "type" "vecperm")])
+
+;; Convert a bfloat16 floating point scalar that has been splatted to
+;; V4SFmode.
+
+(define_insn "xvcvbf16spn_bf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_CVT_FP16_TO_V4SF))]
+ "TARGET_BFLOAT16_HW"
+ "xvcvbf16spn %x0,%x1"
+ [(set_attr "type" "vecperm")])
+
+;; Convert a V4SFmode vector to a 16-bit floating point scalar. We
+;; only care about the 2nd V4SFmode element, which is the element we
+;; converted the 16-bit scalar (4th element) to V4SFmode to do the
+;; operation, and converted it back.
+
+(define_insn "xvcvspbf16_bf"
+ [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+ (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_XVCVSPBF16_BF))]
+ "TARGET_BFLOAT16_HW"
+ "xvcvspbf16 %x0,%x1"
+ [(set_attr "type" "vecfloat")])
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 31a166a1c0f..7b63c0b64a9 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -591,6 +591,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT
flags)
if ((flags & OPTION_MASK_P9_VECTOR) != 0)
rs6000_define_or_undefine_macro (define_p, "__FLOAT16_HW__");
+
+ if ((flags & OPTION_MASK_POWER10) != 0)
+ rs6000_define_or_undefine_macro (define_p, "__BFLOAT16_HW__");
}
/* Tell the user if we are targeting CELL. */
if (rs6000_cpu == PROCESSOR_CELL)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a913f91b6e5..74b84d1770d 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -3012,6 +3012,9 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
if (TARGET_P9_VECTOR)
rs6000_vector_unit[V8HFmode] = VECTOR_VSX;
+
+ if (TARGET_POWER10)
+ rs6000_vector_unit[V8BFmode] = VECTOR_VSX;
}
/* DFmode, see if we want to use the VSX unit. Memory is handled
--
2.51.1
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]