double.

Michael Meissner Fri, 14 Nov 2025 13:53:14 -0800

This patch provides conversions between __bfloat16 and float/double scalars on
power10 and power11 systems.


Unlike the support for _Float16, there is not a single instruction to convert
between a __bfloat16 and float/double scalar value on the power10.

Instead we have to use the vector conversion instructions.

To convert a __bfloat16 scalar to a float/double scalar, GCC will generate:

        lxsihzx     0,0,4       Load value into vector register
        xxsldwi     0,0,0,1     Get the value into the upper 32-bits
        xvcvbf16spn 0,0         Convert vector __bfloat16 to vector float
        xscvspdpn   0,0         Convert memory float format to scalar

To convert a scalar float/double to __bfloat16, GCC will generate:

        xscvdpsp   0,0          Convert float scalar to float memory format
        xvcvspbf16 0,0          Convert vector float to vector __bfloat16

All 11 patches have been tested on little endian and big endian PowerPC
servers with no regressions.  Can I check in these patches?

2025-11-14  Michael Meissner  <[email protected]>

gcc/

        * config/rs6000/float16.md (FP16_HW): Add BFmode.
        (VFP16_HW): New mode iterator.
        (cvt_fp16_to_v4sf_insn): New mode attribute.
        (FP16_VECTOR4): Likewise.
        (UNSPEC_FP16_SHIFT_LEFT_32BIT): New unspec constant.
        (UNSPEC_CVT_FP16_TO_V4SF): Likewise.
        (UNSPEC_XXSPLTW_FP16): Likewise.
        (UNSPEC_XVCVSPBF16_BF): Likewise.
        (extendbf<mode>2): New insns to convert between BFmode and
        SFmode/DFmode.
        (xscvdpspn_sf): Likewise.
        (xscvspdpn_sf): Likewise.
        (<fp16_vector8>_shift_left_32bit): Likewise.
        (trunc<mode>bf): Likewise.
        (vsx_xscvdpspn_sf): Likewise.
        (cvt_fp16_to_v4sf_<mode): Likewise.
        (cvt_fp16_to_v4sf_<mode>_le): Likewise.
        (cvt_fp16_to_v4sf_<mode>_be): Likewise.
        (dup_<mode>_to_v4s): Likewise.
        (xxspltw_<mode>): Likewise.
        (xvcvbf16spn_bf): Likewise.
        (xvcvspbf16_bf): Likewise.
        * config/rs6000/rs6000-c.cc (rs6000_target_modify_macros): Define
        __BFLOAT16_HW__ if we have hardware support for __bfloat16.
        * config/rs6000/rs6000.cc (rs6000_init_hard_regno_mode_ok): Mark that we
        use VSX arithmetic support for V8BFmode if we are a power10 or later.
---
 gcc/config/rs6000/float16.md  | 246 +++++++++++++++++++++++++++++++++-
 gcc/config/rs6000/rs6000-c.cc |   3 +
 gcc/config/rs6000/rs6000.cc   |   3 +
 3 files changed, 251 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 579703517d5..e27428d9486 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -28,7 +28,18 @@ (define_mode_iterator VFP16  [V8BF V8HF])
 
 ;; Mode iterator for 16-bit floating point modes on machines with
 ;; hardware support both as a scalar and as a vector.
-(define_mode_iterator FP16_HW [(HF "TARGET_FLOAT16_HW")])
+(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW")
+                              (HF "TARGET_FLOAT16_HW")])
+
+(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW")
+                               (V8HF "TARGET_FLOAT16_HW")])
+
+;; Mode attribute giving the instruction to convert the even
+;; V8HFmode or V8BFmode elements to V4SFmode
+(define_mode_attr cvt_fp16_to_v4sf_insn [(BF   "xvcvbf16spn")
+                                        (HF   "xvcvhpsp")
+                                        (V8BF "xvcvbf16spn")
+                                        (V8HF "xvcvhpsp")])
 
 ;; Mode attribute giving the vector mode for a 16-bit floating point
 ;; scalar in both upper and lower case.
@@ -37,6 +48,20 @@ (define_mode_attr FP16_VECTOR8 [(BF "V8BF")
 
 (define_mode_attr fp16_vector8 [(BF "v8bf")
                                (HF "v8hf")])
+
+;; Mode attribute giving the vector mode with 4 16-bit floating point
+;; elements given a scalar or 8 element vector.
+(define_mode_attr FP16_VECTOR4 [(BF   "V4BF")
+                               (HF   "V4HF")
+                               (V8BF "V4BF")
+                               (V8HF "V4HF")])
+
+;; UNSPEC constants
+(define_c_enum "unspec"
+  [UNSPEC_FP16_SHIFT_LEFT_32BIT
+   UNSPEC_CVT_FP16_TO_V4SF
+   UNSPEC_XXSPLTW_FP16
+   UNSPEC_XVCVSPBF16_BF])
 
 ;; _Float16 and __bfloat16 moves
 (define_expand "mov<mode>"
@@ -179,3 +204,222 @@ (define_insn "trunc<mode>hf2"
   "TARGET_FLOAT16_HW"
   "xscvdphp %x0,%x1"
   [(set_attr "type" "fpsimple")])
+
+;; Convert BFmode to SFmode/DFmode.
+;; 3 instructions are generated:
+;;     VSPLTH          -- duplicate BFmode into all elements
+;;     XVCVBF16SPN     -- convert even BFmode elements to SFmode
+;;     XSCVSPNDP       -- convert memory format of SFmode to DFmode.
+(define_insn_and_split "extendbf<mode>2"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
+       (float_extend:SFDF
+        (match_operand:BF 1 "vsx_register_operand" "v")))
+   (clobber (match_scratch:V8BF 2 "=v"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2_v8bf = operands[2];
+
+  if (GET_CODE (op2_v8bf) == SCRATCH)
+    op2_v8bf = gen_reg_rtx (V8BFmode);
+
+  rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf);
+
+  /* XXSLDWI -- shift BFmode element into the upper 32 bits.  */
+  emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1));
+
+  /* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode.  */
+  emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf));
+
+  /* XSCVSPNDP -- convert single V4SFmode element to DFmode.  */
+  emit_insn (GET_MODE (op0) == SFmode
+            ? gen_xscvspdpn_sf (op0, op2_v4sf)
+            : gen_vsx_xscvspdpn (op0, op2_v4sf));
+
+  DONE;
+}
+  [(set_attr "type" "fpsimple")
+   (set_attr "length" "12")])
+
+;; Convert a SFmode scalar represented as DFmode to elements 0 and 1 of
+;; V4SFmode.
+(define_insn "xscvdpspn_sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_VSX_CVSPDP))]
+  "VECTOR_UNIT_VSX_P (SFmode)"
+  "xscvdpspn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Convert element 0 of a V4SFmode to scalar SFmode (which on the
+;; PowerPC uses the DFmode encoding).
+(define_insn "xscvspdpn_sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+       (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_VSX_CVSPDPN))]
+  "TARGET_XSCVSPDPN"
+  "xscvspdpn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Vector shift left by 32 bits to get the 16-bit floating point value
+;; into the upper 32 bits for the conversion.
+(define_insn "<fp16_vector8>_shift_left_32bit"
+  [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa")
+        (unspec:<FP16_VECTOR8>
+        [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+        UNSPEC_FP16_SHIFT_LEFT_32BIT))]
+  ""
+  "xxsldwi %x0,%x1,%x1,1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert SFmode/DFmode to BFmode.
+;; 2 instructions are generated:
+;;     XSCVDPSPN       -- convert SFmode/DFmode scalar to V4SFmode
+;;     XVCVSPBF16      -- convert V4SFmode to even V8BFmode
+
+(define_insn_and_split "trunc<mode>bf2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (match_operand:SFDF 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:V4SF 2 "=wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+
+  if (GET_CODE (op2) == SCRATCH)
+    op2 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (GET_MODE (op1) == SFmode
+            ? gen_xscvdpspn_sf (op2, op1)
+            : gen_vsx_xscvdpspn (op2, op1));
+
+  emit_insn (gen_xvcvspbf16_bf (op0, op2));
+  DONE;
+}
+  [(set_attr "type" "fpsimple")])
+
+(define_insn "vsx_xscvdpspn_sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_VSX_CVDPSPN))]
+  "TARGET_XSCVDPSPN"
+  "xscvdpspn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Convert the even elements of a vector 16-bit floating point to
+;; V4SFmode.  Deal with little endian vs. big endian element ordering
+;; in identifying which elements are converted.
+
+(define_expand "cvt_fp16_to_v4sf_<mode>"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(match_dup 2)
+                    (match_dup 3)
+                    (match_dup 4)
+                    (match_dup 5)]))))]
+  ""
+{
+  int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1;
+  operands[2] = GEN_INT (0 + endian_adjust);
+  operands[3] = GEN_INT (2 + endian_adjust);
+  operands[4] = GEN_INT (4 + endian_adjust);
+  operands[5] = GEN_INT (6 + endian_adjust);
+})
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_le"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(const_int 1)
+                    (const_int 3)
+                    (const_int 5)
+                    (const_int 7)]))))]
+  "!WORDS_BIG_ENDIAN"
+  "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_be"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(const_int 0)
+                    (const_int 2)
+                    (const_int 4)
+                    (const_int 6)]))))]
+  "WORDS_BIG_ENDIAN"
+  "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+;; Duplicate and convert a 16-bit floating point scalar to V4SFmode.
+
+(define_insn_and_split "*dup_<mode>_to_v4sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (vec_duplicate:V4SF
+        (float_extend:SF
+         (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))]
+  ""
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0);
+
+  emit_insn (gen_xxspltw_<mode> (op0, op1));
+  emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16));
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "vecperm")])
+
+;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn.
+;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can
+;; use xxspltw instead of vspltw.  This has the advantage that the
+;; register allocator can use any of the 64 VSX registers instead of
+;; being limited to the 32 Altivec registers that VSPLTH would require.
+
+(define_insn "xxspltw_<mode>"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XXSPLTW_FP16))]
+  ""
+  "xxspltw %x0,%x1,1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a bfloat16 floating point scalar that has been splatted to
+;; V4SFmode.
+
+(define_insn "xvcvbf16spn_bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvbf16spn %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a V4SFmode vector to a 16-bit floating point scalar.  We
+;; only care about the 2nd V4SFmode element, which is the element we
+;; converted the 16-bit scalar (4th element) to V4SFmode to do the
+;; operation, and converted it back.
+
+(define_insn "xvcvspbf16_bf"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_XVCVSPBF16_BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvspbf16 %x0,%x1"
+  [(set_attr "type" "vecfloat")])
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 31a166a1c0f..7b63c0b64a9 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -591,6 +591,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT 
flags)
 
       if ((flags & OPTION_MASK_P9_VECTOR) != 0)
        rs6000_define_or_undefine_macro (define_p, "__FLOAT16_HW__");
+
+      if ((flags & OPTION_MASK_POWER10) != 0)
+       rs6000_define_or_undefine_macro (define_p, "__BFLOAT16_HW__");
     }
   /* Tell the user if we are targeting CELL.  */
   if (rs6000_cpu == PROCESSOR_CELL)
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a913f91b6e5..74b84d1770d 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -3012,6 +3012,9 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
 
       if (TARGET_P9_VECTOR)
        rs6000_vector_unit[V8HFmode] = VECTOR_VSX;
+
+      if (TARGET_POWER10)
+       rs6000_vector_unit[V8BFmode] = VECTOR_VSX;
     }
 
   /* DFmode, see if we want to use the VSX unit.  Memory is handled
-- 
2.51.1


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]

[PATCH, 5/11] Add conversions between __bfloat16 and float/double.

Reply via email to