gcc-16-future-float16)] Optimize __bfloat16 scalar code.

Michael Meissner via Gcc-cvs Thu, 13 Nov 2025 16:38:11 -0800

https://gcc.gnu.org/g:bdfca635571a640fca76c26ad55417e3169a82ce


commit bdfca635571a640fca76c26ad55417e3169a82ce
Author: Michael Meissner <[email protected]>
Date:   Thu Nov 13 11:21:23 2025 -0500

    Optimize __bfloat16 scalar code.
    
    Optimize __bfloat16 binary operations.  Unlike _Float16 where we
    have instructions to convert between HFmode and SFmode as scalar
    values, with BFmode, we only have vector conversions.  Thus to do:
    
            __bfloat16 a, b, c;
    
            a = b + c;
    
    the GCC compiler generates the following code:
    
            lxsihzx 0,4,2           // load __bfloat16 value b
            lxsihzx 12,5,2          // load __bfloat16 value c
            xxsldwi 0,0,0,1         // shift b into bits 16..31
            xxsldwi 12,12,12,1      // shift c into bits 16..31
            xvcvbf16spn 0,0         // vector convert b into V4SFmode
            xvcvbf16spn 12,12       // vector convert c into V4SFmode
            xscvspdpn 0,0           // convert b into SFmode scalar
            xscvspdpn 12,12         // convert c into SFmode scalar
            fadds 0,0,12            // add b+c
            xscvdpspn 0,0           // convert b+c into SFmode memory format
            xvcvspbf16 0,0          // convert b+c into BFmode memory format
            stxsihx 0,3,2           // store b+c
    
    Using the following combiner patterns that are defined in this patch, the 
code
    generated would be:
    
            lxsihzx 12,4,2          // load __bfloat16 value b
            lxsihzx 0,5,2           // load __bfloat16 value c
            xxspltw 12,12,1         // shift b into bits 16..31
            xxspltw 0,0,1           // shift c into bits 16..31
            xvcvbf16spn 12,12       // vector convert b into V4SFmode
            xvcvbf16spn 0,0         // vector convert c into V4SFmode
            xvaddsp 0,0,12          // vector b+c in V4SFmode
            xvcvspbf16 0,0          // convert b+c into BFmode memory format
            stxsihx 0,3,2           // store b+c
    
    We cannot just define insns like 'addbf3' to keep the operation as
    BFmode because GCC will not generate these patterns unless the user
    uses -Ofast.  Without -Ofast, it will always convert BFmode into
    SFmode.
    
    2025-11-13  Michael Meissner  <[email protected]>
    
    gcc/
    
            * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): New 
function to
            optimize __bfloat16 scalar operations.
            * config/rs6000/float16.md (bfloat16_binary_op_internal1): New
            __bfloat16 scalar combiner insns.
            (bfloat16_binary_op_internal2): Likewise.
            (bfloat16_fma_internal1): Likewise.
            (bfloat16_fma_internal2): Likewise.
            (bfloat16_fms_internal1): Likewise.
            (bfloat16_fms_internal2): Likewise.
            (bfloat16_nfma_internal1): Likewise.
            (bfloat16_nfma_internal2): Likewise.
            (bfloat16_nfms_internal3): Likewise.
            * config/rs6000/predicates.md (fp16_reg_or_constant_operand): New
            predicate.
            (bfloat16_v4sf_operand): Likewise.
            (bfloat16_bf_operand): Likewise.
            * config/rs6000/rs6000-protos.h (bfloat16_operation_as_v4sf): New
            declaration.

Diff:
---
 gcc/config/rs6000/float16.cc      | 150 ++++++++++++++++++++
 gcc/config/rs6000/float16.md      | 282 ++++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/predicates.md   |  76 ++++++++++
 gcc/config/rs6000/rs6000-protos.h |   2 +
 4 files changed, 510 insertions(+)

diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
index 5274a0df962f..2c7b6278a16a 100644
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@@ -183,3 +183,153 @@ fp16_vectorization (enum rtx_code icode,
 
   return;
 }
+
+/* Expand a bfloat16 scalar floating point operation:
+
+   ICODE:   Operation to perform.
+   RESULT:  Result of the operation.
+   OP1:     Input operand1.
+   OP2:     Input operand2.
+   OP3:     Input operand3 or NULL_RTX.
+   SUBTYPE: Describe the operation.
+
+   The operation is done as a V4SFmode vector operation.  This is because
+   converting BFmode from a scalar BFmode to SFmode to do the operation and
+   back again takes quite a bit of time.  GCC will only generate the native
+   operation if -Ofast is used.  The float16.md code that calls this function
+   adds various combine operations to do the operation in V4SFmode instead of
+   SFmode.  */
+       
+void
+bfloat16_operation_as_v4sf (enum rtx_code icode,
+                           rtx result,
+                           rtx op1,
+                           rtx op2,
+                           rtx op3,
+                           enum fp16_operation subtype)
+{
+  gcc_assert (can_create_pseudo_p ());
+
+  rtx result_v4sf = gen_reg_rtx (V4SFmode);
+  rtx ops_orig[3] = { op1, op2, op3 };
+  rtx ops_v4sf[3];
+  size_t n_opts;
+
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      n_opts = 2;
+      gcc_assert (op3 == NULL_RTX);
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      gcc_assert (icode == FMA);
+      n_opts = 3;
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  for (size_t i = 0; i < n_opts; i++)
+    {
+      rtx op = ops_orig[i];
+      rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode);
+
+      gcc_assert (op != NULL_RTX);
+
+      /* Remove truncation/extend added.  */
+      if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE)
+       op = XEXP (op, 0);
+
+      /* Convert operands to V4SFmode format.  We use SPLAT for registers to
+        get the value into the upper 32-bits.  We can use XXSPLTW to splat
+        words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the
+        odd half-words, and XXSPLTW can operate on all VSX registers instead
+        of just the Altivec registers.  Using SPLAT instead of a shift also
+        insure that other bits are not a signalling NaN.  If we are using
+        XXSPLTIW or XXSPLTIB to load the constant the other bits are
+        duplicated.  */
+
+      if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode))
+       emit_move_insn (tmp, CONST0_RTX (V4SFmode));
+
+      else if (GET_MODE (op) == BFmode)
+       {
+         emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op)));
+         emit_insn (gen_xvcvbf16spn_bf (tmp, tmp));
+       }
+
+      else if (GET_MODE (op) == SFmode)
+       {
+         if (GET_CODE (op) == CONST_DOUBLE)
+           {
+             rtvec v = rtvec_alloc (4);
+
+             for (size_t i = 0; i < 4; i++)
+               RTVEC_ELT (v, i) = op;
+
+             emit_insn (gen_rtx_SET (tmp,
+                                     gen_rtx_CONST_VECTOR (V4SFmode, v)));
+           }
+
+         else
+           emit_insn (gen_vsx_splat_v4sf (tmp,
+                                          force_reg (SFmode, op)));
+       }
+
+      else
+       gcc_unreachable ();
+    }
+
+  /* Do the operation in V4SFmode.  */
+  switch (subtype)
+    {
+    case FP16_BINARY:
+      emit_insn (gen_rtx_SET (result_v4sf,
+                             gen_rtx_fmt_ee (icode, V4SFmode,
+                                             ops_v4sf[0],
+                                             ops_v4sf[1])));
+      break;
+
+    case FP16_FMA:
+    case FP16_FMS:
+    case FP16_NFMA:
+    case FP16_NFMS:
+      {
+       rtx op1 = ops_v4sf[0];
+       rtx op2 = ops_v4sf[1];
+       rtx op3 = ops_v4sf[2];
+
+       if (subtype == FP16_FMS || subtype == FP16_NFMS)
+         op3 = gen_rtx_NEG (V4SFmode, op3);
+
+       rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3);
+
+       if (subtype == FP16_NFMA || subtype == FP16_NFMS)
+         op_fma = gen_rtx_NEG (V4SFmode, op_fma);
+
+       emit_insn (gen_rtx_SET (result_v4sf, op_fma));
+      }
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Convert V4SF result back to scalar mode.  */
+  if (GET_MODE (result) == BFmode)
+    emit_insn (gen_xvcvspbf16_bf (result, result_v4sf));
+
+  else if (GET_MODE (result) == SFmode)
+    {
+      rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
+      emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element));
+    }
+
+  else
+    gcc_unreachable ();
+}
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 690b8c2d6610..fe5422dc2892 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -963,3 +963,285 @@
   "TARGET_BFLOAT16_HW"
   "xvcvbf16spn %x0,%x1"
   [(set_attr "type" "vecperm")])
+
+;; Optimize __bfloat16 binary operations.  Unlike _Float16 where we
+;; have instructions to convert between HFmode and SFmode as scalar
+;; values, with BFmode, we only have vector conversions.  Thus to do:
+;;
+;;      __bfloat16 a, b, c;
+;;      a = b + c;
+;;
+;; the GCC compiler would normally generate:
+;;
+;;      lxsihzx 0,4,2           // load __bfloat16 value b
+;;      lxsihzx 12,5,2          // load __bfloat16 value c
+;;      xxsldwi 0,0,0,1         // shift b into bits 16..31
+;;      xxsldwi 12,12,12,1      // shift c into bits 16..31
+;;      xvcvbf16spn 0,0         // vector convert b into V4SFmode
+;;      xvcvbf16spn 12,12       // vector convert c into V4SFmode
+;;      xscvspdpn 0,0           // convert b into SFmode scalar
+;;      xscvspdpn 12,12         // convert c into SFmode scalar
+;;      fadds 0,0,12            // add b+c
+;;      xscvdpspn 0,0           // convert b+c into SFmode memory format
+;;      xvcvspbf16 0,0          // convert b+c into BFmode memory format
+;;      stxsihx 0,3,2           // store b+c
+;;
+;; Using the following combiner patterns, the code generated would now
+;; be:
+;;
+;;      lxsihzx 12,4,2          // load __bfloat16 value b
+;;      lxsihzx 0,5,2           // load __bfloat16 value c
+;;      xxspltw 12,12,1         // shift b into bits 16..31
+;;      xxspltw 0,0,1           // shift c into bits 16..31
+;;      xvcvbf16spn 12,12       // vector convert b into V4SFmode
+;;      xvcvbf16spn 0,0         // vector convert c into V4SFmode
+;;      xvaddsp 0,0,12          // vector b+c in V4SFmode
+;;      xvcvspbf16 0,0          // convert b+c into BFmode memory format
+;;      stxsihx 0,3,2           // store b+c
+;;
+;; We cannot just define insns like 'addbf3' to keep the operation as
+;; BFmode because GCC will not generate these patterns unless the user
+;; uses -Ofast.  Without -Ofast, it will always convert BFmode into
+;; SFmode.
+
+(define_insn_and_split "*bfloat16_binary_op_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (match_operator:SF 1 "fp16_binary_operator"
+                          [(match_operand:SF 2 "bfloat16_v4sf_operand")
+                           (match_operand:SF 3 "bfloat16_v4sf_operand")]))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[2], SFmode)
+       || bfloat16_bf_operand (operands[3], SFmode))"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+                             operands[3], NULL_RTX, FP16_BINARY);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_binary_op_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (match_operator:SF 1 "fp16_binary_operator"
+                           [(match_operand:SF 2 "bfloat16_v4sf_operand")
+                            (match_operand:SF 3 "bfloat16_v4sf_operand")])))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[2], SFmode)
+       || bfloat16_bf_operand (operands[3], SFmode))"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+                             operands[3], NULL_RTX, FP16_BINARY);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (fma:SF
+        (match_operand:SF 1 "bfloat16_v4sf_operand")
+        (match_operand:SF 2 "bfloat16_v4sf_operand")
+        (match_operand:SF 3 "bfloat16_v4sf_operand")))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (fma:SF
+        (match_operand:SF 1 "bfloat16_v4sf_operand")
+        (match_operand:SF 2 "bfloat16_v4sf_operand")
+        (neg:SF
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (neg:SF
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (neg:SF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (neg:SF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal3"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (neg:BF
+        (float_truncate:BF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (neg:SF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (neg:SF
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (neg:SF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (neg:SF
+           (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal3"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (neg:BF
+        (float_truncate:BF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (neg:SF
+           (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 785d09b94234..172991de3662 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -2276,3 +2276,79 @@
 
   return constant_generates_xxspltiw (&vsx_const);
 })
+
+;; Return 1 if this is a 16-bit floating point operand that can be used
+;; in an add, subtract, or multiply operation that uses the vector
+;; conversion function.
+(define_predicate "fp16_reg_or_constant_operand"
+  (match_code "reg,subreg,const_double")
+{
+  if (REG_P (op) || SUBREG_P (op))
+    return vsx_register_operand (op, mode);
+
+  if (CONST_DOUBLE_P (op))
+    return fp16_xxspltiw_constant (op, mode);
+
+  return false;
+})
+
+;; Match binary operators where we convert a BFmode operand into a
+;; SFmode operand so that we can optimize the BFmode operation to do
+;; the operation in vector mode rather than convverting the BFmode to a
+;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and
+;; then converting the V4SFmode element to SFmode scalar.
+(define_predicate "fp16_binary_operator"
+  (match_code "plus,minus,mult,smax,smin"))
+
+;; Match bfloat16/float operands that can be optimized to do the
+;; operation in V4SFmode.
+(define_predicate "bfloat16_v4sf_operand"
+  (match_code "reg,subreg,const_double,float_extend,float_truncate")
+{
+  if (mode != BFmode && mode != SFmode)
+    return false;
+
+  if (REG_P (op) || SUBREG_P (op))
+    return register_operand (op, mode);
+
+  if (CONST_DOUBLE_P (op))
+    return true;
+
+  if (GET_CODE (op) == FLOAT_EXTEND)
+    {
+      rtx op_arg = XEXP (op, 0);
+      return (mode == SFmode
+             && GET_MODE (op_arg) == BFmode
+             && (REG_P (op_arg) || SUBREG_P (op_arg)));
+    }
+
+  if (GET_CODE (op) == FLOAT_TRUNCATE)
+    {
+      rtx op_arg = XEXP (op, 0);
+      return (mode == BFmode
+             && GET_MODE (op_arg) == SFmode
+             && (REG_P (op_arg) || SUBREG_P (op_arg)));
+    }
+
+  return false;
+})
+
+;; Match an operand that originally was an BFmode value to prevent
+;; operations involing only SFmode values from being converted to
+;; BFmode.
+(define_predicate "bfloat16_bf_operand"
+  (match_code "reg,subreg,const_double,float_extend")
+{
+  if (mode == BFmode || GET_MODE (op) == BFmode)
+    return true;
+
+  if (mode != SFmode)
+    return false;
+
+  if (GET_MODE (op) == SFmode
+      && GET_CODE (op) == FLOAT_EXTEND
+      && GET_MODE (XEXP (op, 0)) == BFmode)
+    return true;
+
+  return false;
+})
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index dd5fcd69e836..3665a405cfd2 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -271,6 +271,8 @@ enum fp16_operation {
 
 extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
                                enum fp16_operation);
+extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
+                                       enum fp16_operation);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE

[gcc(refs/vendors/ibm/heads/gcc-16-future-float16)] Optimize __bfloat16 scalar code.

Reply via email to