Optimize __bfloat16 binary operations. Unlike _Float16 where we
have instructions to convert between HFmode and SFmode as scalar
values, with BFmode, we only have vector conversions. Thus to do:
__bfloat16 a, b, c;
a = b + c;
the GCC compiler generates the following code:
lxsihzx 0,4,2 // load __bfloat16 value b
lxsihzx 12,5,2 // load __bfloat16 value c
xxsldwi 0,0,0,1 // shift b into bits 16..31
xxsldwi 12,12,12,1 // shift c into bits 16..31
xvcvbf16spn 0,0 // vector convert b into V4SFmode
xvcvbf16spn 12,12 // vector convert c into V4SFmode
xscvspdpn 0,0 // convert b into SFmode scalar
xscvspdpn 12,12 // convert c into SFmode scalar
fadds 0,0,12 // add b+c
xscvdpspn 0,0 // convert b+c into SFmode memory format
xvcvspbf16 0,0 // convert b+c into BFmode memory format
stxsihx 0,3,2 // store b+c
Using the following combiner patterns that are defined in this patch, the code
generated would be:
lxsihzx 12,4,2 // load __bfloat16 value b
lxsihzx 0,5,2 // load __bfloat16 value c
xxspltw 12,12,1 // shift b into bits 16..31
xxspltw 0,0,1 // shift c into bits 16..31
xvcvbf16spn 12,12 // vector convert b into V4SFmode
xvcvbf16spn 0,0 // vector convert c into V4SFmode
xvaddsp 0,0,12 // vector b+c in V4SFmode
xvcvspbf16 0,0 // convert b+c into BFmode memory format
stxsihx 0,3,2 // store b+c
We cannot just define insns like 'addbf3' to keep the operation as
BFmode because GCC will not generate these patterns unless the user
uses -Ofast. Without -Ofast, it will always convert BFmode into
SFmode.
All 11 patches have been tested on little endian and big endian PowerPC
servers with no regressions. Can I check in these patches?
2025-11-14 Michael Meissner <[email protected]>
gcc/
* config/rs6000/float16.cc (bfloat16_operation_as_v4sf): New function to
optimize __bfloat16 scalar operations.
* config/rs6000/float16.md (bfloat16_binary_op_internal1): New
__bfloat16 scalar combiner insns.
(bfloat16_binary_op_internal2): Likewise.
(bfloat16_fma_internal1): Likewise.
(bfloat16_fma_internal2): Likewise.
(bfloat16_fms_internal1): Likewise.
(bfloat16_fms_internal2): Likewise.
(bfloat16_nfma_internal1): Likewise.
(bfloat16_nfma_internal2): Likewise.
(bfloat16_nfms_internal3): Likewise.
* config/rs6000/predicates.md (fp16_reg_or_constant_operand): New
predicate.
(bfloat16_v4sf_operand): Likewise.
(bfloat16_bf_operand): Likewise.
* config/rs6000/rs6000-protos.h (bfloat16_operation_as_v4sf): New
declaration.
---
gcc/config/rs6000/float16.cc | 150 ++++++++++++++++
gcc/config/rs6000/float16.md | 282 ++++++++++++++++++++++++++++++
gcc/config/rs6000/predicates.md | 76 ++++++++
gcc/config/rs6000/rs6000-protos.h | 2 +
4 files changed, 510 insertions(+)
diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
index 5274a0df962..2c7b6278a16 100644
--- a/gcc/config/rs6000/float16.cc
+++ b/gcc/config/rs6000/float16.cc
@@ -183,3 +183,153 @@ fp16_vectorization (enum rtx_code icode,
return;
}
+
+/* Expand a bfloat16 scalar floating point operation:
+
+ ICODE: Operation to perform.
+ RESULT: Result of the operation.
+ OP1: Input operand1.
+ OP2: Input operand2.
+ OP3: Input operand3 or NULL_RTX.
+ SUBTYPE: Describe the operation.
+
+ The operation is done as a V4SFmode vector operation. This is because
+ converting BFmode from a scalar BFmode to SFmode to do the operation and
+ back again takes quite a bit of time. GCC will only generate the native
+ operation if -Ofast is used. The float16.md code that calls this function
+ adds various combine operations to do the operation in V4SFmode instead of
+ SFmode. */
+
+void
+bfloat16_operation_as_v4sf (enum rtx_code icode,
+ rtx result,
+ rtx op1,
+ rtx op2,
+ rtx op3,
+ enum fp16_operation subtype)
+{
+ gcc_assert (can_create_pseudo_p ());
+
+ rtx result_v4sf = gen_reg_rtx (V4SFmode);
+ rtx ops_orig[3] = { op1, op2, op3 };
+ rtx ops_v4sf[3];
+ size_t n_opts;
+
+ switch (subtype)
+ {
+ case FP16_BINARY:
+ n_opts = 2;
+ gcc_assert (op3 == NULL_RTX);
+ break;
+
+ case FP16_FMA:
+ case FP16_FMS:
+ case FP16_NFMA:
+ case FP16_NFMS:
+ gcc_assert (icode == FMA);
+ n_opts = 3;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ for (size_t i = 0; i < n_opts; i++)
+ {
+ rtx op = ops_orig[i];
+ rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode);
+
+ gcc_assert (op != NULL_RTX);
+
+ /* Remove truncation/extend added. */
+ if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE)
+ op = XEXP (op, 0);
+
+ /* Convert operands to V4SFmode format. We use SPLAT for registers to
+ get the value into the upper 32-bits. We can use XXSPLTW to splat
+ words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the
+ odd half-words, and XXSPLTW can operate on all VSX registers instead
+ of just the Altivec registers. Using SPLAT instead of a shift also
+ insure that other bits are not a signalling NaN. If we are using
+ XXSPLTIW or XXSPLTIB to load the constant the other bits are
+ duplicated. */
+
+ if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode))
+ emit_move_insn (tmp, CONST0_RTX (V4SFmode));
+
+ else if (GET_MODE (op) == BFmode)
+ {
+ emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op)));
+ emit_insn (gen_xvcvbf16spn_bf (tmp, tmp));
+ }
+
+ else if (GET_MODE (op) == SFmode)
+ {
+ if (GET_CODE (op) == CONST_DOUBLE)
+ {
+ rtvec v = rtvec_alloc (4);
+
+ for (size_t i = 0; i < 4; i++)
+ RTVEC_ELT (v, i) = op;
+
+ emit_insn (gen_rtx_SET (tmp,
+ gen_rtx_CONST_VECTOR (V4SFmode, v)));
+ }
+
+ else
+ emit_insn (gen_vsx_splat_v4sf (tmp,
+ force_reg (SFmode, op)));
+ }
+
+ else
+ gcc_unreachable ();
+ }
+
+ /* Do the operation in V4SFmode. */
+ switch (subtype)
+ {
+ case FP16_BINARY:
+ emit_insn (gen_rtx_SET (result_v4sf,
+ gen_rtx_fmt_ee (icode, V4SFmode,
+ ops_v4sf[0],
+ ops_v4sf[1])));
+ break;
+
+ case FP16_FMA:
+ case FP16_FMS:
+ case FP16_NFMA:
+ case FP16_NFMS:
+ {
+ rtx op1 = ops_v4sf[0];
+ rtx op2 = ops_v4sf[1];
+ rtx op3 = ops_v4sf[2];
+
+ if (subtype == FP16_FMS || subtype == FP16_NFMS)
+ op3 = gen_rtx_NEG (V4SFmode, op3);
+
+ rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3);
+
+ if (subtype == FP16_NFMA || subtype == FP16_NFMS)
+ op_fma = gen_rtx_NEG (V4SFmode, op_fma);
+
+ emit_insn (gen_rtx_SET (result_v4sf, op_fma));
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Convert V4SF result back to scalar mode. */
+ if (GET_MODE (result) == BFmode)
+ emit_insn (gen_xvcvspbf16_bf (result, result_v4sf));
+
+ else if (GET_MODE (result) == SFmode)
+ {
+ rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3);
+ emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element));
+ }
+
+ else
+ gcc_unreachable ();
+}
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index 690b8c2d661..fe5422dc289 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -963,3 +963,285 @@ (define_insn "xvcvbf16spn_v8bf"
"TARGET_BFLOAT16_HW"
"xvcvbf16spn %x0,%x1"
[(set_attr "type" "vecperm")])
+
+;; Optimize __bfloat16 binary operations. Unlike _Float16 where we
+;; have instructions to convert between HFmode and SFmode as scalar
+;; values, with BFmode, we only have vector conversions. Thus to do:
+;;
+;; __bfloat16 a, b, c;
+;; a = b + c;
+;;
+;; the GCC compiler would normally generate:
+;;
+;; lxsihzx 0,4,2 // load __bfloat16 value b
+;; lxsihzx 12,5,2 // load __bfloat16 value c
+;; xxsldwi 0,0,0,1 // shift b into bits 16..31
+;; xxsldwi 12,12,12,1 // shift c into bits 16..31
+;; xvcvbf16spn 0,0 // vector convert b into V4SFmode
+;; xvcvbf16spn 12,12 // vector convert c into V4SFmode
+;; xscvspdpn 0,0 // convert b into SFmode scalar
+;; xscvspdpn 12,12 // convert c into SFmode scalar
+;; fadds 0,0,12 // add b+c
+;; xscvdpspn 0,0 // convert b+c into SFmode memory format
+;; xvcvspbf16 0,0 // convert b+c into BFmode memory format
+;; stxsihx 0,3,2 // store b+c
+;;
+;; Using the following combiner patterns, the code generated would now
+;; be:
+;;
+;; lxsihzx 12,4,2 // load __bfloat16 value b
+;; lxsihzx 0,5,2 // load __bfloat16 value c
+;; xxspltw 12,12,1 // shift b into bits 16..31
+;; xxspltw 0,0,1 // shift c into bits 16..31
+;; xvcvbf16spn 12,12 // vector convert b into V4SFmode
+;; xvcvbf16spn 0,0 // vector convert c into V4SFmode
+;; xvaddsp 0,0,12 // vector b+c in V4SFmode
+;; xvcvspbf16 0,0 // convert b+c into BFmode memory format
+;; stxsihx 0,3,2 // store b+c
+;;
+;; We cannot just define insns like 'addbf3' to keep the operation as
+;; BFmode because GCC will not generate these patterns unless the user
+;; uses -Ofast. Without -Ofast, it will always convert BFmode into
+;; SFmode.
+
+(define_insn_and_split "*bfloat16_binary_op_internal1"
+ [(set (match_operand:SF 0 "vsx_register_operand")
+ (match_operator:SF 1 "fp16_binary_operator"
+ [(match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand")]))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[2], SFmode)
+ || bfloat16_bf_operand (operands[3], SFmode))"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+ operands[3], NULL_RTX, FP16_BINARY);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_binary_op_internal2"
+ [(set (match_operand:BF 0 "vsx_register_operand")
+ (float_truncate:BF
+ (match_operator:SF 1 "fp16_binary_operator"
+ [(match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand")])))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[2], SFmode)
+ || bfloat16_bf_operand (operands[3], SFmode))"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+ operands[3], NULL_RTX, FP16_BINARY);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal1"
+ [(set (match_operand:SF 0 "vsx_register_operand")
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand")))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMA);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal2"
+ [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+ (float_truncate:BF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMA);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal1"
+ [(set (match_operand:SF 0 "vsx_register_operand")
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (neg:SF
+ (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMS);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal2"
+ [(set (match_operand:BF 0 "vsx_register_operand")
+ (float_truncate:BF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (neg:SF
+ (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMS);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal1"
+ [(set (match_operand:SF 0 "vsx_register_operand")
+ (neg:SF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMA);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal2"
+ [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+ (float_truncate:BF
+ (neg:SF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMA);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal3"
+ [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+ (neg:BF
+ (float_truncate:BF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMA);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal1"
+ [(set (match_operand:SF 0 "vsx_register_operand")
+ (neg:SF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (neg:SF
+ (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMS);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal2"
+ [(set (match_operand:BF 0 "vsx_register_operand")
+ (float_truncate:BF
+ (neg:SF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (neg:SF
+ (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMS);
+ DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal3"
+ [(set (match_operand:BF 0 "vsx_register_operand")
+ (neg:BF
+ (float_truncate:BF
+ (fma:SF
+ (match_operand:SF 1 "bfloat16_v4sf_operand")
+ (match_operand:SF 2 "bfloat16_v4sf_operand")
+ (neg:SF
+ (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+ "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+ && (bfloat16_bf_operand (operands[1], SFmode)
+ + bfloat16_bf_operand (operands[2], SFmode)
+ + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMS);
+ DONE;
+})
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 785d09b9423..172991de366 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -2276,3 +2276,79 @@ (define_predicate "fp16_xxspltiw_constant"
return constant_generates_xxspltiw (&vsx_const);
})
+
+;; Return 1 if this is a 16-bit floating point operand that can be used
+;; in an add, subtract, or multiply operation that uses the vector
+;; conversion function.
+(define_predicate "fp16_reg_or_constant_operand"
+ (match_code "reg,subreg,const_double")
+{
+ if (REG_P (op) || SUBREG_P (op))
+ return vsx_register_operand (op, mode);
+
+ if (CONST_DOUBLE_P (op))
+ return fp16_xxspltiw_constant (op, mode);
+
+ return false;
+})
+
+;; Match binary operators where we convert a BFmode operand into a
+;; SFmode operand so that we can optimize the BFmode operation to do
+;; the operation in vector mode rather than convverting the BFmode to a
+;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and
+;; then converting the V4SFmode element to SFmode scalar.
+(define_predicate "fp16_binary_operator"
+ (match_code "plus,minus,mult,smax,smin"))
+
+;; Match bfloat16/float operands that can be optimized to do the
+;; operation in V4SFmode.
+(define_predicate "bfloat16_v4sf_operand"
+ (match_code "reg,subreg,const_double,float_extend,float_truncate")
+{
+ if (mode != BFmode && mode != SFmode)
+ return false;
+
+ if (REG_P (op) || SUBREG_P (op))
+ return register_operand (op, mode);
+
+ if (CONST_DOUBLE_P (op))
+ return true;
+
+ if (GET_CODE (op) == FLOAT_EXTEND)
+ {
+ rtx op_arg = XEXP (op, 0);
+ return (mode == SFmode
+ && GET_MODE (op_arg) == BFmode
+ && (REG_P (op_arg) || SUBREG_P (op_arg)));
+ }
+
+ if (GET_CODE (op) == FLOAT_TRUNCATE)
+ {
+ rtx op_arg = XEXP (op, 0);
+ return (mode == BFmode
+ && GET_MODE (op_arg) == SFmode
+ && (REG_P (op_arg) || SUBREG_P (op_arg)));
+ }
+
+ return false;
+})
+
+;; Match an operand that originally was an BFmode value to prevent
+;; operations involing only SFmode values from being converted to
+;; BFmode.
+(define_predicate "bfloat16_bf_operand"
+ (match_code "reg,subreg,const_double,float_extend")
+{
+ if (mode == BFmode || GET_MODE (op) == BFmode)
+ return true;
+
+ if (mode != SFmode)
+ return false;
+
+ if (GET_MODE (op) == SFmode
+ && GET_CODE (op) == FLOAT_EXTEND
+ && GET_MODE (XEXP (op, 0)) == BFmode)
+ return true;
+
+ return false;
+})
diff --git a/gcc/config/rs6000/rs6000-protos.h
b/gcc/config/rs6000/rs6000-protos.h
index dd5fcd69e83..3665a405cfd 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -271,6 +271,8 @@ enum fp16_operation {
extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
enum fp16_operation);
+extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
+ enum fp16_operation);
#endif /* RTX_CODE */
#ifdef TREE_CODE
--
2.51.1
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]