https://gcc.gnu.org/g:5b6529709f13057c05a80d818b7e5f6321949568

commit 5b6529709f13057c05a80d818b7e5f6321949568
Author: Michael Meissner <[email protected]>
Date:   Thu Nov 6 00:27:31 2025 -0500

    Revert changes

Diff:
---
 gcc/ChangeLog.test                |   30 +-
 gcc/config.gcc                    |   19 +
 gcc/config/rs6000/float16.md      | 1139 +++++++++++++++++++++++++++++++++++++
 gcc/config/rs6000/predicates.md   |   76 +++
 gcc/config/rs6000/rs6000-c.cc     |    4 +
 gcc/config/rs6000/rs6000-call.cc  |   34 +-
 gcc/config/rs6000/rs6000-cpus.def |   13 +-
 gcc/config/rs6000/rs6000-protos.h |   15 +
 gcc/config/rs6000/t-rs6000        |    4 +
 9 files changed, 1325 insertions(+), 9 deletions(-)

diff --git a/gcc/ChangeLog.test b/gcc/ChangeLog.test
index f47c02f47d9e..79b0fb7849d3 100644
--- a/gcc/ChangeLog.test
+++ b/gcc/ChangeLog.test
@@ -44,7 +44,35 @@ gcc/
        Define __FLOAT16_HW__ if we have hardware support for _Float16
        conversions.
 
-==================== Branch work223-test, patch #430 patch #460 was reverted 
====================
+==================== Branch work223-test, patch #430 patch #460 
====================
+
+Add __bfloat16 optimizations.
+
+2025-11-05  Michael Meissner  <[email protected]>
+
+gcc/
+
+       * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): New function to
+       add __bfloat16 optimizations.
+       * config/rs6000/float16.md (VFP16): New mode iterator.
+       (bfloat16_binary_op_internal1): Add optimizations for __bfloat16.
+       (bfloat16_binary_op_internal2): Likewise.
+       (bfloat16_fma_internal1): Likewise.
+       (bfloat16_fma_internal2): Likewise.
+       (bfloat16_fms_internal1): Likewise.
+       (bfloat16_fms_internal2): Likewise.
+       (bfloat16_nfma_internal1): Likewise.
+       (bfloat16_nfma_internal2): Likewise.
+       (bfloat16_nfms_internal1): Likewise.
+       (bfloat16_nfms_internal2): Likewise.
+       (bfloat16_nfms_internal3): Likewise.
+       (__bfloat16 peephole): New peephole.
+       * config/rs6000/predicates.md (fp16_reg_or_constant_operand): New
+       predicate.
+       (bfloat16_v4sf_operand): Likewise.
+       (bfloat16_bf_operand): Likewise.
+       * config/rs6000/rs60000-protos.h (bfloat16_operation_as_v4sf): New
+       declaration.
 
 ==================== Branch work223-test, patch #430 patch #459 
====================
 
diff --git a/gcc/config.gcc b/gcc/config.gcc
index a753c018ae1c..ad8f0bd4e26c 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -533,6 +533,7 @@ powerpc*-*-*)
        extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
        extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
        extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+       extra_objs="${extra_objs} float16.o"
        extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
        extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
        extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -5798,6 +5799,24 @@ case "${target}" in
                elif test x$with_long_double_format = xibm; then
                    tm_defines="${tm_defines} TARGET_IEEEQUAD_DEFAULT=0"
                fi
+
+               # Test if we should enable 16-bit floating point on the 
platforms
+               # where we can support __bfloat16 and _Float16.
+               if test x$with_powerpc_float16 = xyes; then
+                   tm_defines="${tm_defines} POWERPC_FLOAT16_DEFAULT=1"
+
+               elif test x$with_powerpc_16bit_floating_point = xyes; then
+                   tm_defines="${tm_defines} POWERPC_FLOAT16_DEFAULT=0"
+               fi
+
+               # Test if we should disable the warning about passing
+               # and returning 16-bit floating point values.
+               if test x$with_powerpc_float16_disable_warning = xyes; then
+                   tm_defines="${tm_defines} POWERPC_FLOAT16_DISABLE_WARNING=1"
+
+               elif test x$with_powerpc_float16_disable_warning = xno; then
+                   tm_defines="${tm_defines} POWERPC_FLOAT16_DISABLE_WARNING=0"
+               fi
                ;;
 
        s390*-*-*)
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index fec4bb87fd09..6060746cc4ca 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -25,6 +25,28 @@
 (define_mode_iterator FP16 [(BF "TARGET_FLOAT16")
                            (HF "TARGET_FLOAT16")])
 
+(define_mode_iterator VFP16 [(V8BF "TARGET_BFLOAT16")
+                            (V8HF "TARGET_FLOAT16")])
+
+;; Mode iterator for 16-bit floating point modes on machines with
+;; hardware support both as a scalar and as a vector.
+(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW")
+                              (HF "TARGET_FLOAT16_HW")])
+
+(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW")
+                               (V8HF "TARGET_FLOAT16_HW")])
+
+;; Mode iterator for floating point modes other than SF/DFmode that we
+;; convert to/from _Float16 (HFmode) via DFmode.
+(define_mode_iterator fp16_float_convert [TF KF IF SD DD TD])
+
+;; Mode attribute giving the instruction to convert the even
+;; V8HFmode or V8BFmode elements to V4SFmode
+(define_mode_attr cvt_fp16_to_v4sf_insn [(BF   "xvcvbf16spn")
+                                        (HF   "xvcvhpsp")
+                                        (V8BF "xvcvbf16spn")
+                                        (V8HF "xvcvhpsp")])
+
 ;; Mode attribute giving the vector mode for a 16-bit floating point
 ;; scalar in both upper and lower case.
 (define_mode_attr FP16_VECTOR8 [(BF "V8BF")
@@ -32,6 +54,35 @@
 
 (define_mode_attr fp16_vector8 [(BF "v8bf")
                                (HF "v8hf")])
+
+;; Mode attribute giving the vector mode with 4 16-bit floating point
+;; elements given a scalar or 8 element vector.
+(define_mode_attr FP16_VECTOR4 [(BF   "V4BF")
+                               (HF   "V4HF")
+                               (V8BF "V4BF")
+                               (V8HF "V4HF")])
+
+;; Binary operators for bfloat16/float16 vectorization.
+(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin])
+
+;; Standard names for the unary/binary/ternary operators
+(define_code_attr fp16_names [(abs   "abs")
+                             (fma   "fma")
+                             (plus  "add")
+                             (minus "sub")
+                             (mult  "mul")
+                             (neg   "neg")
+                             (smax  "smax")
+                             (smin  "smin")])
+
+;; UNSPEC constants
+(define_c_enum "unspec"
+  [UNSPEC_FP16_SHIFT_LEFT_32BIT
+   UNSPEC_CVT_FP16_TO_V4SF
+   UNSPEC_XXSPLTW_FP16
+   UNSPEC_XVCVSPBF16_BF
+   UNSPEC_XVCVSPHP_V8HF
+   UNSPEC_XVCVSPBF16_V8BF])
 
 ;; _Float16 and __bfloat16 moves
 (define_expand "mov<mode>"
@@ -122,3 +173,1091 @@
 }
   [(set_attr "type" "veclogical,vecperm")
    (set_attr "prefixed" "*,yes")])
+
+;; Convert IEEE 16-bit floating point to/from other floating point modes.
+
+(define_insn "extendhf<mode>2"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
+       (float_extend:SFDF
+        (match_operand:HF 1 "vsx_register_operand" "wa")))]
+  "TARGET_FLOAT16_HW"
+  "xscvhpdp %x0,%x1"
+  [(set_attr "type" "fpsimple")])
+
+(define_insn "trunc<mode>hf2"
+  [(set (match_operand:HF 0 "vsx_register_operand" "=wa")
+       (float_truncate:HF
+        (match_operand:SFDF 1 "vsx_register_operand" "wa")))]
+  "TARGET_FLOAT16_HW"
+  "xscvdphp %x0,%x1"
+  [(set_attr "type" "fpsimple")])
+
+;; Convert BFmode to SFmode/DFmode.
+;; 3 instructions are generated:
+;;     VSPLTH          -- duplicate BFmode into all elements
+;;     XVCVBF16SPN     -- convert even BFmode elements to SFmode
+;;     XSCVSPNDP       -- convert memory format of SFmode to DFmode.
+(define_insn_and_split "extendbf<mode>2"
+  [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa")
+       (float_extend:SFDF
+        (match_operand:BF 1 "vsx_register_operand" "v")))
+   (clobber (match_scratch:V8BF 2 "=v"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2_v8bf = operands[2];
+
+  if (GET_CODE (op2_v8bf) == SCRATCH)
+    op2_v8bf = gen_reg_rtx (V8BFmode);
+
+  rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf);
+
+  /* XXSLDWI -- shift BFmode element into the upper 32 bits.  */
+  emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1));
+
+  /* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode.  */
+  emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf));
+
+  /* XSCVSPNDP -- convert single V4SFmode element to DFmode.  */
+  emit_insn (GET_MODE (op0) == SFmode
+            ? gen_xscvspdpn_sf (op0, op2_v4sf)
+            : gen_vsx_xscvspdpn (op0, op2_v4sf));
+
+  DONE;
+}
+  [(set_attr "type" "fpsimple")
+   (set_attr "length" "12")])
+
+;; Convert a SFmode scalar represented as DFmode to elements 0 and 1 of
+;; V4SFmode.
+(define_insn "xscvdpspn_sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_VSX_CVSPDP))]
+  "VECTOR_UNIT_VSX_P (SFmode)"
+  "xscvdpspn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Convert element 0 of a V4SFmode to scalar SFmode (which on the
+;; PowerPC uses the DFmode encoding).
+(define_insn "xscvspdpn_sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+       (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_VSX_CVSPDPN))]
+  "TARGET_XSCVSPDPN"
+  "xscvspdpn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Vector shift left by 32 bits to get the 16-bit floating point value
+;; into the upper 32 bits for the conversion.
+(define_insn "<fp16_vector8>_shift_left_32bit"
+  [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa")
+        (unspec:<FP16_VECTOR8>
+        [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+        UNSPEC_FP16_SHIFT_LEFT_32BIT))]
+  ""
+  "xxsldwi %x0,%x1,%x1,1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert SFmode/DFmode to BFmode.
+;; 2 instructions are generated:
+;;     XSCVDPSPN       -- convert SFmode/DFmode scalar to V4SFmode
+;;     XVCVSPBF16      -- convert V4SFmode to even V8BFmode
+
+(define_insn_and_split "trunc<mode>bf2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (match_operand:SFDF 1 "vsx_register_operand" "wa")))
+   (clobber (match_scratch:V4SF 2 "=wa"))]
+  "TARGET_BFLOAT16_HW"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+
+  if (GET_CODE (op2) == SCRATCH)
+    op2 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (GET_MODE (op1) == SFmode
+            ? gen_xscvdpspn_sf (op2, op1)
+            : gen_vsx_xscvdpspn (op2, op1));
+
+  emit_insn (gen_xvcvspbf16_bf (op0, op2));
+  DONE;
+}
+  [(set_attr "type" "fpsimple")])
+
+(define_insn "vsx_xscvdpspn_sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_VSX_CVDPSPN))]
+  "TARGET_XSCVDPSPN"
+  "xscvdpspn %x0,%x1"
+  [(set_attr "type" "fp")])
+
+;; Convert the even elements of a vector 16-bit floating point to
+;; V4SFmode.  Deal with little endian vs. big endian element ordering
+;; in identifying which elements are converted.
+
+(define_expand "cvt_fp16_to_v4sf_<mode>"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(match_dup 2)
+                    (match_dup 3)
+                    (match_dup 4)
+                    (match_dup 5)]))))]
+  ""
+{
+  int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1;
+  operands[2] = GEN_INT (0 + endian_adjust);
+  operands[3] = GEN_INT (2 + endian_adjust);
+  operands[4] = GEN_INT (4 + endian_adjust);
+  operands[5] = GEN_INT (6 + endian_adjust);
+})
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_le"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(const_int 1)
+                    (const_int 3)
+                    (const_int 5)
+                    (const_int 7)]))))]
+  "!WORDS_BIG_ENDIAN"
+  "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+(define_insn "*cvt_fp16_to_v4sf_<mode>_be"
+  [(set (match_operand:V4SF 0 "vsx_register_operand")
+       (float_extend:V4SF
+        (vec_select:<FP16_VECTOR4>
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (parallel [(const_int 0)
+                    (const_int 2)
+                    (const_int 4)
+                    (const_int 6)]))))]
+  "WORDS_BIG_ENDIAN"
+  "<cvt_fp16_to_v4sf_insn> %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+;; Duplicate and convert a 16-bit floating point scalar to V4SFmode.
+
+(define_insn_and_split "*dup_<mode>_to_v4sf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (vec_duplicate:V4SF
+        (float_extend:SF
+         (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))]
+  ""
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0);
+
+  emit_insn (gen_xxspltw_<mode> (op0, op1));
+  emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16));
+  DONE;
+}
+  [(set_attr "length" "8")
+   (set_attr "type" "vecperm")])
+
+;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn.
+;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can
+;; use xxspltw instead of vspltw.  This has the advantage that the
+;; register allocator can use any of the 64 VSX registers instead of
+;; being limited to the 32 Altivec registers that VSPLTH would require.
+
+(define_insn "xxspltw_<mode>"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XXSPLTW_FP16))]
+  ""
+  "xxspltw %x0,%x1,1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a bfloat16 floating point scalar that has been splatted to
+;; V4SFmode.
+
+(define_insn "xvcvbf16spn_bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvbf16spn %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
+;; Convert a V4SFmode vector to a 16-bit floating point scalar.  We
+;; only care about the 2nd V4SFmode element, which is the element we
+;; converted the 16-bit scalar (4th element) to V4SFmode to do the
+;; operation, and converted it back.
+
+(define_insn "xvcvspbf16_bf"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_XVCVSPBF16_BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvspbf16 %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+;; Convert between HFmode/BFmode and 128-bit binary floating point and
+;; decimal floating point types.  We use convert_move since some of the
+;; types might not have valid RTX expanders.  We use DFmode as the
+;; intermediate conversion destination.
+
+(define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2"
+  [(set (match_operand:fp16_float_convert 0 "vsx_register_operand")
+       (float_extend:fp16_float_convert
+        (match_operand:FP16_HW 1 "vsx_register_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+  emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
+  convert_move (operands[0], df_tmp, 0);
+  DONE;
+})
+
+(define_expand "trunc<fp16_float_convert:mode><FP16_HW:mode>2"
+  [(set (match_operand:FP16_HW 0 "vsx_register_operand")
+       (float_truncate:FP16_HW
+        (match_operand:fp16_float_convert 1 "vsx_register_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+
+  convert_move (df_tmp, operands[1], 0);
+  emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
+  DONE;
+})
+
+;; Convert integers to 16-bit floating point modes.
+(define_expand "float<GPR:mode><FP16_HW:mode>2"
+  [(set (match_operand:FP16_HW 0 "vsx_register_operand")
+       (float:FP16_HW
+        (match_operand:GPR 1 "nonimmediate_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+  emit_insn (gen_float<GPR:mode>df2 (df_tmp, operands[1]));
+  emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
+  DONE;
+})
+
+(define_expand "floatuns<GPR:mode><FP16_HW:mode>2"
+  [(set (match_operand:FP16_HW 0 "vsx_register_operand")
+       (unsigned_float:FP16_HW
+        (match_operand:GPR 1 "nonimmediate_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+  emit_insn (gen_floatuns<GPR:mode>df2 (df_tmp, operands[1]));
+  emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp));
+  DONE;
+})
+
+;; Convert 16-bit floating point modes to integers
+(define_expand "fix_trunc<FP16_HW:mode><GPR:mode>2"
+  [(set (match_operand:GPR 0 "vsx_register_operand")
+       (fix:GPR
+        (match_operand:FP16_HW 1 "vsx_register_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+  emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
+  emit_insn (gen_fix_truncdf<GPR:mode>2 (operands[0], df_tmp));
+  DONE;
+})
+
+(define_expand "fixuns_trunc<FP16_HW:mode><GPR:mode>2"
+  [(set (match_operand:GPR 0 "vsx_register_operand")
+       (unsigned_fix:GPR
+        (match_operand:FP16_HW 1 "vsx_register_operand")))]
+  ""
+{
+  rtx df_tmp = gen_reg_rtx (DFmode);
+  emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1]));
+  emit_insn (gen_fixuns_truncdf<GPR:mode>2 (operands[0], df_tmp));
+  DONE;
+})
+
+;; Negate 16-bit floating point by XOR with -0.0.
+
+(define_insn_and_split "neg<mode>2"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
+       (neg:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")))
+   (clobber (match_scratch:FP16 2 "=&wa,&r"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (xor:FP16 (match_dup 1)
+                 (match_dup 2)))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx rc = const_double_from_real_value (dconst, <MODE>mode);
+  if (!TARGET_PREFIXED)
+    rc = force_const_mem (<MODE>mode, rc);
+
+  operands[3] = rc;
+}
+  [(set_attr "type" "veclogical,integer")
+   (set_attr "length" "16")])
+
+;; 16-bit floating point absolute value
+
+(define_insn_and_split "abs<mode>2"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
+       (abs:FP16
+        (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")))
+   (clobber (match_scratch:FP16 2 "=&wa,&r"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (and:FP16 (match_dup 1)
+                 (not:FP16 (match_dup 2))))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+
+  rtx rc = const_double_from_real_value (dconst, <MODE>mode);
+
+  if (!TARGET_PREFIXED)
+    rc = force_const_mem (<MODE>mode, rc);
+
+  operands[3] = rc;
+}
+  [(set_attr "type" "veclogical,integer")
+   (set_attr "length" "16")])
+
+;; 16-bit negative floating point absolute value
+
+(define_insn_and_split "*nabs<mode>2"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr")
+       (neg:FP16
+        (abs:FP16
+         (match_operand:FP16 1 "gpc_reg_operand" "wa,wr"))))
+   (clobber (match_scratch:FP16 2 "=&wa,&r"))]
+  ""
+  "#"
+  "&& 1"
+  [(set (match_dup 2)
+       (match_dup 3))
+   (set (match_dup 0)
+       (ior:FP16 (match_dup 1)
+                 (match_dup 2)))]
+{
+  if (GET_CODE (operands[2]) == SCRATCH)
+    operands[2] = gen_reg_rtx (<MODE>mode);
+
+  REAL_VALUE_TYPE dconst;
+
+  gcc_assert (real_from_string (&dconst, "-0.0") == 0);
+  rtx rc = const_double_from_real_value (dconst, <MODE>mode);
+
+  if (!TARGET_PREFIXED)
+    rc = force_const_mem (<MODE>mode, rc);
+
+  operands[3] = rc;
+}
+  [(set_attr "type" "veclogical,integer")
+   (set_attr "length" "16")])
+
+;; Add logical operations for 16-bit floating point types that are used
+;; for things like negate, abs, and negative abs.  Possibly in the
+;; future we might need logical operators for extracting exponents and
+;; mantissas.
+(define_expand "and<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+       (and:FP16 (match_operand:FP16 1 "gpc_reg_operand")
+                 (match_operand:FP16 2 "gpc_reg_operand")))]
+  ""
+  "")
+
+(define_expand "ior<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+        (ior:FP16 (match_operand:FP16 1 "gpc_reg_operand")
+                 (match_operand:FP16 2 "gpc_reg_operand")))]
+  ""
+  "")
+
+(define_expand "xor<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+        (xor:FP16 (match_operand:FP16 1 "gpc_reg_operand")
+                 (match_operand:FP16 2 "gpc_reg_operand")))]
+  ""
+  "")
+
+(define_expand "nor<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+       (and:FP16
+        (not:FP16 (match_operand:FP16 1 "gpc_reg_operand"))
+        (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))))]
+  ""
+  "")
+
+(define_expand "andn<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+        (and:FP16
+        (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))
+        (match_operand:FP16 1 "gpc_reg_operand")))]
+  ""
+  "")
+
+(define_expand "eqv<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+       (not:FP16
+        (xor:FP16 (match_operand:FP16 1 "gpc_reg_operand")
+                  (match_operand:FP16 2 "gpc_reg_operand"))))]
+  ""
+  "")
+
+;; Rewrite nand into canonical form
+(define_expand "nand<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+       (ior:FP16
+        (not:FP16 (match_operand:FP16 1 "gpc_reg_operand"))
+        (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))))]
+  ""
+  "")
+
+;; The canonical form is to have the negated element first, so we need to
+;; reverse arguments.
+(define_expand "iorn<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand")
+       (ior:FP16
+        (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))
+        (match_operand:FP16 1 "gpc_reg_operand")))]
+  ""
+  "")
+
+;; AND, IOR, and XOR insns.  Unlike HImode operations prefer using
+;; floating point/vector registers over GPRs.
+(define_insn "*bool<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r")
+       (match_operator:FP16 3 "boolean_operator"
+        [(match_operand:FP16 1 "gpc_reg_operand" "wa,r")
+         (match_operand:FP16 2 "gpc_reg_operand" "wa,r")]))]
+  ""
+  "@
+   xxl%q3 %x0,%x1,%x2
+   %q3 %0,%1,%2"
+  [(set_attr "type" "veclogical,logical")])
+
+;; ANDC, IORC, and EQV insns.
+(define_insn "*boolc<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r")
+       (match_operator:FP16 3 "boolean_operator"
+        [(not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r"))
+         (match_operand:FP16 1 "gpc_reg_operand" "wa,r")]))]
+  ""
+  "@
+   xxl%q3 %x0,%x1,%x2
+   %q3 %0,%1,%2"
+  [(set_attr "type" "veclogical,logical")])
+
+(define_insn "*boolc<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r")
+       (match_operator:FP16 3 "boolean_operator"
+        [(match_operand:FP16 1 "gpc_reg_operand" "wa,r")
+         (not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r"))]))]
+  ""
+  "@
+   xxl%q3 %x0,%x1,%x2
+   %q3 %0,%1,%2"
+  [(set_attr "type" "veclogical,logical")])
+
+;; NOR and NAND insns.
+(define_insn "*boolcc<mode>3"
+  [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r")
+       (match_operator:FP16 3 "boolean_operator"
+        [(not:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,r"))
+         (not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r"))]))]
+  ""
+  "@
+   xxl%q3 %x0,%x1,%x2
+   %q3 %0,%1,%2"
+  [(set_attr "type" "veclogical,logical")])
+
+;; Optimize __bfloat16 binary operations.  Unlike _Float16 where we
+;; have instructions to convert between HFmode and SFmode as scalar
+;; values, with BFmode, we only have vector conversions.  Thus to do:
+;;
+;;      __bfloat16 a, b, c;
+;;      a = b + c;
+;;
+;; the GCC compiler would normally generate:
+;;
+;;      lxsihzx 0,4,2           // load __bfloat16 value b
+;;      lxsihzx 12,5,2          // load __bfloat16 value c
+;;      xxsldwi 0,0,0,1         // shift b into bits 16..31
+;;      xxsldwi 12,12,12,1      // shift c into bits 16..31
+;;      xvcvbf16spn 0,0         // vector convert b into V4SFmode
+;;      xvcvbf16spn 12,12       // vector convert c into V4SFmode
+;;      xscvspdpn 0,0           // convert b into SFmode scalar
+;;      xscvspdpn 12,12         // convert c into SFmode scalar
+;;      fadds 0,0,12            // add b+c
+;;      xscvdpspn 0,0           // convert b+c into SFmode memory format
+;;      xvcvspbf16 0,0          // convert b+c into BFmode memory format
+;;      stxsihx 0,3,2           // store b+c
+;;
+;; Using the following combiner patterns, the code generated would now
+;; be:
+;;
+;;      lxsihzx 12,4,2          // load __bfloat16 value b
+;;      lxsihzx 0,5,2           // load __bfloat16 value c
+;;      xxspltw 12,12,1         // shift b into bits 16..31
+;;      xxspltw 0,0,1           // shift c into bits 16..31
+;;      xvcvbf16spn 12,12       // vector convert b into V4SFmode
+;;      xvcvbf16spn 0,0         // vector convert c into V4SFmode
+;;      xvaddsp 0,0,12          // vector b+c in V4SFmode
+;;      xvcvspbf16 0,0          // convert b+c into BFmode memory format
+;;      stxsihx 0,3,2           // store b+c
+;;
+;; We cannot just define insns like 'addbf3' to keep the operation as
+;; BFmode because GCC will not generate these patterns unless the user
+;; uses -Ofast.  Without -Ofast, it will always convert BFmode into
+;; SFmode.
+
+(define_insn_and_split "*bfloat16_binary_op_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (match_operator:SF 1 "fp16_binary_operator"
+                          [(match_operand:SF 2 "bfloat16_v4sf_operand")
+                           (match_operand:SF 3 "bfloat16_v4sf_operand")]))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[2], SFmode)
+       || bfloat16_bf_operand (operands[3], SFmode))"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+                             operands[3], NULL_RTX, FP16_BINARY);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_binary_op_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (match_operator:SF 1 "fp16_binary_operator"
+                           [(match_operand:SF 2 "bfloat16_v4sf_operand")
+                            (match_operand:SF 3 "bfloat16_v4sf_operand")])))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[2], SFmode)
+       || bfloat16_bf_operand (operands[3], SFmode))"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2],
+                             operands[3], NULL_RTX, FP16_BINARY);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (fma:SF
+        (match_operand:SF 1 "bfloat16_v4sf_operand")
+        (match_operand:SF 2 "bfloat16_v4sf_operand")
+        (match_operand:SF 3 "bfloat16_v4sf_operand")))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fma_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (fma:SF
+        (match_operand:SF 1 "bfloat16_v4sf_operand")
+        (match_operand:SF 2 "bfloat16_v4sf_operand")
+        (neg:SF
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_fms_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (neg:SF
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (neg:SF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (match_operand:SF 3 "bfloat16_v4sf_operand"))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (float_truncate:BF
+        (neg:SF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfma_internal3"
+  [(set (match_operand:BF 0 "vsx_register_operand" "=wa")
+       (neg:BF
+        (float_truncate:BF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal1"
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (neg:SF
+        (fma:SF
+         (match_operand:SF 1 "bfloat16_v4sf_operand")
+         (match_operand:SF 2 "bfloat16_v4sf_operand")
+         (neg:SF
+          (match_operand:SF 3 "bfloat16_v4sf_operand")))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal2"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (float_truncate:BF
+        (neg:SF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (neg:SF
+           (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
+
+(define_insn_and_split "*bfloat16_nfms_internal3"
+  [(set (match_operand:BF 0 "vsx_register_operand")
+       (neg:BF
+        (float_truncate:BF
+         (fma:SF
+          (match_operand:SF 1 "bfloat16_v4sf_operand")
+          (match_operand:SF 2 "bfloat16_v4sf_operand")
+          (neg:SF
+           (match_operand:SF 3 "bfloat16_v4sf_operand"))))))]
+  "TARGET_BFLOAT16_HW && can_create_pseudo_p ()
+   && (bfloat16_bf_operand (operands[1], SFmode)
+       + bfloat16_bf_operand (operands[2], SFmode)
+       + bfloat16_bf_operand (operands[3], SFmode) >= 2)"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2],
+                             operands[3], FP16_NFMS);
+  DONE;
+})
+
+;; Add vectorization support for 16-bit floating point.
+
+;; Binary operators being vectorized.
+(define_insn_and_split "<fp16_names><mode>3"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (FP16_BINARY_OP:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX,
+                     FP16_BINARY);
+  DONE;
+})
+
+;; FMA operations being vectorized.
+(define_insn_and_split "fma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (match_operand:VFP16_HW 3 "vsx_register_operand")))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMA);
+  DONE;
+})
+
+(define_insn_and_split "*fms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (fma:VFP16_HW
+        (match_operand:VFP16_HW 1 "vsx_register_operand")
+        (match_operand:VFP16_HW 2 "vsx_register_operand")
+        (neg:VFP16_HW
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_FMS);
+  DONE;
+})
+
+(define_insn_and_split "*nfma<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMA);
+  DONE;
+})
+
+(define_insn_and_split "*nfms<mode>4"
+  [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+       (neg:VFP16_HW
+        (fma:VFP16_HW
+         (match_operand:VFP16_HW 1 "vsx_register_operand")
+         (match_operand:VFP16_HW 2 "vsx_register_operand")
+         (neg:VFP16_HW
+          (match_operand:VFP16_HW 3 "vsx_register_operand")))))]
+  "can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(pc)]
+{
+  fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+                     operands[3], FP16_NFMS);
+  DONE;
+})
+
+;; If we do multiple __bfloat16 operations, between the first and
+;; second operation, GCC will want to convert the first operation from
+;; V4SFmode to SFmode and then reconvert it back to V4SFmode.  On the
+;; PowerPC, this is complicated because internally in the vector
+;; register, SFmode values are stored as DFmode values.
+;;
+;; For example, if we have:
+;;
+;;     __bfloat16 a, b, c, d;
+;;     a = b + c + d;
+;;
+;; We would generate:
+;;
+;;      lxsihzx 0,4,2           // load b as BFmode
+;;      lxsihzx 11,5,2          // load c as BFmode
+;;      lxsihzx 12,6,2          // load d as BFmode
+;;      xxspltw 0,0,1           // shift b into bits 16..31
+;;      xxspltw 11,11,1         // shift c into bits 16..31
+;;      xxspltw 12,12,1         // shift d into bits 16..31
+;;      xvcvbf16spn 0,0         // convert b into V4SFmode
+;;      xvcvbf16spn 11,11       // convert c into V4SFmode
+;;      xvcvbf16spn 12,12       // convert d into V4SFmode
+;;      xvaddsp 0,0,11          // calculate b+c as V4SFmode
+;;      xscvspdp 0,0            // convert b+c into DFmode memory format
+;;      xscvdpspn 0,0           // convert b+c into SFmode memory format
+;;      xxspltw 0,0,0           // convert b+c into V4SFmode
+;;      xvaddsp 12,12,0         // calculate b+c+d as V4SFmode
+;;      xvcvspbf16 12,12        // convert b+c+d into BFmode memory format
+;;      stxsihx 12,3,2          // store b+c+d
+;;
+;; With this peephole2, we can eliminate the xscvspdp and xscvdpspn
+;; instructions.
+;;
+;; We keep the xxspltw between the two xvaddsp's in case the user
+;; explicitly did a SFmode extract of element 0 and did a splat
+;; operation.
+
+(define_peephole2
+  [(set (match_operand:SF 0 "vsx_register_operand")
+       (unspec:SF
+        [(match_operand:V4SF 1 "vsx_register_operand")]
+        UNSPEC_VSX_CVSPDP))
+   (set (match_operand:V4SF 2 "vsx_register_operand")
+       (unspec:V4SF [(match_dup 0)] UNSPEC_VSX_CVDPSPN))]
+  "REGNO (operands[1]) == REGNO (operands[2])
+   || peep2_reg_dead_p (1, operands[1])"
+  [(set (match_dup 2) (match_dup 1))])
+
+;; Vector Pack support.
+
+(define_expand "vec_pack_trunc_v4sf_v8hf"
+  [(match_operand:V8HF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8HFmode);
+  rtx r2 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_xvcvsphp_v8hf (r1, operands[1]));
+  emit_insn (gen_xvcvsphp_v8hf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+(define_expand "vec_pack_trunc_v4sf_v8bf"
+  [(match_operand:V8BF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8BFmode);
+  rtx r2 = gen_reg_rtx (V8BFmode);
+
+  emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+  emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+;; Unfortunately the machine independent code assumes there is only one
+;; 16-bit floating point type.  This means we have to choose whether to
+;; support packing _Float16 or __bfloat16.  It looks like __bfloat16 is
+;; more popular, so we choose __bfloat16 to be the default.
+
+(define_expand "vec_pack_trunc_v4sf"
+  [(match_operand:V8BF 0 "vfloat_operand")
+   (match_operand:V4SF 1 "vfloat_operand")
+   (match_operand:V4SF 2 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx r1 = gen_reg_rtx (V8BFmode);
+  rtx r2 = gen_reg_rtx (V8BFmode);
+
+  emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+  emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+  rs6000_expand_extract_even (operands[0], r1, r2);
+  DONE;
+})
+
+;; Used for vector conversion to _Float16
+(define_insn "xvcvsphp_v8hf"
+  [(set (match_operand:V8HF 0 "vsx_register_operand" "=wa")
+       (unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XVCVSPHP_V8HF))]
+  "TARGET_FLOAT16_HW"
+  "xvcvsphp %x0,%x1"
+[(set_attr "type" "vecfloat")])
+
+;; Used for vector conversion to __bfloat16
+(define_insn "xvcvspbf16_v8bf"
+  [(set (match_operand:V8BF 0 "vsx_register_operand" "=wa")
+       (unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_XVCVSPBF16_V8BF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvspbf16 %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
+;; Vector unpack support.  Given the name is for the type being
+;; unpacked, we can unpack both __bfloat16 and _Float16.
+
+;; Unpack vector _Float16
+(define_expand "vec_unpacks_hi_v8hf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8HF 1 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8HFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+  DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8hf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8HF 1 "vfloat_operand")]
+  "TARGET_FLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8HFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+  DONE;
+})
+
+;; Used for vector conversion from _Float16
+(define_insn "xvcvhpsp_v8hf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvhpsp %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
+;; Unpack vector __bfloat16
+(define_expand "vec_unpacks_hi_v8bf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8BF 1 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8BFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+  DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8bf"
+  [(match_operand:V4SF 0 "vfloat_operand")
+   (match_operand:V8BF 1 "vfloat_operand")]
+  "TARGET_BFLOAT16_HW"
+{
+  rtx reg = gen_reg_rtx (V8BFmode);
+
+  rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+  emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+  DONE;
+})
+
+;; Used for vector conversion from __bfloat16
+(define_insn "xvcvbf16spn_v8bf"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+       (unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")]
+                    UNSPEC_CVT_FP16_TO_V4SF))]
+  "TARGET_BFLOAT16_HW"
+  "xvcvbf16spn %x0,%x1"
+  [(set_attr "type" "vecperm")])
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index e9ddc61e3a8a..408caf4521f0 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -2192,3 +2192,79 @@
 
   return constant_generates_xxspltiw (&vsx_const);
 })
+
+;; Return 1 if this is a 16-bit floating point operand that can be used
+;; in an add, subtract, or multiply operation that uses the vector
+;; conversion function.
+(define_predicate "fp16_reg_or_constant_operand"
+  (match_code "reg,subreg,const_double")
+{
+  if (REG_P (op) || SUBREG_P (op))
+    return vsx_register_operand (op, mode);
+
+  if (CONST_DOUBLE_P (op))
+    return fp16_xxspltiw_constant (op, mode);
+
+  return false;
+})
+
+;; Match binary operators where we convert a BFmode operand into a
+;; SFmode operand so that we can optimize the BFmode operation to do
+;; the operation in vector mode rather than convverting the BFmode to a
+;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and
+;; then converting the V4SFmode element to SFmode scalar.
+(define_predicate "fp16_binary_operator"
+  (match_code "plus,minus,mult,smax,smin"))
+
+;; Match bfloat16/float operands that can be optimized to do the
+;; operation in V4SFmode.
+(define_predicate "bfloat16_v4sf_operand"
+  (match_code "reg,subreg,const_double,float_extend,float_truncate")
+{
+  if (mode != BFmode && mode != SFmode)
+    return false;
+
+  if (REG_P (op) || SUBREG_P (op))
+    return register_operand (op, mode);
+
+  if (CONST_DOUBLE_P (op))
+    return true;
+
+  if (GET_CODE (op) == FLOAT_EXTEND)
+    {
+      rtx op_arg = XEXP (op, 0);
+      return (mode == SFmode
+             && GET_MODE (op_arg) == BFmode
+             && (REG_P (op_arg) || SUBREG_P (op_arg)));
+    }
+
+  if (GET_CODE (op) == FLOAT_TRUNCATE)
+    {
+      rtx op_arg = XEXP (op, 0);
+      return (mode == BFmode
+             && GET_MODE (op_arg) == SFmode
+             && (REG_P (op_arg) || SUBREG_P (op_arg)));
+    }
+
+  return false;
+})
+
+;; Match an operand that originally was an BFmode value to prevent
+;; operations involing only SFmode values from being converted to
+;; BFmode.
+(define_predicate "bfloat16_bf_operand"
+  (match_code "reg,subreg,const_double,float_extend")
+{
+  if (mode == BFmode || GET_MODE (op) == BFmode)
+    return true;
+
+  if (mode != SFmode)
+    return false;
+
+  if (GET_MODE (op) == SFmode
+      && GET_CODE (op) == FLOAT_EXTEND
+      && GET_MODE (XEXP (op, 0)) == BFmode)
+    return true;
+
+  return false;
+})
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index b3ace1166f43..cc27c39d8d23 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -591,6 +591,10 @@ rs6000_target_modify_macros (bool define_p,
     {
       rs6000_define_or_undefine_macro (define_p, "__FLOAT16__");
       rs6000_define_or_undefine_macro (define_p, "__BFLOAT16__");
+      if ((cpu_option & CPU_OPTION_POWER9_MASK) != 0)
+       rs6000_define_or_undefine_macro (define_p, "__FLOAT16_HW__");
+      if ((cpu_option & CPU_OPTION_POWER10_MASK) != 0)
+       rs6000_define_or_undefine_macro (define_p, "__BFLOAT16_HW__");
     }
   /* Tell the user if we are targeting CELL.  */
   if (rs6000_cpu == PROCESSOR_CELL)
diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc
index 41c0d4f71590..4d55ea2d1b36 100644
--- a/gcc/config/rs6000/rs6000-call.cc
+++ b/gcc/config/rs6000/rs6000-call.cc
@@ -685,17 +685,27 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype,
             " to enable them", "-maltivec");
     }
 
+#if !POWERPC_FLOAT16_DISABLE_WARNING
   /* Warn that __bfloat16 and _Float16 might be returned differently in the
      future.  The issue is currently 16-bit floating point is returned in
      floating point register #1 in 16-bit format.  We may or may not want to
      return it as a scalar 64-bit value.  */
   if (fntype && warn_psabi && !cum->libcall)
     {
-      machine_mode ret_mode = TYPE_MODE (TREE_TYPE (fntype));
-      if (ret_mode == BFmode || ret_mode == HFmode)
-       warning (OPT_Wpsabi, "%s might be returned differently in the future",
-                ret_mode == BFmode ? "__bfloat16" : "_Float16");
+      static bool warned_about_float16_return = false;
+
+      if (!warned_about_float16_return)
+       {
+         machine_mode ret_mode = TYPE_MODE (TREE_TYPE (fntype));
+
+         warned_about_float16_return = true;
+         if (ret_mode == BFmode || ret_mode == HFmode)
+           warning (OPT_Wpsabi,
+                    "%s might be returned differently in the future",
+                    ret_mode == BFmode ? "__bfloat16" : "_Float16");
+       }
     }
+#endif
 }
 
 
@@ -1653,13 +1663,23 @@ rs6000_function_arg (cumulative_args_t cum_v, const 
function_arg_info &arg)
       return NULL_RTX;
     }
 
+#if !POWERPC_FLOAT16_DISABLE_WARNING
   /* Warn that _Float16 and __bfloat16 might be passed differently in the
      future.  The issue is currently 16-bit floating point values are passed in
      floating point registers in the native 16-bit format.  We may or may not
      want to pass the value it as a scalar 64-bit value.  */
-  if (warn_psabi && !cum->libcall && (mode == BFmode || mode == HFmode))
-    warning (OPT_Wpsabi, "%s might be passed differently in the future",
-            mode == BFmode ? "__bfloat16" : "_Float16");
+  if (warn_psabi && !cum->libcall && FP16_SCALAR_MODE_P (mode))
+    {
+      static bool warned_about_float16_call = false;
+
+      if (!warned_about_float16_call)
+       {
+         warned_about_float16_call = true;
+         warning (OPT_Wpsabi, "%s might be passed differently in the future",
+                  mode == BFmode ? "__bfloat16" : "_Float16");
+       }
+    }
+#endif
 
   /* Return a marker to indicate whether CR1 needs to set or clear the
      bit that V.4 uses to say fp args were passed in registers.
diff --git a/gcc/config/rs6000/rs6000-cpus.def 
b/gcc/config/rs6000/rs6000-cpus.def
index 233f01e9c615..0add89c28792 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -85,6 +85,15 @@
                                 | OPTION_MASK_ALTIVEC                  \
                                 | OPTION_MASK_VSX)
 
+/* Determine whether to enable 16-bit floating point types on power8 systems
+   and above.  */
+#if POWERPC_FLOAT16_DEFAULT
+#define TARGET_16BIT_FLOATING_POINT    OPTION_MASK_FLOAT16
+
+#else
+#define TARGET_16BIT_FLOATING_POINT    0
+#endif
+
 /* For now, don't provide an embedded version of ISA 2.07.  Do not set power8
    fusion here, instead set it in rs6000.cc if we are tuning for a power8
    system.  */
@@ -93,7 +102,8 @@
                                 | OPTION_MASK_CRYPTO                   \
                                 | OPTION_MASK_EFFICIENT_UNALIGNED_VSX  \
                                 | OPTION_MASK_QUAD_MEMORY              \
-                                | OPTION_MASK_QUAD_MEMORY_ATOMIC)
+                                | OPTION_MASK_QUAD_MEMORY_ATOMIC       \
+                                | TARGET_16BIT_FLOATING_POINT)
 
 /* ISA masks setting fusion options.  */
 #define OTHER_FUSION_MASKS     (OPTION_MASK_P8_FUSION                  \
@@ -160,6 +170,7 @@
                                 | OPTION_MASK_EFFICIENT_UNALIGNED_VSX  \
                                 | OPTION_MASK_FLOAT128_HW              \
                                 | OPTION_MASK_FLOAT128_KEYWORD         \
+                                | OPTION_MASK_FLOAT16                  \
                                 | OPTION_MASK_FPRND                    \
                                 | OPTION_MASK_P10_FUSION               \
                                 | OPTION_MASK_HTM                      \
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index 9bf971370d41..001dc1fc7f4b 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -258,6 +258,21 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
 extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
 extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
+
+/* From float16.cc.  */
+/* Optimize bfloat16 and float16 operations.  */
+enum fp16_operation {
+  FP16_BINARY,                         /* Bfloat16/float16 binary op.  */
+  FP16_FMA,                            /* (a * b) + c.  */
+  FP16_FMS,                            /* (a * b) - c.  */
+  FP16_NFMA,                           /* - ((a * b) + c).  */
+  FP16_NFMS                            /* - ((a * b) - c).  */
+};
+
+extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
+                               enum fp16_operation);
+extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx,
+                                       enum fp16_operation);
 #endif /* RTX_CODE */
 
 #ifdef TREE_CODE
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index a5d1c27424f3..c8f19865311c 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc 
rs6000-builtins.h
        $(COMPILE) $<
        $(POSTCOMPILE)
 
+float16.o: $(srcdir)/config/rs6000/float16.cc
+       $(COMPILE) $<
+       $(POSTCOMPILE)
+
 #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
 #      $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md

Reply via email to