This patch adds support for defining vectorization support for 16-bit floating
point values. This is done by converting the 16-bit floating point vector
values to 32-bit floating point vectors, doing the operation in 32-bits and
converting it back to the appropriate 16-bit floating point format.
I have committed all of the patches in my backlog (dense math registers, other
-mcpu=future instructions, random bug fixes, support for _Float16 and
__bfloat16, and optimizations for vector logical operations on power10/power11)
into the IBM vendor branch:
vendors/ibm/gcc-17-future
2026-07-01 Michael Meissner <[email protected]>
gcc/
* config.gcc (powerpc*-*-*): Add float16.o.
* config/rs6000/float16.cc: New file to add 16-bit floating point
vectorization.
* config/rs6000/float16.md: (FP16_BINARY_OP): New mode iterator.
(fp16_names): New mode attribute.
(UNSPEC_XVCVSPHP_V8HF): New unspec.
(UNSPEC_XVCVSPBF16_V8BF): Likewise.
(UNSPEC_CVT_FP16_TO_V4SF): Likewise.
(<fp16_names><mode>): New insns to support vectorization of 16-bit
floating point.
(fma<mode>4): Likewise.
(fms<mode>4): Likewise.
(nfma<mode>): Likewise.
(nfms<mode>4): Likewise.
(vec_pack_trunc_v4sf_v8hf): Likewise.
(vec_pack_trunc_v4sf_v8bf): Likewise.
(vec_pack_trunc_v4sf): Likewise.
(xvcvsphp_v8hf): Likewise.
(xvcvspbf16_v8bf): Likewise.
(vec_unpacks_hi_v8hf): Likewise.
(vec_unpacks_lo_v8hf): Likewise.
(xvcvhpsp_v8hf): Likewise.
(vec_unpacks_hi_v8bf): Likewise.
(vec_unpacks_lo_v8bf): Likewise.
(xvcvbf16spn_v8bf): Likewise.
* config/rs6000/rs6000-protos.h (enum fp16_operation): New enumeration
for vectorizing 16-bit floating point.
(fp16_vectorization): New declaration.
* config/rs6000/t-rs6000 (float16.o): Add build rules.
---
gcc/config.gcc | 1 +
gcc/config/rs6000/float16.cc | 185 ++++++++++++++++++++++
gcc/config/rs6000/float16.md | 244 ++++++++++++++++++++++++++++++
gcc/config/rs6000/rs6000-protos.h | 13 ++
gcc/config/rs6000/t-rs6000 | 4 +
5 files changed, 447 insertions(+)
create mode 100644 gcc/config/rs6000/float16.cc
diff --git a/gcc/config.gcc b/gcc/config.gcc
index 739ba98b28d..3a99bd03647 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -541,6 +541,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+ extra_objs="${extra_objs} float16.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc
new file mode 100644
index 00000000000..5274a0df962
--- /dev/null
+++ b/gcc/config/rs6000/float16.cc
@@ -0,0 +1,185 @@
+/* Subroutines for the C front end on the PowerPC architecture.
+ Copyright (C) 2002-2025 Free Software Foundation, Inc.
+
+ Contributed by Zack Weinberg <[email protected]>
+ and Paolo Bonzini <[email protected]>
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* 16-bit floating point support. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "attribs.h"
+#include "explow.h"
+#include "expr.h"
+#include "common/common-target.h"
+#include "rs6000-internal.h"
+
+/* Expand a 16-bit vector operation:
+
+ ICODE: Operation to perform.
+ RESULT: Result of the operation.
+ OP1: Input operand1.
+ OP2: Input operand2.
+ OP3: Input operand3 or NULL_RTX.
+ SUBTYPE: Describe the operation. */
+
+void
+fp16_vectorization (enum rtx_code icode,
+ rtx result,
+ rtx op1,
+ rtx op2,
+ rtx op3,
+ enum fp16_operation subtype)
+{
+ gcc_assert (can_create_pseudo_p ());
+
+ machine_mode result_mode = GET_MODE (result);
+ rtx op_orig[3] = { op1, op2, op3 };
+ rtx op_hi[3];
+ rtx op_lo[3];
+ rtx result_hi;
+ rtx result_lo;
+ size_t n_opts;
+
+ switch (subtype)
+ {
+ case FP16_BINARY:
+ n_opts = 2;
+ break;
+
+ case FP16_FMA:
+ case FP16_FMS:
+ case FP16_NFMA:
+ case FP16_NFMS:
+ n_opts = 3;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Allocate 2 temporaries for the results and the input operands. */
+ result_hi = gen_reg_rtx (V4SFmode);
+ result_lo = gen_reg_rtx (V4SFmode);
+
+ for (size_t i = 0; i < n_opts; i++)
+ {
+ gcc_assert (op_orig[i] != NULL_RTX);
+ op_hi[i] = gen_reg_rtx (V4SFmode); /* high register. */
+ op_lo[i] = gen_reg_rtx (V4SFmode); /* low register. */
+
+ rtx interleave_hi = gen_reg_rtx (result_mode);
+ rtx interleave_lo = gen_reg_rtx (result_mode);
+ rtx orig = op_orig[i];
+
+ rs6000_expand_interleave (interleave_hi, orig, orig, !BYTES_BIG_ENDIAN);
+ rs6000_expand_interleave (interleave_lo, orig, orig, BYTES_BIG_ENDIAN);
+
+ if (result_mode == V8HFmode)
+ {
+ emit_insn (gen_xvcvhpsp_v8hf (op_hi[i], interleave_hi));
+ emit_insn (gen_xvcvhpsp_v8hf (op_lo[i], interleave_lo));
+ }
+
+ else if (result_mode == V8BFmode)
+ {
+ emit_insn (gen_xvcvbf16spn_v8bf (op_hi[i], interleave_hi));
+ emit_insn (gen_xvcvbf16spn_v8bf (op_lo[i], interleave_lo));
+ }
+
+ else
+ gcc_unreachable ();
+ }
+
+ /* Do 2 sets of V4SFmode operations. */
+ switch (subtype)
+ {
+ case FP16_BINARY:
+ emit_insn (gen_rtx_SET (result_hi,
+ gen_rtx_fmt_ee (icode, V4SFmode,
+ op_hi[0],
+ op_hi[1])));
+
+ emit_insn (gen_rtx_SET (result_lo,
+ gen_rtx_fmt_ee (icode, V4SFmode,
+ op_lo[0],
+ op_lo[1])));
+ break;
+
+ case FP16_FMA:
+ case FP16_FMS:
+ case FP16_NFMA:
+ case FP16_NFMS:
+ {
+ rtx op1_hi = op_hi[0];
+ rtx op2_hi = op_hi[1];
+ rtx op3_hi = op_hi[2];
+
+ rtx op1_lo = op_lo[0];
+ rtx op2_lo = op_lo[1];
+ rtx op3_lo = op_lo[2];
+
+ if (subtype == FP16_FMS || subtype == FP16_NFMS)
+ {
+ op3_hi = gen_rtx_NEG (V4SFmode, op3_hi);
+ op3_lo = gen_rtx_NEG (V4SFmode, op3_lo);
+ }
+
+ rtx op_fma_hi = gen_rtx_FMA (V4SFmode, op1_hi, op2_hi, op3_hi);
+ rtx op_fma_lo = gen_rtx_FMA (V4SFmode, op1_lo, op2_lo, op3_lo);
+
+ if (subtype == FP16_NFMA || subtype == FP16_NFMS)
+ {
+ op_fma_hi = gen_rtx_NEG (V4SFmode, op_fma_hi);
+ op_fma_lo = gen_rtx_NEG (V4SFmode, op_fma_lo);
+ }
+
+ emit_insn (gen_rtx_SET (result_hi, op_fma_hi));
+ emit_insn (gen_rtx_SET (result_lo, op_fma_lo));
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Combine the 2 V4SFmode operations into one V8HFmode/V8BFmode vector. */
+ if (result_mode == V8HFmode)
+ emit_insn (gen_vec_pack_trunc_v4sf_v8hf (result, result_hi, result_lo));
+
+ else if (result_mode == V8BFmode)
+ emit_insn (gen_vec_pack_trunc_v4sf_v8bf (result, result_hi, result_lo));
+
+ else
+ gcc_unreachable ();
+
+ return;
+}
diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md
index e695febf9ca..2b79998197b 100644
--- a/gcc/config/rs6000/float16.md
+++ b/gcc/config/rs6000/float16.md
@@ -67,11 +67,27 @@ (define_mode_attr FP16_VECTOR4 [(BF "V4BF")
(V8BF "V4BF")
(V8HF "V4HF")])
+;; Binary operators for bfloat16/float16 vectorization.
+(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin])
+
+;; Standard names for the unary/binary/ternary operators
+(define_code_attr fp16_names [(abs "abs")
+ (fma "fma")
+ (plus "add")
+ (minus "sub")
+ (mult "mul")
+ (neg "neg")
+ (smax "smax")
+ (smin "smin")])
+
;; UNSPEC constants
(define_c_enum "unspec"
[UNSPEC_BF_SHIFT_LEFT_16BIT
UNSPEC_XXSPLTW_FP16
UNSPEC_XVCVSPBF16_BF
+ UNSPEC_XVCVSPHP_V8HF
+ UNSPEC_XVCVSPBF16_V8BF
+ UNSPEC_CVT_FP16_TO_V4SF
UNSPEC_CVT_V4SF_TO_FP16])
;; _Float16 and __bfloat16 moves
@@ -856,3 +872,231 @@ (define_insn "*boolcc<mode>3"
xxl%q3 %x0,%x1,%x2
%q3 %0,%1,%2"
[(set_attr "type" "veclogical,logical")])
+
+;; Add vectorization support for 16-bit floating point.
+
+;; Binary operators being vectorized.
+(define_insn_and_split "<fp16_names><mode>3"
+ [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+ (FP16_BINARY_OP:VFP16_HW
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (match_operand:VFP16_HW 2 "vsx_register_operand")))]
+ "can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX,
+ FP16_BINARY);
+ DONE;
+})
+
+;; FMA operations being vectorized.
+(define_insn_and_split "fma<mode>4"
+ [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+ (fma:VFP16_HW
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (match_operand:VFP16_HW 2 "vsx_register_operand")
+ (match_operand:VFP16_HW 3 "vsx_register_operand")))]
+ "can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMA);
+ DONE;
+})
+
+(define_insn_and_split "*fms<mode>4"
+ [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+ (fma:VFP16_HW
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (match_operand:VFP16_HW 2 "vsx_register_operand")
+ (neg:VFP16_HW
+ (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+ "can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_FMS);
+ DONE;
+})
+
+(define_insn_and_split "*nfma<mode>4"
+ [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+ (neg:VFP16_HW
+ (fma:VFP16_HW
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (match_operand:VFP16_HW 2 "vsx_register_operand")
+ (match_operand:VFP16_HW 3 "vsx_register_operand"))))]
+ "can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMA);
+ DONE;
+})
+
+(define_insn_and_split "*nfms<mode>4"
+ [(set (match_operand:VFP16_HW 0 "vsx_register_operand")
+ (neg:VFP16_HW
+ (fma:VFP16_HW
+ (match_operand:VFP16_HW 1 "vsx_register_operand")
+ (match_operand:VFP16_HW 2 "vsx_register_operand")
+ (neg:VFP16_HW
+ (match_operand:VFP16_HW 3 "vsx_register_operand")))))]
+ "can_create_pseudo_p ()"
+ "#"
+ "&& 1"
+ [(pc)]
+{
+ fp16_vectorization (FMA, operands[0], operands[1], operands[2],
+ operands[3], FP16_NFMS);
+ DONE;
+})
+
+;; Vector Pack support.
+
+(define_expand "vec_pack_trunc_v4sf_v8hf"
+ [(match_operand:V8HF 0 "vfloat_operand")
+ (match_operand:V4SF 1 "vfloat_operand")
+ (match_operand:V4SF 2 "vfloat_operand")]
+ "TARGET_FLOAT16_HW"
+{
+ rtx r1 = gen_reg_rtx (V8HFmode);
+ rtx r2 = gen_reg_rtx (V8HFmode);
+
+ emit_insn (gen_xvcvsphp_v8hf (r1, operands[1]));
+ emit_insn (gen_xvcvsphp_v8hf (r2, operands[2]));
+ rs6000_expand_extract_even (operands[0], r1, r2);
+ DONE;
+})
+
+(define_expand "vec_pack_trunc_v4sf_v8bf"
+ [(match_operand:V8BF 0 "vfloat_operand")
+ (match_operand:V4SF 1 "vfloat_operand")
+ (match_operand:V4SF 2 "vfloat_operand")]
+ "TARGET_BFLOAT16_HW"
+{
+ rtx r1 = gen_reg_rtx (V8BFmode);
+ rtx r2 = gen_reg_rtx (V8BFmode);
+
+ emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+ emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+ rs6000_expand_extract_even (operands[0], r1, r2);
+ DONE;
+})
+
+;; Unfortunately the machine independent code assumes there is only one
+;; 16-bit floating point type. This means we have to choose whether to
+;; support packing _Float16 or __bfloat16. It looks like __bfloat16 is
+;; more popular, so we choose __bfloat16 to be the default.
+
+(define_expand "vec_pack_trunc_v4sf"
+ [(match_operand:V8BF 0 "vfloat_operand")
+ (match_operand:V4SF 1 "vfloat_operand")
+ (match_operand:V4SF 2 "vfloat_operand")]
+ "TARGET_BFLOAT16_HW"
+{
+ rtx r1 = gen_reg_rtx (V8BFmode);
+ rtx r2 = gen_reg_rtx (V8BFmode);
+
+ emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1]));
+ emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2]));
+ rs6000_expand_extract_even (operands[0], r1, r2);
+ DONE;
+})
+
+;; Used for vector conversion to _Float16
+(define_insn "xvcvsphp_v8hf"
+ [(set (match_operand:V8HF 0 "vsx_register_operand" "=wa")
+ (unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_XVCVSPHP_V8HF))]
+ "TARGET_FLOAT16_HW"
+ "xvcvsphp %x0,%x1"
+[(set_attr "type" "vecfloat")])
+
+;; Used for vector conversion to __bfloat16
+(define_insn "xvcvspbf16_v8bf"
+ [(set (match_operand:V8BF 0 "vsx_register_operand" "=wa")
+ (unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+ UNSPEC_XVCVSPBF16_V8BF))]
+ "TARGET_BFLOAT16_HW"
+ "xvcvspbf16 %x0,%x1"
+ [(set_attr "type" "vecfloat")])
+
+;; Vector unpack support. Given the name is for the type being
+;; unpacked, we can unpack both __bfloat16 and _Float16.
+
+;; Unpack vector _Float16
+(define_expand "vec_unpacks_hi_v8hf"
+ [(match_operand:V4SF 0 "vfloat_operand")
+ (match_operand:V8HF 1 "vfloat_operand")]
+ "TARGET_FLOAT16_HW"
+{
+ rtx reg = gen_reg_rtx (V8HFmode);
+
+ rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+ emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+ DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8hf"
+ [(match_operand:V4SF 0 "vfloat_operand")
+ (match_operand:V8HF 1 "vfloat_operand")]
+ "TARGET_FLOAT16_HW"
+{
+ rtx reg = gen_reg_rtx (V8HFmode);
+
+ rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+ emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg));
+ DONE;
+})
+
+;; Used for vector conversion from _Float16
+(define_insn "xvcvhpsp_v8hf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")]
+ UNSPEC_CVT_FP16_TO_V4SF))]
+ "TARGET_FLOAT16_HW"
+ "xvcvhpsp %x0,%x1"
+ [(set_attr "type" "vecperm")])
+
+;; Unpack vector __bfloat16
+(define_expand "vec_unpacks_hi_v8bf"
+ [(match_operand:V4SF 0 "vfloat_operand")
+ (match_operand:V8BF 1 "vfloat_operand")]
+ "TARGET_BFLOAT16_HW"
+{
+ rtx reg = gen_reg_rtx (V8BFmode);
+
+ rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN);
+ emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+ DONE;
+})
+
+(define_expand "vec_unpacks_lo_v8bf"
+ [(match_operand:V4SF 0 "vfloat_operand")
+ (match_operand:V8BF 1 "vfloat_operand")]
+ "TARGET_BFLOAT16_HW"
+{
+ rtx reg = gen_reg_rtx (V8BFmode);
+
+ rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN);
+ emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg));
+ DONE;
+})
+
+;; Used for vector conversion from __bfloat16
+(define_insn "xvcvbf16spn_v8bf"
+ [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+ (unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")]
+ UNSPEC_CVT_FP16_TO_V4SF))]
+ "TARGET_BFLOAT16_HW"
+ "xvcvbf16spn %x0,%x1"
+ [(set_attr "type" "vecperm")])
diff --git a/gcc/config/rs6000/rs6000-protos.h
b/gcc/config/rs6000/rs6000-protos.h
index e507562ab8d..2f41a9ae182 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -258,6 +258,19 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode,
extern unsigned constant_generates_lxvkq (vec_const_128bit_type *);
extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *);
extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *);
+
+/* From float16.cc. */
+/* Optimize bfloat16 and float16 operations. */
+enum fp16_operation {
+ FP16_BINARY, /* Bfloat16/float16 binary op. */
+ FP16_FMA, /* (a * b) + c. */
+ FP16_FMS, /* (a * b) - c. */
+ FP16_NFMA, /* - ((a * b) + c). */
+ FP16_NFMS /* - ((a * b) - c). */
+};
+
+extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx,
+ enum fp16_operation);
#endif /* RTX_CODE */
#ifdef TREE_CODE
diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
index dc5aa87c9c7..48c455f917d 100644
--- a/gcc/config/rs6000/t-rs6000
+++ b/gcc/config/rs6000/t-rs6000
@@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc
rs6000-builtins.h
$(COMPILE) $<
$(POSTCOMPILE)
+float16.o: $(srcdir)/config/rs6000/float16.cc
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
#$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl
# $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md
--
2.54.0
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]