https://gcc.gnu.org/g:5b6529709f13057c05a80d818b7e5f6321949568
commit 5b6529709f13057c05a80d818b7e5f6321949568 Author: Michael Meissner <[email protected]> Date: Thu Nov 6 00:27:31 2025 -0500 Revert changes Diff: --- gcc/ChangeLog.test | 30 +- gcc/config.gcc | 19 + gcc/config/rs6000/float16.md | 1139 +++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/predicates.md | 76 +++ gcc/config/rs6000/rs6000-c.cc | 4 + gcc/config/rs6000/rs6000-call.cc | 34 +- gcc/config/rs6000/rs6000-cpus.def | 13 +- gcc/config/rs6000/rs6000-protos.h | 15 + gcc/config/rs6000/t-rs6000 | 4 + 9 files changed, 1325 insertions(+), 9 deletions(-) diff --git a/gcc/ChangeLog.test b/gcc/ChangeLog.test index f47c02f47d9e..79b0fb7849d3 100644 --- a/gcc/ChangeLog.test +++ b/gcc/ChangeLog.test @@ -44,7 +44,35 @@ gcc/ Define __FLOAT16_HW__ if we have hardware support for _Float16 conversions. -==================== Branch work223-test, patch #430 patch #460 was reverted ==================== +==================== Branch work223-test, patch #430 patch #460 ==================== + +Add __bfloat16 optimizations. + +2025-11-05 Michael Meissner <[email protected]> + +gcc/ + + * config/rs6000/float16.cc (bfloat16_operation_as_v4sf): New function to + add __bfloat16 optimizations. + * config/rs6000/float16.md (VFP16): New mode iterator. + (bfloat16_binary_op_internal1): Add optimizations for __bfloat16. + (bfloat16_binary_op_internal2): Likewise. + (bfloat16_fma_internal1): Likewise. + (bfloat16_fma_internal2): Likewise. + (bfloat16_fms_internal1): Likewise. + (bfloat16_fms_internal2): Likewise. + (bfloat16_nfma_internal1): Likewise. + (bfloat16_nfma_internal2): Likewise. + (bfloat16_nfms_internal1): Likewise. + (bfloat16_nfms_internal2): Likewise. + (bfloat16_nfms_internal3): Likewise. + (__bfloat16 peephole): New peephole. + * config/rs6000/predicates.md (fp16_reg_or_constant_operand): New + predicate. + (bfloat16_v4sf_operand): Likewise. + (bfloat16_bf_operand): Likewise. + * config/rs6000/rs60000-protos.h (bfloat16_operation_as_v4sf): New + declaration. ==================== Branch work223-test, patch #430 patch #459 ==================== diff --git a/gcc/config.gcc b/gcc/config.gcc index a753c018ae1c..ad8f0bd4e26c 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -533,6 +533,7 @@ powerpc*-*-*) extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o" extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o" extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o" + extra_objs="${extra_objs} float16.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" @@ -5798,6 +5799,24 @@ case "${target}" in elif test x$with_long_double_format = xibm; then tm_defines="${tm_defines} TARGET_IEEEQUAD_DEFAULT=0" fi + + # Test if we should enable 16-bit floating point on the platforms + # where we can support __bfloat16 and _Float16. + if test x$with_powerpc_float16 = xyes; then + tm_defines="${tm_defines} POWERPC_FLOAT16_DEFAULT=1" + + elif test x$with_powerpc_16bit_floating_point = xyes; then + tm_defines="${tm_defines} POWERPC_FLOAT16_DEFAULT=0" + fi + + # Test if we should disable the warning about passing + # and returning 16-bit floating point values. + if test x$with_powerpc_float16_disable_warning = xyes; then + tm_defines="${tm_defines} POWERPC_FLOAT16_DISABLE_WARNING=1" + + elif test x$with_powerpc_float16_disable_warning = xno; then + tm_defines="${tm_defines} POWERPC_FLOAT16_DISABLE_WARNING=0" + fi ;; s390*-*-*) diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index fec4bb87fd09..6060746cc4ca 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -25,6 +25,28 @@ (define_mode_iterator FP16 [(BF "TARGET_FLOAT16") (HF "TARGET_FLOAT16")]) +(define_mode_iterator VFP16 [(V8BF "TARGET_BFLOAT16") + (V8HF "TARGET_FLOAT16")]) + +;; Mode iterator for 16-bit floating point modes on machines with +;; hardware support both as a scalar and as a vector. +(define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW") + (HF "TARGET_FLOAT16_HW")]) + +(define_mode_iterator VFP16_HW [(V8BF "TARGET_BFLOAT16_HW") + (V8HF "TARGET_FLOAT16_HW")]) + +;; Mode iterator for floating point modes other than SF/DFmode that we +;; convert to/from _Float16 (HFmode) via DFmode. +(define_mode_iterator fp16_float_convert [TF KF IF SD DD TD]) + +;; Mode attribute giving the instruction to convert the even +;; V8HFmode or V8BFmode elements to V4SFmode +(define_mode_attr cvt_fp16_to_v4sf_insn [(BF "xvcvbf16spn") + (HF "xvcvhpsp") + (V8BF "xvcvbf16spn") + (V8HF "xvcvhpsp")]) + ;; Mode attribute giving the vector mode for a 16-bit floating point ;; scalar in both upper and lower case. (define_mode_attr FP16_VECTOR8 [(BF "V8BF") @@ -32,6 +54,35 @@ (define_mode_attr fp16_vector8 [(BF "v8bf") (HF "v8hf")]) + +;; Mode attribute giving the vector mode with 4 16-bit floating point +;; elements given a scalar or 8 element vector. +(define_mode_attr FP16_VECTOR4 [(BF "V4BF") + (HF "V4HF") + (V8BF "V4BF") + (V8HF "V4HF")]) + +;; Binary operators for bfloat16/float16 vectorization. +(define_code_iterator FP16_BINARY_OP [plus minus mult smax smin]) + +;; Standard names for the unary/binary/ternary operators +(define_code_attr fp16_names [(abs "abs") + (fma "fma") + (plus "add") + (minus "sub") + (mult "mul") + (neg "neg") + (smax "smax") + (smin "smin")]) + +;; UNSPEC constants +(define_c_enum "unspec" + [UNSPEC_FP16_SHIFT_LEFT_32BIT + UNSPEC_CVT_FP16_TO_V4SF + UNSPEC_XXSPLTW_FP16 + UNSPEC_XVCVSPBF16_BF + UNSPEC_XVCVSPHP_V8HF + UNSPEC_XVCVSPBF16_V8BF]) ;; _Float16 and __bfloat16 moves (define_expand "mov<mode>" @@ -122,3 +173,1091 @@ } [(set_attr "type" "veclogical,vecperm") (set_attr "prefixed" "*,yes")]) + +;; Convert IEEE 16-bit floating point to/from other floating point modes. + +(define_insn "extendhf<mode>2" + [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") + (float_extend:SFDF + (match_operand:HF 1 "vsx_register_operand" "wa")))] + "TARGET_FLOAT16_HW" + "xscvhpdp %x0,%x1" + [(set_attr "type" "fpsimple")]) + +(define_insn "trunc<mode>hf2" + [(set (match_operand:HF 0 "vsx_register_operand" "=wa") + (float_truncate:HF + (match_operand:SFDF 1 "vsx_register_operand" "wa")))] + "TARGET_FLOAT16_HW" + "xscvdphp %x0,%x1" + [(set_attr "type" "fpsimple")]) + +;; Convert BFmode to SFmode/DFmode. +;; 3 instructions are generated: +;; VSPLTH -- duplicate BFmode into all elements +;; XVCVBF16SPN -- convert even BFmode elements to SFmode +;; XSCVSPNDP -- convert memory format of SFmode to DFmode. +(define_insn_and_split "extendbf<mode>2" + [(set (match_operand:SFDF 0 "vsx_register_operand" "=wa") + (float_extend:SFDF + (match_operand:BF 1 "vsx_register_operand" "v"))) + (clobber (match_scratch:V8BF 2 "=v"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2_v8bf = operands[2]; + + if (GET_CODE (op2_v8bf) == SCRATCH) + op2_v8bf = gen_reg_rtx (V8BFmode); + + rtx op2_v4sf = gen_lowpart (V4SFmode, op2_v8bf); + + /* XXSLDWI -- shift BFmode element into the upper 32 bits. */ + emit_insn (gen_v8bf_shift_left_32bit (op2_v8bf, op1)); + + /* XVCVBF16SPN -- convert even V8BFmode elements to V4SFmode. */ + emit_insn (gen_cvt_fp16_to_v4sf_v8bf (op2_v4sf, op2_v8bf)); + + /* XSCVSPNDP -- convert single V4SFmode element to DFmode. */ + emit_insn (GET_MODE (op0) == SFmode + ? gen_xscvspdpn_sf (op0, op2_v4sf) + : gen_vsx_xscvspdpn (op0, op2_v4sf)); + + DONE; +} + [(set_attr "type" "fpsimple") + (set_attr "length" "12")]) + +;; Convert a SFmode scalar represented as DFmode to elements 0 and 1 of +;; V4SFmode. +(define_insn "xscvdpspn_sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDP))] + "VECTOR_UNIT_VSX_P (SFmode)" + "xscvdpspn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Convert element 0 of a V4SFmode to scalar SFmode (which on the +;; PowerPC uses the DFmode encoding). +(define_insn "xscvspdpn_sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=wa") + (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDPN))] + "TARGET_XSCVSPDPN" + "xscvspdpn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Vector shift left by 32 bits to get the 16-bit floating point value +;; into the upper 32 bits for the conversion. +(define_insn "<fp16_vector8>_shift_left_32bit" + [(set (match_operand:<FP16_VECTOR8> 0 "vsx_register_operand" "=wa") + (unspec:<FP16_VECTOR8> + [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")] + UNSPEC_FP16_SHIFT_LEFT_32BIT))] + "" + "xxsldwi %x0,%x1,%x1,1" + [(set_attr "type" "vecperm")]) + +;; Convert SFmode/DFmode to BFmode. +;; 2 instructions are generated: +;; XSCVDPSPN -- convert SFmode/DFmode scalar to V4SFmode +;; XVCVSPBF16 -- convert V4SFmode to even V8BFmode + +(define_insn_and_split "trunc<mode>bf2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (match_operand:SFDF 1 "vsx_register_operand" "wa"))) + (clobber (match_scratch:V4SF 2 "=wa"))] + "TARGET_BFLOAT16_HW" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + + if (GET_CODE (op2) == SCRATCH) + op2 = gen_reg_rtx (V4SFmode); + + emit_insn (GET_MODE (op1) == SFmode + ? gen_xscvdpspn_sf (op2, op1) + : gen_vsx_xscvdpspn (op2, op1)); + + emit_insn (gen_xvcvspbf16_bf (op0, op2)); + DONE; +} + [(set_attr "type" "fpsimple")]) + +(define_insn "vsx_xscvdpspn_sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVDPSPN))] + "TARGET_XSCVDPSPN" + "xscvdpspn %x0,%x1" + [(set_attr "type" "fp")]) + +;; Convert the even elements of a vector 16-bit floating point to +;; V4SFmode. Deal with little endian vs. big endian element ordering +;; in identifying which elements are converted. + +(define_expand "cvt_fp16_to_v4sf_<mode>" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(match_dup 2) + (match_dup 3) + (match_dup 4) + (match_dup 5)]))))] + "" +{ + int endian_adjust = WORDS_BIG_ENDIAN ? 0 : 1; + operands[2] = GEN_INT (0 + endian_adjust); + operands[3] = GEN_INT (2 + endian_adjust); + operands[4] = GEN_INT (4 + endian_adjust); + operands[5] = GEN_INT (6 + endian_adjust); +}) + +(define_insn "*cvt_fp16_to_v4sf_<mode>_le" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(const_int 1) + (const_int 3) + (const_int 5) + (const_int 7)]))))] + "!WORDS_BIG_ENDIAN" + "<cvt_fp16_to_v4sf_insn> %x0,%x1" + [(set_attr "type" "vecfloat")]) + +(define_insn "*cvt_fp16_to_v4sf_<mode>_be" + [(set (match_operand:V4SF 0 "vsx_register_operand") + (float_extend:V4SF + (vec_select:<FP16_VECTOR4> + (match_operand:VFP16_HW 1 "vsx_register_operand") + (parallel [(const_int 0) + (const_int 2) + (const_int 4) + (const_int 6)]))))] + "WORDS_BIG_ENDIAN" + "<cvt_fp16_to_v4sf_insn> %x0,%x1" + [(set_attr "type" "vecfloat")]) + +;; Duplicate and convert a 16-bit floating point scalar to V4SFmode. + +(define_insn_and_split "*dup_<mode>_to_v4sf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (vec_duplicate:V4SF + (float_extend:SF + (match_operand:FP16_HW 1 "vsx_register_operand" "wa"))))] + "" + "#" + "&& 1" + [(pc)] +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op0_vfp16 = gen_lowpart (<FP16_VECTOR8>mode, op0); + + emit_insn (gen_xxspltw_<mode> (op0, op1)); + emit_insn (gen_cvt_fp16_to_v4sf_<fp16_vector8> (op0, op0_vfp16)); + DONE; +} + [(set_attr "length" "8") + (set_attr "type" "vecperm")]) + +;; Duplicate a HF/BF value so it can be used for xvcvhpspn/xvcvbf16spn. +;; Because xvcvhpspn/xvcvbf16spn only uses the even elements, we can +;; use xxspltw instead of vspltw. This has the advantage that the +;; register allocator can use any of the 64 VSX registers instead of +;; being limited to the 32 Altivec registers that VSPLTH would require. + +(define_insn "xxspltw_<mode>" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:FP16_HW 1 "vsx_register_operand" "wa")] + UNSPEC_XXSPLTW_FP16))] + "" + "xxspltw %x0,%x1,1" + [(set_attr "type" "vecperm")]) + +;; Convert a bfloat16 floating point scalar that has been splatted to +;; V4SFmode. + +(define_insn "xvcvbf16spn_bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_BFLOAT16_HW" + "xvcvbf16spn %x0,%x1" + [(set_attr "type" "vecperm")]) + +;; Convert a V4SFmode vector to a 16-bit floating point scalar. We +;; only care about the 2nd V4SFmode element, which is the element we +;; converted the 16-bit scalar (4th element) to V4SFmode to do the +;; operation, and converted it back. + +(define_insn "xvcvspbf16_bf" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (unspec:BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPBF16_BF))] + "TARGET_BFLOAT16_HW" + "xvcvspbf16 %x0,%x1" + [(set_attr "type" "vecfloat")]) + +;; Convert between HFmode/BFmode and 128-bit binary floating point and +;; decimal floating point types. We use convert_move since some of the +;; types might not have valid RTX expanders. We use DFmode as the +;; intermediate conversion destination. + +(define_expand "extend<FP16_HW:mode><fp16_float_convert:mode>2" + [(set (match_operand:fp16_float_convert 0 "vsx_register_operand") + (float_extend:fp16_float_convert + (match_operand:FP16_HW 1 "vsx_register_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); + convert_move (operands[0], df_tmp, 0); + DONE; +}) + +(define_expand "trunc<fp16_float_convert:mode><FP16_HW:mode>2" + [(set (match_operand:FP16_HW 0 "vsx_register_operand") + (float_truncate:FP16_HW + (match_operand:fp16_float_convert 1 "vsx_register_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + + convert_move (df_tmp, operands[1], 0); + emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); + DONE; +}) + +;; Convert integers to 16-bit floating point modes. +(define_expand "float<GPR:mode><FP16_HW:mode>2" + [(set (match_operand:FP16_HW 0 "vsx_register_operand") + (float:FP16_HW + (match_operand:GPR 1 "nonimmediate_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + emit_insn (gen_float<GPR:mode>df2 (df_tmp, operands[1])); + emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); + DONE; +}) + +(define_expand "floatuns<GPR:mode><FP16_HW:mode>2" + [(set (match_operand:FP16_HW 0 "vsx_register_operand") + (unsigned_float:FP16_HW + (match_operand:GPR 1 "nonimmediate_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + emit_insn (gen_floatuns<GPR:mode>df2 (df_tmp, operands[1])); + emit_insn (gen_truncdf<FP16_HW:mode>2 (operands[0], df_tmp)); + DONE; +}) + +;; Convert 16-bit floating point modes to integers +(define_expand "fix_trunc<FP16_HW:mode><GPR:mode>2" + [(set (match_operand:GPR 0 "vsx_register_operand") + (fix:GPR + (match_operand:FP16_HW 1 "vsx_register_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); + emit_insn (gen_fix_truncdf<GPR:mode>2 (operands[0], df_tmp)); + DONE; +}) + +(define_expand "fixuns_trunc<FP16_HW:mode><GPR:mode>2" + [(set (match_operand:GPR 0 "vsx_register_operand") + (unsigned_fix:GPR + (match_operand:FP16_HW 1 "vsx_register_operand")))] + "" +{ + rtx df_tmp = gen_reg_rtx (DFmode); + emit_insn (gen_extend<FP16_HW:mode>df2 (df_tmp, operands[1])); + emit_insn (gen_fixuns_truncdf<GPR:mode>2 (operands[0], df_tmp)); + DONE; +}) + +;; Negate 16-bit floating point by XOR with -0.0. + +(define_insn_and_split "neg<mode>2" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr") + (neg:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,wr"))) + (clobber (match_scratch:FP16 2 "=&wa,&r"))] + "" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (xor:FP16 (match_dup 1) + (match_dup 2)))] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + + rtx rc = const_double_from_real_value (dconst, <MODE>mode); + if (!TARGET_PREFIXED) + rc = force_const_mem (<MODE>mode, rc); + + operands[3] = rc; +} + [(set_attr "type" "veclogical,integer") + (set_attr "length" "16")]) + +;; 16-bit floating point absolute value + +(define_insn_and_split "abs<mode>2" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr") + (abs:FP16 + (match_operand:FP16 1 "gpc_reg_operand" "wa,wr"))) + (clobber (match_scratch:FP16 2 "=&wa,&r"))] + "" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (and:FP16 (match_dup 1) + (not:FP16 (match_dup 2))))] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + + rtx rc = const_double_from_real_value (dconst, <MODE>mode); + + if (!TARGET_PREFIXED) + rc = force_const_mem (<MODE>mode, rc); + + operands[3] = rc; +} + [(set_attr "type" "veclogical,integer") + (set_attr "length" "16")]) + +;; 16-bit negative floating point absolute value + +(define_insn_and_split "*nabs<mode>2" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,?wr") + (neg:FP16 + (abs:FP16 + (match_operand:FP16 1 "gpc_reg_operand" "wa,wr")))) + (clobber (match_scratch:FP16 2 "=&wa,&r"))] + "" + "#" + "&& 1" + [(set (match_dup 2) + (match_dup 3)) + (set (match_dup 0) + (ior:FP16 (match_dup 1) + (match_dup 2)))] +{ + if (GET_CODE (operands[2]) == SCRATCH) + operands[2] = gen_reg_rtx (<MODE>mode); + + REAL_VALUE_TYPE dconst; + + gcc_assert (real_from_string (&dconst, "-0.0") == 0); + rtx rc = const_double_from_real_value (dconst, <MODE>mode); + + if (!TARGET_PREFIXED) + rc = force_const_mem (<MODE>mode, rc); + + operands[3] = rc; +} + [(set_attr "type" "veclogical,integer") + (set_attr "length" "16")]) + +;; Add logical operations for 16-bit floating point types that are used +;; for things like negate, abs, and negative abs. Possibly in the +;; future we might need logical operators for extracting exponents and +;; mantissas. +(define_expand "and<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (and:FP16 (match_operand:FP16 1 "gpc_reg_operand") + (match_operand:FP16 2 "gpc_reg_operand")))] + "" + "") + +(define_expand "ior<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (ior:FP16 (match_operand:FP16 1 "gpc_reg_operand") + (match_operand:FP16 2 "gpc_reg_operand")))] + "" + "") + +(define_expand "xor<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (xor:FP16 (match_operand:FP16 1 "gpc_reg_operand") + (match_operand:FP16 2 "gpc_reg_operand")))] + "" + "") + +(define_expand "nor<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (and:FP16 + (not:FP16 (match_operand:FP16 1 "gpc_reg_operand")) + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))))] + "" + "") + +(define_expand "andn<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (and:FP16 + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand")) + (match_operand:FP16 1 "gpc_reg_operand")))] + "" + "") + +(define_expand "eqv<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (not:FP16 + (xor:FP16 (match_operand:FP16 1 "gpc_reg_operand") + (match_operand:FP16 2 "gpc_reg_operand"))))] + "" + "") + +;; Rewrite nand into canonical form +(define_expand "nand<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (ior:FP16 + (not:FP16 (match_operand:FP16 1 "gpc_reg_operand")) + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand"))))] + "" + "") + +;; The canonical form is to have the negated element first, so we need to +;; reverse arguments. +(define_expand "iorn<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand") + (ior:FP16 + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand")) + (match_operand:FP16 1 "gpc_reg_operand")))] + "" + "") + +;; AND, IOR, and XOR insns. Unlike HImode operations prefer using +;; floating point/vector registers over GPRs. +(define_insn "*bool<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r") + (match_operator:FP16 3 "boolean_operator" + [(match_operand:FP16 1 "gpc_reg_operand" "wa,r") + (match_operand:FP16 2 "gpc_reg_operand" "wa,r")]))] + "" + "@ + xxl%q3 %x0,%x1,%x2 + %q3 %0,%1,%2" + [(set_attr "type" "veclogical,logical")]) + +;; ANDC, IORC, and EQV insns. +(define_insn "*boolc<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r") + (match_operator:FP16 3 "boolean_operator" + [(not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r")) + (match_operand:FP16 1 "gpc_reg_operand" "wa,r")]))] + "" + "@ + xxl%q3 %x0,%x1,%x2 + %q3 %0,%1,%2" + [(set_attr "type" "veclogical,logical")]) + +(define_insn "*boolc<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r") + (match_operator:FP16 3 "boolean_operator" + [(match_operand:FP16 1 "gpc_reg_operand" "wa,r") + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r"))]))] + "" + "@ + xxl%q3 %x0,%x1,%x2 + %q3 %0,%1,%2" + [(set_attr "type" "veclogical,logical")]) + +;; NOR and NAND insns. +(define_insn "*boolcc<mode>3" + [(set (match_operand:FP16 0 "gpc_reg_operand" "=wa,r") + (match_operator:FP16 3 "boolean_operator" + [(not:FP16 (match_operand:FP16 1 "gpc_reg_operand" "wa,r")) + (not:FP16 (match_operand:FP16 2 "gpc_reg_operand" "wa,r"))]))] + "" + "@ + xxl%q3 %x0,%x1,%x2 + %q3 %0,%1,%2" + [(set_attr "type" "veclogical,logical")]) + +;; Optimize __bfloat16 binary operations. Unlike _Float16 where we +;; have instructions to convert between HFmode and SFmode as scalar +;; values, with BFmode, we only have vector conversions. Thus to do: +;; +;; __bfloat16 a, b, c; +;; a = b + c; +;; +;; the GCC compiler would normally generate: +;; +;; lxsihzx 0,4,2 // load __bfloat16 value b +;; lxsihzx 12,5,2 // load __bfloat16 value c +;; xxsldwi 0,0,0,1 // shift b into bits 16..31 +;; xxsldwi 12,12,12,1 // shift c into bits 16..31 +;; xvcvbf16spn 0,0 // vector convert b into V4SFmode +;; xvcvbf16spn 12,12 // vector convert c into V4SFmode +;; xscvspdpn 0,0 // convert b into SFmode scalar +;; xscvspdpn 12,12 // convert c into SFmode scalar +;; fadds 0,0,12 // add b+c +;; xscvdpspn 0,0 // convert b+c into SFmode memory format +;; xvcvspbf16 0,0 // convert b+c into BFmode memory format +;; stxsihx 0,3,2 // store b+c +;; +;; Using the following combiner patterns, the code generated would now +;; be: +;; +;; lxsihzx 12,4,2 // load __bfloat16 value b +;; lxsihzx 0,5,2 // load __bfloat16 value c +;; xxspltw 12,12,1 // shift b into bits 16..31 +;; xxspltw 0,0,1 // shift c into bits 16..31 +;; xvcvbf16spn 12,12 // vector convert b into V4SFmode +;; xvcvbf16spn 0,0 // vector convert c into V4SFmode +;; xvaddsp 0,0,12 // vector b+c in V4SFmode +;; xvcvspbf16 0,0 // convert b+c into BFmode memory format +;; stxsihx 0,3,2 // store b+c +;; +;; We cannot just define insns like 'addbf3' to keep the operation as +;; BFmode because GCC will not generate these patterns unless the user +;; uses -Ofast. Without -Ofast, it will always convert BFmode into +;; SFmode. + +(define_insn_and_split "*bfloat16_binary_op_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (match_operator:SF 1 "fp16_binary_operator" + [(match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")]))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[2], SFmode) + || bfloat16_bf_operand (operands[3], SFmode))" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], + operands[3], NULL_RTX, FP16_BINARY); + DONE; +}) + +(define_insn_and_split "*bfloat16_binary_op_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (match_operator:SF 1 "fp16_binary_operator" + [(match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")])))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[2], SFmode) + || bfloat16_bf_operand (operands[3], SFmode))" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], + operands[3], NULL_RTX, FP16_BINARY); + DONE; +}) + +(define_insn_and_split "*bfloat16_fma_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_fma_internal2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_fms_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_fms_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand"))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal2" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (float_truncate:BF + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfma_internal3" + [(set (match_operand:BF 0 "vsx_register_operand" "=wa") + (neg:BF + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal1" + [(set (match_operand:SF 0 "vsx_register_operand") + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand")))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal2" + [(set (match_operand:BF 0 "vsx_register_operand") + (float_truncate:BF + (neg:SF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +(define_insn_and_split "*bfloat16_nfms_internal3" + [(set (match_operand:BF 0 "vsx_register_operand") + (neg:BF + (float_truncate:BF + (fma:SF + (match_operand:SF 1 "bfloat16_v4sf_operand") + (match_operand:SF 2 "bfloat16_v4sf_operand") + (neg:SF + (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] + "TARGET_BFLOAT16_HW && can_create_pseudo_p () + && (bfloat16_bf_operand (operands[1], SFmode) + + bfloat16_bf_operand (operands[2], SFmode) + + bfloat16_bf_operand (operands[3], SFmode) >= 2)" + "#" + "&& 1" + [(pc)] +{ + bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +;; Add vectorization support for 16-bit floating point. + +;; Binary operators being vectorized. +(define_insn_and_split "<fp16_names><mode>3" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (FP16_BINARY_OP:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (<CODE>, operands[0], operands[1], operands[2], NULL_RTX, + FP16_BINARY); + DONE; +}) + +;; FMA operations being vectorized. +(define_insn_and_split "fma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand")))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMA); + DONE; +}) + +(define_insn_and_split "*fms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_FMS); + DONE; +}) + +(define_insn_and_split "*nfma<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (match_operand:VFP16_HW 3 "vsx_register_operand"))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMA); + DONE; +}) + +(define_insn_and_split "*nfms<mode>4" + [(set (match_operand:VFP16_HW 0 "vsx_register_operand") + (neg:VFP16_HW + (fma:VFP16_HW + (match_operand:VFP16_HW 1 "vsx_register_operand") + (match_operand:VFP16_HW 2 "vsx_register_operand") + (neg:VFP16_HW + (match_operand:VFP16_HW 3 "vsx_register_operand")))))] + "can_create_pseudo_p ()" + "#" + "&& 1" + [(pc)] +{ + fp16_vectorization (FMA, operands[0], operands[1], operands[2], + operands[3], FP16_NFMS); + DONE; +}) + +;; If we do multiple __bfloat16 operations, between the first and +;; second operation, GCC will want to convert the first operation from +;; V4SFmode to SFmode and then reconvert it back to V4SFmode. On the +;; PowerPC, this is complicated because internally in the vector +;; register, SFmode values are stored as DFmode values. +;; +;; For example, if we have: +;; +;; __bfloat16 a, b, c, d; +;; a = b + c + d; +;; +;; We would generate: +;; +;; lxsihzx 0,4,2 // load b as BFmode +;; lxsihzx 11,5,2 // load c as BFmode +;; lxsihzx 12,6,2 // load d as BFmode +;; xxspltw 0,0,1 // shift b into bits 16..31 +;; xxspltw 11,11,1 // shift c into bits 16..31 +;; xxspltw 12,12,1 // shift d into bits 16..31 +;; xvcvbf16spn 0,0 // convert b into V4SFmode +;; xvcvbf16spn 11,11 // convert c into V4SFmode +;; xvcvbf16spn 12,12 // convert d into V4SFmode +;; xvaddsp 0,0,11 // calculate b+c as V4SFmode +;; xscvspdp 0,0 // convert b+c into DFmode memory format +;; xscvdpspn 0,0 // convert b+c into SFmode memory format +;; xxspltw 0,0,0 // convert b+c into V4SFmode +;; xvaddsp 12,12,0 // calculate b+c+d as V4SFmode +;; xvcvspbf16 12,12 // convert b+c+d into BFmode memory format +;; stxsihx 12,3,2 // store b+c+d +;; +;; With this peephole2, we can eliminate the xscvspdp and xscvdpspn +;; instructions. +;; +;; We keep the xxspltw between the two xvaddsp's in case the user +;; explicitly did a SFmode extract of element 0 and did a splat +;; operation. + +(define_peephole2 + [(set (match_operand:SF 0 "vsx_register_operand") + (unspec:SF + [(match_operand:V4SF 1 "vsx_register_operand")] + UNSPEC_VSX_CVSPDP)) + (set (match_operand:V4SF 2 "vsx_register_operand") + (unspec:V4SF [(match_dup 0)] UNSPEC_VSX_CVDPSPN))] + "REGNO (operands[1]) == REGNO (operands[2]) + || peep2_reg_dead_p (1, operands[1])" + [(set (match_dup 2) (match_dup 1))]) + +;; Vector Pack support. + +(define_expand "vec_pack_trunc_v4sf_v8hf" + [(match_operand:V8HF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8HFmode); + rtx r2 = gen_reg_rtx (V8HFmode); + + emit_insn (gen_xvcvsphp_v8hf (r1, operands[1])); + emit_insn (gen_xvcvsphp_v8hf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +(define_expand "vec_pack_trunc_v4sf_v8bf" + [(match_operand:V8BF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8BFmode); + rtx r2 = gen_reg_rtx (V8BFmode); + + emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1])); + emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +;; Unfortunately the machine independent code assumes there is only one +;; 16-bit floating point type. This means we have to choose whether to +;; support packing _Float16 or __bfloat16. It looks like __bfloat16 is +;; more popular, so we choose __bfloat16 to be the default. + +(define_expand "vec_pack_trunc_v4sf" + [(match_operand:V8BF 0 "vfloat_operand") + (match_operand:V4SF 1 "vfloat_operand") + (match_operand:V4SF 2 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx r1 = gen_reg_rtx (V8BFmode); + rtx r2 = gen_reg_rtx (V8BFmode); + + emit_insn (gen_xvcvspbf16_v8bf (r1, operands[1])); + emit_insn (gen_xvcvspbf16_v8bf (r2, operands[2])); + rs6000_expand_extract_even (operands[0], r1, r2); + DONE; +}) + +;; Used for vector conversion to _Float16 +(define_insn "xvcvsphp_v8hf" + [(set (match_operand:V8HF 0 "vsx_register_operand" "=wa") + (unspec:V8HF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPHP_V8HF))] + "TARGET_FLOAT16_HW" + "xvcvsphp %x0,%x1" +[(set_attr "type" "vecfloat")]) + +;; Used for vector conversion to __bfloat16 +(define_insn "xvcvspbf16_v8bf" + [(set (match_operand:V8BF 0 "vsx_register_operand" "=wa") + (unspec:V8BF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_XVCVSPBF16_V8BF))] + "TARGET_BFLOAT16_HW" + "xvcvspbf16 %x0,%x1" + [(set_attr "type" "vecfloat")]) + +;; Vector unpack support. Given the name is for the type being +;; unpacked, we can unpack both __bfloat16 and _Float16. + +;; Unpack vector _Float16 +(define_expand "vec_unpacks_hi_v8hf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8HF 1 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8HFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg)); + DONE; +}) + +(define_expand "vec_unpacks_lo_v8hf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8HF 1 "vfloat_operand")] + "TARGET_FLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8HFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvhpsp_v8hf (operands[0], reg)); + DONE; +}) + +;; Used for vector conversion from _Float16 +(define_insn "xvcvhpsp_v8hf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V8HF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_BFLOAT16_HW" + "xvcvhpsp %x0,%x1" + [(set_attr "type" "vecperm")]) + +;; Unpack vector __bfloat16 +(define_expand "vec_unpacks_hi_v8bf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8BF 1 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8BFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg)); + DONE; +}) + +(define_expand "vec_unpacks_lo_v8bf" + [(match_operand:V4SF 0 "vfloat_operand") + (match_operand:V8BF 1 "vfloat_operand")] + "TARGET_BFLOAT16_HW" +{ + rtx reg = gen_reg_rtx (V8BFmode); + + rs6000_expand_interleave (reg, operands[1], operands[1], !BYTES_BIG_ENDIAN); + emit_insn (gen_xvcvbf16spn_v8bf (operands[0], reg)); + DONE; +}) + +;; Used for vector conversion from __bfloat16 +(define_insn "xvcvbf16spn_v8bf" + [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa") + (unspec:V4SF [(match_operand:V8BF 1 "vsx_register_operand" "wa")] + UNSPEC_CVT_FP16_TO_V4SF))] + "TARGET_BFLOAT16_HW" + "xvcvbf16spn %x0,%x1" + [(set_attr "type" "vecperm")]) diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index e9ddc61e3a8a..408caf4521f0 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -2192,3 +2192,79 @@ return constant_generates_xxspltiw (&vsx_const); }) + +;; Return 1 if this is a 16-bit floating point operand that can be used +;; in an add, subtract, or multiply operation that uses the vector +;; conversion function. +(define_predicate "fp16_reg_or_constant_operand" + (match_code "reg,subreg,const_double") +{ + if (REG_P (op) || SUBREG_P (op)) + return vsx_register_operand (op, mode); + + if (CONST_DOUBLE_P (op)) + return fp16_xxspltiw_constant (op, mode); + + return false; +}) + +;; Match binary operators where we convert a BFmode operand into a +;; SFmode operand so that we can optimize the BFmode operation to do +;; the operation in vector mode rather than convverting the BFmode to a +;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and +;; then converting the V4SFmode element to SFmode scalar. +(define_predicate "fp16_binary_operator" + (match_code "plus,minus,mult,smax,smin")) + +;; Match bfloat16/float operands that can be optimized to do the +;; operation in V4SFmode. +(define_predicate "bfloat16_v4sf_operand" + (match_code "reg,subreg,const_double,float_extend,float_truncate") +{ + if (mode != BFmode && mode != SFmode) + return false; + + if (REG_P (op) || SUBREG_P (op)) + return register_operand (op, mode); + + if (CONST_DOUBLE_P (op)) + return true; + + if (GET_CODE (op) == FLOAT_EXTEND) + { + rtx op_arg = XEXP (op, 0); + return (mode == SFmode + && GET_MODE (op_arg) == BFmode + && (REG_P (op_arg) || SUBREG_P (op_arg))); + } + + if (GET_CODE (op) == FLOAT_TRUNCATE) + { + rtx op_arg = XEXP (op, 0); + return (mode == BFmode + && GET_MODE (op_arg) == SFmode + && (REG_P (op_arg) || SUBREG_P (op_arg))); + } + + return false; +}) + +;; Match an operand that originally was an BFmode value to prevent +;; operations involing only SFmode values from being converted to +;; BFmode. +(define_predicate "bfloat16_bf_operand" + (match_code "reg,subreg,const_double,float_extend") +{ + if (mode == BFmode || GET_MODE (op) == BFmode) + return true; + + if (mode != SFmode) + return false; + + if (GET_MODE (op) == SFmode + && GET_CODE (op) == FLOAT_EXTEND + && GET_MODE (XEXP (op, 0)) == BFmode) + return true; + + return false; +}) diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc index b3ace1166f43..cc27c39d8d23 100644 --- a/gcc/config/rs6000/rs6000-c.cc +++ b/gcc/config/rs6000/rs6000-c.cc @@ -591,6 +591,10 @@ rs6000_target_modify_macros (bool define_p, { rs6000_define_or_undefine_macro (define_p, "__FLOAT16__"); rs6000_define_or_undefine_macro (define_p, "__BFLOAT16__"); + if ((cpu_option & CPU_OPTION_POWER9_MASK) != 0) + rs6000_define_or_undefine_macro (define_p, "__FLOAT16_HW__"); + if ((cpu_option & CPU_OPTION_POWER10_MASK) != 0) + rs6000_define_or_undefine_macro (define_p, "__BFLOAT16_HW__"); } /* Tell the user if we are targeting CELL. */ if (rs6000_cpu == PROCESSOR_CELL) diff --git a/gcc/config/rs6000/rs6000-call.cc b/gcc/config/rs6000/rs6000-call.cc index 41c0d4f71590..4d55ea2d1b36 100644 --- a/gcc/config/rs6000/rs6000-call.cc +++ b/gcc/config/rs6000/rs6000-call.cc @@ -685,17 +685,27 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype, " to enable them", "-maltivec"); } +#if !POWERPC_FLOAT16_DISABLE_WARNING /* Warn that __bfloat16 and _Float16 might be returned differently in the future. The issue is currently 16-bit floating point is returned in floating point register #1 in 16-bit format. We may or may not want to return it as a scalar 64-bit value. */ if (fntype && warn_psabi && !cum->libcall) { - machine_mode ret_mode = TYPE_MODE (TREE_TYPE (fntype)); - if (ret_mode == BFmode || ret_mode == HFmode) - warning (OPT_Wpsabi, "%s might be returned differently in the future", - ret_mode == BFmode ? "__bfloat16" : "_Float16"); + static bool warned_about_float16_return = false; + + if (!warned_about_float16_return) + { + machine_mode ret_mode = TYPE_MODE (TREE_TYPE (fntype)); + + warned_about_float16_return = true; + if (ret_mode == BFmode || ret_mode == HFmode) + warning (OPT_Wpsabi, + "%s might be returned differently in the future", + ret_mode == BFmode ? "__bfloat16" : "_Float16"); + } } +#endif } @@ -1653,13 +1663,23 @@ rs6000_function_arg (cumulative_args_t cum_v, const function_arg_info &arg) return NULL_RTX; } +#if !POWERPC_FLOAT16_DISABLE_WARNING /* Warn that _Float16 and __bfloat16 might be passed differently in the future. The issue is currently 16-bit floating point values are passed in floating point registers in the native 16-bit format. We may or may not want to pass the value it as a scalar 64-bit value. */ - if (warn_psabi && !cum->libcall && (mode == BFmode || mode == HFmode)) - warning (OPT_Wpsabi, "%s might be passed differently in the future", - mode == BFmode ? "__bfloat16" : "_Float16"); + if (warn_psabi && !cum->libcall && FP16_SCALAR_MODE_P (mode)) + { + static bool warned_about_float16_call = false; + + if (!warned_about_float16_call) + { + warned_about_float16_call = true; + warning (OPT_Wpsabi, "%s might be passed differently in the future", + mode == BFmode ? "__bfloat16" : "_Float16"); + } + } +#endif /* Return a marker to indicate whether CR1 needs to set or clear the bit that V.4 uses to say fp args were passed in registers. diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def index 233f01e9c615..0add89c28792 100644 --- a/gcc/config/rs6000/rs6000-cpus.def +++ b/gcc/config/rs6000/rs6000-cpus.def @@ -85,6 +85,15 @@ | OPTION_MASK_ALTIVEC \ | OPTION_MASK_VSX) +/* Determine whether to enable 16-bit floating point types on power8 systems + and above. */ +#if POWERPC_FLOAT16_DEFAULT +#define TARGET_16BIT_FLOATING_POINT OPTION_MASK_FLOAT16 + +#else +#define TARGET_16BIT_FLOATING_POINT 0 +#endif + /* For now, don't provide an embedded version of ISA 2.07. Do not set power8 fusion here, instead set it in rs6000.cc if we are tuning for a power8 system. */ @@ -93,7 +102,8 @@ | OPTION_MASK_CRYPTO \ | OPTION_MASK_EFFICIENT_UNALIGNED_VSX \ | OPTION_MASK_QUAD_MEMORY \ - | OPTION_MASK_QUAD_MEMORY_ATOMIC) + | OPTION_MASK_QUAD_MEMORY_ATOMIC \ + | TARGET_16BIT_FLOATING_POINT) /* ISA masks setting fusion options. */ #define OTHER_FUSION_MASKS (OPTION_MASK_P8_FUSION \ @@ -160,6 +170,7 @@ | OPTION_MASK_EFFICIENT_UNALIGNED_VSX \ | OPTION_MASK_FLOAT128_HW \ | OPTION_MASK_FLOAT128_KEYWORD \ + | OPTION_MASK_FLOAT16 \ | OPTION_MASK_FPRND \ | OPTION_MASK_P10_FUSION \ | OPTION_MASK_HTM \ diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 9bf971370d41..001dc1fc7f4b 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -258,6 +258,21 @@ extern bool vec_const_128bit_to_bytes (rtx, machine_mode, extern unsigned constant_generates_lxvkq (vec_const_128bit_type *); extern unsigned constant_generates_xxspltiw (vec_const_128bit_type *); extern unsigned constant_generates_xxspltidp (vec_const_128bit_type *); + +/* From float16.cc. */ +/* Optimize bfloat16 and float16 operations. */ +enum fp16_operation { + FP16_BINARY, /* Bfloat16/float16 binary op. */ + FP16_FMA, /* (a * b) + c. */ + FP16_FMS, /* (a * b) - c. */ + FP16_NFMA, /* - ((a * b) + c). */ + FP16_NFMS /* - ((a * b) - c). */ +}; + +extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); +extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, + enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index a5d1c27424f3..c8f19865311c 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -87,6 +87,10 @@ rs6000-c.o: $(srcdir)/config/rs6000/rs6000-c.cc rs6000-builtins.h $(COMPILE) $< $(POSTCOMPILE) +float16.o: $(srcdir)/config/rs6000/float16.cc + $(COMPILE) $< + $(POSTCOMPILE) + #$(srcdir)/config/rs6000/fusion.md: $(srcdir)/config/rs6000/genfusion.pl # $(srcdir)/config/rs6000/genfusion.pl > $(srcdir)/config/rs6000/fusion.md
