https://gcc.gnu.org/g:4590846fd98e79d3f5e952e705ea806aa91cfc2d
commit 4590846fd98e79d3f5e952e705ea806aa91cfc2d Author: Michael Meissner <[email protected]> Date: Thu Nov 6 00:01:48 2025 -0500 Revert changes Diff: --- gcc/config/rs6000/float16.cc | 150 ----------------- gcc/config/rs6000/float16.md | 333 -------------------------------------- gcc/config/rs6000/predicates.md | 76 --------- gcc/config/rs6000/rs6000-protos.h | 2 - 4 files changed, 561 deletions(-) diff --git a/gcc/config/rs6000/float16.cc b/gcc/config/rs6000/float16.cc index 2c7b6278a16a..5274a0df962f 100644 --- a/gcc/config/rs6000/float16.cc +++ b/gcc/config/rs6000/float16.cc @@ -183,153 +183,3 @@ fp16_vectorization (enum rtx_code icode, return; } - -/* Expand a bfloat16 scalar floating point operation: - - ICODE: Operation to perform. - RESULT: Result of the operation. - OP1: Input operand1. - OP2: Input operand2. - OP3: Input operand3 or NULL_RTX. - SUBTYPE: Describe the operation. - - The operation is done as a V4SFmode vector operation. This is because - converting BFmode from a scalar BFmode to SFmode to do the operation and - back again takes quite a bit of time. GCC will only generate the native - operation if -Ofast is used. The float16.md code that calls this function - adds various combine operations to do the operation in V4SFmode instead of - SFmode. */ - -void -bfloat16_operation_as_v4sf (enum rtx_code icode, - rtx result, - rtx op1, - rtx op2, - rtx op3, - enum fp16_operation subtype) -{ - gcc_assert (can_create_pseudo_p ()); - - rtx result_v4sf = gen_reg_rtx (V4SFmode); - rtx ops_orig[3] = { op1, op2, op3 }; - rtx ops_v4sf[3]; - size_t n_opts; - - switch (subtype) - { - case FP16_BINARY: - n_opts = 2; - gcc_assert (op3 == NULL_RTX); - break; - - case FP16_FMA: - case FP16_FMS: - case FP16_NFMA: - case FP16_NFMS: - gcc_assert (icode == FMA); - n_opts = 3; - break; - - default: - gcc_unreachable (); - } - - for (size_t i = 0; i < n_opts; i++) - { - rtx op = ops_orig[i]; - rtx tmp = ops_v4sf[i] = gen_reg_rtx (V4SFmode); - - gcc_assert (op != NULL_RTX); - - /* Remove truncation/extend added. */ - if (GET_CODE (op) == FLOAT_EXTEND || GET_CODE (op) == FLOAT_TRUNCATE) - op = XEXP (op, 0); - - /* Convert operands to V4SFmode format. We use SPLAT for registers to - get the value into the upper 32-bits. We can use XXSPLTW to splat - words instead of VSPLTIH since the XVCVBF16SPN instruction ignores the - odd half-words, and XXSPLTW can operate on all VSX registers instead - of just the Altivec registers. Using SPLAT instead of a shift also - insure that other bits are not a signalling NaN. If we are using - XXSPLTIW or XXSPLTIB to load the constant the other bits are - duplicated. */ - - if (op == CONST0_RTX (SFmode) || op == CONST0_RTX (BFmode)) - emit_move_insn (tmp, CONST0_RTX (V4SFmode)); - - else if (GET_MODE (op) == BFmode) - { - emit_insn (gen_xxspltw_bf (tmp, force_reg (BFmode, op))); - emit_insn (gen_xvcvbf16spn_bf (tmp, tmp)); - } - - else if (GET_MODE (op) == SFmode) - { - if (GET_CODE (op) == CONST_DOUBLE) - { - rtvec v = rtvec_alloc (4); - - for (size_t i = 0; i < 4; i++) - RTVEC_ELT (v, i) = op; - - emit_insn (gen_rtx_SET (tmp, - gen_rtx_CONST_VECTOR (V4SFmode, v))); - } - - else - emit_insn (gen_vsx_splat_v4sf (tmp, - force_reg (SFmode, op))); - } - - else - gcc_unreachable (); - } - - /* Do the operation in V4SFmode. */ - switch (subtype) - { - case FP16_BINARY: - emit_insn (gen_rtx_SET (result_v4sf, - gen_rtx_fmt_ee (icode, V4SFmode, - ops_v4sf[0], - ops_v4sf[1]))); - break; - - case FP16_FMA: - case FP16_FMS: - case FP16_NFMA: - case FP16_NFMS: - { - rtx op1 = ops_v4sf[0]; - rtx op2 = ops_v4sf[1]; - rtx op3 = ops_v4sf[2]; - - if (subtype == FP16_FMS || subtype == FP16_NFMS) - op3 = gen_rtx_NEG (V4SFmode, op3); - - rtx op_fma = gen_rtx_FMA (V4SFmode, op1, op2, op3); - - if (subtype == FP16_NFMA || subtype == FP16_NFMS) - op_fma = gen_rtx_NEG (V4SFmode, op_fma); - - emit_insn (gen_rtx_SET (result_v4sf, op_fma)); - } - break; - - default: - gcc_unreachable (); - } - - /* Convert V4SF result back to scalar mode. */ - if (GET_MODE (result) == BFmode) - emit_insn (gen_xvcvspbf16_bf (result, result_v4sf)); - - else if (GET_MODE (result) == SFmode) - { - rtx element = GEN_INT (WORDS_BIG_ENDIAN ? 2 : 3); - emit_insn (gen_vsx_extract_v4sf (result, result_v4sf, element)); - } - - else - gcc_unreachable (); -} diff --git a/gcc/config/rs6000/float16.md b/gcc/config/rs6000/float16.md index 6060746cc4ca..55cca8fa7e19 100644 --- a/gcc/config/rs6000/float16.md +++ b/gcc/config/rs6000/float16.md @@ -25,9 +25,6 @@ (define_mode_iterator FP16 [(BF "TARGET_FLOAT16") (HF "TARGET_FLOAT16")]) -(define_mode_iterator VFP16 [(V8BF "TARGET_BFLOAT16") - (V8HF "TARGET_FLOAT16")]) - ;; Mode iterator for 16-bit floating point modes on machines with ;; hardware support both as a scalar and as a vector. (define_mode_iterator FP16_HW [(BF "TARGET_BFLOAT16_HW") @@ -704,288 +701,6 @@ %q3 %0,%1,%2" [(set_attr "type" "veclogical,logical")]) -;; Optimize __bfloat16 binary operations. Unlike _Float16 where we -;; have instructions to convert between HFmode and SFmode as scalar -;; values, with BFmode, we only have vector conversions. Thus to do: -;; -;; __bfloat16 a, b, c; -;; a = b + c; -;; -;; the GCC compiler would normally generate: -;; -;; lxsihzx 0,4,2 // load __bfloat16 value b -;; lxsihzx 12,5,2 // load __bfloat16 value c -;; xxsldwi 0,0,0,1 // shift b into bits 16..31 -;; xxsldwi 12,12,12,1 // shift c into bits 16..31 -;; xvcvbf16spn 0,0 // vector convert b into V4SFmode -;; xvcvbf16spn 12,12 // vector convert c into V4SFmode -;; xscvspdpn 0,0 // convert b into SFmode scalar -;; xscvspdpn 12,12 // convert c into SFmode scalar -;; fadds 0,0,12 // add b+c -;; xscvdpspn 0,0 // convert b+c into SFmode memory format -;; xvcvspbf16 0,0 // convert b+c into BFmode memory format -;; stxsihx 0,3,2 // store b+c -;; -;; Using the following combiner patterns, the code generated would now -;; be: -;; -;; lxsihzx 12,4,2 // load __bfloat16 value b -;; lxsihzx 0,5,2 // load __bfloat16 value c -;; xxspltw 12,12,1 // shift b into bits 16..31 -;; xxspltw 0,0,1 // shift c into bits 16..31 -;; xvcvbf16spn 12,12 // vector convert b into V4SFmode -;; xvcvbf16spn 0,0 // vector convert c into V4SFmode -;; xvaddsp 0,0,12 // vector b+c in V4SFmode -;; xvcvspbf16 0,0 // convert b+c into BFmode memory format -;; stxsihx 0,3,2 // store b+c -;; -;; We cannot just define insns like 'addbf3' to keep the operation as -;; BFmode because GCC will not generate these patterns unless the user -;; uses -Ofast. Without -Ofast, it will always convert BFmode into -;; SFmode. - -(define_insn_and_split "*bfloat16_binary_op_internal1" - [(set (match_operand:SF 0 "vsx_register_operand") - (match_operator:SF 1 "fp16_binary_operator" - [(match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand")]))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[2], SFmode) - || bfloat16_bf_operand (operands[3], SFmode))" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], - operands[3], NULL_RTX, FP16_BINARY); - DONE; -}) - -(define_insn_and_split "*bfloat16_binary_op_internal2" - [(set (match_operand:BF 0 "vsx_register_operand") - (float_truncate:BF - (match_operator:SF 1 "fp16_binary_operator" - [(match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand")])))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[2], SFmode) - || bfloat16_bf_operand (operands[3], SFmode))" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (GET_CODE (operands[1]), operands[0], operands[2], - operands[3], NULL_RTX, FP16_BINARY); - DONE; -}) - -(define_insn_and_split "*bfloat16_fma_internal1" - [(set (match_operand:SF 0 "vsx_register_operand") - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand")))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_FMA); - DONE; -}) - -(define_insn_and_split "*bfloat16_fma_internal2" - [(set (match_operand:BF 0 "vsx_register_operand" "=wa") - (float_truncate:BF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand"))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_FMA); - DONE; -}) - -(define_insn_and_split "*bfloat16_fms_internal1" - [(set (match_operand:SF 0 "vsx_register_operand") - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (neg:SF - (match_operand:SF 3 "bfloat16_v4sf_operand"))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_FMS); - DONE; -}) - -(define_insn_and_split "*bfloat16_fms_internal2" - [(set (match_operand:BF 0 "vsx_register_operand") - (float_truncate:BF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (neg:SF - (match_operand:SF 3 "bfloat16_v4sf_operand")))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_FMS); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfma_internal1" - [(set (match_operand:SF 0 "vsx_register_operand") - (neg:SF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand"))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMA); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfma_internal2" - [(set (match_operand:BF 0 "vsx_register_operand" "=wa") - (float_truncate:BF - (neg:SF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand")))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMA); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfma_internal3" - [(set (match_operand:BF 0 "vsx_register_operand" "=wa") - (neg:BF - (float_truncate:BF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (match_operand:SF 3 "bfloat16_v4sf_operand")))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMA); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfms_internal1" - [(set (match_operand:SF 0 "vsx_register_operand") - (neg:SF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (neg:SF - (match_operand:SF 3 "bfloat16_v4sf_operand")))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMS); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfms_internal2" - [(set (match_operand:BF 0 "vsx_register_operand") - (float_truncate:BF - (neg:SF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (neg:SF - (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMS); - DONE; -}) - -(define_insn_and_split "*bfloat16_nfms_internal3" - [(set (match_operand:BF 0 "vsx_register_operand") - (neg:BF - (float_truncate:BF - (fma:SF - (match_operand:SF 1 "bfloat16_v4sf_operand") - (match_operand:SF 2 "bfloat16_v4sf_operand") - (neg:SF - (match_operand:SF 3 "bfloat16_v4sf_operand"))))))] - "TARGET_BFLOAT16_HW && can_create_pseudo_p () - && (bfloat16_bf_operand (operands[1], SFmode) - + bfloat16_bf_operand (operands[2], SFmode) - + bfloat16_bf_operand (operands[3], SFmode) >= 2)" - "#" - "&& 1" - [(pc)] -{ - bfloat16_operation_as_v4sf (FMA, operands[0], operands[1], operands[2], - operands[3], FP16_NFMS); - DONE; -}) - ;; Add vectorization support for 16-bit floating point. ;; Binary operators being vectorized. @@ -1073,54 +788,6 @@ DONE; }) -;; If we do multiple __bfloat16 operations, between the first and -;; second operation, GCC will want to convert the first operation from -;; V4SFmode to SFmode and then reconvert it back to V4SFmode. On the -;; PowerPC, this is complicated because internally in the vector -;; register, SFmode values are stored as DFmode values. -;; -;; For example, if we have: -;; -;; __bfloat16 a, b, c, d; -;; a = b + c + d; -;; -;; We would generate: -;; -;; lxsihzx 0,4,2 // load b as BFmode -;; lxsihzx 11,5,2 // load c as BFmode -;; lxsihzx 12,6,2 // load d as BFmode -;; xxspltw 0,0,1 // shift b into bits 16..31 -;; xxspltw 11,11,1 // shift c into bits 16..31 -;; xxspltw 12,12,1 // shift d into bits 16..31 -;; xvcvbf16spn 0,0 // convert b into V4SFmode -;; xvcvbf16spn 11,11 // convert c into V4SFmode -;; xvcvbf16spn 12,12 // convert d into V4SFmode -;; xvaddsp 0,0,11 // calculate b+c as V4SFmode -;; xscvspdp 0,0 // convert b+c into DFmode memory format -;; xscvdpspn 0,0 // convert b+c into SFmode memory format -;; xxspltw 0,0,0 // convert b+c into V4SFmode -;; xvaddsp 12,12,0 // calculate b+c+d as V4SFmode -;; xvcvspbf16 12,12 // convert b+c+d into BFmode memory format -;; stxsihx 12,3,2 // store b+c+d -;; -;; With this peephole2, we can eliminate the xscvspdp and xscvdpspn -;; instructions. -;; -;; We keep the xxspltw between the two xvaddsp's in case the user -;; explicitly did a SFmode extract of element 0 and did a splat -;; operation. - -(define_peephole2 - [(set (match_operand:SF 0 "vsx_register_operand") - (unspec:SF - [(match_operand:V4SF 1 "vsx_register_operand")] - UNSPEC_VSX_CVSPDP)) - (set (match_operand:V4SF 2 "vsx_register_operand") - (unspec:V4SF [(match_dup 0)] UNSPEC_VSX_CVDPSPN))] - "REGNO (operands[1]) == REGNO (operands[2]) - || peep2_reg_dead_p (1, operands[1])" - [(set (match_dup 2) (match_dup 1))]) - ;; Vector Pack support. (define_expand "vec_pack_trunc_v4sf_v8hf" diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md index 408caf4521f0..e9ddc61e3a8a 100644 --- a/gcc/config/rs6000/predicates.md +++ b/gcc/config/rs6000/predicates.md @@ -2192,79 +2192,3 @@ return constant_generates_xxspltiw (&vsx_const); }) - -;; Return 1 if this is a 16-bit floating point operand that can be used -;; in an add, subtract, or multiply operation that uses the vector -;; conversion function. -(define_predicate "fp16_reg_or_constant_operand" - (match_code "reg,subreg,const_double") -{ - if (REG_P (op) || SUBREG_P (op)) - return vsx_register_operand (op, mode); - - if (CONST_DOUBLE_P (op)) - return fp16_xxspltiw_constant (op, mode); - - return false; -}) - -;; Match binary operators where we convert a BFmode operand into a -;; SFmode operand so that we can optimize the BFmode operation to do -;; the operation in vector mode rather than convverting the BFmode to a -;; V8BFmode vector, converting that V8BFmode vector to V4SFmode, and -;; then converting the V4SFmode element to SFmode scalar. -(define_predicate "fp16_binary_operator" - (match_code "plus,minus,mult,smax,smin")) - -;; Match bfloat16/float operands that can be optimized to do the -;; operation in V4SFmode. -(define_predicate "bfloat16_v4sf_operand" - (match_code "reg,subreg,const_double,float_extend,float_truncate") -{ - if (mode != BFmode && mode != SFmode) - return false; - - if (REG_P (op) || SUBREG_P (op)) - return register_operand (op, mode); - - if (CONST_DOUBLE_P (op)) - return true; - - if (GET_CODE (op) == FLOAT_EXTEND) - { - rtx op_arg = XEXP (op, 0); - return (mode == SFmode - && GET_MODE (op_arg) == BFmode - && (REG_P (op_arg) || SUBREG_P (op_arg))); - } - - if (GET_CODE (op) == FLOAT_TRUNCATE) - { - rtx op_arg = XEXP (op, 0); - return (mode == BFmode - && GET_MODE (op_arg) == SFmode - && (REG_P (op_arg) || SUBREG_P (op_arg))); - } - - return false; -}) - -;; Match an operand that originally was an BFmode value to prevent -;; operations involing only SFmode values from being converted to -;; BFmode. -(define_predicate "bfloat16_bf_operand" - (match_code "reg,subreg,const_double,float_extend") -{ - if (mode == BFmode || GET_MODE (op) == BFmode) - return true; - - if (mode != SFmode) - return false; - - if (GET_MODE (op) == SFmode - && GET_CODE (op) == FLOAT_EXTEND - && GET_MODE (XEXP (op, 0)) == BFmode) - return true; - - return false; -}) diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 001dc1fc7f4b..6b17112ed0da 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -271,8 +271,6 @@ enum fp16_operation { extern void fp16_vectorization (enum rtx_code, rtx, rtx, rtx, rtx, enum fp16_operation); -extern void bfloat16_operation_as_v4sf (enum rtx_code, rtx, rtx, rtx, rtx, - enum fp16_operation); #endif /* RTX_CODE */ #ifdef TREE_CODE
