llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> This makes little difference in the final output, as we manage to form this after these are lowered to the _ieee operations. This does result in fewer steps in the DAG, and helps prepare for changing the handling of minnum/maxnum. --- Full diff: https://github.com/llvm/llvm-project/pull/141048.diff 2 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-2) - (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+8-8) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2d337fafe6dc2..ade88a16193b8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13593,10 +13593,34 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (K0->getValueAPF() > K1->getValueAPF()) return SDValue(); + // med3 with a nan input acts like + // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32) + // + // So the result depends on whether the IEEE mode bit is enabled or not with a + // signaling nan input. + // ieee=1 + // s0 snan: yields s2 + // s1 snan: yields s2 + // s2 snan: qnan + + // s0 qnan: min(s1, s2) + // s1 qnan: min(s0, s2) + // s2 qnan: min(s0, s1) + + // ieee=0 + // s0 snan: min(s1, s2) + // s1 snan: min(s0, s2) + // s2 snan: qnan + + // s0 qnan: min(s1, s2) + // s1 qnan: min(s0, s2) + // s2 qnan: min(s0, s1) const MachineFunction &MF = DAG.getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - // TODO: Check IEEE bit enabled? + // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of + // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We + // can only form if op0 is fmaxnum_ieee if IEEE=1. EVT VT = Op0.getValueType(); if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the @@ -13714,9 +13738,14 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, return Med3; } - // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + // if !is_snan(x): + // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) + // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) + // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) + // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1) if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || + (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 01681e7d8b8ed..6274b38a63fe0 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -4115,13 +4115,13 @@ define float @v_clamp_f32_daz_minimumnum_maximumnum(float %a) #0 { ; GFX6-LABEL: v_clamp_f32_daz_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f32_daz_minimumnum_maximumnum: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_clamp_f32_daz_minimumnum_maximumnum: @@ -4154,13 +4154,13 @@ define float @v_clamp_f32_minimumnum_maximumnum(float %a) #1 { ; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum: @@ -4193,13 +4193,13 @@ define float @v_clamp_f32_neg_minimumnum_maximumnum(float %a) #1 { ; GFX6-LABEL: v_clamp_f32_neg_minimumnum_maximumnum: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e64 v0, -1.0, v0 clamp +; GFX6-NEXT: v_max_f32_e64 v0, -v0, -v0 clamp ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f32_neg_minimumnum_maximumnum: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, -1.0, v0 clamp +; GFX8-NEXT: v_max_f32_e64 v0, -v0, -v0 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_clamp_f32_neg_minimumnum_maximumnum: @@ -4233,13 +4233,13 @@ define float @v_clamp_f32_minimumnum_maximumnum_no_ieee(float %a) #5 { ; GFX6-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX6-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, v0 clamp +; GFX8-NEXT: v_max_f32_e64 v0, v0, v0 clamp ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_clamp_f32_minimumnum_maximumnum_no_ieee: `````````` </details> https://github.com/llvm/llvm-project/pull/141048 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits