https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/167264
>From 0696fe61de175a41ac8a67c265a97e1611b9c5ab Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Sun, 9 Nov 2025 18:45:32 -0800 Subject: [PATCH 1/2] DAG: Add AssertNoFPClass from call return attributes This defends against regressions in future patches. This excludes the target intrinsic case for now; I'm worried introducing an intermediate AssertNoFPClass is likely to break combines. --- .../SelectionDAG/SelectionDAGBuilder.cpp | 17 +++++++++++++++++ .../CodeGen/SelectionDAG/SelectionDAGBuilder.h | 4 ++++ llvm/test/CodeGen/AMDGPU/nofpclass-call.ll | 16 ++++------------ 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 88b0809b767b5..6a9022dff41ad 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4638,6 +4638,12 @@ static std::optional<ConstantRange> getRange(const Instruction &I) { return std::nullopt; } +static FPClassTest getNoFPClass(const Instruction &I) { + if (const auto *CB = dyn_cast<CallBase>(&I)) + return CB->getRetNoFPClass(); + return fcNone; +} + void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (I.isAtomic()) return visitAtomicLoad(I); @@ -9132,6 +9138,7 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee, if (Result.first.getNode()) { Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first); + Result.first = lowerNoFPClassToAssertNoFPClass(DAG, CB, Result.first); setValue(&CB, Result.first); } @@ -10718,6 +10725,16 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG, return DAG.getMergeValues(Ops, SL); } +SDValue SelectionDAGBuilder::lowerNoFPClassToAssertNoFPClass( + SelectionDAG &DAG, const Instruction &I, SDValue Op) { + FPClassTest Classes = getNoFPClass(I); + if (Classes == fcNone) + return Op; + + return DAG.getNode(ISD::AssertNoFPClass, SDLoc(Op), Op.getValueType(), Op, + DAG.getTargetConstant(Classes, SDLoc(), MVT::i32)); +} + /// Populate a CallLowerinInfo (into \p CLI) based on the properties of /// the call being lowered. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index ed63bee58c957..13e2daa783147 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -429,6 +429,10 @@ class SelectionDAGBuilder { SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I, SDValue Op); + // Lower nofpclass attributes to AssertNoFPClass + SDValue lowerNoFPClassToAssertNoFPClass(SelectionDAG &DAG, + const Instruction &I, SDValue Op); + void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI, const CallBase *Call, unsigned ArgIdx, unsigned NumArgs, SDValue Callee, diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll index 1861f02ec8b1c..5f303cc2a1eef 100644 --- a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll +++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll @@ -35,9 +35,7 @@ define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v3, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f32_e32 v1, v3, v3 -; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 -; CHECK-NEXT: v_min_f32_e32 v0, v1, v0 +; CHECK-NEXT: v_min_f32_e32 v0, v3, v0 ; CHECK-NEXT: v_readlane_b32 s31, v4, 1 ; CHECK-NEXT: v_readlane_b32 s30, v4, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 @@ -87,12 +85,8 @@ define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v0, v3 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f32_e32 v2, v4, v4 -; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 -; CHECK-NEXT: v_min_f32_e32 v0, v2, v0 -; CHECK-NEXT: v_max_f32_e32 v2, v5, v5 -; CHECK-NEXT: v_max_f32_e32 v1, v1, v1 -; CHECK-NEXT: v_min_f32_e32 v1, v2, v1 +; CHECK-NEXT: v_min_f32_e32 v0, v4, v0 +; CHECK-NEXT: v_min_f32_e32 v1, v5, v1 ; CHECK-NEXT: v_readlane_b32 s31, v6, 1 ; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 @@ -142,12 +136,10 @@ define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: v_mov_b32_e32 v0, v5 ; CHECK-NEXT: v_mov_b32_e32 v1, v4 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; CHECK-NEXT: v_readlane_b32 s31, v6, 1 ; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 -; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] >From 669fbce2fbc118a533cc5f3560e2fd29088b71ce Mon Sep 17 00:00:00 2001 From: Matt Arsenault <[email protected]> Date: Mon, 10 Nov 2025 08:01:51 -0800 Subject: [PATCH 2/2] AMDGPU: Add baseline tests for copysign with known signmask input (#167265) --- .../AMDGPU/copysign-simplify-demanded-bits.ll | 108 +++++++++ .../AMDGPU/copysign-to-disjoint-or-combine.ll | 208 ++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index f5227eed458d6..0be2b5c70c93b 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -370,4 +370,112 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ret float %pow_sign1 } +define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { +; GFX9-LABEL: test_pow_fast_f64integral_y: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v43, s16, 15 +; GFX9-NEXT: v_writelane_b32 v43, s30, 0 +; GFX9-NEXT: v_writelane_b32 v43, s31, 1 +; GFX9-NEXT: v_writelane_b32 v43, s34, 2 +; GFX9-NEXT: v_writelane_b32 v43, s35, 3 +; GFX9-NEXT: v_writelane_b32 v43, s36, 4 +; GFX9-NEXT: v_writelane_b32 v43, s37, 5 +; GFX9-NEXT: v_writelane_b32 v43, s38, 6 +; GFX9-NEXT: v_writelane_b32 v43, s39, 7 +; GFX9-NEXT: v_writelane_b32 v43, s48, 8 +; GFX9-NEXT: v_writelane_b32 v43, s49, 9 +; GFX9-NEXT: v_writelane_b32 v43, s50, 10 +; GFX9-NEXT: v_writelane_b32 v43, s51, 11 +; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v43, s52, 12 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_writelane_b32 v43, s53, 13 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v43, s54, 14 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_mov_b32_e32 v41, v2 +; GFX9-NEXT: s_mov_b32 s50, s15 +; GFX9-NEXT: s_mov_b32 s51, s14 +; GFX9-NEXT: s_mov_b32 s52, s13 +; GFX9-NEXT: s_mov_b32 s53, s12 +; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] +; GFX9-NEXT: s_brev_b32 s54, -2 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4exp2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4exp2d@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s53 +; GFX9-NEXT: s_mov_b32 s13, s52 +; GFX9-NEXT: s_mov_b32 s14, s51 +; GFX9-NEXT: s_mov_b32 s15, s50 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v41 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v42 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2 +; GFX9-NEXT: v_readlane_b32 s54, v43, 14 +; GFX9-NEXT: v_readlane_b32 s53, v43, 13 +; GFX9-NEXT: v_readlane_b32 s52, v43, 12 +; GFX9-NEXT: v_readlane_b32 s51, v43, 11 +; GFX9-NEXT: v_readlane_b32 s50, v43, 10 +; GFX9-NEXT: v_readlane_b32 s49, v43, 9 +; GFX9-NEXT: v_readlane_b32 s48, v43, 8 +; GFX9-NEXT: v_readlane_b32 s39, v43, 7 +; GFX9-NEXT: v_readlane_b32 s38, v43, 6 +; GFX9-NEXT: v_readlane_b32 s37, v43, 5 +; GFX9-NEXT: v_readlane_b32 s36, v43, 4 +; GFX9-NEXT: v_readlane_b32 s35, v43, 3 +; GFX9-NEXT: v_readlane_b32 s34, v43, 2 +; GFX9-NEXT: v_readlane_b32 s31, v43, 1 +; GFX9-NEXT: v_readlane_b32 s30, v43, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v43, 15 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs = call fast double @llvm.fabs.f64(double %x) + %log2 = call fast double @_Z4log2d(double %fabs) + %pownI2F = sitofp i32 %y.i to double + %ylogx = fmul fast double %log2, %pownI2F + %exp2 = call fast nofpclass(nan ninf nzero nsub nnorm) double @_Z4exp2d(double %ylogx) + %ytou = zext i32 %y.i to i64 + %yeven = shl i64 %ytou, 63 + %x.i64 = bitcast double %x to i64 + %pow_sign = and i64 %yeven, %x.i64 + %pow_sign.f64 = bitcast i64 %pow_sign to double + %pow_sign1 = call fast double @llvm.copysign.f64(double %exp2, double %pow_sign.f64) + ret double %pow_sign1 +} + +declare hidden double @_Z4exp2d(double) #1 +declare hidden double @_Z4log2d(double) #1 + attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { norecurse nounwind memory(read) } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll new file mode 100644 index 0000000000000..b99b64316b62f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Negative test, don't know %x is positive +define half @copysign_known_signmask_f16(half %x, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %x, half %signmask.bitcast) + ret half %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %x, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define double @copysign_known_signmask_f64(double %x, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %x, double %signmask.bitcast) + ret double %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero(float nofpclass(nan ninf nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %sign.bit.known.zero, half %signmask.bitcast) + ret half %result +} + +define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %sign.bit.known.zero, double %signmask.bitcast) + ret double %result +} + +; exp always returns a positive result, excluding the unknown nan sign +; bit. +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2aeac50 +; GFX9-NEXT: v_add_f32_e32 v2, 0x42800000, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag_through_fence(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag_through_fence: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: ;ARITH_FENCE +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %fence = call float @llvm.arithmetic.fence.f32(float %sign.bit.known.zero) + %result = call float @llvm.copysign.f32(float %fence, float %signmask.bitcast) + ret float %result +} _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
