https://github.com/stomfaig updated https://github.com/llvm/llvm-project/pull/171227
>From 62647bf9b0323e8ca161dd87657e56e5d6aa20b1 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Sun, 7 Dec 2025 23:37:58 +0000 Subject: [PATCH 01/19] adding initial handlers --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 98 ++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index be2b7d442645e..a3e5c48629228 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -840,6 +840,104 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops[0]); return Builder.CreateExtractValue(Call, 0); } + case X86::BI__builtin_ia32_roundps: + case X86::BI__builtin_ia32_roundpd: + case X86::BI__builtin_ia32_roundps256: + case X86::BI__builtin_ia32_roundpd256: { + unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue(); + unsigned roundingModeAndPE = M & 0b111; + unsigned updatePE = M & 0b100; + unsigned use_MXCSR = M & 0b1000; + + Intrinsic::ID ID; + + // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction + if (use_MXCSR) { + switch (BuiltinID) { + case X86::BI__builtin_ia32_roundps: ID = Intrinsic::x86_sse41_round_ps; break; + case X86::BI__builtin_ia32_roundpd: ID = Intrinsic::x86_sse41_round_pd; break; + } + return nullptr; + } else { + switch (roundingModeAndPE) { + default: return nullptr; + case 0b000: ID = Intrinsic::nearbyint; break; + case 0b001: ID = Intrinsic::floor; break; + case 0b010: ID = Intrinsic::ceil; break; + case 0b011: ID = Intrinsic::trunc; break; + case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op + case 0b101: ID = Intrinsic::experimental_constrained_floor; break; + case 0b110: ID = Intrinsic::experimental_constrained_ceil; break; + case 0b111: ID = Intrinsic::experimental_constrained_trunc; break; + } + } + + Function *F = CGM.getIntrinsic(ID, Ops[0]->getType()); + + if (updatePE) { + LLVMContext &Ctx = CGM.getLLVMContext(); + + Value *ExceptMode =MetadataAsValue::get( + Ctx, + MDString::get(Ctx, "fpexcept.strict") + ); + + return Builder.CreateCall(F, {Ops[0], ExceptMode}); + } + + return Builder.CreateCall(F, {Ops[0]}); + } + case X86::BI__builtin_ia32_roundss: + case X86::BI__builtin_ia32_roundsd: { + unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue(); + unsigned roundingModeAndPE = M & 0b111; + unsigned updatePE = M & 0b100; + unsigned use_MXCSR = M & 0b1000; + + Intrinsic::ID ID; + + // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction + if (use_MXCSR) { + switch (BuiltinID) { + case X86::BI__builtin_ia32_roundss: ID = Intrinsic::x86_sse41_round_ss; break; + case X86::BI__builtin_ia32_roundsd: ID = Intrinsic::x86_sse41_round_sd; break; + } + return nullptr; + } else { + switch (roundingModeAndPE) { + default: return nullptr; + case 0b000: ID = Intrinsic::nearbyint; break; + case 0b001: ID = Intrinsic::floor; break; + case 0b010: ID = Intrinsic::ceil; break; + case 0b011: ID = Intrinsic::trunc; break; + case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op + case 0b101: ID = Intrinsic::experimental_constrained_floor; break; + case 0b110: ID = Intrinsic::experimental_constrained_ceil; break; + case 0b111: ID = Intrinsic::experimental_constrained_trunc; break; + } + } + + Value *idx = Builder.getInt32(0); + Value *b0 = Builder.CreateExtractElement(Ops[1], idx); + Value *rounded0; + + Function *F = CGM.getIntrinsic(ID, b0->getType()); + + if (updatePE) { + LLVMContext &Ctx = CGM.getLLVMContext(); + + Value *ExceptMode =MetadataAsValue::get( + Ctx, + MDString::get(Ctx, "fpexcept.strict") + ); + + rounded0 = Builder.CreateCall(F, {b0, ExceptMode}); + } else { + rounded0 = Builder.CreateCall(F, {b0}); + } + + return Builder.CreateInsertElement(Ops[0], rounded0, idx); + } case X86::BI__builtin_ia32_lzcnt_u16: case X86::BI__builtin_ia32_lzcnt_u32: case X86::BI__builtin_ia32_lzcnt_u64: { >From 56f72b858744ff3c837170aac2f6b9654dd9be0e Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Sun, 7 Dec 2025 23:38:35 +0000 Subject: [PATCH 02/19] modify relevant tests --- clang/test/CodeGen/X86/avx-builtins.c | 12 ++++----- clang/test/CodeGen/X86/pr51324.c | 2 +- clang/test/CodeGen/X86/sse41-builtins.c | 36 ++++++++++++++++--------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 13da4292c5b92..506327bc910c7 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -246,13 +246,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000 __m256d test_mm256_ceil_pd(__m256d x) { // CHECK-LABEL: test_mm256_ceil_pd - // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2) + // CHECK: %{{.*}} = call <4 x double> @llvm.ceil.v4f64(<4 x double> %{{.*}}) return _mm256_ceil_pd(x); } __m256 test_mm_ceil_ps(__m256 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2) + // CHECK: %{{.*}} = call <8 x float> @llvm.ceil.v8f32(<8 x float> %{{.*}}) return _mm256_ceil_ps(x); } @@ -1095,13 +1095,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL, __m256d test_mm256_floor_pd(__m256d x) { // CHECK-LABEL: test_mm256_floor_pd - // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1) + // CHECK: %{{.*}} = call <4 x double> @llvm.floor.v4f64(<4 x double> %{{.*}}) return _mm256_floor_pd(x); } __m256 test_mm_floor_ps(__m256 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1) + // CHECK: %{{.*}} = call <8 x float> @llvm.floor.v8f32(<8 x float> %{{.*}}) return _mm256_floor_ps(x); } @@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) { __m256d test_mm256_round_pd(__m256d x) { // CHECK-LABEL: test_mm256_round_pd - // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4) + // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict") return _mm256_round_pd(x, 4); } __m256 test_mm256_round_ps(__m256 x) { // CHECK-LABEL: test_mm256_round_ps - // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4) + // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict") return _mm256_round_ps(x, 4); } diff --git a/clang/test/CodeGen/X86/pr51324.c b/clang/test/CodeGen/X86/pr51324.c index 10d1ba3c84b85..de97183aa6613 100644 --- a/clang/test/CodeGen/X86/pr51324.c +++ b/clang/test/CodeGen/X86/pr51324.c @@ -9,7 +9,7 @@ // Make sure brackets work after macro intrinsics. float pr51324(__m128 a) { // CHECK-LABEL: pr51324 - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0) + // call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}}) // CHECK: extractelement <4 x float> %{{.*}}, i32 0 return _mm_round_ps(a, 0)[0]; } diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 35fa65a99836b..9163b14a9fc11 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -75,25 +75,29 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f} __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}}) return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: %{{.*}} = call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}}) return _mm_ceil_ps(x); } __m128d test_mm_ceil_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_ceil_sd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2) + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.ceil.f64(double %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_ceil_sd(x, y); } __m128 test_mm_ceil_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_ceil_ss - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2) + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.ceil.f32(float %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_ceil_ss(x, y); } @@ -256,25 +260,29 @@ TEST_CONSTEXPR(_mm_extract_ps(((__m128){1.25f, 2.5f, 3.75f, 5.0f}), 6) == __buil __m128d test_mm_floor_pd(__m128d x) { // CHECK-LABEL: test_mm_floor_pd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1) + // CHECK: %{{.*}} = call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}}) return _mm_floor_pd(x); } __m128 test_mm_floor_ps(__m128 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1) + // CHECK: %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}}) return _mm_floor_ps(x); } __m128d test_mm_floor_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_floor_sd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1) + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.floor.f64(double %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_floor_sd(x, y); } __m128 test_mm_floor_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_floor_ss - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1) + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.floor.f32(float %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_floor_ss(x, y); } @@ -430,25 +438,29 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276 __m128d test_mm_round_pd(__m128d x) { // CHECK-LABEL: test_mm_round_pd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4) + // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict") return _mm_round_pd(x, 4); } __m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4) + // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict") return _mm_round_ps(x, 4); } __m128d test_mm_round_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_round_sd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4) + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.floor.f64(double %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_round_sd(x, y, 4); } __m128 test_mm_round_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_round_ss - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4) + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.floor.f32(float %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_round_ss(x, y, 4); } >From aab58a9b7309e3daa7b95ddc49da33ddebfef2cb Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Sun, 7 Dec 2025 23:39:05 +0000 Subject: [PATCH 03/19] remove ClangBuiltin from ops --- llvm/include/llvm/IR/IntrinsicsX86.td | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 1dd23f60c7e1e..6369e97f807fb 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -626,17 +626,13 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_round_ss : ClangBuiltin<"__builtin_ia32_roundss">, - DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + def int_x86_sse41_round_ss : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_ps : ClangBuiltin<"__builtin_ia32_roundps">, - DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, + def int_x86_sse41_round_ps : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_sse41_round_sd : ClangBuiltin<"__builtin_ia32_roundsd">, - DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + def int_x86_sse41_round_sd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_pd : ClangBuiltin<"__builtin_ia32_roundpd">, - DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, + def int_x86_sse41_round_pd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; } @@ -921,11 +917,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">, DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">, - DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], + def int_x86_avx_round_pd_256 : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">, - DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], + def int_x86_avx_round_ps_256 : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; } >From 156d2aa3b72e90699885973f85aaacc0eb930435 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Mon, 8 Dec 2025 23:06:21 +0000 Subject: [PATCH 04/19] moving rounding functionality to helper --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 155 ++++++++++------------- 1 file changed, 69 insertions(+), 86 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index a3e5c48629228..167ad4478e6b1 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -75,6 +75,70 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, return MaskVec; } +static Value *emitX86Round(CodeGenFunction &CGF, + Value *X, + unsigned M) { + unsigned RoundingMask = 0b11; + unsigned UpdatePEBit = 0b100; + unsigned UseMXCSRBit = 0b1000; + + unsigned roundingMode = M & RoundingMask; + bool updatePE = M & UpdatePEBit; + bool useMXCSR = M & UseMXCSRBit; + + Intrinsic::ID ID = Intrinsic::not_intrinsic; + LLVMContext &Ctx = CGF.CGM.getLLVMContext(); + + if (useMXCSR) { + ID = Intrinsic::experimental_constrained_nearbyint; + + auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore"; + + Value *ExceptMode = MetadataAsValue::get( + Ctx, + MDString::get(Ctx, PE_metatadata) + ); + + Value *RoundingMode = MetadataAsValue::get( + Ctx, + MDString::get(Ctx, "rounding.dynamic") + ); + + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode}); + } + + if (updatePE) { + switch (roundingMode) { + case 0b00: ID = Intrinsic::experimental_constrained_roundeven; break; + case 0b01: ID = Intrinsic::experimental_constrained_floor; break; + case 0b10: ID = Intrinsic::experimental_constrained_ceil; break; + case 0b11: ID = Intrinsic::experimental_constrained_trunc; break; + default: llvm_unreachable("Invalid rounding mode"); + } + + Value *ExceptMode =MetadataAsValue::get( + Ctx, + MDString::get(Ctx, "fpexcept.strict") + ); + + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X, ExceptMode}); + } + + // Otherwise we can use the standard ops + switch (roundingMode) { + case 0b00: ID = Intrinsic::roundeven; break; + case 0b01: ID = Intrinsic::floor; break; + case 0b10: ID = Intrinsic::ceil; break; + case 0b11: ID = Intrinsic::trunc; break; + default: llvm_unreachable("Invalid rounding mode"); + } + + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X}); +} + static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops, Align Alignment) { Value *Ptr = Ops[0]; @@ -843,100 +907,19 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_roundps: case X86::BI__builtin_ia32_roundpd: case X86::BI__builtin_ia32_roundps256: - case X86::BI__builtin_ia32_roundpd256: { + case X86::BI__builtin_ia32_roundpd256: { unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue(); - unsigned roundingModeAndPE = M & 0b111; - unsigned updatePE = M & 0b100; - unsigned use_MXCSR = M & 0b1000; - - Intrinsic::ID ID; - - // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction - if (use_MXCSR) { - switch (BuiltinID) { - case X86::BI__builtin_ia32_roundps: ID = Intrinsic::x86_sse41_round_ps; break; - case X86::BI__builtin_ia32_roundpd: ID = Intrinsic::x86_sse41_round_pd; break; - } - return nullptr; - } else { - switch (roundingModeAndPE) { - default: return nullptr; - case 0b000: ID = Intrinsic::nearbyint; break; - case 0b001: ID = Intrinsic::floor; break; - case 0b010: ID = Intrinsic::ceil; break; - case 0b011: ID = Intrinsic::trunc; break; - case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op - case 0b101: ID = Intrinsic::experimental_constrained_floor; break; - case 0b110: ID = Intrinsic::experimental_constrained_ceil; break; - case 0b111: ID = Intrinsic::experimental_constrained_trunc; break; - } - } - - Function *F = CGM.getIntrinsic(ID, Ops[0]->getType()); - - if (updatePE) { - LLVMContext &Ctx = CGM.getLLVMContext(); - - Value *ExceptMode =MetadataAsValue::get( - Ctx, - MDString::get(Ctx, "fpexcept.strict") - ); - - return Builder.CreateCall(F, {Ops[0], ExceptMode}); - } - - return Builder.CreateCall(F, {Ops[0]}); + return emitX86Round(*this, Ops[0], M); } case X86::BI__builtin_ia32_roundss: case X86::BI__builtin_ia32_roundsd: { unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue(); - unsigned roundingModeAndPE = M & 0b111; - unsigned updatePE = M & 0b100; - unsigned use_MXCSR = M & 0b1000; - Intrinsic::ID ID; - - // Currently no ops for MXCSR bit set, so lower directly to SSE41 instruction - if (use_MXCSR) { - switch (BuiltinID) { - case X86::BI__builtin_ia32_roundss: ID = Intrinsic::x86_sse41_round_ss; break; - case X86::BI__builtin_ia32_roundsd: ID = Intrinsic::x86_sse41_round_sd; break; - } - return nullptr; - } else { - switch (roundingModeAndPE) { - default: return nullptr; - case 0b000: ID = Intrinsic::nearbyint; break; - case 0b001: ID = Intrinsic::floor; break; - case 0b010: ID = Intrinsic::ceil; break; - case 0b011: ID = Intrinsic::trunc; break; - case 0b100: ID = Intrinsic::experimental_constrained_floor; break; // TODO: replace with actual op - case 0b101: ID = Intrinsic::experimental_constrained_floor; break; - case 0b110: ID = Intrinsic::experimental_constrained_ceil; break; - case 0b111: ID = Intrinsic::experimental_constrained_trunc; break; - } - } - Value *idx = Builder.getInt32(0); - Value *b0 = Builder.CreateExtractElement(Ops[1], idx); - Value *rounded0; - - Function *F = CGM.getIntrinsic(ID, b0->getType()); - - if (updatePE) { - LLVMContext &Ctx = CGM.getLLVMContext(); - - Value *ExceptMode =MetadataAsValue::get( - Ctx, - MDString::get(Ctx, "fpexcept.strict") - ); - - rounded0 = Builder.CreateCall(F, {b0, ExceptMode}); - } else { - rounded0 = Builder.CreateCall(F, {b0}); - } + Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx); + Value *RoundedAt0 = emitX86Round(*this, ValAt0, M); - return Builder.CreateInsertElement(Ops[0], rounded0, idx); + return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx); } case X86::BI__builtin_ia32_lzcnt_u16: case X86::BI__builtin_ia32_lzcnt_u32: >From 7ab45f89c6e3092b210ab2c12a24bd706b8de41c Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Mon, 8 Dec 2025 23:06:41 +0000 Subject: [PATCH 05/19] update tests --- clang/test/CodeGen/X86/avx-builtins.c | 4 ++-- clang/test/CodeGen/X86/sse41-builtins.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 506327bc910c7..f3844adf0a498 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) { __m256d test_mm256_round_pd(__m256d x) { // CHECK-LABEL: test_mm256_round_pd - // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict") return _mm256_round_pd(x, 4); } __m256 test_mm256_round_ps(__m256 x) { // CHECK-LABEL: test_mm256_round_ps - // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.floor.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict") return _mm256_round_ps(x, 4); } diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 9163b14a9fc11..f084e1dfade15 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -438,20 +438,20 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276 __m128d test_mm_round_pd(__m128d x) { // CHECK-LABEL: test_mm_round_pd - // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict") return _mm_round_pd(x, 4); } __m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps - // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict") return _mm_round_ps(x, 4); } __m128d test_mm_round_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_round_sd // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.floor.f64(double %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.roundeven.f64(double %[[A:.*]], metadata !"fpexcept.strict") // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_round_sd(x, y, 4); } @@ -459,7 +459,7 @@ __m128d test_mm_round_sd(__m128d x, __m128d y) { __m128 test_mm_round_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_round_ss // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.floor.f32(float %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.roundeven.f32(float %[[A:.*]], metadata !"fpexcept.strict") // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_round_ss(x, y, 4); } >From 62a18f9a34347c6243c15e0c627b1d74cfe916a7 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Mon, 8 Dec 2025 23:07:42 +0000 Subject: [PATCH 06/19] format --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 76 ++++++++++++++---------- llvm/include/llvm/IR/IntrinsicsX86.td | 32 ++++++---- 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 167ad4478e6b1..c8b55e855e717 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -75,52 +75,53 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, return MaskVec; } -static Value *emitX86Round(CodeGenFunction &CGF, - Value *X, - unsigned M) { +static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) { unsigned RoundingMask = 0b11; unsigned UpdatePEBit = 0b100; unsigned UseMXCSRBit = 0b1000; - + unsigned roundingMode = M & RoundingMask; bool updatePE = M & UpdatePEBit; bool useMXCSR = M & UseMXCSRBit; - + Intrinsic::ID ID = Intrinsic::not_intrinsic; LLVMContext &Ctx = CGF.CGM.getLLVMContext(); - + if (useMXCSR) { ID = Intrinsic::experimental_constrained_nearbyint; - + auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore"; - Value *ExceptMode = MetadataAsValue::get( - Ctx, - MDString::get(Ctx, PE_metatadata) - ); + Value *ExceptMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, PE_metatadata)); - Value *RoundingMode = MetadataAsValue::get( - Ctx, - MDString::get(Ctx, "rounding.dynamic") - ); + Value *RoundingMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic")); Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode}); - } + } if (updatePE) { switch (roundingMode) { - case 0b00: ID = Intrinsic::experimental_constrained_roundeven; break; - case 0b01: ID = Intrinsic::experimental_constrained_floor; break; - case 0b10: ID = Intrinsic::experimental_constrained_ceil; break; - case 0b11: ID = Intrinsic::experimental_constrained_trunc; break; - default: llvm_unreachable("Invalid rounding mode"); + case 0b00: + ID = Intrinsic::experimental_constrained_roundeven; + break; + case 0b01: + ID = Intrinsic::experimental_constrained_floor; + break; + case 0b10: + ID = Intrinsic::experimental_constrained_ceil; + break; + case 0b11: + ID = Intrinsic::experimental_constrained_trunc; + break; + default: + llvm_unreachable("Invalid rounding mode"); } - Value *ExceptMode =MetadataAsValue::get( - Ctx, - MDString::get(Ctx, "fpexcept.strict") - ); + Value *ExceptMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.strict")); Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); return CGF.Builder.CreateCall(F, {X, ExceptMode}); @@ -128,11 +129,20 @@ static Value *emitX86Round(CodeGenFunction &CGF, // Otherwise we can use the standard ops switch (roundingMode) { - case 0b00: ID = Intrinsic::roundeven; break; - case 0b01: ID = Intrinsic::floor; break; - case 0b10: ID = Intrinsic::ceil; break; - case 0b11: ID = Intrinsic::trunc; break; - default: llvm_unreachable("Invalid rounding mode"); + case 0b00: + ID = Intrinsic::roundeven; + break; + case 0b01: + ID = Intrinsic::floor; + break; + case 0b10: + ID = Intrinsic::ceil; + break; + case 0b11: + ID = Intrinsic::trunc; + break; + default: + llvm_unreachable("Invalid rounding mode"); } Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); @@ -907,18 +917,18 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_roundps: case X86::BI__builtin_ia32_roundpd: case X86::BI__builtin_ia32_roundps256: - case X86::BI__builtin_ia32_roundpd256: { + case X86::BI__builtin_ia32_roundpd256: { unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue(); return emitX86Round(*this, Ops[0], M); } case X86::BI__builtin_ia32_roundss: case X86::BI__builtin_ia32_roundsd: { unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue(); - + Value *idx = Builder.getInt32(0); Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx); Value *RoundedAt0 = emitX86Round(*this, ValAt0, M); - + return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx); } case X86::BI__builtin_ia32_lzcnt_u16: diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 6369e97f807fb..7838e410badd7 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -626,14 +626,20 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_round_ss : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_ps : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_sse41_round_sd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_pd : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_sse41_round_ss + : DefaultAttrsIntrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + def int_x86_sse41_round_ps + : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_sse41_round_sd + : DefaultAttrsIntrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + def int_x86_sse41_round_pd + : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; } // Vector min element @@ -917,10 +923,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">, DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_round_pd_256 : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_avx_round_ps_256 : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_avx_round_pd_256 + : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_avx_round_ps_256 + : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; } // Horizontal ops >From c4eff0d6076ebaeaaef481f4adc914bfc349ec4a Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Tue, 9 Dec 2025 18:04:23 +0000 Subject: [PATCH 07/19] resolving comments --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 43 ++++-------------------- clang/test/CodeGen/X86/avx-builtins.c | 4 +-- clang/test/CodeGen/X86/sse41-builtins.c | 8 ++--- 3 files changed, 13 insertions(+), 42 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index c8b55e855e717..d4c25cdc8b0ab 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -75,14 +75,13 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, return MaskVec; } -static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) { - unsigned RoundingMask = 0b11; - unsigned UpdatePEBit = 0b100; - unsigned UseMXCSRBit = 0b1000; +// Emit rounding for the value X according to the rounding RoundingControl. +static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned RoundingControl) { + unsigned roundingMask = 0b11; + unsigned useMXCSRBit = 0b1000; - unsigned roundingMode = M & RoundingMask; - bool updatePE = M & UpdatePEBit; - bool useMXCSR = M & UseMXCSRBit; + unsigned roundingMode = RoundingControl & roundingMask; + bool useMXCSR = RoundingControl & useMXCSRBit; Intrinsic::ID ID = Intrinsic::not_intrinsic; LLVMContext &Ctx = CGF.CGM.getLLVMContext(); @@ -90,10 +89,8 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) { if (useMXCSR) { ID = Intrinsic::experimental_constrained_nearbyint; - auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore"; - Value *ExceptMode = - MetadataAsValue::get(Ctx, MDString::get(Ctx, PE_metatadata)); + MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); Value *RoundingMode = MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic")); @@ -102,32 +99,6 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) { return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode}); } - if (updatePE) { - switch (roundingMode) { - case 0b00: - ID = Intrinsic::experimental_constrained_roundeven; - break; - case 0b01: - ID = Intrinsic::experimental_constrained_floor; - break; - case 0b10: - ID = Intrinsic::experimental_constrained_ceil; - break; - case 0b11: - ID = Intrinsic::experimental_constrained_trunc; - break; - default: - llvm_unreachable("Invalid rounding mode"); - } - - Value *ExceptMode = - MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.strict")); - - Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); - return CGF.Builder.CreateCall(F, {X, ExceptMode}); - } - - // Otherwise we can use the standard ops switch (roundingMode) { case 0b00: ID = Intrinsic::roundeven; diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index f3844adf0a498..e6b8b57b8cb30 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) { __m256d test_mm256_round_pd(__m256d x) { // CHECK-LABEL: test_mm256_round_pd - // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %{{.*}}) return _mm256_round_pd(x, 4); } __m256 test_mm256_round_ps(__m256 x) { // CHECK-LABEL: test_mm256_round_ps - // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %{{.*}}) return _mm256_round_ps(x, 4); } diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index f084e1dfade15..3d2eacfcb9287 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -438,20 +438,20 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276 __m128d test_mm_round_pd(__m128d x) { // CHECK-LABEL: test_mm_round_pd - // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %{{.*}}) return _mm_round_pd(x, 4); } __m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps - // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict") + // CHECK: %{{.*}} = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %{{.*}}) return _mm_round_ps(x, 4); } __m128d test_mm_round_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_round_sd // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 - // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.roundeven.f64(double %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %[[B:.*]] = call double @llvm.roundeven.f64(double %[[A:.*]]) // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_round_sd(x, y, 4); } @@ -459,7 +459,7 @@ __m128d test_mm_round_sd(__m128d x, __m128d y) { __m128 test_mm_round_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_round_ss // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 - // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.roundeven.f32(float %[[A:.*]], metadata !"fpexcept.strict") + // CHECK: %[[B:.*]] = call float @llvm.roundeven.f32(float %[[A:.*]]) // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_round_ss(x, y, 4); } >From 9b2cda2ccaa0631133d0dee1e378951d1db09cce Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Tue, 9 Dec 2025 18:04:39 +0000 Subject: [PATCH 08/19] format --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index d4c25cdc8b0ab..fc10f460e6dc4 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -76,7 +76,8 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, } // Emit rounding for the value X according to the rounding RoundingControl. -static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned RoundingControl) { +static Value *emitX86Round(CodeGenFunction &CGF, Value *X, + unsigned RoundingControl) { unsigned roundingMask = 0b11; unsigned useMXCSRBit = 0b1000; >From 4202a3a74c1a45660236351477c9707f74251233 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 01:24:00 +0000 Subject: [PATCH 09/19] save --- llvm/lib/Target/X86/X86InstrSSE.td | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index e4aaa1e1b594a..fada8ccb9808a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5707,6 +5707,23 @@ let Predicates = [UseSSE41, OptForSize] in { (ROUNDSDmi addr:$src1, timm:$src2)>; } +multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, + ValueType VT, Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (insertelt VT:$dst, (OpNode (extractelt VT:$src, 0)), 0)), + (!cast<Instruction>(OpcPrefix#ri_Int) $dst, $src, 0)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, 0)>; + } +} + +defm : test<any_fceil, "ROUNDSS", X86Movss, v4f32, UseSSE41>; + //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// >From a1825fb75bccb95560333b3b624acfccaa3363a2 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 17:05:57 +0000 Subject: [PATCH 10/19] add pattern to eliminate round + blend in asm --- llvm/lib/Target/X86/X86InstrSSE.td | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index fada8ccb9808a..24d19b40ad41d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5710,19 +5710,20 @@ let Predicates = [UseSSE41, OptForSize] in { multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, ValueType VT, Predicate BasePredicate> { let Predicates = [BasePredicate] in { - def : Pat<(VT (insertelt VT:$dst, (OpNode (extractelt VT:$src, 0)), 0)), - (!cast<Instruction>(OpcPrefix#ri_Int) $dst, $src, 0)>; + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))), + (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>; } // Repeat for AVX versions of the instructions. let Predicates = [UseAVX] in { def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, 0))))), - (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, 0)>; + (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))), + (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>; } } - -defm : test<any_fceil, "ROUNDSS", X86Movss, v4f32, UseSSE41>; +defm : test<X86any_VRndScale, "ROUNDSS", X86Movss, v4f32, UseSSE41>; +defm : test<X86any_VRndScale, "ROUNDSD", X86Movsd, v2f64, UseSSE41>; //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test >From 7f84d4919098c7b1e80596f3ff45079e7d83aba1 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 17:07:41 +0000 Subject: [PATCH 11/19] style: varnames are capitalised --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index fc10f460e6dc4..8d43fd90a5247 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -78,16 +78,16 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, // Emit rounding for the value X according to the rounding RoundingControl. static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned RoundingControl) { - unsigned roundingMask = 0b11; - unsigned useMXCSRBit = 0b1000; + unsigned RoundingMask = 0b11; + unsigned UseMXCSRBit = 0b1000; - unsigned roundingMode = RoundingControl & roundingMask; - bool useMXCSR = RoundingControl & useMXCSRBit; + unsigned RoundingMode = RoundingControl & RoundingMask; + bool UseMXCSR = RoundingControl & UseMXCSRBit; Intrinsic::ID ID = Intrinsic::not_intrinsic; LLVMContext &Ctx = CGF.CGM.getLLVMContext(); - if (useMXCSR) { + if (UseMXCSR) { ID = Intrinsic::experimental_constrained_nearbyint; Value *ExceptMode = @@ -100,7 +100,7 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode}); } - switch (roundingMode) { + switch (RoundingMode) { case 0b00: ID = Intrinsic::roundeven; break; >From c9cfd3df5cd6c50d61d916a8e107d1d71a0ea4b1 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 22:57:19 +0000 Subject: [PATCH 12/19] fix: correct order and value for metadata args --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 8d43fd90a5247..8b804ab561e09 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -90,14 +90,14 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, if (UseMXCSR) { ID = Intrinsic::experimental_constrained_nearbyint; + Value *RoundingMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.dynamic")); + Value *ExceptMode = MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); - Value *RoundingMode = - MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic")); - Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); - return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode}); + return CGF.Builder.CreateCall(F, {X, RoundingMode, ExceptMode}); } switch (RoundingMode) { >From c9923c145b8bb60fe24a388ce37d7f09c42f048d Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 22:58:34 +0000 Subject: [PATCH 13/19] test: add tests for constrained ops --- .../CodeGen/X86/avx-builtins-constrainted.c | 36 ++++++++++++++ .../CodeGen/X86/sse41-builtins-constrained.c | 49 +++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 clang/test/CodeGen/X86/avx-builtins-constrainted.c create mode 100644 clang/test/CodeGen/X86/sse41-builtins-constrained.c diff --git a/clang/test/CodeGen/X86/avx-builtins-constrainted.c b/clang/test/CodeGen/X86/avx-builtins-constrainted.c new file mode 100644 index 0000000000000..cbd4060364139 --- /dev/null +++ b/clang/test/CodeGen/X86/avx-builtins-constrainted.c @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK + +#include <immintrin.h> +#include "builtin_test_helpers.h" + +__m256d test_mm256_round_pd(__m256d x) { + // CHECK-LABEL: test_mm256_round_pd + // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") + return _mm256_round_pd(x, 8); +} + +__m256 test_mm256_round_ps(__m256 x) { + // CHECK-LABEL: test_mm256_round_ps + // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") + return _mm256_round_ps(x, 8); +} \ No newline at end of file diff --git a/clang/test/CodeGen/X86/sse41-builtins-constrained.c b/clang/test/CodeGen/X86/sse41-builtins-constrained.c new file mode 100644 index 0000000000000..ed6c95d37872f --- /dev/null +++ b/clang/test/CodeGen/X86/sse41-builtins-constrained.c @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK + +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK + + +#include <immintrin.h> +#include "builtin_test_helpers.h" + +__m128d test_mm_round_pd(__m128d x) { + // CHECK-LABEL: test_mm_round_pd + // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") + return _mm_round_pd(x, 8); +} + +__m128 test_mm_round_ps(__m128 x) { + // CHECK-LABEL: test_mm_round_ps + // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") + return _mm_round_ps(x, 8); +} + +__m128d test_mm_round_sd(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.nearbyint.f64(double %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 + return _mm_round_sd(x, y, 8); +} + +__m128 test_mm_round_ss(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.nearbyint.f32(float %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 + return _mm_round_ss(x, y, 8); +} \ No newline at end of file >From 0cc45b674d42d84586f54b4b0d0f65dd7c17abff Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 23:02:21 +0000 Subject: [PATCH 14/19] format --- llvm/lib/Target/X86/X86InstrSSE.td | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 24d19b40ad41d..d4996c27dbe8d 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5708,18 +5708,22 @@ let Predicates = [UseSSE41, OptForSize] in { } multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, - ValueType VT, Predicate BasePredicate> { + ValueType VT, Predicate BasePredicate> { let Predicates = [BasePredicate] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))), - (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>; + def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode + (extractelt VT:$src, (i64 0)), + i32:$imm)))), + (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, + i32:$imm)>; } // Repeat for AVX versions of the instructions. let Predicates = [UseAVX] in { - def : Pat<(VT (Move VT:$dst, (scalar_to_vector - (OpNode (extractelt VT:$src, (i64 0)), i32:$imm)))), - (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, i32:$imm)>; + def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode + (extractelt VT:$src, (i64 0)), + i32:$imm)))), + (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, + i32:$imm)>; } } defm : test<X86any_VRndScale, "ROUNDSS", X86Movss, v4f32, UseSSE41>; >From d69d36a8b50a3ea63433c59343b9cc2daadd127e Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Thu, 11 Dec 2025 23:48:50 +0000 Subject: [PATCH 15/19] tests: fix vec_floor tests --- llvm/test/CodeGen/X86/vec_floor.ll | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index 7f4ed3394d10d..2327036a8e1d2 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -821,8 +821,7 @@ define <4 x float> @const_trunc_v4f32() { define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: floor_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundss $9, %xmm0, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: roundss $9, %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_ss: @@ -846,8 +845,7 @@ declare float @llvm.floor.f32(float %s) define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: floor_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundsd $9, %xmm0, %xmm0 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: roundsd $9, %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_sd: @@ -1811,8 +1809,7 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: ceil_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundss $10, %xmm0, %xmm0 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: roundss $10, %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_ss: @@ -1836,8 +1833,7 @@ declare float @llvm.ceil.f32(float %s) define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: ceil_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundsd $10, %xmm0, %xmm0 -; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: roundsd $10, %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_sd: >From 7fc5bdbfcb6b244cdb0c6415cb21eef606156bb7 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Fri, 12 Dec 2025 09:58:16 +0000 Subject: [PATCH 16/19] remove rewrite pattern and associated test changes --- llvm/lib/Target/X86/X86InstrSSE.td | 22 ---------------------- llvm/test/CodeGen/X86/vec_floor.ll | 12 ++++++++---- 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index d4996c27dbe8d..e4aaa1e1b594a 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5707,28 +5707,6 @@ let Predicates = [UseSSE41, OptForSize] in { (ROUNDSDmi addr:$src1, timm:$src2)>; } -multiclass test<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, - ValueType VT, Predicate BasePredicate> { - let Predicates = [BasePredicate] in { - def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode - (extractelt VT:$src, (i64 0)), - i32:$imm)))), - (!cast<Instruction>(OpcPrefix#ri_Int) VT:$dst, VT:$src, - i32:$imm)>; - } - - // Repeat for AVX versions of the instructions. - let Predicates = [UseAVX] in { - def : Pat<(VT(Move VT:$dst, (scalar_to_vector(OpNode - (extractelt VT:$src, (i64 0)), - i32:$imm)))), - (!cast<Instruction>("V"#OpcPrefix#ri_Int) VT:$dst, VT:$src, - i32:$imm)>; - } -} -defm : test<X86any_VRndScale, "ROUNDSS", X86Movss, v4f32, UseSSE41>; -defm : test<X86any_VRndScale, "ROUNDSD", X86Movsd, v2f64, UseSSE41>; - //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll index 2327036a8e1d2..7f4ed3394d10d 100644 --- a/llvm/test/CodeGen/X86/vec_floor.ll +++ b/llvm/test/CodeGen/X86/vec_floor.ll @@ -821,7 +821,8 @@ define <4 x float> @const_trunc_v4f32() { define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: floor_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundss $9, %xmm0, %xmm1 +; SSE41-NEXT: roundss $9, %xmm0, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_ss: @@ -845,7 +846,8 @@ declare float @llvm.floor.f32(float %s) define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: floor_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundsd $9, %xmm0, %xmm1 +; SSE41-NEXT: roundsd $9, %xmm0, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: floor_sd: @@ -1809,7 +1811,8 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind { ; SSE41-LABEL: ceil_ss: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundss $10, %xmm0, %xmm1 +; SSE41-NEXT: roundss $10, %xmm0, %xmm0 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_ss: @@ -1833,7 +1836,8 @@ declare float @llvm.ceil.f32(float %s) define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind { ; SSE41-LABEL: ceil_sd: ; SSE41: ## %bb.0: -; SSE41-NEXT: roundsd $10, %xmm0, %xmm1 +; SSE41-NEXT: roundsd $10, %xmm0, %xmm0 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; SSE41-NEXT: retq ; ; AVX-LABEL: ceil_sd: >From c06eeaede07dc357a35ec40d8469e817f9164bcc Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Fri, 12 Dec 2025 11:10:44 +0000 Subject: [PATCH 17/19] tests: adjust tests for experimentals --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 34 +++++++-- .../CodeGen/X86/avx-builtins-constrainted.c | 59 +++++++++------ .../CodeGen/X86/sse41-builtins-constrained.c | 75 ++++++++++++------- 3 files changed, 113 insertions(+), 55 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 8b804ab561e09..a4bd8fb900d8d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -86,18 +86,40 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, Intrinsic::ID ID = Intrinsic::not_intrinsic; LLVMContext &Ctx = CGF.CGM.getLLVMContext(); + if (CGF.Builder.getIsFPConstrained()) { - if (UseMXCSR) { - ID = Intrinsic::experimental_constrained_nearbyint; + Value *ExceptMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); + + if (UseMXCSR) { + ID = Intrinsic::experimental_constrained_nearbyint; - Value *RoundingMode = + Value *RoundingMode = MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.dynamic")); - Value *ExceptMode = - MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X, RoundingMode, ExceptMode}); + } + + switch (RoundingMode) { + case 0b00: + ID = Intrinsic::experimental_constrained_roundeven; + break; + case 0b01: + ID = Intrinsic::experimental_constrained_floor; + break; + case 0b10: + ID = Intrinsic::experimental_constrained_ceil; + break; + case 0b11: + ID = Intrinsic::experimental_constrained_trunc; + break; + default: + llvm_unreachable("Invalid rounding mode"); + } Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); - return CGF.Builder.CreateCall(F, {X, RoundingMode, ExceptMode}); + return CGF.Builder.CreateCall(F, {X, ExceptMode}); } switch (RoundingMode) { diff --git a/clang/test/CodeGen/X86/avx-builtins-constrainted.c b/clang/test/CodeGen/X86/avx-builtins-constrainted.c index cbd4060364139..459bbebbf7b95 100644 --- a/clang/test/CodeGen/X86/avx-builtins-constrainted.c +++ b/clang/test/CodeGen/X86/avx-builtins-constrainted.c @@ -1,36 +1,47 @@ -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK #include <immintrin.h> #include "builtin_test_helpers.h" -__m256d test_mm256_round_pd(__m256d x) { - // CHECK-LABEL: test_mm256_round_pd +__m256d test_mm256_round_pd1(__m256d x) { + // CHECK-LABEL: test_mm256_round_pd1 // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") return _mm256_round_pd(x, 8); } -__m256 test_mm256_round_ps(__m256 x) { - // CHECK-LABEL: test_mm256_round_ps +__m256d test_mm256_round_pd2(__m256d x) { + // CHECK-LABEL: test_mm256_round_pd2 + // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.ignore") + return _mm256_round_pd(x, 3); +} + +__m256 test_mm256_round_ps1(__m256 x) { + // CHECK-LABEL: test_mm256_round_ps1 // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") return _mm256_round_ps(x, 8); +} +__m256 test_mm256_round_ps2(__m256 x) { + // CHECK-LABEL: test_mm256_round_ps2 + // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.ignore") + return _mm256_round_ps(x, 3); } \ No newline at end of file diff --git a/clang/test/CodeGen/X86/sse41-builtins-constrained.c b/clang/test/CodeGen/X86/sse41-builtins-constrained.c index ed6c95d37872f..94ff706ec8835 100644 --- a/clang/test/CodeGen/X86/sse41-builtins-constrained.c +++ b/clang/test/CodeGen/X86/sse41-builtins-constrained.c @@ -1,49 +1,74 @@ -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK - -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK -// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK + +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK #include <immintrin.h> #include "builtin_test_helpers.h" -__m128d test_mm_round_pd(__m128d x) { - // CHECK-LABEL: test_mm_round_pd +__m128d test_mm_round_pd1(__m128d x) { + // CHECK-LABEL: test_mm_round_pd1 // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") return _mm_round_pd(x, 8); } +__m128d test_mm_round_pd2(__m128d x) { + // CHECK-LABEL: test_mm_round_pd2 + // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.ignore") + return _mm_round_pd(x, 0); +} -__m128 test_mm_round_ps(__m128 x) { - // CHECK-LABEL: test_mm_round_ps +__m128 test_mm_round_ps1(__m128 x) { + // CHECK-LABEL: test_mm_round_ps1 // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore") return _mm_round_ps(x, 8); } +__m128 test_mm_round_ps2(__m128 x) { + // CHECK-LABEL: test_mm_round_ps2 + // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.ignore") + return _mm_round_ps(x, 1); +} -__m128d test_mm_round_sd(__m128d x, __m128d y) { - // CHECK-LABEL: test_mm_round_sd +__m128d test_mm_round_sd1(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd1 // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.nearbyint.f64(double %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 return _mm_round_sd(x, y, 8); } +__m128d test_mm_round_sd2(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd2 + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.ceil.f64(double %[[A:.*]], metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 + return _mm_round_sd(x, y, 2); +} -__m128 test_mm_round_ss(__m128 x, __m128 y) { - // CHECK-LABEL: test_mm_round_ss +__m128 test_mm_round_ss1(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss1 // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.nearbyint.f32(float %[[A:.*]], metadata !"round.dynamic", metadata !"fpexcept.ignore") // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 return _mm_round_ss(x, y, 8); +} + +__m128 test_mm_round_ss2(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss2 + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.trunc.f32(float %[[A:.*]], metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 + return _mm_round_ss(x, y, 3); } \ No newline at end of file >From c7d0e7f87e23ed125dc791baac9695745f726233 Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Fri, 12 Dec 2025 11:11:33 +0000 Subject: [PATCH 18/19] chore: format --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index a4bd8fb900d8d..0d803eef79d38 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -90,17 +90,17 @@ static Value *emitX86Round(CodeGenFunction &CGF, Value *X, Value *ExceptMode = MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); - + if (UseMXCSR) { ID = Intrinsic::experimental_constrained_nearbyint; Value *RoundingMode = - MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.dynamic")); + MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.dynamic")); Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); return CGF.Builder.CreateCall(F, {X, RoundingMode, ExceptMode}); } - + switch (RoundingMode) { case 0b00: ID = Intrinsic::experimental_constrained_roundeven; >From 04636feba7582a594100f032b94d4ffd3a48ddce Mon Sep 17 00:00:00 2001 From: stomfaig <[email protected]> Date: Fri, 12 Dec 2025 11:12:33 +0000 Subject: [PATCH 19/19] style: idx -> Idx --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 0d803eef79d38..cbb46381a02f9 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -919,11 +919,11 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_roundsd: { unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue(); - Value *idx = Builder.getInt32(0); - Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx); + Value *Idx = Builder.getInt32(0); + Value *ValAt0 = Builder.CreateExtractElement(Ops[1], Idx); Value *RoundedAt0 = emitX86Round(*this, ValAt0, M); - return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx); + return Builder.CreateInsertElement(Ops[0], RoundedAt0, Idx); } case X86::BI__builtin_ia32_lzcnt_u16: case X86::BI__builtin_ia32_lzcnt_u32: _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
