https://github.com/rohitaggarwal007 updated https://github.com/llvm/llvm-project/pull/203521
>From 00dbfba3b0f83aec53adf3b4492f167dee88c07a Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal <[email protected]> Date: Mon, 8 Jun 2026 16:22:13 +0530 Subject: [PATCH 1/3] [X86] Try to preserve the MXCSR flag behaviour while lowering the intrinsics --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++ .../Target/X86/X86InstCombineIntrinsic.cpp | 18 +- .../X86/avx512-cur-direction-rounding.ll | 61 +++++++ .../X86/x86-avx512-cur-direction-rounding.ll | 168 ++++++++++++++++++ 4 files changed, 269 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll create mode 100644 llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8bb44e55d713f..1b902d3af8345 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27093,6 +27093,30 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getTargetConstant(RC, dl, MVT::i32)); if (!isRoundModeCurDirection(Rnd)) return SDValue(); + + // CUR_DIRECTION means "use the current MXCSR rounding mode". In a + // function that accesses the FP environment (strictfp), lowering to a + // plain FADD/FSUB/FMUL/FDIV would let the DAG constant-fold the + // operation under round-to-nearest-even, discarding the live MXCSR + // rounding mode. Emit the corresponding strict node instead: it is not + // constant-folded and lowers to a real instruction that reads MXCSR. + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::StrictFP)) { + unsigned StrictOpc = 0; + switch (IntrData->Opc0) { + case ISD::FADD: StrictOpc = ISD::STRICT_FADD; break; + case ISD::FSUB: StrictOpc = ISD::STRICT_FSUB; break; + case ISD::FMUL: StrictOpc = ISD::STRICT_FMUL; break; + case ISD::FDIV: StrictOpc = ISD::STRICT_FDIV; break; + default: break; + } + if (StrictOpc) { + SDValue StrictNode = DAG.getNode( + StrictOpc, dl, DAG.getVTList(Op.getValueType(), MVT::Other), + {DAG.getEntryNode(), Op.getOperand(1), Src2}); + return StrictNode.getValue(0); + } + } } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 932b4a416a8d3..2e48406a3599c 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -2451,7 +2451,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::x86_avx512_mul_pd_512: case Intrinsic::x86_avx512_sub_pd_512: // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. + // IR operations. A plain fadd/fsub/fmul/fdiv is unconstrained FP and + // assumes the default rounding mode (round-to-nearest-even), whereas + // CUR_DIRECTION must honor whatever rounding the live MXCSR selects. Only + // fold when the function does not access the FP environment; inside a + // strictfp function MXCSR may have been changed (e.g. via fesetround), so + // the intrinsic must be preserved. + if (II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP)) + break; if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { if (R->getValue() == 4) { Value *Arg0 = II.getArgOperand(0); @@ -2493,7 +2500,14 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { case Intrinsic::x86_avx512_mask_mul_sd_round: case Intrinsic::x86_avx512_mask_sub_sd_round: // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular - // IR operations. + // IR operations. A plain fadd/fsub/fmul/fdiv is unconstrained FP and + // assumes the default rounding mode (round-to-nearest-even), whereas + // CUR_DIRECTION must honor whatever rounding the live MXCSR selects. Only + // fold when the function does not access the FP environment; inside a + // strictfp function MXCSR may have been changed (e.g. via fesetround), so + // the intrinsic must be preserved. + if (II.getFunction()->getAttributes().hasFnAttr(Attribute::StrictFP)) + break; if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { if (R->getValue() == 4) { // Extract the element as scalars. diff --git a/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll b/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll new file mode 100644 index 0000000000000..5c13f2196da52 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-cur-direction-rounding.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s + +; Backend behavior for the AVX-512 packed add/sub/mul/div "_round" intrinsics +; with rounding-mode operand == 4 (_MM_FROUND_CUR_DIRECTION, "use current +; MXCSR"). +; +; - With symbolic operands the intrinsic lowers to a plain vaddps that reads the +; live MXCSR rounding mode at run time (correct), including under strictfp. +; - With constant operands in a non-strictfp function the SelectionDAG +; constant-folds the result using round-to-nearest-even (0x3F800001 == +; 1.00000012f). This is allowed under the default-FP-environment contract. +; - With constant operands in a strictfp function the lowering emits a strict +; node, which is NOT constant-folded, so a real runtime vaddps is produced +; that honors the live MXCSR rounding mode (no round-to-nearest value is +; baked in). + +declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) + +define <16 x float> @add_ps_512_symbolic(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: add_ps_512_symbolic: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + ret <16 x float> %r +} + +define <16 x float> @add_ps_512_symbolic_strictfp(<16 x float> %a, <16 x float> %b) strictfp { +; CHECK-LABEL: add_ps_512_symbolic_strictfp: +; CHECK: # %bb.0: +; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) strictfp + ret <16 x float> %r +} + +define <16 x float> @add_ps_512_constant() { +; CHECK-LABEL: add_ps_512_constant: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0,1.00000012E+0] +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.x86.avx512.add.ps.512( + <16 x float> splat (float 1.0), + <16 x float> splat (float 0x3E78000000000000), + i32 4) + ret <16 x float> %r +} + +define <16 x float> @add_ps_512_constant_strictfp() strictfp { +; CHECK-LABEL: add_ps_512_constant_strictfp: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %r = call <16 x float> @llvm.x86.avx512.add.ps.512( + <16 x float> splat (float 1.0), + <16 x float> splat (float 0x3E78000000000000), + i32 4) strictfp + ret <16 x float> %r +} diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll new file mode 100644 index 0000000000000..26d8ec1e6c57a --- /dev/null +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-cur-direction-rounding.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; The AVX512 packed/scalar "_round" arithmetic intrinsics take a rounding-mode +; immediate. A value of 4 (_MM_FROUND_CUR_DIRECTION) means "use whatever the +; MXCSR register currently selects". InstCombine rewrites these to plain +; unconstrained FP operations, which are semantically round-to-nearest-even and +; are then free to be constant-folded. +; +; These tests document that behavior. The 'constfold' tests are the dangerous +; case: with constant operands chosen so that round-to-nearest and +; round-toward-zero disagree, the result is baked in as the round-to-nearest +; value (0x3F800001 == 1.00000012f), even though a caller that set MXCSR to +; round-toward-zero before the call expects 1.0 (0x3F800000). + +declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32) +declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32) +declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32) +declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32) +declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32) +declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +; In a strictfp function the FP environment may have been changed (e.g. via +; fesetround), so CUR_DIRECTION must NOT be folded to an unconstrained fadd: +; the intrinsic has to be preserved so the live MXCSR rounding mode is honored. +define <16 x float> @add_ps_512_cur_direction_strictfp(<16 x float> %a, <16 x float> %b) strictfp { +; CHECK-LABEL: @add_ps_512_cur_direction_strictfp( +; CHECK-NEXT: [[R:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 4) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) strictfp + ret <16 x float> %r +} + +; Same constants as the constfold test above, but strictfp: must stay an +; intrinsic instead of constant-folding to the round-to-nearest value. +define <16 x float> @add_ps_512_cur_direction_constfold_strictfp() strictfp { +; CHECK-LABEL: @add_ps_512_cur_direction_constfold_strictfp( +; CHECK-NEXT: [[R:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> splat (float 1.000000e+00), <16 x float> splat (float f0x33C00000), i32 4) #[[ATTR1]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.add.ps.512( + <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0>, + <16 x float> <float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000>, + i32 4) strictfp + ret <16 x float> %r +} + +; Masked scalar variant must likewise be preserved under strictfp. +define <4 x float> @mask_add_ss_round_cur_direction_strictfp(<4 x float> %a, <4 x float> %b, <4 x float> %c) strictfp { +; CHECK-LABEL: @mask_add_ss_round_cur_direction_strictfp( +; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 -1, i32 4) #[[ATTR1]] +; CHECK-NEXT: ret <4 x float> [[R]] +; + %r = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 -1, i32 4) strictfp + ret <4 x float> %r +} + +; The rounding-mode operand is dropped: CUR_DIRECTION folds to a plain fadd that +; no longer carries any MXCSR dependence. +define <16 x float> @add_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: @add_ps_512_cur_direction( +; CHECK-NEXT: [[R:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + ret <16 x float> %r +} + +define <16 x float> @sub_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: @sub_ps_512_cur_direction( +; CHECK-NEXT: [[R:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + ret <16 x float> %r +} + +define <16 x float> @mul_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: @mul_ps_512_cur_direction( +; CHECK-NEXT: [[R:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + ret <16 x float> %r +} + +define <16 x float> @div_ps_512_cur_direction(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: @div_ps_512_cur_direction( +; CHECK-NEXT: [[R:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4) + ret <16 x float> %r +} + +define <8 x double> @add_pd_512_cur_direction(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: @add_pd_512_cur_direction( +; CHECK-NEXT: [[R:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret <8 x double> [[R]] +; + %r = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4) + ret <8 x double> %r +} + +; A non-CUR_DIRECTION rounding mode (8 == round-to-nearest, no exceptions) must +; NOT be folded, since it overrides MXCSR. Kept as a negative control. +define <16 x float> @add_ps_512_round_to_nearest(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: @add_ps_512_round_to_nearest( +; CHECK-NEXT: [[R:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8) +; CHECK-NEXT: ret <16 x float> [[R]] +; + %r = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8) + ret <16 x float> %r +} + +; Constant operands: a = 1.0, b = 3*2^-25 (0x3E78000000000000 in double-encoded +; literal form == 0x33C00000 as f32). 1.0 + b lies between 1.0 and 1.0+ulp and +; is closer to the upper neighbor, so: +; round-to-nearest-even -> 0x3F800001 (1.00000012) +; round-toward-zero -> 0x3F800000 (1.0) +; The fold bakes in the round-to-nearest value, which is wrong whenever the +; caller's live MXCSR rounding mode is not round-to-nearest. +define <16 x float> @add_ps_512_cur_direction_constfold() { +; CHECK-LABEL: @add_ps_512_cur_direction_constfold( +; CHECK-NEXT: ret <16 x float> splat (float f0x3F800001) +; + %r = call <16 x float> @llvm.x86.avx512.add.ps.512( + <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0, + float 1.0, float 1.0, float 1.0, float 1.0>, + <16 x float> <float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000, + float 0x3E78000000000000, float 0x3E78000000000000>, + i32 4) + ret <16 x float> %r +} + +; Masked scalar variant (arg 4 is the rounding mode); all-ones mask + constants. +; Lane 0 likewise constant-folds to the round-to-nearest value 0x3F800001. +define <4 x float> @mask_add_ss_round_cur_direction_constfold() { +; CHECK-LABEL: @mask_add_ss_round_cur_direction_constfold( +; CHECK-NEXT: ret <4 x float> <float f0x3F800001, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00> +; + %r = call <4 x float> @llvm.x86.avx512.mask.add.ss.round( + <4 x float> <float 1.0, float 0.0, float 0.0, float 0.0>, + <4 x float> <float 0x3E78000000000000, float 0.0, float 0.0, float 0.0>, + <4 x float> zeroinitializer, + i8 -1, i32 4) + ret <4 x float> %r +} >From 666415fca7e1a32dc0b7537b3dc2791d758a1e13 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal <[email protected]> Date: Fri, 12 Jun 2026 17:01:38 +0530 Subject: [PATCH 2/3] [X86] Add clang test for AVX-512 CUR_DIRECTION rounding MXCSR behaviour Add a CodeGen test covering the packed add/sub/mul/div _round builtins with _MM_FROUND_CUR_DIRECTION. It checks that with -ffp-exception-behavior=strict the call and enclosing function are marked strictfp (so the operation is preserved and honors the live MXCSR rounding mode), while without it the intrinsic is foldable to a plain IR op at -O2. --- .../X86/avx512f-cur-direction-rounding.c | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c diff --git a/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c b/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c new file mode 100644 index 0000000000000..fbbb0efdbebf2 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512f-cur-direction-rounding.c @@ -0,0 +1,134 @@ +// REQUIRES: x86-registered-target +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -emit-llvm -o - -Wall -Werror | FileCheck --check-prefixes=COMMON,UNCONSTRAINED %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -emit-llvm -o - -Wall -Werror | FileCheck --check-prefixes=COMMON,STRICT %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -S -o - -Wall -Werror | FileCheck --check-prefix=CHECK-ASM %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -S -o - -Wall -Werror | FileCheck --check-prefix=CHECK-ASM %s +// +// At -O2 the default (non-strictfp) intrinsic is folded to a plain fadd, while +// the strictfp form is preserved (see test_mm512_add_round_ps_fold below). +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -O2 -emit-llvm -o - -Wall -Werror | FileCheck --check-prefix=FOLD %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux-gnu -target-feature +avx512f -ffp-exception-behavior=strict -O2 -emit-llvm -o - -Wall -Werror | FileCheck --check-prefix=KEEP %s + +// The packed add/sub/mul/div "_round" builtins with _MM_FROUND_CUR_DIRECTION +// lower to the unmasked x86 intrinsic with rounding operand 4. +// +// Without -ffp-exception-behavior=strict the call is a plain (non-strictfp) +// intrinsic call: under the default FP environment the optimizer is free to +// fold it to round-to-nearest IR. +// +// With -ffp-exception-behavior=strict the enclosing function and the call are +// marked "strictfp". That attribute is what makes the rest of the pipeline +// (InstCombine and the X86 SelectionDAG lowering) preserve the operation so it +// honors the live MXCSR rounding mode instead of constant-folding it. + +#include <immintrin.h> + +__m512 test_mm512_add_round_ps(__m512 a, __m512 b) { + // COMMON-LABEL: test_mm512_add_round_ps + // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + // STRICT: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR:[0-9]+]] + // CHECK-ASM: vaddps + return _mm512_add_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_sub_round_ps(__m512 a, __m512 b) { + // COMMON-LABEL: test_mm512_sub_round_ps + // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + // STRICT: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]] + // CHECK-ASM: vsubps + return _mm512_sub_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_mul_round_ps(__m512 a, __m512 b) { + // COMMON-LABEL: test_mm512_mul_round_ps + // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + // STRICT: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]] + // CHECK-ASM: vmulps + return _mm512_mul_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_div_round_ps(__m512 a, __m512 b) { + // COMMON-LABEL: test_mm512_div_round_ps + // UNCONSTRAINED: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + // STRICT: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) #[[ATTR]] + // CHECK-ASM: vdivps + return _mm512_div_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +// Optimized (-O2) view of the same operation, equivalent to the InstCombine +// test add_ps_512_cur_direction: without strictfp the rounding-mode operand is +// dropped and the call becomes a plain fadd that no longer carries any MXCSR +// dependence; with strictfp the intrinsic (and its MXCSR dependence) survives. +__m512 test_mm512_add_round_ps_fold(__m512 a, __m512 b) { + // FOLD-LABEL: @test_mm512_add_round_ps_fold( + // FOLD: fadd <16 x float> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_add_round_ps_fold( + // KEEP: call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + return _mm512_add_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_sub_round_ps_fold(__m512 a, __m512 b) { + // FOLD-LABEL: @test_mm512_sub_round_ps_fold( + // FOLD: fsub <16 x float> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_sub_round_ps_fold( + // KEEP: call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + return _mm512_sub_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_mul_round_ps_fold(__m512 a, __m512 b) { + // FOLD-LABEL: @test_mm512_mul_round_ps_fold( + // FOLD: fmul <16 x float> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_mul_round_ps_fold( + // KEEP: call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + return _mm512_mul_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512 test_mm512_div_round_ps_fold(__m512 a, __m512 b) { + // FOLD-LABEL: @test_mm512_div_round_ps_fold( + // FOLD: fdiv <16 x float> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_div_round_ps_fold( + // KEEP: call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4) + return _mm512_div_round_ps(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512d test_mm512_add_round_pd_fold(__m512d a, __m512d b) { + // FOLD-LABEL: @test_mm512_add_round_pd_fold( + // FOLD: fadd <8 x double> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_add_round_pd_fold( + // KEEP: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) + return _mm512_add_round_pd(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512d test_mm512_sub_round_pd_fold(__m512d a, __m512d b) { + // FOLD-LABEL: @test_mm512_sub_round_pd_fold( + // FOLD: fsub <8 x double> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_sub_round_pd_fold( + // KEEP: call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) + return _mm512_sub_round_pd(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512d test_mm512_mul_round_pd_fold(__m512d a, __m512d b) { + // FOLD-LABEL: @test_mm512_mul_round_pd_fold( + // FOLD: fmul <8 x double> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_mul_round_pd_fold( + // KEEP: call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) + return _mm512_mul_round_pd(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512d test_mm512_div_round_pd_fold(__m512d a, __m512d b) { + // FOLD-LABEL: @test_mm512_div_round_pd_fold( + // FOLD: fdiv <8 x double> %{{.*}}, %{{.*}} + // KEEP-LABEL: @test_mm512_div_round_pd_fold( + // KEEP: call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) + return _mm512_div_round_pd(a, b, _MM_FROUND_CUR_DIRECTION); +} + +__m512d test_mm512_add_round_pd(__m512d a, __m512d b) { + // COMMON-LABEL: test_mm512_add_round_pd + // UNCONSTRAINED: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) + // STRICT: call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4) #[[ATTR]] + // CHECK-ASM: vaddpd + return _mm512_add_round_pd(a, b, _MM_FROUND_CUR_DIRECTION); +} + +// STRICT: attributes #[[ATTR]] = { strictfp } >From a467c81ea67b7b3ff476dbc568a10d92cd61cbd2 Mon Sep 17 00:00:00 2001 From: Rohit Aggarwal <[email protected]> Date: Fri, 12 Jun 2026 18:38:53 +0530 Subject: [PATCH 3/3] Fix formating issue in llvm/lib/Target/X86/X86ISelLowering.cpp --- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1b902d3af8345..417d3205e57cc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27104,11 +27104,20 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Attribute::StrictFP)) { unsigned StrictOpc = 0; switch (IntrData->Opc0) { - case ISD::FADD: StrictOpc = ISD::STRICT_FADD; break; - case ISD::FSUB: StrictOpc = ISD::STRICT_FSUB; break; - case ISD::FMUL: StrictOpc = ISD::STRICT_FMUL; break; - case ISD::FDIV: StrictOpc = ISD::STRICT_FDIV; break; - default: break; + case ISD::FADD: + StrictOpc = ISD::STRICT_FADD; + break; + case ISD::FSUB: + StrictOpc = ISD::STRICT_FSUB; + break; + case ISD::FMUL: + StrictOpc = ISD::STRICT_FMUL; + break; + case ISD::FDIV: + StrictOpc = ISD::STRICT_FDIV; + break; + default: + break; } if (StrictOpc) { SDValue StrictNode = DAG.getNode( _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
