llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-systemz

@llvm/pr-subscribers-backend-aarch64

Author: Serge Pavlov (spavloff)

<details>
<summary>Changes</summary>

The previous default implementation unrolled strictfp vector operation, 
producing its scalar version. This behavior can result in low performance in 
some cases.  A better behavior is to convert strictfp nodes into default-mode 
operations. This method has already been used to support strictfp nodes on 
targets with limited support of strictfp operations.

Many targets already implement lowering of strictfp and default-mode nodes 
identically. Small number of deviations from this rule is fixed in this change. 
Identical treatment of strictfp and default mode nodes is natural because both 
represent the same operation in the resulting code, the only difference is 
additional restrictions that compiler should apply when it handles strictfp 
nodes.

Unrolling vector operations can also violate user expectations. In strictfp 
mode, vector operations cannot be created by auto-vectorization because of the 
side effect of constrained functions, so the vector operations come from source 
code. A user may use vector operations to improve performance, but unrolling 
negates that intended performance gain.

---

Patch is 213.99 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/179276.diff


17 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp (+19-4) 
- (modified) llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll (+1-2) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll (+28-96) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll (+12-53) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll (+22-97) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll (+14-16) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll (+99-331) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll (+69-73) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_ldexp.f32.ll (+69-144) 
- (modified) llvm/test/CodeGen/ARM/fp-intrinsics-vector.ll (+9-91) 
- (modified) llvm/test/CodeGen/PowerPC/fp-strict-round.ll (+2-86) 
- (modified) llvm/test/CodeGen/PowerPC/fp-strict.ll (+9-182) 
- (modified) llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll 
(+9-168) 
- (modified) llvm/test/CodeGen/X86/avx512fp16-frem.ll (+156-156) 
- (modified) llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll 
(+169-521) 


``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8f6eddefa57ac..16f431e0315be 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -257,12 +257,24 @@ SDValue VectorLegalizer::TranslateLegalizeResults(SDValue 
Op, SDNode *Result) {
 SDValue
 VectorLegalizer::RecursivelyLegalizeResults(SDValue Op,
                                             MutableArrayRef<SDValue> Results) {
-  assert(Results.size() == Op->getNumValues() &&
-         "Unexpected number of results");
+  // If a strictfp node was lowered to non-strictfp one, the array Results has
+  // one value more than the number of values produced by Op. The additional
+  // value is the chain produced by the original strictfp operation.
+  if (Results.size() != Op->getNumValues()) {
+    assert(Results.size() == (Op->getNumValues() + 1) &&
+           "Unexpected number of results");
+    if (Op.getResNo() >= Op->getNumValues()) {
+      assert(Op.getResNo() + 1 == Results.size());
+      SDValue Chain = Results.back();
+      assert(Chain.getValueType() == MVT::Other);
+      return Chain;
+    }
+  }
   // Make sure that the generated code is itself legal.
   for (unsigned i = 0, e = Results.size(); i != e; ++i) {
     Results[i] = LegalizeOp(Results[i]);
-    AddLegalizedOperand(Op.getValue(i), Results[i]);
+    if (i < Op->getNumValues())
+      AddLegalizedOperand(Op.getValue(i), Results[i]);
   }
 
   return Results[Op.getResNo()];
@@ -2238,7 +2250,10 @@ void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
     return;
   }
 
-  UnrollStrictFPOp(Node, Results);
+  SDValue Chain = Node->getOperand(0);
+  assert(Chain.getValueType() == MVT::Other && "Wrong type of input chain");
+  Results.push_back(SDValue(DAG.mutateStrictFPToFP(Node), 0));
+  Results.push_back(Chain);
 }
 
 void VectorLegalizer::ExpandREM(SDNode *Node,
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll 
b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
index c3da22757f1d2..50c894623fc17 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll
@@ -760,8 +760,7 @@ define <1 x double> @sitofp_v1f64_v1i64(<1 x i64> %x) #0 {
 ; CHECK-LABEL: sitofp_v1f64_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    ret
   %val = call <1 x double> 
@llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %x, metadata 
!"round.tonearest", metadata !"fpexcept.strict") #0
   ret <1 x double> %val
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll 
b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
index c68a0e6f43578..7b17e98d9d135 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll
@@ -353,24 +353,12 @@ define <3 x half> 
@v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 
 ; FIXME: Scalarized
 define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x 
half> %y) #0 {
-; GFX9-SDAG-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v5, v0, s4
-; GFX9-SDAG-NEXT:    v_perm_b32 v1, v4, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_add_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_add_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
@@ -394,86 +382,30 @@ define <4 x half> 
@v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_f16_sdwa v4, v1, v3 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-SDAG-NEXT:    v_add_f16_sdwa v5, v0, v2 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-SDAG-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX10-SDAG-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_add_f16 v1, v1, v3
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v1.h, v1.h, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.h, v0.h, v2.h
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_add_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v2, v6, v5
-; GFX11-SDAG-FAKE16-NEXT:    v_add_f16_e32 v3, v7, v4
-; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_pk_add_f16 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX10-NEXT:    v_pk_add_f16 v1, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-SDAG-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v1, v1, v3
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | 
instid1(VALU_DEP_4)
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v2, v6, v5
-; GFX12-SDAG-NEXT:    v_add_f16_e32 v3, v7, v4
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-SDAG-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_add_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX12-GISEL-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_pk_add_f16 v0, v0, v2
-; GFX12-GISEL-NEXT:    v_pk_add_f16 v1, v1, v3
-; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-LABEL: v_constained_fadd_v4f16_fpexcept_strict:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_add_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> 
%x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret <4 x half> %val
 }
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll 
b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
index f695526737311..28d9d12c78c53 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll
@@ -117,19 +117,8 @@ define <4 x half> 
@v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_fma_f16 v6, v8, v7, v6
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_fma_f16 v7, v9, v8, v7
-; GFX9-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX9-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
@@ -154,46 +143,16 @@ define <4 x half> 
@v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha
 ; GFX10-LABEL: v_constained_fma_v4f16_fpexcept_strict:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; GFX10-NEXT:    v_fmac_f16_e32 v4, v0, v2
-; GFX10-NEXT:    v_fmac_f16_e32 v6, v8, v7
-; GFX10-NEXT:    v_fmac_f16_e32 v5, v1, v3
-; GFX10-NEXT:    v_fmac_f16_e32 v9, v11, v10
-; GFX10-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
+; GFX10-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX10-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: v_constained_fma_v4f16_fpexcept_strict:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_fma_f16 v5.h, v1.h, v3.h, v5.h
-; GFX11-TRUE16-NEXT:    v_fma_f16 v4.h, v0.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v4.l, v0.l, v2.l
-; GFX11-TRUE16-NEXT:    v_fmac_f16_e32 v5.l, v1.l, v3.l
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_constained_fma_v4f16_fpexcept_strict:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
-; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v4, v0, v2
-; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v6, v8, v7
-; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v5, v1, v3
-; GFX11-FAKE16-NEXT:    v_fmac_f16_e32 v9, v11, v10
-; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v6, v5, 0x5040100
-; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v9, v4, 0x5040100
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> 
%x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata 
!"fpexcept.strict")
   ret <4 x half> %val
 }
@@ -300,9 +259,9 @@ define <2 x half> 
@v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_fma_f16 v3, -v5, -v4, v3
+; GFX8-NEXT:    v_fma_f16 v3, v5, v4, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll 
b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
index 922a4dcbea351..6447ff9409a3e 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll
@@ -136,15 +136,15 @@ define <2 x float> 
@v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float>
 ; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, -v0, -v2, v4
-; GCN-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_fma_f32 v0, -v0, -v2, v4
-; GFX10-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX10-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX10-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x float> %x
   %neg.y = fneg <2 x float> %y
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll 
b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
index 5cb293ab25806..904471775b3a3 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll
@@ -136,15 +136,15 @@ define <2 x double> 
@v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x doubl
 ; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
-; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
-; GFX10-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x double> %x
   %neg.y = fneg <2 x double> %y
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll 
b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
index bdb2128bf609b..cbd9351a505f2 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll
@@ -425,24 +425,12 @@ define <3 x half> 
@v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x h
 
 ; FIXME: Scalarized
 define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x 
half> %y) #0 {
-; GFX9-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v5, v0, s4
-; GFX9-SDAG-NEXT:    v_perm_b32 v1, v4, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
 ; GFX8-SDAG:       ; %bb.0:
@@ -466,86 +454,23 @@ define <4 x half> 
@v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v4, v1, v3 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, v2 dst_sel:DWORD 
dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX10-SDAG-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, v1.h, v3.h
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v0.h, v2.h
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_v4f16_fpexcept_strict:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-SDAG-FAKE16-NEXT:    v_lshr...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/179276
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to