llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> For multipass instructions, overlap on VDST and SRC’s would result in HW race & undefined results. Co-authored-by: Pravin Jagtap <Pravin.Jagtap@<!-- -->amd.com> --- Patch is 68.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117822.diff 5 Files Affected: - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+7-5) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll (+154-60) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll (+84-84) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll (+84-84) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 00caea1f923391..9ef52c0feb7233 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>; - defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>; - defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>; - defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + let Constraints = "@earlyclobber $vdst" in { + defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>; + defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>; + defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + } } defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>; defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>; @@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in } } -let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { +let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>; defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>; defm V_CVT_SCALEF32_PK32_F16_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>; @@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>; } -let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { +let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>; defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>; defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 6d627186d25816..f80f2935856e36 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) { } define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scale_pk32_f32_fp6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale) ret <32 x float> %ret } define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scale_pk32_f32_bf6: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v33, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v34, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v35, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v36, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v37, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v38, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale) ret <32 x float> %ret } define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale) ret <32 x half> %ret } @@ -897,14 +957,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl: @@ -912,11 +972,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0) ret <32 x half> %ret @@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float % ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: @@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: @@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) { } define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) { -; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale) ret <32 x half> %ret } @@ -1013,14 +1100,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl: @@ -1028,11 +1115,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 ; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0) ret <32 x half> %ret @@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float % ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: @@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) { ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s17 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 ; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll index 4153bc8f43563b..f9fd7e253b1243 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -10,24 +10,24 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) { ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv: ; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16 -; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 -; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 +; GFX950-GISEL-NEXT: v_mov_b... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/117822 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits