https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151765
>From b0154731dc024d812de24ba138270b3cdf2b0d4b Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Fri, 1 Aug 2025 13:10:57 -0700 Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_sr_pk8_* instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 9 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 106 +++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 6 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 21 + .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll | 385 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 54 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 54 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 54 +++ 10 files changed, 712 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e117e993fc572..9196f5583e45f 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -725,6 +725,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", "gfx TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16, "V2UiV8hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32, "V2UiV8fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 150c6ce0b76ee..177df6c1e555a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -768,6 +768,112 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale); } +// CHECK-LABEL: @test_cvt_scalef32_sr_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) +// CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr +// CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr +// CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr +// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr +// CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr +// CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr +// CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr +// CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 8 +// CHECK-NEXT: [[TMP30:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> [[TMP30]], i32 [[TMP31]], float [[TMP32]]) +// CHECK-NEXT: [[TMP34:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP33]], ptr addrspace(1) [[TMP34]], align 4 +// CHECK-NEXT: [[TMP35:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> [[TMP35]], i32 [[TMP36]], float [[TMP37]]) +// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP38]], ptr addrspace(1) [[TMP39]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]]) +// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8, + global uint3 *out3, bfloat16 srcbf16, half16 srch16, float16 srcf16, + global uint *out1, uint sr, float scale) +{ + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16(srcbf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16(srcbf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16(srch8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f16(srch8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f32(srcf8, sr, scale); + *out2 = __builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32(srcf8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale); + *out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale); +} + // CHECK-LABEL: @test_sat_pk4_i4_i8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e85f9864cb1ce..af7b757f6ebe9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -674,12 +674,21 @@ def int_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ def int_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp4_f16">; def int_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp4_bf16">; -def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">; -def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">; -def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">; def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_fp6_bf16">; -def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">; -def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp8_bf16">; +def int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_bf8_bf16">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp8_f16">; +def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_bf8_f16">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp8_f32">; +def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_bf8_f32">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">; +def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">; def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">; def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0894e26a9a42d..6537884017040 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4612,6 +4612,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 38b609ca47f90..d9a336175b97e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2966,6 +2966,12 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>; def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>; +def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>; +def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>; +def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>; +def VOP_I32_V8F32_I32_F32 : VOPProfile<[i32, v8f32, i32, f32]>; +def VOP_I32_V8F16_I32_F32 : VOPProfile<[i32, v8f16, i32, f32]>; +def VOP_I32_V8BF16_I32_F32 : VOPProfile<[i32, v8bf16, i32, f32]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index f1ed9380f8449..421938a8c041a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1790,6 +1790,18 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>; defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>; } // End WaveSizePredicate = isWave32 + + let WaveSizePredicate = isWave32 in { + defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f16>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_f32>; + defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32>; + defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>; + defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>; + } // End WaveSizePredicate = isWave32 } // End Constraints = "@earlyclobber $vdst" let True16Predicate = UseRealTrue16Insts in { @@ -2221,6 +2233,15 @@ defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>; defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>; defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>; defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; +defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b9>; +defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2bc>; +defm V_CVT_SCALEF32_SR_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2bf>; +defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c0>; +defm V_CVT_SCALEF32_SR_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c1>; +defm V_CVT_SCALEF32_SR_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2c2>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll new file mode 100644 index 0000000000000..d33acf6ca7f76 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll @@ -0,0 +1,385 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v12, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b32 v[10:11], v12, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index 9a63afc2c3fed..93d5cf3edc801 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -940,3 +940,57 @@ v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0 // GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 7f1185906d3f9..2244839a6b161 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -940,3 +940,57 @@ v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0 // GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0x22,0x04] + +v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 +// GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index 53b795844b5d2..0710393cd4762 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -991,3 +991,57 @@ 0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00 # GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xc2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xc2,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc2,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0xc1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xc1,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc1,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0x99,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0x99,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x99,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xbc,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0xb9,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xb9,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_f16 v10, v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xb9,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0x97,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0x97,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x97,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0xc0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xc0,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_bf16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xc0,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0xbf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xbf,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_f16 v[10:11], v[20:23], v4, v8 ; encoding: [0x0a,0x00,0xbf,0xd6,0x14,0x09,0x22,0x04] + +0x0a,0x00,0x98,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], s4, 0x42c80000 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04 +# GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04] _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits