https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151807
>From 41253bc2a507f36d7f313801ddcda7822907e378 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Sat, 2 Aug 2025 01:48:31 -0700 Subject: [PATCH] [AMDGPU] v_cvt_scalef32_pk16_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 6 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 36 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 3 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 + .../llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll | 303 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 36 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 36 +++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 36 +++ 10 files changed, 480 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3773031e187c1..9125315310306 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -731,6 +731,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", "gfx TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_f32, "V3UiV16ff", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_f32, "V3UiV16ff", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_f16, "V3UiV16hf", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_f16, "V3UiV16hf", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16, "V3UiV16yf", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16, "V3UiV16yf", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_bf16, "V2UiV8yUif", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp8_f16, "V2UiV8hUif", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index c25aaf11bb0e1..e50ab77f48c79 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -787,6 +787,36 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2, // CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]]) // CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8 // CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP37:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> [[TMP36]], float [[TMP37]]) +// CHECK-NEXT: [[TMP39:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 16 +// CHECK-NEXT: [[TMP40:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> [[TMP40]], float [[TMP41]]) +// CHECK-NEXT: [[TMP43:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP42]], ptr addrspace(1) [[TMP43]], align 16 +// CHECK-NEXT: [[TMP44:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP46:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> [[TMP44]], float [[TMP45]]) +// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP46]], ptr addrspace(1) [[TMP47]], align 16 +// CHECK-NEXT: [[TMP48:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP49:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP50:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> [[TMP48]], float [[TMP49]]) +// CHECK-NEXT: [[TMP51:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP50]], ptr addrspace(1) [[TMP51]], align 16 +// CHECK-NEXT: [[TMP52:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP53:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP54:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> [[TMP52]], float [[TMP53]]) +// CHECK-NEXT: [[TMP55:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP54]], ptr addrspace(1) [[TMP55]], align 16 +// CHECK-NEXT: [[TMP56:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64 +// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> [[TMP56]], float [[TMP57]]) +// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16 // CHECK-NEXT: ret void // void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8, @@ -802,6 +832,12 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32(srcf8, scale); *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16(srch8, scale); *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_bf16(srcbf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f16(srch16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_bf16(srcbf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f16(srch16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_bf6_f32(srcf16, scale); + *out3 = __builtin_amdgcn_cvt_scalef32_pk16_fp6_f32(srcf16, scale); } // CHECK-LABEL: @test_cvt_scalef32_sr_pk( diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index fcb4b02a594a0..af06fe7a09d7e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -679,6 +679,12 @@ def int_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i3 def int_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp4_f32">; def int_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp4_f16">; def int_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp4_bf16">; +def int_amdgcn_cvt_scalef32_pk16_fp6_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_pk16_fp6_f32">; +def int_amdgcn_cvt_scalef32_pk16_bf6_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_pk16_bf6_f32">; +def int_amdgcn_cvt_scalef32_pk16_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_pk16_fp6_f16">; +def int_amdgcn_cvt_scalef32_pk16_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_pk16_bf6_f16">; +def int_amdgcn_cvt_scalef32_pk16_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_pk16_fp6_bf16">; +def int_amdgcn_cvt_scalef32_pk16_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_pk16_bf6_bf16">; def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">; def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ff64613402776..ddb1e1081da8a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4618,6 +4618,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16: case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk16_bf6_bf16: case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_bf16: case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_bf8_bf16: case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp8_f16: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 392bf42d3f377..350a31885e629 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2937,6 +2937,9 @@ def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>; def VOP_V2F16_F32_F32_I32 : VOPProfile <[v2f16, f32, f32, i32]>; def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; +def VOP_V3I32_V16F16_F32 : VOPProfile<[v3i32, v16f16, f32, untyped]>; +def VOP_V3I32_V16BF16_F32 : VOPProfile<[v3i32, v16bf16, f32, untyped]>; +def VOP_V3I32_V16F32_F32 : VOPProfile<[v3i32, v16f32, f32, untyped]>; def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>; def VOP_V2F16_I32_F32 : VOPProfile<[v2f16, i32, f32, untyped]>; def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 9e0f4f131abf9..100bb2de9abc7 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1802,6 +1802,12 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>; defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>; } // End WaveSizePredicate = isWave32 + defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f32>; + defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f32>; + defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_f16>; + defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_f16>; + defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_fp6_bf16>; + defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_F32>, int_amdgcn_cvt_scalef32_pk16_bf6_bf16>; let WaveSizePredicate = isWave32 in { defm V_CVT_SCALEF32_SR_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp8_bf16>; @@ -2260,6 +2266,12 @@ defm V_CVT_SCALE_PK16_F32_FP6 : VOP3Only_ScaleSel_Real_gfx1250<0x2c9>; defm V_CVT_SCALE_PK16_F16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2ca>; defm V_CVT_SCALE_PK16_BF16_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cb>; defm V_CVT_SCALE_PK16_F32_BF6 : VOP3Only_ScaleSel_Real_gfx1250<0x2cc>; +defm V_CVT_SCALEF32_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2cd>; +defm V_CVT_SCALEF32_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2ce>; +defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>; +defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>; +defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>; +defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>; defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>; defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>; defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll new file mode 100644 index 0000000000000..dfb908930750f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s + +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float %scale) + +define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1210-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 +; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 +; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 +; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 +; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index 9a6ddf5299be0..59a3aa59fe098 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -1093,3 +1093,39 @@ v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00 v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 // GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0x11,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 988408cf12534..37eb12f898656 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -1093,3 +1093,39 @@ v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], 0xcf00 v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 // GFX1250: v_cvt_scale_pk16_f32_bf6 v[10:25], v[20:22], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xcc,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], 100.0 +// GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], v8 +// GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], 100.0 +// GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], v8 +// GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0x11,0x02,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index cc485312ccbae..3885d746746bf 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -1144,3 +1144,39 @@ 0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00 # GFX1250: v_cvt_scale_pk16_f32_fp6 v[10:25], v[20:22], v8 scale_sel:4 ; encoding: [0x0a,0x20,0xc9,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xd2,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xd2,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd2,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xd0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xd0,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd0,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xce,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xce,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_bf6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xce,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xd1,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xd1,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xd1,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xcf,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xcf,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[20:27], v8 ; encoding: [0x0a,0x00,0xcf,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xcd,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42 +# GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], 0x42c80000 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42] + +0x0a,0x00,0xcd,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scalef32_pk16_fp6_f32 v[10:12], v[20:35], v8 ; encoding: [0x0a,0x00,0xcd,0xd6,0x14,0x11,0x02,0x00] _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits