https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151616
>From ab3017a04a86f64329c37bf3ba22dc17a7630f3e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Thu, 31 Jul 2025 17:14:33 -0700 Subject: [PATCH] [AMDGPU] Add v_cvt_scale_pk8_* gfx1250 instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 9 + clang/lib/Sema/SemaAMDGPU.cpp | 10 ++ .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 111 ++++++++++++ .../builtins-amdgcn-error-gfx1250-param.cl | 40 +++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 23 ++- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 + .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 6 + .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 ++ .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 11 ++ llvm/lib/Target/AMDGPU/VOP3Instructions.td | 39 +++++ llvm/lib/Target/AMDGPU/VOPInstructions.td | 14 ++ .../AMDGPU/llvm.amdgcn.cvt.scale.pk.ll | 164 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 81 +++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 81 +++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s | 40 +++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 84 +++++++++ 17 files changed, 730 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 1879a9da753e5..bb3953ea1253d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -707,6 +707,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp8, "V8hV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp8, "V8yV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_bf8, "V8hV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_bf8, "V8yV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp4, "V8hUiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index c23c98aa3aaeb..8580de2f0c03c 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -84,6 +84,16 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, return checkMovDPPFunctionCall(TheCall, 2, 1); case AMDGPU::BI__builtin_amdgcn_update_dpp: { return checkMovDPPFunctionCall(TheCall, 6, 2); + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_bf8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_bf8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp4: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp4: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_bf8: + case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp4: + return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7); } default: return false; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 67cb742ea32ef..51ab970655b4a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -7,8 +7,20 @@ typedef unsigned int uint; typedef unsigned short int ushort; typedef unsigned int __attribute__((ext_vector_type(2))) uint2; +typedef unsigned int __attribute__((ext_vector_type(3))) uint3; +typedef unsigned int __attribute__((ext_vector_type(4))) uint4; typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2; +typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8; +typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16; +typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; typedef half __attribute__((ext_vector_type(2))) half2; +typedef half __attribute__((ext_vector_type(8))) half8; +typedef half __attribute__((ext_vector_type(16))) half16; +typedef half __attribute__((ext_vector_type(32))) half32; +typedef float __attribute__((ext_vector_type(8))) float8; +typedef float __attribute__((ext_vector_type(16))) float16; +typedef float __attribute__((ext_vector_type(32))) float32; +typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: @test_setprio_inc_wg( // CHECK-NEXT: entry: @@ -563,6 +575,105 @@ void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old) *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3); } +// CHECK-LABEL: @test_cvt_scale_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUTH8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTY8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// CHECK-NEXT: [[OUTF32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTF8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTH16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTY16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUTF16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC3_ADDR:%.*]] = alloca <3 x i32>, align 16, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUTH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH8_ADDR]] to ptr +// CHECK-NEXT: [[OUTY8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY8_ADDR]] to ptr +// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr +// CHECK-NEXT: [[OUTF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF32_ADDR]] to ptr +// CHECK-NEXT: [[OUTF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF8_ADDR]] to ptr +// CHECK-NEXT: [[OUTH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH16_ADDR]] to ptr +// CHECK-NEXT: [[OUTY16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY16_ADDR]] to ptr +// CHECK-NEXT: [[OUTF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF16_ADDR]] to ptr +// CHECK-NEXT: [[SRC3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC3_ADDR]] to ptr +// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr +// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUTH8:%.*]], ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTY8:%.*]], ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF32:%.*]], ptr [[OUTF32_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF8:%.*]], ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTH16:%.*]], ptr [[OUTH16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTY16:%.*]], ptr [[OUTY16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[OUTF16:%.*]], ptr [[OUTF16_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[SRC3:%.*]], ptr [[SRC3_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[TMP0]], i32 [[TMP1]], i32 4) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[TMP4]], i32 [[TMP5]], i32 5) +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[TMP8]], i32 [[TMP9]], i32 6) +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[TMP12]], i32 [[TMP13]], i32 7) +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 16 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[TMP16]], i32 [[TMP17]], i32 1) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP18]], ptr addrspace(1) [[TMP19]], align 16 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[TMP20]], i32 [[TMP21]], i32 2) +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x bfloat> [[TMP22]], ptr addrspace(1) [[TMP23]], align 16 +// CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> [[TMP24]], i32 [[TMP25]], i32 5) +// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP26]], ptr addrspace(1) [[TMP27]], align 32 +// CHECK-NEXT: [[TMP28:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> [[TMP28]], i32 [[TMP29]], i32 6) +// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP30]], ptr addrspace(1) [[TMP31]], align 32 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7) +// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32 +// CHECK-NEXT: ret void +// +void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2, + global float32 *outf32, global float8 *outf8, + global half16 *outh16, global bfloat16 *outy16, + global float16 *outf16, uint3 src3, + uint src1, uint scale) +{ + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 4); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 5); + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 6); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 7); + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 1); + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 2); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6); + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7); +} + // CHECK-LABEL: @test_sat_pk4_i4_i8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 32473808208f8..83c63f1465a8b 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -1,7 +1,21 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s +typedef unsigned int uint; +typedef unsigned short int ushort; typedef int v2i __attribute__((ext_vector_type(2))); +typedef unsigned int __attribute__((ext_vector_type(2))) uint2; +typedef unsigned int __attribute__((ext_vector_type(3))) uint3; +typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8; +typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16; +typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; +typedef half __attribute__((ext_vector_type(8))) half8; +typedef half __attribute__((ext_vector_type(16))) half16; +typedef half __attribute__((ext_vector_type(32))) half32; +typedef float __attribute__((ext_vector_type(8))) float8; +typedef float __attribute__((ext_vector_type(16))) float16; +typedef float __attribute__((ext_vector_type(32))) float32; + typedef int v4i __attribute__((ext_vector_type(4))); typedef int v8i __attribute__((ext_vector_type(8))); @@ -29,6 +43,32 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) { __builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}} } +void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2, + global float32 *outf32, global half16 *outh16, global bfloat16 *outy16, + global float16 *outf16, uint3 src3, + global float8 *outf8, uint src1, uint scale, uint scale_sel) +{ + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp8' must be a constant integer}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp8' must be a constant integer}} + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_bf8' must be a constant integer}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_bf8' must be a constant integer}} + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp4' must be a constant integer}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp4' must be a constant integer}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp8' must be a constant integer}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_bf8' must be a constant integer}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp4' must be a constant integer}} + + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a58e26c7d2224..7265a76294c4c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -633,18 +633,33 @@ def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic< [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] >, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">; -class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< - [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] +// llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..7] +class AMDGPUCvtScaleIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< + [DstTy], [Src0Ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] >, ClangBuiltin<"__builtin_amdgcn_"#name>; -class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic< - [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] +class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< + [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] >, ClangBuiltin<"__builtin_amdgcn_"#name>; class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] >, ClangBuiltin<"__builtin_amdgcn_"#name>; +def int_amdgcn_cvt_scale_pk8_f16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_fp8">; +def int_amdgcn_cvt_scale_pk8_bf16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_fp8">; +def int_amdgcn_cvt_scale_pk8_f16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_bf8">; +def int_amdgcn_cvt_scale_pk8_bf16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_bf8">; +def int_amdgcn_cvt_scale_pk8_f16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_i32_ty, "cvt_scale_pk8_f16_fp4">; +def int_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_i32_ty, "cvt_scale_pk8_bf16_fp4">; +def int_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_fp8">; +def int_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_bf8">; +def int_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_i32_ty, "cvt_scale_pk8_f32_fp4">; + +class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic< + [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] +>, ClangBuiltin<"__builtin_amdgcn_"#name>; + def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">; def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">; def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0f1d7edad12f7..c8e45d47c3660 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4582,6 +4582,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_bf8_f16: case Intrinsic::amdgcn_cvt_sr_fp8_f16: case Intrinsic::amdgcn_cvt_sr_bf8_f16: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4: + case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8: + case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 700701d503853..a83caa0db8a69 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -180,6 +180,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyMatrixBFMT, ImmTyMatrixAReuse, ImmTyMatrixBReuse, + ImmTyScaleSel, ImmTyByteSel, }; @@ -1184,6 +1185,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break; case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; + case ImmTyScaleSel: OS << "ScaleSel" ; break; case ImmTyByteSel: OS << "ByteSel" ; break; } // clang-format on @@ -9366,6 +9368,10 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, } } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::scale_sel)) + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyScaleSel); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClamp); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 15088ac25863f..42c4d8b8a9717 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1793,4 +1793,14 @@ void AMDGPUInstPrinter::printBitOp3(const MCInst *MI, unsigned OpNo, O << formatHex(static_cast<uint64_t>(Imm)); } +void AMDGPUInstPrinter::printScaleSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + uint8_t Imm = MI->getOperand(OpNo).getImm(); + if (!Imm) + return; + + O << " scale_sel:" << formatDec(Imm); +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e0b7aa5799e62..f6739b14926e1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -173,6 +173,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { const MCSubtargetInfo &STI, raw_ostream &O, StringRef Prefix, bool PrintInHex, bool AlwaysPrint); + void printScaleSel(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printBitOp3(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c5931fcd5d909..a3e20baa9e298 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1313,6 +1313,10 @@ def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">; def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; +def ScaleSel : NamedIntOperand<"scale_sel"> { + let Validator = "isUInt<3>"; +} + class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_KIMM"#vt.Size; @@ -2944,6 +2948,13 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>; def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; +def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>; +def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>; +def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>; +def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>; +def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>; +def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>; +def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 7e922abd695c0..1ffe39dc5cba5 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1675,6 +1675,23 @@ let SubtargetPredicate = HasBF16ConversionInsts in { (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; } +class VOP3_CVT_SCALE_PK_F16_F864_Profile<VOPProfile P> : VOP3_CVT_SCALEF32_PK_F864_Profile<P> { + let Src0RC64 = getVOP3VRegSrcForVT<Src0VT>.ret; + let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, + HasClamp, HasModifiers, HasSrc2Mods, + HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret, + (ins ScaleSel:$scale_sel)); + let Asm64 = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp, + HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods, + HasSrc2Mods, DstVT>.ret # "$scale_sel"; +} + +multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator node> { + def _e64 : VOP3InstBase<OpName, VOP3_CVT_SCALE_PK_F16_F864_Profile<P>> { + let Pattern = [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0)), i32:$src1, i32:$scale_sel))]; + } +} + let Src0RC64 = VSrc_NoInline_v2f16 in { def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; @@ -1712,6 +1729,19 @@ let SubtargetPredicate = isGFX1250Plus in { defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile, VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>; } + + let Constraints = "@earlyclobber $vdst" in { + defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>; + defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>; + defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>; + defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>; + defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>; + defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>; + } // End Constraints = "@earlyclobber $vdst" + + defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>; + defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>; + defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>; } // End ReadsModeReg = 0 let True16Predicate = UseRealTrue16Insts in { @@ -2120,6 +2150,15 @@ let AssemblerPredicate = isGFX11Plus in { defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; +defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x29f>; +defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a0>; +defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a1>; +defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a8>; +defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a9>; +defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>; +defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>; +defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>; +defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>; defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>; defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>; defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 0858b0475eb07..f027ab05c546c 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -414,6 +414,13 @@ class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0); } +class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> { + bits<3> scale_sel; + + let Inst{13-11} = scale_sel; + let Inst{14} = 0; +} + class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> { bits<6> attr; bits<2> attrchan; @@ -2010,6 +2017,13 @@ multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> { } } +multiclass VOP3Only_ScaleSel_Real_gfx1250<bits<10> op> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + def _e64_gfx1250 : + VOP3_Real_Gen<ps, GFX1250Gen>, + VOP3a_ScaleSel_gfx1250<op, ps.Pfl>; +} + multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME, string pseudo_mnemonic = "", bit isSingle = 0> : VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll new file mode 100644 index 0000000000000..4309cfbe1b124 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 %src, i32 %scale, i32 %scale_sel) +declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i32 %scale_sel) +declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel) +declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 %scale_sel) + +define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 +; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1 +; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_fp8_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1 +; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> %src, i32 %scale, i32 1) + store <8 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_bf8_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 +; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2 +; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_bf8_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2 +; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> %src, i32 %scale, i32 0) + store <8 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp8_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 +; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp8 v[4:7], v[0:1], v2 scale_sel:1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> %src, i32 %scale, i32 1) + store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk8_bf16_bf8_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3 +; GFX1250-NEXT: v_cvt_scale_pk8_bf16_bf8 v[4:7], v[0:1], v2 scale_sel:2 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> %src, i32 %scale, i32 2) + store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk8_f16_fp4_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scale_pk8_f16_fp4 v[4:7], v0, v1 scale_sel:3 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 %src, i32 %scale, i32 3) + store <8 x half> %cvt, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp4_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp4 v[4:7], v0, v1 scale_sel:4 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i32 4) + store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp8_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3 +; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp8_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4 +; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 7) + store <8 x float> %cvt, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_bf8_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3 +; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_bf8_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4 +; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 0) + store <8 x float> %cvt, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp4_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp4_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-GISEL-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 1) + store <8 x float> %cvt, ptr addrspace(1) %out, align 32 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index d73e214f1bedf..1f40a3249ff94 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -685,3 +685,84 @@ v_cvt_sr_bf8_f32 v10, s2, v5 v_cvt_sr_bf8_f32 v5, -|v255|, v4 // GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 33b003b4377c8..03f642d8ef33b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -685,3 +685,84 @@ v_cvt_sr_bf8_f32 v10, s2, v5 v_cvt_sr_bf8_f32 v5, -|v255|, v4 // GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 +// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 +// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 +// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 +// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 +// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 +// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 +// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 +// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 +// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s index 301cfdd217b9f..c5bd00c004a43 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s @@ -117,7 +117,47 @@ v_cvt_sr_fp8_f16 v1, v2, v3 mul:2 // GFX125X-ERR-NEXT:{{^}}v_cvt_sr_fp8_f16 v1, v2, v3 mul:2 // GFX125X-ERR-NEXT:{{^}} ^ +v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid scale_sel value. +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8 +// GFX125X-ERR-NEXT:{{^}} ^ + v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:4 // GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid byte_sel value. // GFX125X-ERR-NEXT:{{^}}v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:4 // GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_f16_fp8 v[10:13], s[20:21], v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp8 v[10:13], s[20:21], v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_f16_fp8 v[10:13], 1, v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp8 v[10:13], 1, v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_bf16_fp8 v[10:13], s[20:21], v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_bf16_fp8 v[10:13], s[20:21], v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_f32_fp8 v[10:17], s[20:21], v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], s[20:21], v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_f16_fp4 v[10:13], s20, v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp4 v[10:13], s20, v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_bf16_fp4 v[10:13], s20, v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_bf16_fp4 v[10:13], s20, v8 +// GFX125X-ERR-NEXT:{{^}} ^ + +v_cvt_scale_pk8_f32_fp4 v[10:17], s20, v8 +// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp4 v[10:17], s20, v8 +// GFX125X-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index cf6a999d645be..ce8cfcbc1e987 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -736,3 +736,87 @@ 0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20 # GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20] + +0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00] + +0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00] + +0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00 +# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00] _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits