https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151773
>From dead6ea1ef30c5dba70e2709faad18293ae3895f Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Fri, 1 Aug 2025 14:09:42 -0700 Subject: [PATCH] [AMDGPU] gfx1250 v_perm_pk16_* instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++ clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 55 ++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 ++++ llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 15 +++++ llvm/lib/TargetParser/TargetParser.cpp | 1 + .../CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll | 66 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 45 +++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 45 +++++++++++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 45 +++++++++++++ 15 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 9196f5583e45f5..a2e109b416b9d6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -745,6 +745,10 @@ TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wave TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-cvt-lut-insts") +TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts") +TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts") + // GFX1250 WMMA builtins TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index df71ead39f48ce..9ae947985e5457 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -108,7 +108,7 @@ // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" +// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 177df6c1e555ac..7ca6106432e501 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -1070,6 +1070,61 @@ void test_permlane_idx_gen(global uint* out, uint src0, uint src1) { *out = __builtin_amdgcn_permlane_idx_gen(src0, src1); } +// CHECK-LABEL: @test_perm_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[A64_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B32_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[B64_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[OUT4_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A32_ADDR]] to ptr +// CHECK-NEXT: [[A64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A64_ADDR]] to ptr +// CHECK-NEXT: [[B32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B32_ADDR]] to ptr +// CHECK-NEXT: [[B64_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B64_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr +// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr +// CHECK-NEXT: [[OUT4_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT4_ADDR]] to ptr +// CHECK-NEXT: store i32 [[A32:%.*]], ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[A64:%.*]], ptr [[A64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B32:%.*]], ptr [[B32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[B64:%.*]], ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store <2 x i32> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr [[OUT4:%.*]], ptr [[OUT4_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[B32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 [[TMP0]], i32 [[TMP1]], <2 x i32> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT2_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A32_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV:%.*]] = zext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 [[TMP5]], i64 [[CONV]], <2 x i32> [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[OUT3_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <3 x i32> [[TMP8]], ptr [[TMP9]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV1:%.*]] = zext i32 [[TMP10]] to i64 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[B64_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[CONV2:%.*]] = zext i32 [[TMP11]] to i64 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 [[CONV1]], i64 [[CONV2]], <2 x i32> [[TMP12]]) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[OUT4_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP14]], align 16 +// CHECK-NEXT: ret void +// +void test_perm_pk(uint a32, uint a64, uint b32, uint b64, uint2 c, uint2 *out2, uint3 *out3, uint4 *out4) { + *out2 = __builtin_amdgcn_perm_pk16_b4_u4(a32, b32, c); + *out3 = __builtin_amdgcn_perm_pk16_b6_u4(a32, b64, c); + *out4 = __builtin_amdgcn_perm_pk16_b8_u4(a64, b64, c); +} + // CHECK-LABEL: @test_prefetch( // CHECK-NEXT: entry: // CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index af7b757f6ebe93..3e331d06359103 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3705,6 +3705,18 @@ def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_ge [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_perm_pk16_b4_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b4_u4">, + DefaultAttrsIntrinsic<[llvm_v2i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], + [IntrNoMem, IntrSpeculatable]>; + +def int_amdgcn_perm_pk16_b6_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b6_u4">, + DefaultAttrsIntrinsic<[llvm_v3i32_ty], [llvm_i32_ty, llvm_i64_ty, llvm_v2i32_ty], + [IntrNoMem, IntrSpeculatable]>; + +def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4">, + DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty], + [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8a0c4ac6ed8d7f..18f3c4761748ad 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1160,6 +1160,12 @@ def FeatureTanhInsts : SubtargetFeature<"tanh-insts", "Has v_tanh_f32/f16 instructions" >; +def FeatureTensorCvtLutInsts : SubtargetFeature<"tensor-cvt-lut-insts", + "HasTensorCvtLutInsts", + "true", + "Has v_perm_pk16* instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -2030,6 +2036,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureDPPSrc1SGPR, FeatureBitOp3Insts, FeatureTanhInsts, + FeatureTensorCvtLutInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2785,6 +2792,9 @@ def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, AssemblerPredicate<(all_of FeatureTanhInsts)>; +def HasTensorCvtLutInsts : Predicate<"Subtarget->hasTensorCvtLutInsts()">, + AssemblerPredicate<(all_of FeatureTensorCvtLutInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 65378840170409..df4244db6de8d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4795,6 +4795,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: + case Intrinsic::amdgcn_perm_pk16_b4_u4: + case Intrinsic::amdgcn_perm_pk16_b6_u4: + case Intrinsic::amdgcn_perm_pk16_b8_u4: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 6fe3abc98b5d55..c84ba1a0a9d47c 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -236,6 +236,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool Has64BitLiterals = false; bool HasBitOp3Insts = false; bool HasTanhInsts = false; + bool HasTensorCvtLutInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1411,6 +1412,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasTanhInsts() const { return HasTanhInsts; } + bool hasTensorCvtLutInsts() const { return HasTensorCvtLutInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index d9a336175b97e0..78f45447d1fc7f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1774,6 +1774,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, + !eq(VT.Size, 96) : VOPDstOperand<VReg_96>, !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, !eq(VT.Size, 16) : op16, @@ -1924,6 +1925,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> { !eq(VT, v2f16) : VCSrc_v2f16, !eq(VT, v2bf16) : VCSrc_v2bf16, !eq(VT, f32) : VCSrc_f32, + !eq(VT, v2i32) : VCSrc_v2b32, 1 : VCSrc_b32); } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 36d1a3b9442ec7..08d07c927e4c47 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1302,6 +1302,7 @@ def VCSrc_f64 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_FP64">; def VCSrc_v2b16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2INT16">; def VCSrc_v2bf16: SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2BF16">; def VCSrc_v2f16 : SrcRegOrImm9 <VS_32, "OPERAND_REG_INLINE_C_V2FP16">; +def VCSrc_v2b32 : SrcRegOrImm9 <VS_64, "OPERAND_REG_INLINE_C_V2INT32">; // True 16 Operands def VCSrcT_b16 : SrcRegOrImm9_t16 <"OPERAND_REG_INLINE_C_INT16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 421938a8c041a2..63f83e0850849d 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1726,6 +1726,12 @@ multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator no } } +let HasExtVOP3DPP = 0, HasModifiers = 0 in { +def VOP3_V2I32_I32_I32_V2I32 : VOP3_Profile<VOPProfile<[v2i32, i32, i32, v2i32]>>; +def VOP3_V3I32_I32_I64_V2I32 : VOP3_Profile<VOPProfile<[v3i32, i32, i64, v2i32]>>; +def VOP3_V4I32_I64_I64_V2I32 : VOP3_Profile<VOPProfile<[v4i32, i64, i64, v2i32]>>; +} + let Src0RC64 = VSrc_NoInline_v2f16 in { def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>; def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>; @@ -1814,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in { } } // End SubtargetPredicate = isGFX1250Plus +let SubtargetPredicate = HasTensorCvtLutInsts in { + defm V_PERM_PK16_B4_U4 : VOP3Inst<"v_perm_pk16_b4_u4", VOP3_V2I32_I32_I32_V2I32, int_amdgcn_perm_pk16_b4_u4>; + defm V_PERM_PK16_B6_U4 : VOP3Inst<"v_perm_pk16_b6_u4", VOP3_V3I32_I32_I64_V2I32, int_amdgcn_perm_pk16_b6_u4>; + defm V_PERM_PK16_B8_U4 : VOP3Inst<"v_perm_pk16_b8_u4", VOP3_V4I32_I64_I64_V2I32, int_amdgcn_perm_pk16_b8_u4>; +} // End SubtargetPredicate = HasTensorCvtLutInsts + class Cvt_Scale_Sr_F32ToBF16F16_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat< (DstTy (node DstTy:$vdst_in, f32:$src0, i32:$src1, timm:$word_sel)), (inst (DstSelToOpSelXForm $word_sel), $src0, 0, $src1, VGPR_32:$vdst_in) @@ -2212,6 +2224,9 @@ let AssemblerPredicate = isGFX11Plus in { } // These instructions differ from GFX12 variant by supporting DPP: +defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>; +defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>; +defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>; defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>; defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>; defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 126be71a4cda57..e5d2e1ca2f7020 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -444,6 +444,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["bitop3-insts"] = true; Features["prng-inst"] = true; Features["tanh-insts"] = true; + Features["tensor-cvt-lut-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["bf16-cvt-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll new file mode 100644 index 00000000000000..d2f96c402d50ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32, i32, <2 x i32>) +declare <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32, i64, <2 x i32>) +declare <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64, i64, <2 x i32>) + +define void @test_perm_pk16_b4_u4(i32 %a, i32 %b, <2 x i32> %c, ptr %out) { +; GFX1250-LABEL: test_perm_pk16_b4_u4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_perm_pk16_b4_u4 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: flat_store_b64 v[4:5], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 %a, i32 %b, <2 x i32> %c) + store <2 x i32> %ret, ptr %out, align 8 + ret void +} + +define void @test_perm_pk16_b6_u4(i32 %a, i64 %b, <2 x i32> %c, ptr %out) { +; GFX1250-SDAG-LABEL: test_perm_pk16_b6_u4: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[2:3], v[8:9] +; GFX1250-SDAG-NEXT: flat_store_b96 v[6:7], v[0:2] scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: test_perm_pk16_b6_u4: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[8:9], v[2:3] +; GFX1250-GISEL-NEXT: flat_store_b96 v[4:5], v[0:2] scope:SCOPE_SE +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 %a, i64 %b, <2 x i32> %c) + store <3 x i32> %ret, ptr %out, align 16 + ret void +} + +define void @test_perm_pk16_b8_u4(i64 %a, i64 %b, <2 x i32> %c, ptr %out) { +; GFX1250-LABEL: test_perm_pk16_b8_u4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_perm_pk16_b8_u4 v[0:3], v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: flat_store_b128 v[6:7], v[0:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 %a, i64 %b, <2 x i32> %c) + store <4 x i32> %ret, ptr %out, align 16 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s index 93d5cf3edc8017..10c76733f0b950 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s @@ -994,3 +994,48 @@ v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 // GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0xe3,0x18,0x00] + +v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0a,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 100 +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 0x64 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x12,0x02] + +v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 100 +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 0x64 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x12,0x02] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100 +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 0x64 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s index 2244839a6b1612..16f8425ca8d05b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s @@ -994,3 +994,48 @@ v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], v4, v8 v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 100.0 // GFX1250: v_cvt_scalef32_sr_pk8_fp4_bf16 v10, v[20:23], s4, 0x42c80000 ; encoding: [0x0a,0x00,0xbc,0xd6,0x14,0x09,0xfc,0x03,0x00,0x00,0xc8,0x42] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0xe3,0x18,0x00] + +v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] +// GFX1250: v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0a,0x1a,0x04] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 100 +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 0x64 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 +// GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x12,0x02] + +v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] +// GFX1250: v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 100 +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 0x64 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 +// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x12,0x02] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0xe1,0x18,0x00] + +v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] +// GFX1250: v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x08,0x1a,0x04] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 100 +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 0x64 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 +// GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt index 0710393cd4762d..6ac25062181308 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt @@ -1045,3 +1045,48 @@ 0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04 # GFX1250: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[20:27], v4, v8 ; encoding: [0x0a,0x00,0x98,0xd6,0x14,0x09,0x22,0x04] + +0x02,0x00,0x3f,0xd6,0x04,0x0a,0x1a,0x04 +# GFX1250: v_perm_pk16_b4_u4 v[2:3], s4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0a,0x1a,0x04] + +0x02,0x00,0x3f,0xd6,0x04,0xe3,0x18,0x00 +# GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, ttmp5, s[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0xe3,0x18,0x00] + +0x02,0x00,0x3f,0xd6,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00 +# GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 0x64 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0xfe,0x03,0x64,0x00,0x00,0x00] + +0x02,0x00,0x3f,0xd6,0x04,0x0b,0x12,0x02 +# GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, 4 ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x12,0x02] + +0x02,0x00,0x3f,0xd6,0x04,0x0b,0x1a,0x04 +# GFX1250: v_perm_pk16_b4_u4 v[2:3], v4, v5, v[6:7] ; encoding: [0x02,0x00,0x3f,0xd6,0x04,0x0b,0x1a,0x04] + +0x02,0x00,0x42,0xd6,0x04,0x08,0x1a,0x04 +# GFX1250: v_perm_pk16_b6_u4 v[2:4], s4, v[4:5], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x08,0x1a,0x04] + +0x02,0x00,0x42,0xd6,0x04,0xe1,0x18,0x00 +# GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0xe1,0x18,0x00] + +0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00 +# GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 0x64 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +0x02,0x00,0x42,0xd6,0x04,0x09,0x12,0x02 +# GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 4 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x12,0x02] + +0x02,0x00,0x42,0xd6,0x04,0x11,0x1a,0x04 +# GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[8:9], v[6:7] ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x11,0x1a,0x04] + +0x02,0x00,0x43,0xd6,0x04,0x08,0x1a,0x04 +# GFX1250: v_perm_pk16_b8_u4 v[2:5], s[4:5], v[4:5], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x08,0x1a,0x04] + +0x02,0x00,0x43,0xd6,0x04,0xe1,0x18,0x00 +# GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], ttmp[4:5], s[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0xe1,0x18,0x00] + +0x02,0x00,0x43,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00 +# GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 0x64 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0xfe,0x03,0x64,0x00,0x00,0x00] + +0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02 +# GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[4:5], 4 ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x09,0x12,0x02] + +0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04 +# GFX1250: v_perm_pk16_b8_u4 v[2:5], v[4:5], v[8:9], v[6:7] ; encoding: [0x02,0x00,0x43,0xd6,0x04,0x11,0x1a,0x04] _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits