Author: Matt Arsenault Date: 2024-11-25T09:39:04-08:00 New Revision: e97fb2207e1ef6235a6268dbbd3cc08d437b07ef
URL: https://github.com/llvm/llvm-project/commit/e97fb2207e1ef6235a6268dbbd3cc08d437b07ef DIFF: https://github.com/llvm/llvm-project/commit/e97fb2207e1ef6235a6268dbbd3cc08d437b07ef.diff LOG: AMDGPU: Add support for load transpose instructions for gfx950 (#117378) This patch support for intrinsics in clang, as well as assembly instructions in the backend. Co-authored-by: Sirish Pande <sirish.pa...@amd.com> Added: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/lib/CodeGen/CGBuiltin.cpp llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td llvm/lib/Target/AMDGPU/DSInstructions.td llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll llvm/test/MC/AMDGPU/gfx950-unsupported.s Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 548bcc8ad55f48..a42ad56ce4f998 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -462,6 +462,11 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x64_fp8_fp8, "V16fV4iV8iV16fiIiI TARGET_BUILTIN(__builtin_amdgcn_permlane16_swap, "V2UiUiUiIbIb", "nc", "permlane16-swap") TARGET_BUILTIN(__builtin_amdgcn_permlane32_swap, "V2UiUiUiIbIb", "nc", "permlane32-swap") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr4_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts") + //===----------------------------------------------------------------------===// // GFX12+ only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3db439c87fa326..91b70b4fdf3d20 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19697,8 +19697,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: { - + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: { Intrinsic::ID IID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: @@ -19713,6 +19716,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_ds_read_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_ds_read_tr8_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_ds_read_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_read_tr16_b64_v4i16: + IID = Intrinsic::amdgcn_ds_read_tr16_b64; + break; } llvm::Type *LoadTy = ConvertType(E->getType()); llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl new file mode 100644 index 00000000000000..39fa46d5845f42 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl @@ -0,0 +1,50 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v3i __attribute__((ext_vector_type(3))); +typedef short v4s __attribute__((ext_vector_type(4))); + +// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_read_b64_tr_b4_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_read_tr4_b64_v2i32(inptr); +} + +// GFX950-LABEL: define dso_local <3 x i32> @test_amdgcn_ds_read_b96_tr_b6_v3i32( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <3 x i32> [[TMP0]] +// +v3i test_amdgcn_ds_read_b96_tr_b6_v3i32(local v3i* inptr) +{ + return __builtin_amdgcn_ds_read_tr6_b96_v3i32(inptr); +} + +// GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b8_v2i32( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_read_b64_tr_b8_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_read_tr8_b64_v2i32(inptr); +} + +// GFX950-LABEL: define dso_local <4 x i16> @test_amdgcn_ds_read_b64_tr_b16_v2i16( +// GFX950-SAME: ptr addrspace(3) nocapture noundef readonly [[INPTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// GFX950-NEXT: entry: +// GFX950-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) [[INPTR]]) +// GFX950-NEXT: ret <4 x i16> [[TMP0]] +// +v4s test_amdgcn_ds_read_b64_tr_b16_v2i16(local v4s* inptr) +{ + return __builtin_amdgcn_ds_read_tr16_b64_v4i16(inptr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 73599851f50003..b46fe668ea7afd 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2726,6 +2726,10 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>: def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_read_tr16_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; // i32 @llvm.amdgcn.wave.id() def int_amdgcn_wave_id : diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 8c050348f753bb..b3a6f0fd09ea02 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4967,6 +4967,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_ds_read_tr4_b64: + case Intrinsic::amdgcn_ds_read_tr6_b96: + case Intrinsic::amdgcn_ds_read_tr8_b64: + case Intrinsic::amdgcn_ds_read_tr16_b64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index bc8b373d06e01a..10175557fadc71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -345,6 +345,11 @@ def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; +def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>; +def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>; +def : SourceOfDivergence<int_amdgcn_ds_read_tr8_b64>; +def : SourceOfDivergence<int_amdgcn_ds_read_tr16_b64>; + // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and // uniform result, so the returned struct is collectively divergent. diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 061ffda2498f45..7cbd6d2dc62097 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -294,6 +294,12 @@ multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOu } } +multiclass DS_1A_RET_NoM0<string opName, RegisterClass rc = VGPR_32> { + let has_m0_read = 0 in { + def "" : DS_1A_RET<opName, rc>; + } +} + class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> : DS_1A_RET<opName, rc, 1>; @@ -744,6 +750,13 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus +let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in { + defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>; + defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>; + defm DS_READ_B64_TR_B16 : DS_1A_RET_NoM0<"ds_read_b64_tr_b16", VReg_64>; + defm DS_READ_B96_TR_B6 : DS_1A_RET_NoM0<"ds_read_b96_tr_b6", VReg_96>; +} + //===----------------------------------------------------------------------===// // DS Patterns //===----------------------------------------------------------------------===// @@ -1179,6 +1192,18 @@ def : GCNPat < sub0) >; +class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPat < + (vt (node (DS1Addr1Offset i32:$ptr, i32:$offset))), + (inst $ptr, Offset:$offset, (i1 0)) +>; + +let SubtargetPredicate = HasGFX950Insts in { + def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>; + def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>; + def : DSLoadTrPat <DS_READ_B96_TR_B6, v3i32, int_amdgcn_ds_read_tr6_b96>; + def : DSLoadTrPat <DS_READ_B64_TR_B16, v4i16, int_amdgcn_ds_read_tr16_b64>; +} + //===----------------------------------------------------------------------===// // Target-specific instruction encodings. //===----------------------------------------------------------------------===// @@ -1748,3 +1773,11 @@ def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>; + +//===----------------------------------------------------------------------===// +// GFX950. +//===----------------------------------------------------------------------===// +def DS_READ_B64_TR_B4_vi : DS_Real_vi<0x0e0, DS_READ_B64_TR_B4>; +def DS_READ_B96_TR_B6_vi : DS_Real_vi<0x0e1, DS_READ_B96_TR_B6>; +def DS_READ_B64_TR_B8_vi : DS_Real_vi<0x0e2, DS_READ_B64_TR_B8>; +def DS_READ_B64_TR_B16_vi : DS_Real_vi<0x0e3, DS_READ_B64_TR_B16>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f3b5e6985e8e0d..d35bb15ac6566a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1382,7 +1382,11 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } case Intrinsic::amdgcn_global_load_tr_b64: - case Intrinsic::amdgcn_global_load_tr_b128: { + case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_ds_read_tr4_b64: + case Intrinsic::amdgcn_ds_read_tr6_b96: + case Intrinsic::amdgcn_ds_read_tr8_b64: + case Intrinsic::amdgcn_ds_read_tr16_b64: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1477,6 +1481,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: + case Intrinsic::amdgcn_ds_read_tr4_b64: + case Intrinsic::amdgcn_ds_read_tr6_b96: + case Intrinsic::amdgcn_ds_read_tr8_b64: + case Intrinsic::amdgcn_ds_read_tr16_b64: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_flat_atomic_fmax_num: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 0dfd1d880f9cf2..fb1420ee340043 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -261,6 +261,50 @@ bb: ret void } +declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b64_tr4_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b96_tr6_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %tmp0 = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32(ptr addrspace(3) %gep) + store <3 x i32> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b64_tr8_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32(ptr addrspace(3) %gep) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3)) + +; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep) +define amdgpu_kernel void @ds_read_b64_tr_b16_v4i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %gep = getelementptr i64, ptr addrspace(3) %addr, i16 4 + %tmp0 = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16(ptr addrspace(3) %gep) + store <4 x i16> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll new file mode 100644 index 00000000000000..0689af0d56268d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s + +declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3)) +declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3)) +declare <3 x i32> @llvm.amdgcn.ds.read.tr6.b64.v3i32.p3(ptr addrspace(3)) +declare <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3)) + +define amdgpu_ps void @ds_read_b64_tr_b4(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b64_tr_b4: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b64_tr_b4: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: ds_read_b64_tr_b4 v[0:1], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_read_b96_tr_b6(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b96_tr_b6: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-SDAG-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b96_tr_b6: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: ds_read_b96_tr_b6 v[0:2], v0 offset:32 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx3 v[4:5], v[0:2], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) + store <3 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_read_b64_tr_b8(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b64_tr_b8: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b64_tr_b8: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: ds_read_b64_tr_b8 v[0:1], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_read_b64_tr_b16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX950-SDAG-LABEL: ds_read_b64_tr_b16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: ds_read_b64_tr_b16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep) + store <4 x i16> %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx950-unsupported.s b/llvm/test/MC/AMDGPU/gfx950-unsupported.s index f8bbd40b700fd8..225784177ae185 100644 --- a/llvm/test/MC/AMDGPU/gfx950-unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s @@ -1,4 +1,5 @@ // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck -check-prefix=W32-ERR %s //===----------------------------------------------------------------------===// // v_mfma_f32_32x32x4_xf32 @@ -177,3 +178,79 @@ v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7] v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7] // ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +//===----------------------------------------------------------------------===// +// ds_read_b64_tr_b4 +//===----------------------------------------------------------------------===// +ds_read_b64_tr_b4 v[1:2], v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b4 v1, v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b4 v[0:1], s0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b4 v[2:3], v2 offset:-64 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +//===----------------------------------------------------------------------===// +//ds_read_b64_tr_b8 +//===----------------------------------------------------------------------===// +ds_read_b64_tr_b8 v[1:2], v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b8 v1, v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b8 v[0:1], s0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b8 v[2:3], v2 offset:-64 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +//===----------------------------------------------------------------------===// +// ds_read_b64_tr_b16 +//===----------------------------------------------------------------------===// +ds_read_b64_tr_b16 v[1:2], v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b16 v1, v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b16 v[0:1], s0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b64_tr_b16 v[2:3], v2 offset:-64 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +//===----------------------------------------------------------------------===// +// ds_read_b96_tr_b6 +//===----------------------------------------------------------------------===// +ds_read_b96_tr_b6 v[1:3], v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register class: vgpr tuples must be 64 bit aligned +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b96_tr_b6 v1, v0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b96_tr_b6 v[0:3], s0 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_read_b96_tr_b6 v[2:4], v2 offset:-64 +// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 16-bit unsigned offset +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s new file mode 100644 index 00000000000000..93d015f790c862 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_asm_read_tr.s @@ -0,0 +1,34 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940-ERR --implicit-check-not=error: %s + +ds_read_b64_tr_b4 v[0:1], v1 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x00,0x00,0xc0,0xd9,0x01,0x00,0x00,0x00] + +ds_read_b64_tr_b4 v[2:3], v3 offset:64 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x40,0x00,0xc0,0xd9,0x03,0x00,0x00,0x02] + +ds_read_b64_tr_b8 v[0:1], v1 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x00,0x00,0xc4,0xd9,0x01,0x00,0x00,0x00] + +ds_read_b64_tr_b8 v[2:3], v3 offset:64 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x40,0x00,0xc4,0xd9,0x03,0x00,0x00,0x02] + +ds_read_b64_tr_b16 v[0:1], v1 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x00,0x00,0xc6,0xd9,0x01,0x00,0x00,0x00] + +ds_read_b64_tr_b16 v[2:3], v3 offset:64 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x40,0x00,0xc6,0xd9,0x03,0x00,0x00,0x02] + +ds_read_b96_tr_b6 v[0:2], v0 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00] + +ds_read_b96_tr_b6 v[2:4], v2 offset:64 +// GFX940-ERR: [[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt new file mode 100644 index 00000000000000..1efd2d7b996d48 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_ds_read_tr.txt @@ -0,0 +1,37 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -check-prefix=GFX950 %s + +# GFX950: ds_read_b64_tr_b4 v[0:1], v0 ; encoding: [0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00] +0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00 + +# GFX950: ds_read_b64_tr_b4 v[2:3], v2 ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02] +0x00,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b64_tr_b4 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02] +0x40,0x00,0xc0,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b64_tr_b8 v[0:1], v0 ; encoding: [0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00] +0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00 + +# GFX950: ds_read_b64_tr_b8 v[2:3], v2 ; encoding: [0x00,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02] +0x00,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b64_tr_b8 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02] +0x40,0x00,0xc4,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b64_tr_b16 v[0:1], v0 ; encoding: [0x00,0x00,0xc6,0xd9,0x00,0x00,0x00,0x00] +0x00,0x00,0xc6,0xd9,0x00,0x00,0x00,0x00 + +# GFX950: ds_read_b64_tr_b16 v[2:3], v2 ; encoding: [0x00,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02] +0x00,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b64_tr_b16 v[2:3], v2 offset:64 ; encoding: [0x40,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02] +0x40,0x00,0xc6,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b96_tr_b6 v[0:2], v0 ; encoding: [0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00] +0x00,0x00,0xc2,0xd9,0x00,0x00,0x00,0x00 + +# GFX950: ds_read_b96_tr_b6 v[2:4], v2 ; encoding: [0x00,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02] +0x00,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02 + +# GFX950: ds_read_b96_tr_b6 v[2:4], v2 offset:64 ; encoding: [0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02] +0x40,0x00,0xc2,0xd9,0x02,0x00,0x00,0x02 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits