Hi, The attached patches fix some LDS bugs on SI and add support for atomic add for R600 and SI.
Please Review. -Tom
>From 0447b0918efca9000e66414e9ff1bd291936f702 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Wed, 21 Aug 2013 11:35:27 -0400 Subject: [PATCH 1/4] R600/SI: Don't emit S_WQM_B64 instruction for compute shaders --- lib/Target/R600/SILowerControlFlow.cpp | 3 ++- test/CodeGen/R600/load.ll | 13 +++++++++++++ test/CodeGen/R600/local-memory.ll | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index c2e8f02..09cf25b 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -409,6 +409,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); TRI = MF.getTarget().getRegisterInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; bool NeedM0 = false; @@ -508,7 +509,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { AMDGPU::M0).addImm(0xffffffff); } - if (NeedWQM) { + if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) { MachineBasicBlock &MBB = MF.front(); BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC).addReg(AMDGPU::EXEC); diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll index c7fe611..8829ff5 100644 --- a/test/CodeGen/R600/load.ll +++ b/test/CodeGen/R600/load.ll @@ -446,6 +446,7 @@ define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace( ; R600-CHECK: @load_i8_local ; R600-CHECK: LDS_UBYTE_READ_RET ; SI-CHECK: @load_i8_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U8 define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { %1 = load i8 addrspace(3)* %in @@ -458,6 +459,7 @@ define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { ; R600-CHECK: LDS_UBYTE_READ_RET ; R600-CHECK: ASHR ; SI-CHECK: @load_i8_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I8 define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) { entry: @@ -471,6 +473,7 @@ entry: ; R600-CHECK: LDS_UBYTE_READ_RET ; R600-CHECK: LDS_UBYTE_READ_RET ; SI-CHECK: @load_v2i8_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U8 ; SI-CHECK: DS_READ_U8 define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { @@ -487,6 +490,7 @@ entry: ; R600-CHECK-DAG: ASHR ; R600-CHECK-DAG: ASHR ; SI-CHECK: @load_v2i8_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I8 ; SI-CHECK: DS_READ_I8 define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) { @@ -503,6 +507,7 @@ entry: ; R600-CHECK: LDS_UBYTE_READ_RET ; R600-CHECK: LDS_UBYTE_READ_RET ; SI-CHECK: @load_v4i8_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U8 ; SI-CHECK: DS_READ_U8 ; SI-CHECK: DS_READ_U8 @@ -525,6 +530,7 @@ entry: ; R600-CHECK-DAG: ASHR ; R600-CHECK-DAG: ASHR ; SI-CHECK: @load_v4i8_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I8 ; SI-CHECK: DS_READ_I8 ; SI-CHECK: DS_READ_I8 @@ -541,6 +547,7 @@ entry: ; R600-CHECK: @load_i16_local ; R600-CHECK: LDS_USHORT_READ_RET ; SI-CHECK: @load_i16_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U16 define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { entry: @@ -554,6 +561,7 @@ entry: ; R600-CHECK: LDS_USHORT_READ_RET ; R600-CHECK: ASHR ; SI-CHECK: @load_i16_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I16 define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) { entry: @@ -567,6 +575,7 @@ entry: ; R600-CHECK: LDS_USHORT_READ_RET ; R600-CHECK: LDS_USHORT_READ_RET ; SI-CHECK: @load_v2i16_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U16 ; SI-CHECK: DS_READ_U16 define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { @@ -583,6 +592,7 @@ entry: ; R600-CHECK-DAG: ASHR ; R600-CHECK-DAG: ASHR ; SI-CHECK: @load_v2i16_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I16 ; SI-CHECK: DS_READ_I16 define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) { @@ -599,6 +609,7 @@ entry: ; R600-CHECK: LDS_USHORT_READ_RET ; R600-CHECK: LDS_USHORT_READ_RET ; SI-CHECK: @load_v4i16_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_U16 ; SI-CHECK: DS_READ_U16 ; SI-CHECK: DS_READ_U16 @@ -621,6 +632,7 @@ entry: ; R600-CHECK-DAG: ASHR ; R600-CHECK-DAG: ASHR ; SI-CHECK: @load_v4i16_sext_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_I16 ; SI-CHECK: DS_READ_I16 ; SI-CHECK: DS_READ_I16 @@ -637,6 +649,7 @@ entry: ; R600-CHECK: @load_i32_local ; R600-CHECK: LDS_READ_RET ; SI-CHECK: @load_i32_local +; SI-CHECK-NOT: S_WQM_B64 ; SI-CHECK: DS_READ_B32 define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) { entry: diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll index 9ebb769..bd0d59c 100644 --- a/test/CodeGen/R600/local-memory.ll +++ b/test/CodeGen/R600/local-memory.ll @@ -13,6 +13,7 @@ ; SI-CHECK-NEXT: .long 32768 ; EG-CHECK: LDS_WRITE +; SI-CHECK_NOT: S_WQM_B64 ; SI-CHECK: DS_WRITE_B32 0 ; GROUP_BARRIER must be the last instruction in a clause -- 1.7.11.4
>From a3606062a83cbe95916b2983f5e966a80540e6ff Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Tue, 20 Aug 2013 13:17:22 -0700 Subject: [PATCH 2/4] R600: Fix incorrect LDS size calculation GlobalAdderss nodes that appeared in more than one basic block were being counted twice. --- lib/Target/R600/AMDGPUISelLowering.cpp | 14 ++++++++++---- lib/Target/R600/AMDGPUMachineFunction.h | 4 ++++ test/CodeGen/R600/lds-size.ll | 26 ++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/R600/lds-size.ll diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 24b826b..5497356 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -248,12 +248,18 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); - unsigned Offset = MFI->LDSSize; const GlobalValue *GV = G->getGlobal(); - uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); - // XXX: Account for alignment? - MFI->LDSSize += Size; + unsigned Offset; + if (MFI->LocalMemoryObjects.count(GV) == 0) { + uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); + Offset = MFI->LDSSize; + MFI->LocalMemoryObjects[GV] = Offset; + // XXX: Account for alignment? + MFI->LDSSize += Size; + } else { + Offset = MFI->LocalMemoryObjects[GV]; + } return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace())); } diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h index 789b96a..fe80ce3 100644 --- a/lib/Target/R600/AMDGPUMachineFunction.h +++ b/lib/Target/R600/AMDGPUMachineFunction.h @@ -14,6 +14,7 @@ #define AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" +#include <map> namespace llvm { @@ -21,6 +22,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { public: AMDGPUMachineFunction(const MachineFunction &MF); unsigned ShaderType; + /// A map to keep track of local memory objects and their offsets within + /// the local memory space. + std::map<const GlobalValue *, unsigned> LocalMemoryObjects; /// Number of bytes in the LDS that are being used. unsigned LDSSize; }; diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll new file mode 100644 index 0000000..2185180 --- /dev/null +++ b/test/CodeGen/R600/lds-size.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; This test makes sure we do not double count global values when they are +; used in different basic blocks. + +; CHECK-LABEL: @test +; CHECK: .long 166120 +; CHECK-NEXT: .long 1 +@lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4 + +define void @test(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp eq i32 %cond, 0 + br i1 %0, label %if, label %else + +if: + store i32 1, i32 addrspace(3)* @lds + br label %endif + +else: + store i32 2, i32 addrspace(3)* @lds + br label %endif + +endif: + ret void +} -- 1.7.11.4
>From 48dce43f670fe6c58987e4c8a3c0d31c3e3f9c3a Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Mon, 19 Aug 2013 07:28:48 -0700 Subject: [PATCH 3/4] R600: Expand SELECT nodes rather than custom lowering them --- lib/Target/R600/R600ISelLowering.cpp | 20 +++++----------- lib/Target/R600/R600ISelLowering.h | 1 - test/CodeGen/R600/select.ll | 46 ++++++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/R600/select.ll diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index f0242b8..450e2a8 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -60,8 +60,12 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); - setOperationAction(ISD::SELECT, MVT::i32, Custom); - setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::i32, Expand); + setOperationAction(ISD::SELECT, MVT::f32, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v2f32, Expand); + setOperationAction(ISD::SELECT, MVT::v4i32, Expand); + setOperationAction(ISD::SELECT, MVT::v4f32, Expand); // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); @@ -480,7 +484,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::FCOS: case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); @@ -930,17 +933,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const DAG.getCondCode(ISD::SETNE)); } -SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - return DAG.getNode(ISD::SELECT_CC, - SDLoc(Op), - Op.getValueType(), - Op.getOperand(0), - DAG.getConstant(0, MVT::i32), - Op.getOperand(1), - Op.getOperand(2), - DAG.getCondCode(ISD::SETNE)); -} - /// LLVM generates byte-addresed pointers. For indirect addressing, we need to /// convert these pointers to a register index. Each register holds /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index a033fcb..811850d 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -56,7 +56,6 @@ private: SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll new file mode 100644 index 0000000..f940142 --- /dev/null +++ b/test/CodeGen/R600/select.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; Normally icmp + select is optimized to select_cc, when this happens the +; DAGLegalizer never sees the select and doesn't have a chance to leaglize it. +; +; In order to avoid the select_cc optimization, this test case calculates the +; condition for the select in a separate basic block. + +; CHECK-LABEL: @select +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out, + <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out, + <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out, + i32 %cond) { +entry: + br label %for +body: + %inc = add i32 %i, 1 + %br_cmp.i = icmp eq i1 %br_cmp, 0 + br label %for +for: + %i = phi i32 [ %inc, %body], [ 0, %entry ] + %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ] + %0 = icmp eq i32 %cond, %i + %1 = select i1 %br_cmp, i32 2, i32 3 + %2 = select i1 %br_cmp, float 2.0 , float 5.0 + %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5> + %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0> + %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9> + %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0> + br i1 %0, label %body, label %done + +done: + store i32 %1, i32 addrspace(1)* %i32out + store float %2, float addrspace(1)* %f32out + store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out + store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out + store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out + store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out + ret void +} -- 1.7.11.4
>From 8f258501b8e45434125fed4d0b88bfaadc0a62ce Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Tue, 20 Aug 2013 13:22:28 -0700 Subject: [PATCH 4/4] R600: Add support for local memory atomic add --- lib/Target/R600/AMDGPUInstructions.td | 5 ++++ lib/Target/R600/R600ISelLowering.cpp | 22 ++++++++++++------ lib/Target/R600/R600InstrInfo.h | 6 +++++ lib/Target/R600/R600Instructions.td | 42 ++++++++++++++++++++++++++++------ lib/Target/R600/SIInstrInfo.td | 12 ++++++++++ lib/Target/R600/SIInstructions.td | 4 ++++ lib/Target/R600/SILowerControlFlow.cpp | 1 + test/CodeGen/R600/atomic_load_add.ll | 23 +++++++++++++++++++ 8 files changed, 101 insertions(+), 14 deletions(-) create mode 100644 test/CodeGen/R600/atomic_load_add.ll diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index dec6082..6745fed 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -191,6 +191,11 @@ def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isLocalLoad(dyn_cast<LoadSDNode>(N)); }]>; +def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value), + (atomic_load_add node:$ptr, node:$value), [{ + return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 450e2a8..ff9ba52 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -109,16 +109,24 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( switch (MI->getOpcode()) { default: - if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::LDS_1A) { - MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(MI->getOpcode()), - AMDGPU::OQAP); + if (TII->isLDSInstr(MI->getOpcode()) && + TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst) != -1) { + int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + MachineInstrBuilder NewMI; + if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) { + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()), + AMDGPU::OQAP); + TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, + MI->getOperand(0).getReg(), + AMDGPU::OQAP); + } else { + NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), + TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); + } for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { NewMI.addOperand(MI->getOperand(i)); } - TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV, - MI->getOperand(0).getReg(), - AMDGPU::OQAP); } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index e28d771..189d062 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -273,6 +273,12 @@ namespace llvm { void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; }; +namespace AMDGPU { + +int getLDSNoRetOp(uint16_t Opcode); + +} //End namespace AMDGPU + } // End llvm namespace #endif // R600INSTRINFO_H_ diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index f5c0266..76c3c4f 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1640,23 +1640,39 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < let DisableEncoding = "$dst"; } -class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> : +class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern, + string dst =""> : R600_LDS < - lds_op, - (outs), + lds_op, outs, (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel, R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel, LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle), - " "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel", + " "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel", pattern > { + field string BaseOp; + let src2 = 0; let src2_rel = 0; let LDS_1A1D = 1; } +class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs), name, pattern> { + let BaseOp = name; +} + +class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> : + R600_LDS_1A1D <lds_op, (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> { + + let BaseOp = name; + let usesCustomInserter = 1; + let DisableEncoding = "$dst"; + let Defs = [OQAP]; +} + class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS < lds_op, @@ -1670,15 +1686,19 @@ class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> : let LDS_1A2D = 1; } -def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE", +def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >; +def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] >; -def LDS_BYTE_WRITE : R600_LDS_1A1D<0x12, "LDS_BYTE_WRITE", +def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", [(truncstorei8_local i32:$src1, i32:$src0)] >; -def LDS_SHORT_WRITE : R600_LDS_1A1D<0x13, "LDS_SHORT_WRITE", +def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE", [(truncstorei16_local i32:$src1, i32:$src0)] >; +def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD", + [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))] +>; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] >; @@ -2442,3 +2462,11 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>; def : DwordAddrPat <i32, R600_Reg32>; } // End isR600toCayman Predicate + +def getLDSNoRetOp : InstrMapping { + let FilterClass = "R600_LDS_1A1D"; + let RowFields = ["BaseOp"]; + let ColFields = ["DisableEncoding"]; + let KeyCol = ["$dst"]; + let ValueCols = [[""""]]; +} diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index ecc4718..09d5f01 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -362,6 +362,18 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS < let vdst = 0; } +class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS < + op, + (outs rc:$vdst), + (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0, + i8imm:$offset1), + asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]", + []> { + let mayStore = 1; + let mayLoad = 1; + let data1 = 0; +} + class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < op, (outs), diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 136f69c..31a5ad2 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -391,6 +391,7 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">; } // End isCompare = 1 +def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>; def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>; def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>; def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>; @@ -1775,6 +1776,9 @@ def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>; def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>; def : DSWritePat <DS_WRITE_B32, i32, local_store>; +def : Pat <(atomic_load_add_local i32:$ptr, i32:$val), + (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>; + /********** ================== **********/ /********** SMRD Patterns **********/ /********** ================== **********/ diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp index 09cf25b..a6c43bb 100644 --- a/lib/Target/R600/SILowerControlFlow.cpp +++ b/lib/Target/R600/SILowerControlFlow.cpp @@ -488,6 +488,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { NeedWQM = true; // Fall through case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_ADD_U32_RTN: NeedM0 = true; break; diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll new file mode 100644 index 0000000..054d9cd --- /dev/null +++ b/test/CodeGen/R600/atomic_load_add.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK +; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK + +; R600-CHECK-LABEL: @atomic_add_local +; R600-CHECK: LDS_ADD * +; SI-CHECK-LABEL: @atomic_add_local +; SI-CHECK: DS_ADD_U32_RTN 0 +define void @atomic_add_local(i32 addrspace(3)* %local) { +entry: + %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + ret void +} + +; R600-CHECK-LABEL: @atomic_add_ret_local +; R600-CHECK: LDS_ADD_RET * +; SI-CHECK-LABEL: @atomic_add_ret_local +; SI-CHECK: DS_ADD_U32_RTN 0 +define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) { +entry: + %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst + store i32 %0, i32 addrspace(1)* %out + ret void +} -- 1.7.11.4
_______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
