On Fri, Nov 09, 2012 at 04:13:51PM +0100, Michel Dänzer wrote: > From: Michel Dänzer <[email protected]> > >
I've always used "Flow Control", but wikipedia and one person in #llvm agree that "Control Flow" is correct. Thanks for pointing out this mistake. Now, I can sound smart when I talk to other compiler writers. :) Reviewed-by: Tom Stellard <[email protected]> > Signed-off-by: Michel Dänzer <[email protected]> > --- > > This patch applies on top of Christian's SGPR liveness patch. > > lib/Target/AMDGPU/AMDGPU.h | 2 +- > lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- > lib/Target/AMDGPU/CMakeLists.txt | 2 +- > lib/Target/AMDGPU/SIFixSGPRLiveness.cpp | 2 +- > lib/Target/AMDGPU/SILowerControlFlow.cpp | 193 > +++++++++++++++++++++++++++++ > lib/Target/AMDGPU/SILowerFlowControl.cpp | 193 > ----------------------------- > 6 files changed, 197 insertions(+), 197 deletions(-) > create mode 100644 lib/Target/AMDGPU/SILowerControlFlow.cpp > delete mode 100644 lib/Target/AMDGPU/SILowerFlowControl.cpp > > diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h > index 33a74dc..2a06ade 100644 > --- a/lib/Target/AMDGPU/AMDGPU.h > +++ b/lib/Target/AMDGPU/AMDGPU.h > @@ -25,7 +25,7 @@ FunctionPass > *createR600ExpandSpecialInstrsPass(TargetMachine &tm); > > // SI Passes > FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); > -FunctionPass *createSILowerFlowControlPass(TargetMachine &tm); > +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); > FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); > FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); > FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm); > diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp > b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp > index 024ff3d..5c4af91 100644 > --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp > +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp > @@ -137,7 +137,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { > addPass(&FinalizeMachineBundlesID); > } else { > addPass(createSILowerLiteralConstantsPass(*TM)); > - addPass(createSILowerFlowControlPass(*TM)); > + addPass(createSILowerControlFlowPass(*TM)); > } > > return false; > diff --git a/lib/Target/AMDGPU/CMakeLists.txt > b/lib/Target/AMDGPU/CMakeLists.txt > index 5e013f6..cd3f174 100644 > --- a/lib/Target/AMDGPU/CMakeLists.txt > +++ b/lib/Target/AMDGPU/CMakeLists.txt > @@ -41,7 +41,7 @@ add_llvm_target(AMDGPUCodeGen > SIInstrInfo.cpp > SIISelLowering.cpp > SILowerLiteralConstants.cpp > - SILowerFlowControl.cpp > + SILowerControlFlow.cpp > SIMachineFunctionInfo.cpp > SIRegisterInfo.cpp > SIPreColorSGPRs.cpp > diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp > b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp > index 028753e..f92eff5 100644 > --- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp > +++ b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp > @@ -7,7 +7,7 @@ > // > > //===----------------------------------------------------------------------===// > // > -// SGPRs are not affected by flow control. This pass adjust SGPR liveness in > +// SGPRs are not affected by control flow. This pass adjust SGPR liveness in > // so that the register allocator can still correctly allocate them. > // > > //===----------------------------------------------------------------------===// > diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp > b/lib/Target/AMDGPU/SILowerControlFlow.cpp > new file mode 100644 > index 0000000..b43fdeb > --- /dev/null > +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp > @@ -0,0 +1,193 @@ > +//===-- SILowerControlFlow.cpp - Use predicates for control flow > ----------===// > +// > +// The LLVM Compiler Infrastructure > +// > +// This file is distributed under the University of Illinois Open Source > +// License. See LICENSE.TXT for details. > +// > +//===----------------------------------------------------------------------===// > +// > +// This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, > ENDIF) > +// to predicated instructions. > +// > +// All control flow (except loops) is handled using predicated instructions > and > +// a predicate stack. Each Scalar ALU controls the operations of 64 Vector > +// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs > +// by writting to the 64-bit EXEC register (each bit corresponds to a > +// single vector ALU). Typically, for predicates, a vector ALU will write > +// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each > +// Vector ALU) and then the ScalarALU will AND the VCC register with the > +// EXEC to update the predicates. > +// > +// For example: > +// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 > +// SI_IF_NZ %VCC > +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 > +// ELSE > +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 > +// ENDIF > +// > +// becomes: > +// > +// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask > +// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask > +// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask > +// S_CBRANCH_EXECZ label0 // This instruction is an > +// // optimization which allows us to > +// // branch if all the bits of > +// // EXEC are zero. > +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch > +// > +// label0: > +// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask > +// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then > block > +// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block > +// S_BRANCH_EXECZ label1 // Use our branch optimization > +// // instruction again. > +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block > +// label1: > +// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits > +//===----------------------------------------------------------------------===// > + > +#include "AMDGPU.h" > +#include "SIInstrInfo.h" > +#include "SIMachineFunctionInfo.h" > +#include "llvm/CodeGen/MachineFunction.h" > +#include "llvm/CodeGen/MachineFunctionPass.h" > +#include "llvm/CodeGen/MachineInstrBuilder.h" > +#include "llvm/CodeGen/MachineRegisterInfo.h" > + > +using namespace llvm; > + > +namespace { > + > +class SILowerControlFlowPass : public MachineFunctionPass { > + > +private: > + static char ID; > + const TargetInstrInfo *TII; > + std::vector<unsigned> PredicateStack; > + std::vector<unsigned> UnusedRegisters; > + > + void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); > + void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); > + > +public: > + SILowerControlFlowPass(TargetMachine &tm) : > + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } > + > + virtual bool runOnMachineFunction(MachineFunction &MF); > + > + const char *getPassName() const { > + return "SI Lower control flow instructions"; > + } > + > +}; > + > +} // End anonymous namespace > + > +char SILowerControlFlowPass::ID = 0; > + > +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { > + return new SILowerControlFlowPass(tm); > +} > + > +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { > + > + // Find all the unused registers that can be used for the predicate stack. > + for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), > + S = AMDGPU::SReg_64RegClass.end(); > + I != S; ++I) { > + unsigned Reg = *I; > + if (!MF.getRegInfo().isPhysRegUsed(Reg)) { > + UnusedRegisters.insert(UnusedRegisters.begin(), Reg); > + } > + } > + > + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); > + BB != BB_E; ++BB) { > + MachineBasicBlock &MBB = *BB; > + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); > + I != MBB.end(); I = Next) { > + Next = llvm::next(I); > + MachineInstr &MI = *I; > + switch (MI.getOpcode()) { > + default: break; > + case AMDGPU::SI_IF_NZ: > + pushExecMask(MBB, I); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64), > + AMDGPU::EXEC) > + .addOperand(MI.getOperand(0)) // VCC > + .addReg(AMDGPU::EXEC); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), > + PredicateStack.back()) > + .addReg(PredicateStack.back()) > + .addReg(AMDGPU::EXEC); > + MI.eraseFromParent(); > + break; > + case AMDGPU::ELSE: > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > + UnusedRegisters.back()) > + .addReg(AMDGPU::EXEC); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > + AMDGPU::EXEC) > + .addReg(PredicateStack.back()); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > + PredicateStack.back()) > + .addReg(UnusedRegisters.back()); > + MI.eraseFromParent(); > + break; > + case AMDGPU::ENDIF: > + popExecMask(MBB, I); > + if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == > ShaderType::PIXEL && > + PredicateStack.empty()) { > + // If the exec mask is non-zero, skip the next two instructions > + BuildMI(MBB, I, MBB.findDebugLoc(I), > TII->get(AMDGPU::S_CBRANCH_EXECNZ)) > + .addImm(3) > + .addReg(AMDGPU::EXEC); > + > + // Exec mask is zero: Export to NULL target... > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) > + .addImm(0) > + .addImm(0x09) // V_008DFC_SQ_EXP_NULL > + .addImm(0) > + .addImm(1) > + .addImm(1) > + .addReg(AMDGPU::SREG_LIT_0) > + .addReg(AMDGPU::SREG_LIT_0) > + .addReg(AMDGPU::SREG_LIT_0) > + .addReg(AMDGPU::SREG_LIT_0); > + > + // ... and terminate wavefront > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); > + } > + MI.eraseFromParent(); > + break; > + } > + } > + } > + return false; > +} > + > +void SILowerControlFlowPass::pushExecMask(MachineBasicBlock &MBB, > + MachineBasicBlock::iterator I) { > + > + assert(!UnusedRegisters.empty() && "Ran out of registers for predicate > stack"); > + unsigned StackReg = UnusedRegisters.back(); > + UnusedRegisters.pop_back(); > + PredicateStack.push_back(StackReg); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > + StackReg) > + .addReg(AMDGPU::EXEC); > +} > + > +void SILowerControlFlowPass::popExecMask(MachineBasicBlock &MBB, > + MachineBasicBlock::iterator I) { > + unsigned StackReg = PredicateStack.back(); > + PredicateStack.pop_back(); > + UnusedRegisters.push_back(StackReg); > + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), > + AMDGPU::EXEC) > + .addReg(AMDGPU::EXEC) > + .addReg(StackReg); > +} > diff --git a/lib/Target/AMDGPU/SILowerFlowControl.cpp > b/lib/Target/AMDGPU/SILowerFlowControl.cpp > deleted file mode 100644 > index 0d90c13..0000000 > --- a/lib/Target/AMDGPU/SILowerFlowControl.cpp > +++ /dev/null > @@ -1,193 +0,0 @@ > -//===-- SILowerFlowControl.cpp - Use predicates for flow control > ----------===// > -// > -// The LLVM Compiler Infrastructure > -// > -// This file is distributed under the University of Illinois Open Source > -// License. See LICENSE.TXT for details. > -// > -//===----------------------------------------------------------------------===// > -// > -// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, > ENDIF) > -// to predicated instructions. > -// > -// All flow control (except loops) is handled using predicated instructions > and > -// a predicate stack. Each Scalar ALU controls the operations of 64 Vector > -// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs > -// by writting to the 64-bit EXEC register (each bit corresponds to a > -// single vector ALU). Typically, for predicates, a vector ALU will write > -// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each > -// Vector ALU) and then the ScalarALU will AND the VCC register with the > -// EXEC to update the predicates. > -// > -// For example: > -// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 > -// SI_IF_NZ %VCC > -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 > -// ELSE > -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 > -// ENDIF > -// > -// becomes: > -// > -// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask > -// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask > -// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask > -// S_CBRANCH_EXECZ label0 // This instruction is an > -// // optimization which allows us to > -// // branch if all the bits of > -// // EXEC are zero. > -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch > -// > -// label0: > -// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask > -// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then > block > -// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block > -// S_BRANCH_EXECZ label1 // Use our branch optimization > -// // instruction again. > -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block > -// label1: > -// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits > -//===----------------------------------------------------------------------===// > - > -#include "AMDGPU.h" > -#include "SIInstrInfo.h" > -#include "SIMachineFunctionInfo.h" > -#include "llvm/CodeGen/MachineFunction.h" > -#include "llvm/CodeGen/MachineFunctionPass.h" > -#include "llvm/CodeGen/MachineInstrBuilder.h" > -#include "llvm/CodeGen/MachineRegisterInfo.h" > - > -using namespace llvm; > - > -namespace { > - > -class SILowerFlowControlPass : public MachineFunctionPass { > - > -private: > - static char ID; > - const TargetInstrInfo *TII; > - std::vector<unsigned> PredicateStack; > - std::vector<unsigned> UnusedRegisters; > - > - void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); > - void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); > - > -public: > - SILowerFlowControlPass(TargetMachine &tm) : > - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } > - > - virtual bool runOnMachineFunction(MachineFunction &MF); > - > - const char *getPassName() const { > - return "SI Lower flow control instructions"; > - } > - > -}; > - > -} // End anonymous namespace > - > -char SILowerFlowControlPass::ID = 0; > - > -FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) { > - return new SILowerFlowControlPass(tm); > -} > - > -bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) { > - > - // Find all the unused registers that can be used for the predicate stack. > - for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), > - S = AMDGPU::SReg_64RegClass.end(); > - I != S; ++I) { > - unsigned Reg = *I; > - if (!MF.getRegInfo().isPhysRegUsed(Reg)) { > - UnusedRegisters.insert(UnusedRegisters.begin(), Reg); > - } > - } > - > - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); > - BB != BB_E; ++BB) { > - MachineBasicBlock &MBB = *BB; > - for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); > - I != MBB.end(); I = Next) { > - Next = llvm::next(I); > - MachineInstr &MI = *I; > - switch (MI.getOpcode()) { > - default: break; > - case AMDGPU::SI_IF_NZ: > - pushExecMask(MBB, I); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64), > - AMDGPU::EXEC) > - .addOperand(MI.getOperand(0)) // VCC > - .addReg(AMDGPU::EXEC); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), > - PredicateStack.back()) > - .addReg(PredicateStack.back()) > - .addReg(AMDGPU::EXEC); > - MI.eraseFromParent(); > - break; > - case AMDGPU::ELSE: > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > - UnusedRegisters.back()) > - .addReg(AMDGPU::EXEC); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > - AMDGPU::EXEC) > - .addReg(PredicateStack.back()); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > - PredicateStack.back()) > - .addReg(UnusedRegisters.back()); > - MI.eraseFromParent(); > - break; > - case AMDGPU::ENDIF: > - popExecMask(MBB, I); > - if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == > ShaderType::PIXEL && > - PredicateStack.empty()) { > - // If the exec mask is non-zero, skip the next two instructions > - BuildMI(MBB, I, MBB.findDebugLoc(I), > TII->get(AMDGPU::S_CBRANCH_EXECNZ)) > - .addImm(3) > - .addReg(AMDGPU::EXEC); > - > - // Exec mask is zero: Export to NULL target... > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) > - .addImm(0) > - .addImm(0x09) // V_008DFC_SQ_EXP_NULL > - .addImm(0) > - .addImm(1) > - .addImm(1) > - .addReg(AMDGPU::SREG_LIT_0) > - .addReg(AMDGPU::SREG_LIT_0) > - .addReg(AMDGPU::SREG_LIT_0) > - .addReg(AMDGPU::SREG_LIT_0); > - > - // ... and terminate wavefront > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); > - } > - MI.eraseFromParent(); > - break; > - } > - } > - } > - return false; > -} > - > -void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB, > - MachineBasicBlock::iterator I) { > - > - assert(!UnusedRegisters.empty() && "Ran out of registers for predicate > stack"); > - unsigned StackReg = UnusedRegisters.back(); > - UnusedRegisters.pop_back(); > - PredicateStack.push_back(StackReg); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), > - StackReg) > - .addReg(AMDGPU::EXEC); > -} > - > -void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB, > - MachineBasicBlock::iterator I) { > - unsigned StackReg = PredicateStack.back(); > - PredicateStack.pop_back(); > - UnusedRegisters.push_back(StackReg); > - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), > - AMDGPU::EXEC) > - .addReg(AMDGPU::EXEC) > - .addReg(StackReg); > -} > -- > 1.7.10.4 > > _______________________________________________ > mesa-dev mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
