g in SI code.

Tom Stellard Fri, 09 Nov 2012 12:08:17 -0800

On Fri, Nov 09, 2012 at 04:13:51PM +0100, Michel Dänzer wrote:
> From: Michel Dänzer <[email protected]>
> 
>


I've always used "Flow Control", but wikipedia and one person in
#llvm agree that "Control Flow" is correct.  Thanks for pointing out
this mistake.  Now, I can sound smart when I talk to other compiler
writers. :)


Reviewed-by: Tom Stellard <[email protected]>
> Signed-off-by: Michel Dänzer <[email protected]>
> ---
> 
> This patch applies on top of Christian's SGPR liveness patch.
> 
>  lib/Target/AMDGPU/AMDGPU.h                |    2 +-
>  lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    2 +-
>  lib/Target/AMDGPU/CMakeLists.txt          |    2 +-
>  lib/Target/AMDGPU/SIFixSGPRLiveness.cpp   |    2 +-
>  lib/Target/AMDGPU/SILowerControlFlow.cpp  |  193 
> +++++++++++++++++++++++++++++
>  lib/Target/AMDGPU/SILowerFlowControl.cpp  |  193 
> -----------------------------
>  6 files changed, 197 insertions(+), 197 deletions(-)
>  create mode 100644 lib/Target/AMDGPU/SILowerControlFlow.cpp
>  delete mode 100644 lib/Target/AMDGPU/SILowerFlowControl.cpp
> 
> diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
> index 33a74dc..2a06ade 100644
> --- a/lib/Target/AMDGPU/AMDGPU.h
> +++ b/lib/Target/AMDGPU/AMDGPU.h
> @@ -25,7 +25,7 @@ FunctionPass 
> *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
>  
>  // SI Passes
>  FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
> -FunctionPass *createSILowerFlowControlPass(TargetMachine &tm);
> +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
>  FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
>  FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
>  FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm);
> diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp 
> b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> index 024ff3d..5c4af91 100644
> --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> @@ -137,7 +137,7 @@ bool AMDGPUPassConfig::addPreEmitPass() {
>      addPass(&FinalizeMachineBundlesID);
>    } else {
>      addPass(createSILowerLiteralConstantsPass(*TM));
> -    addPass(createSILowerFlowControlPass(*TM));
> +    addPass(createSILowerControlFlowPass(*TM));
>    }
>  
>    return false;
> diff --git a/lib/Target/AMDGPU/CMakeLists.txt 
> b/lib/Target/AMDGPU/CMakeLists.txt
> index 5e013f6..cd3f174 100644
> --- a/lib/Target/AMDGPU/CMakeLists.txt
> +++ b/lib/Target/AMDGPU/CMakeLists.txt
> @@ -41,7 +41,7 @@ add_llvm_target(AMDGPUCodeGen
>    SIInstrInfo.cpp
>    SIISelLowering.cpp
>    SILowerLiteralConstants.cpp
> -  SILowerFlowControl.cpp
> +  SILowerControlFlow.cpp
>    SIMachineFunctionInfo.cpp
>    SIRegisterInfo.cpp
>    SIPreColorSGPRs.cpp
> diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp 
> b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> index 028753e..f92eff5 100644
> --- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> +++ b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp
> @@ -7,7 +7,7 @@
>  //
>  
> //===----------------------------------------------------------------------===//
>  //
> -// SGPRs are not affected by flow control. This pass adjust SGPR liveness in
> +// SGPRs are not affected by control flow. This pass adjust SGPR liveness in
>  // so that the register allocator can still correctly allocate them.
>  //
>  
> //===----------------------------------------------------------------------===//
> diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp 
> b/lib/Target/AMDGPU/SILowerControlFlow.cpp
> new file mode 100644
> index 0000000..b43fdeb
> --- /dev/null
> +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
> @@ -0,0 +1,193 @@
> +//===-- SILowerControlFlow.cpp - Use predicates for control flow 
> ----------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, 
> ENDIF)
> +// to predicated instructions.
> +//
> +// All control flow (except loops) is handled using predicated instructions 
> and
> +// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
> +// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
> +// by writting to the 64-bit EXEC register (each bit corresponds to a
> +// single vector ALU).  Typically, for predicates, a vector ALU will write
> +// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
> +// Vector ALU) and then the ScalarALU will AND the VCC register with the
> +// EXEC to update the predicates.
> +//
> +// For example:
> +// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
> +// SI_IF_NZ %VCC
> +//   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
> +// ELSE
> +//   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
> +// ENDIF
> +//
> +// becomes:
> +//
> +// %SGPR0 = S_MOV_B64 %EXEC          // Save the current exec mask
> +// %EXEC = S_AND_B64 %VCC, %EXEC     // Update the exec mask
> +// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
> +// S_CBRANCH_EXECZ label0            // This instruction is an
> +//                                   // optimization which allows us to
> +//                                   // branch if all the bits of
> +//                                   // EXEC are zero.
> +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
> +//
> +// label0:
> +// %SGPR2 = S_MOV_B64 %EXEC           // Save the current exec mask
> +// %EXEC = S_MOV_B64 %SGPR0           // Restore the exec mask for the Then 
> block
> +// %SGPR0 = S_MOV_B64 %SGPR2          // Save the exec mask from the If block
> +// S_BRANCH_EXECZ label1              // Use our branch optimization
> +//                                    // instruction again.
> +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
> +// label1:
> +// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
> +//===----------------------------------------------------------------------===//
> +
> +#include "AMDGPU.h"
> +#include "SIInstrInfo.h"
> +#include "SIMachineFunctionInfo.h"
> +#include "llvm/CodeGen/MachineFunction.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> +#include "llvm/CodeGen/MachineRegisterInfo.h"
> +
> +using namespace llvm;
> +
> +namespace {
> +
> +class SILowerControlFlowPass : public MachineFunctionPass {
> +
> +private:
> +  static char ID;
> +  const TargetInstrInfo *TII;
> +  std::vector<unsigned> PredicateStack;
> +  std::vector<unsigned> UnusedRegisters;
> +
> +  void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> +  void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> +
> +public:
> +  SILowerControlFlowPass(TargetMachine &tm) :
> +    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
> +
> +  virtual bool runOnMachineFunction(MachineFunction &MF);
> +
> +  const char *getPassName() const {
> +    return "SI Lower control flow instructions";
> +  }
> +
> +};
> +
> +} // End anonymous namespace
> +
> +char SILowerControlFlowPass::ID = 0;
> +
> +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
> +  return new SILowerControlFlowPass(tm);
> +}
> +
> +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
> +
> +  // Find all the unused registers that can be used for the predicate stack.
> +  for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
> +                                     S = AMDGPU::SReg_64RegClass.end();
> +                                     I != S; ++I) {
> +    unsigned Reg = *I;
> +    if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
> +      UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
> +    }
> +  }
> +
> +  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
> +                                                  BB != BB_E; ++BB) {
> +    MachineBasicBlock &MBB = *BB;
> +    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
> +                               I != MBB.end(); I = Next) {
> +      Next = llvm::next(I);
> +      MachineInstr &MI = *I;
> +      switch (MI.getOpcode()) {
> +        default: break;
> +        case AMDGPU::SI_IF_NZ:
> +          pushExecMask(MBB, I);
> +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
> +                  AMDGPU::EXEC)
> +                  .addOperand(MI.getOperand(0)) // VCC
> +                  .addReg(AMDGPU::EXEC);
> +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
> +                  PredicateStack.back())
> +                  .addReg(PredicateStack.back())
> +                  .addReg(AMDGPU::EXEC);
> +          MI.eraseFromParent();
> +          break;
> +        case AMDGPU::ELSE:
> +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> +                  UnusedRegisters.back())
> +                  .addReg(AMDGPU::EXEC);
> +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> +                  AMDGPU::EXEC)
> +                  .addReg(PredicateStack.back());
> +          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> +                  PredicateStack.back())
> +                  .addReg(UnusedRegisters.back());
> +          MI.eraseFromParent();
> +          break;
> +        case AMDGPU::ENDIF:
> +          popExecMask(MBB, I);
> +       if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == 
> ShaderType::PIXEL &&
> +           PredicateStack.empty()) {
> +            // If the exec mask is non-zero, skip the next two instructions
> +            BuildMI(MBB, I, MBB.findDebugLoc(I), 
> TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> +                    .addImm(3)
> +                    .addReg(AMDGPU::EXEC);
> +
> +            // Exec mask is zero: Export to NULL target...
> +            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
> +                    .addImm(0)
> +                    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
> +                    .addImm(0)
> +                    .addImm(1)
> +                    .addImm(1)
> +                    .addReg(AMDGPU::SREG_LIT_0)
> +                    .addReg(AMDGPU::SREG_LIT_0)
> +                    .addReg(AMDGPU::SREG_LIT_0)
> +                    .addReg(AMDGPU::SREG_LIT_0);
> +
> +            // ... and terminate wavefront
> +            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
> +       }
> +          MI.eraseFromParent();
> +          break;
> +      }
> +    }
> +  }
> +  return false;
> +}
> +
> +void SILowerControlFlowPass::pushExecMask(MachineBasicBlock &MBB,
> +                                          MachineBasicBlock::iterator I) {
> +
> +  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate 
> stack");
> +  unsigned StackReg = UnusedRegisters.back();
> +  UnusedRegisters.pop_back();
> +  PredicateStack.push_back(StackReg);
> +  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> +          StackReg)
> +          .addReg(AMDGPU::EXEC);
> +}
> +
> +void SILowerControlFlowPass::popExecMask(MachineBasicBlock &MBB,
> +                                        MachineBasicBlock::iterator I) {
> +  unsigned StackReg = PredicateStack.back();
> +  PredicateStack.pop_back();
> +  UnusedRegisters.push_back(StackReg);
> +  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
> +          AMDGPU::EXEC)
> +          .addReg(AMDGPU::EXEC)
> +          .addReg(StackReg);
> +}
> diff --git a/lib/Target/AMDGPU/SILowerFlowControl.cpp 
> b/lib/Target/AMDGPU/SILowerFlowControl.cpp
> deleted file mode 100644
> index 0d90c13..0000000
> --- a/lib/Target/AMDGPU/SILowerFlowControl.cpp
> +++ /dev/null
> @@ -1,193 +0,0 @@
> -//===-- SILowerFlowControl.cpp - Use predicates for flow control 
> ----------===//
> -//
> -//                     The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
> -//===----------------------------------------------------------------------===//
> -//
> -// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, 
> ENDIF)
> -// to predicated instructions.
> -//
> -// All flow control (except loops) is handled using predicated instructions 
> and
> -// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
> -// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
> -// by writting to the 64-bit EXEC register (each bit corresponds to a
> -// single vector ALU).  Typically, for predicates, a vector ALU will write
> -// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
> -// Vector ALU) and then the ScalarALU will AND the VCC register with the
> -// EXEC to update the predicates.
> -//
> -// For example:
> -// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
> -// SI_IF_NZ %VCC
> -//   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
> -// ELSE
> -//   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
> -// ENDIF
> -//
> -// becomes:
> -//
> -// %SGPR0 = S_MOV_B64 %EXEC          // Save the current exec mask
> -// %EXEC = S_AND_B64 %VCC, %EXEC     // Update the exec mask
> -// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
> -// S_CBRANCH_EXECZ label0            // This instruction is an
> -//                                   // optimization which allows us to
> -//                                   // branch if all the bits of
> -//                                   // EXEC are zero.
> -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
> -//
> -// label0:
> -// %SGPR2 = S_MOV_B64 %EXEC           // Save the current exec mask
> -// %EXEC = S_MOV_B64 %SGPR0           // Restore the exec mask for the Then 
> block
> -// %SGPR0 = S_MOV_B64 %SGPR2          // Save the exec mask from the If block
> -// S_BRANCH_EXECZ label1              // Use our branch optimization
> -//                                    // instruction again.
> -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
> -// label1:
> -// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
> -//===----------------------------------------------------------------------===//
> -
> -#include "AMDGPU.h"
> -#include "SIInstrInfo.h"
> -#include "SIMachineFunctionInfo.h"
> -#include "llvm/CodeGen/MachineFunction.h"
> -#include "llvm/CodeGen/MachineFunctionPass.h"
> -#include "llvm/CodeGen/MachineInstrBuilder.h"
> -#include "llvm/CodeGen/MachineRegisterInfo.h"
> -
> -using namespace llvm;
> -
> -namespace {
> -
> -class SILowerFlowControlPass : public MachineFunctionPass {
> -
> -private:
> -  static char ID;
> -  const TargetInstrInfo *TII;
> -  std::vector<unsigned> PredicateStack;
> -  std::vector<unsigned> UnusedRegisters;
> -
> -  void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> -  void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
> -
> -public:
> -  SILowerFlowControlPass(TargetMachine &tm) :
> -    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
> -
> -  virtual bool runOnMachineFunction(MachineFunction &MF);
> -
> -  const char *getPassName() const {
> -    return "SI Lower flow control instructions";
> -  }
> -
> -};
> -
> -} // End anonymous namespace
> -
> -char SILowerFlowControlPass::ID = 0;
> -
> -FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) {
> -  return new SILowerFlowControlPass(tm);
> -}
> -
> -bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) {
> -
> -  // Find all the unused registers that can be used for the predicate stack.
> -  for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
> -                                     S = AMDGPU::SReg_64RegClass.end();
> -                                     I != S; ++I) {
> -    unsigned Reg = *I;
> -    if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
> -      UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
> -    }
> -  }
> -
> -  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
> -                                                  BB != BB_E; ++BB) {
> -    MachineBasicBlock &MBB = *BB;
> -    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
> -                               I != MBB.end(); I = Next) {
> -      Next = llvm::next(I);
> -      MachineInstr &MI = *I;
> -      switch (MI.getOpcode()) {
> -        default: break;
> -        case AMDGPU::SI_IF_NZ:
> -          pushExecMask(MBB, I);
> -          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
> -                  AMDGPU::EXEC)
> -                  .addOperand(MI.getOperand(0)) // VCC
> -                  .addReg(AMDGPU::EXEC);
> -          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
> -                  PredicateStack.back())
> -                  .addReg(PredicateStack.back())
> -                  .addReg(AMDGPU::EXEC);
> -          MI.eraseFromParent();
> -          break;
> -        case AMDGPU::ELSE:
> -          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> -                  UnusedRegisters.back())
> -                  .addReg(AMDGPU::EXEC);
> -          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> -                  AMDGPU::EXEC)
> -                  .addReg(PredicateStack.back());
> -          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> -                  PredicateStack.back())
> -                  .addReg(UnusedRegisters.back());
> -          MI.eraseFromParent();
> -          break;
> -        case AMDGPU::ENDIF:
> -          popExecMask(MBB, I);
> -       if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == 
> ShaderType::PIXEL &&
> -           PredicateStack.empty()) {
> -            // If the exec mask is non-zero, skip the next two instructions
> -            BuildMI(MBB, I, MBB.findDebugLoc(I), 
> TII->get(AMDGPU::S_CBRANCH_EXECNZ))
> -                    .addImm(3)
> -                    .addReg(AMDGPU::EXEC);
> -
> -            // Exec mask is zero: Export to NULL target...
> -            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
> -                    .addImm(0)
> -                    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
> -                    .addImm(0)
> -                    .addImm(1)
> -                    .addImm(1)
> -                    .addReg(AMDGPU::SREG_LIT_0)
> -                    .addReg(AMDGPU::SREG_LIT_0)
> -                    .addReg(AMDGPU::SREG_LIT_0)
> -                    .addReg(AMDGPU::SREG_LIT_0);
> -
> -            // ... and terminate wavefront
> -            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
> -       }
> -          MI.eraseFromParent();
> -          break;
> -      }
> -    }
> -  }
> -  return false;
> -}
> -
> -void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB,
> -                                          MachineBasicBlock::iterator I) {
> -
> -  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate 
> stack");
> -  unsigned StackReg = UnusedRegisters.back();
> -  UnusedRegisters.pop_back();
> -  PredicateStack.push_back(StackReg);
> -  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
> -          StackReg)
> -          .addReg(AMDGPU::EXEC);
> -}
> -
> -void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB,
> -                                        MachineBasicBlock::iterator I) {
> -  unsigned StackReg = PredicateStack.back();
> -  PredicateStack.pop_back();
> -  UnusedRegisters.push_back(StackReg);
> -  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
> -          AMDGPU::EXEC)
> -          .addReg(AMDGPU::EXEC)
> -          .addReg(StackReg);
> -}
> -- 
> 1.7.10.4
> 
> _______________________________________________
> mesa-dev mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] AMDGPU: s/flow control/control flow/g in SI code.

Reply via email to