Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Christian König

Am 30.10.2013 14:23, schrieb Marek Olšák:

From: Marek Olšák marek.ol...@amd.com

This also fixes scalar compare instructions which were always eliminated,
because they didn't have a destination of SCC.


Uff, that looks like quite a bit of overhead, isn't there a simpler 
approach? Like setting the the NumRecord to one and letting unused 
constants pointing to a dummy buffer or soemthing like this?


Christian.



Signed-off-by: Marek Olšák marek.ol...@amd.com
---
  lib/Target/R600/SIISelLowering.cpp | 30 ++
  lib/Target/R600/SIInsertWaits.cpp  |  6 ++
  lib/Target/R600/SIInstrInfo.td |  5 +
  lib/Target/R600/SIInstructions.td  | 26 +++---
  4 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 371572e..e9f4035 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,6 +14,7 @@
  
  #include SIISelLowering.h

  #include AMDGPU.h
+#include AMDGPUSubtarget.h
  #include AMDILIntrinsicInfo.h
  #include SIInstrInfo.h
  #include SIMachineFunctionInfo.h
@@ -302,14 +303,37 @@ MachineBasicBlock * 
SITargetLowering::EmitInstrWithCustomInserter(
  MachineInstr * MI, MachineBasicBlock * BB) const {
  
MachineBasicBlock::iterator I = *MI;

+  const SIInstrInfo *TII =
+static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
+
+  // Sea Islands must conditionally execute SMRD instructions depending
+  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware
+  // doesn't skip the instructions if NUM_RECORDS is 0.
+  if (TII-isSMRD(MI-getOpcode())) {
+if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
+AMDGPUSubtarget::SEA_ISLANDS)
+  return BB;
+
+MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
+unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
+
+// XXX should we save and restore the SCC register?
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords)
+.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), 
AMDGPU::SCC)
+.addReg(NumRecords)
+.addImm(0);
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1))
+.addImm(1)
+.addReg(AMDGPU::SCC);
+return BB;
+  }
  
switch (MI-getOpcode()) {

default:
  return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
case AMDGPU::BRANCH: return BB;
case AMDGPU::SI_ADDR64_RSRC: {
-const SIInstrInfo *TII =
-  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
  MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
  unsigned SuperReg = MI-getOperand(0).getReg();
  unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
@@ -336,8 +360,6 @@ MachineBasicBlock * 
SITargetLowering::EmitInstrWithCustomInserter(
  break;
}
case AMDGPU::V_SUB_F64: {
-const SIInstrInfo *TII =
-  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
  BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
  MI-getOperand(0).getReg())
  .addReg(MI-getOperand(1).getReg())
diff --git a/lib/Target/R600/SIInsertWaits.cpp 
b/lib/Target/R600/SIInsertWaits.cpp
index 7e42fb7..2e47346 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB,
if (Counts.Named.EXP == 0)
  ExpInstrTypesSeen = 0;
  
+  // Ensure S_WAITCNT is inserted before S_CBRANCH.

+  MachineBasicBlock::iterator beforeI = I;
+  --beforeI;
+  if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+I = beforeI;
+
// Build the wait instruction
BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT))
.addImm((Counts.Named.VM  0xF) |
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index ed42a2a..9567879 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -177,6 +177,11 @@ class SOPC_32 bits7 op, string opName, listdag pattern 
: SOPC 
opName# $dst, $src0, $src1, pattern
  ;
  
+class SOPCK_32 bits7 op, string opName, listdag pattern : SOPC 

+  op, (outs SCCReg:$dst), (ins SReg_32:$src0, i16imm:$src1),
+  opName# $dst, $src0, $src1, pattern
+;
+
  class SOPC_64 bits7 op, string opName, listdag pattern : SOPC 
op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
opName# $dst, $src0, $src1, pattern
diff --git a/lib/Target/R600/SIInstructions.td 
b/lib/Target/R600/SIInstructions.td
index 048c157..1b275a7 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -115,17 +115,17 @@ def S_CMPK_EQ_I32 : SOPK 
  */
  
  let isCompare = 1 in {

-def S_CMPK_LG_I32 : SOPK_32 0x0004, S_CMPK_LG_I32, [];
-def S_CMPK_GT_I32 : SOPK_32 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Marek Olšák
I thought that doing S_CMPK followed by S_CBRANCH has less overhead
than doing a memory read. If we used one of
S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

Marek

On Wed, Oct 30, 2013 at 2:48 PM, Christian König
deathsim...@vodafone.de wrote:
 Am 30.10.2013 14:23, schrieb Marek Olšák:

 From: Marek Olšák marek.ol...@amd.com

 This also fixes scalar compare instructions which were always eliminated,
 because they didn't have a destination of SCC.


 Uff, that looks like quite a bit of overhead, isn't there a simpler
 approach? Like setting the the NumRecord to one and letting unused constants
 pointing to a dummy buffer or soemthing like this?

 Christian.



 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
   lib/Target/R600/SIISelLowering.cpp | 30 ++
   lib/Target/R600/SIInsertWaits.cpp  |  6 ++
   lib/Target/R600/SIInstrInfo.td |  5 +
   lib/Target/R600/SIInstructions.td  | 26 +++---
   4 files changed, 52 insertions(+), 15 deletions(-)

 diff --git a/lib/Target/R600/SIISelLowering.cpp
 b/lib/Target/R600/SIISelLowering.cpp
 index 371572e..e9f4035 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -14,6 +14,7 @@
 #include SIISelLowering.h
   #include AMDGPU.h
 +#include AMDGPUSubtarget.h
   #include AMDILIntrinsicInfo.h
   #include SIInstrInfo.h
   #include SIMachineFunctionInfo.h
 @@ -302,14 +303,37 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr * MI, MachineBasicBlock * BB) const {
   MachineBasicBlock::iterator I = *MI;
 +  const SIInstrInfo *TII =
 +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
 +
 +  // Sea Islands must conditionally execute SMRD instructions depending
 +  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware
 +  // doesn't skip the instructions if NUM_RECORDS is 0.
 +  if (TII-isSMRD(MI-getOpcode())) {
 +if
 (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
 +AMDGPUSubtarget::SEA_ISLANDS)
 +  return BB;
 +
 +MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 +unsigned NumRecords =
 MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
 +
 +// XXX should we save and restore the SCC register?
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
 NumRecords)
 +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32),
 AMDGPU::SCC)
 +.addReg(NumRecords)
 +.addImm(0);
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1))
 +.addImm(1)
 +.addReg(AMDGPU::SCC);
 +return BB;
 +  }
   switch (MI-getOpcode()) {
 default:
   return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 case AMDGPU::BRANCH: return BB;
 case AMDGPU::SI_ADDR64_RSRC: {
 -const SIInstrInfo *TII =
 -  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
   MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
   unsigned SuperReg = MI-getOperand(0).getReg();
   unsigned SubRegLo =
 MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
 @@ -336,8 +360,6 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
   break;
 }
 case AMDGPU::V_SUB_F64: {
 -const SIInstrInfo *TII =
 -  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
   BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
   MI-getOperand(0).getReg())
   .addReg(MI-getOperand(1).getReg())
 diff --git a/lib/Target/R600/SIInsertWaits.cpp
 b/lib/Target/R600/SIInsertWaits.cpp
 index 7e42fb7..2e47346 100644
 --- a/lib/Target/R600/SIInsertWaits.cpp
 +++ b/lib/Target/R600/SIInsertWaits.cpp
 @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock
 MBB,
 if (Counts.Named.EXP == 0)
   ExpInstrTypesSeen = 0;
   +  // Ensure S_WAITCNT is inserted before S_CBRANCH.
 +  MachineBasicBlock::iterator beforeI = I;
 +  --beforeI;
 +  if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1)
 +I = beforeI;
 +
 // Build the wait instruction
 BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT))
 .addImm((Counts.Named.VM  0xF) |
 diff --git a/lib/Target/R600/SIInstrInfo.td
 b/lib/Target/R600/SIInstrInfo.td
 index ed42a2a..9567879 100644
 --- a/lib/Target/R600/SIInstrInfo.td
 +++ b/lib/Target/R600/SIInstrInfo.td
 @@ -177,6 +177,11 @@ class SOPC_32 bits7 op, string opName, listdag
 pattern : SOPC 
 opName# $dst, $src0, $src1, pattern
   ;
   +class SOPCK_32 bits7 op, string opName, listdag pattern : SOPC 
 +  op, (outs SCCReg:$dst), (ins SReg_32:$src0, i16imm:$src1),
 +  opName# $dst, $src0, $src1, pattern
 +;
 +
   class SOPC_64 bits7 op, string opName, listdag pattern : SOPC 
 op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
 opName# $dst, $src0, $src1, pattern
 diff 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Christian König
Mhm, I'm assumed that having NumRecord zero is actually something quite 
unusual. E.g. a shader that accesses a not defined constant buffer or 
something like that. So I would rather optimize for the common use case.


Anyway branch instructions are quite expensive, you can issue something 
between 12 and 16 arithmetic instructions before they make sense to use 
instead of a conditional move. Not sure how the relation is with memory 
moves but I guess that it would be better to avoid them.


Christian.

Am 30.10.2013 14:59, schrieb Marek Olšák:

I thought that doing S_CMPK followed by S_CBRANCH has less overhead
than doing a memory read. If we used one of
S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

Marek

On Wed, Oct 30, 2013 at 2:48 PM, Christian König
deathsim...@vodafone.de wrote:

Am 30.10.2013 14:23, schrieb Marek Olšák:


From: Marek Olšák marek.ol...@amd.com

This also fixes scalar compare instructions which were always eliminated,
because they didn't have a destination of SCC.


Uff, that looks like quite a bit of overhead, isn't there a simpler
approach? Like setting the the NumRecord to one and letting unused constants
pointing to a dummy buffer or soemthing like this?

Christian.



Signed-off-by: Marek Olšák marek.ol...@amd.com
---
   lib/Target/R600/SIISelLowering.cpp | 30 ++
   lib/Target/R600/SIInsertWaits.cpp  |  6 ++
   lib/Target/R600/SIInstrInfo.td |  5 +
   lib/Target/R600/SIInstructions.td  | 26 +++---
   4 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp
b/lib/Target/R600/SIISelLowering.cpp
index 371572e..e9f4035 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,6 +14,7 @@
 #include SIISelLowering.h
   #include AMDGPU.h
+#include AMDGPUSubtarget.h
   #include AMDILIntrinsicInfo.h
   #include SIInstrInfo.h
   #include SIMachineFunctionInfo.h
@@ -302,14 +303,37 @@ MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr * MI, MachineBasicBlock * BB) const {
   MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
+
+  // Sea Islands must conditionally execute SMRD instructions depending
+  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware
+  // doesn't skip the instructions if NUM_RECORDS is 0.
+  if (TII-isSMRD(MI-getOpcode())) {
+if
(getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
+AMDGPUSubtarget::SEA_ISLANDS)
+  return BB;
+
+MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
+unsigned NumRecords =
MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
+
+// XXX should we save and restore the SCC register?
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
NumRecords)
+.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32),
AMDGPU::SCC)
+.addReg(NumRecords)
+.addImm(0);
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1))
+.addImm(1)
+.addReg(AMDGPU::SCC);
+return BB;
+  }
   switch (MI-getOpcode()) {
 default:
   return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 case AMDGPU::BRANCH: return BB;
 case AMDGPU::SI_ADDR64_RSRC: {
-const SIInstrInfo *TII =
-  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
   MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
   unsigned SuperReg = MI-getOperand(0).getReg();
   unsigned SubRegLo =
MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
@@ -336,8 +360,6 @@ MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(
   break;
 }
 case AMDGPU::V_SUB_F64: {
-const SIInstrInfo *TII =
-  static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
   BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
   MI-getOperand(0).getReg())
   .addReg(MI-getOperand(1).getReg())
diff --git a/lib/Target/R600/SIInsertWaits.cpp
b/lib/Target/R600/SIInsertWaits.cpp
index 7e42fb7..2e47346 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock
MBB,
 if (Counts.Named.EXP == 0)
   ExpInstrTypesSeen = 0;
   +  // Ensure S_WAITCNT is inserted before S_CBRANCH.
+  MachineBasicBlock::iterator beforeI = I;
+  --beforeI;
+  if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+I = beforeI;
+
 // Build the wait instruction
 BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT))
 .addImm((Counts.Named.VM  0xF) |
diff --git a/lib/Target/R600/SIInstrInfo.td
b/lib/Target/R600/SIInstrInfo.td
index ed42a2a..9567879 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Marek Olšák
Yeah, it's unusual.

What if S_BUFFER_LOAD is also used by something else, like texture
buffers, or OpenCL? Will we have to fix that as well?

Marek

On Wed, Oct 30, 2013 at 3:32 PM, Christian König
deathsim...@vodafone.de wrote:
 Mhm, I'm assumed that having NumRecord zero is actually something quite
 unusual. E.g. a shader that accesses a not defined constant buffer or
 something like that. So I would rather optimize for the common use case.

 Anyway branch instructions are quite expensive, you can issue something
 between 12 and 16 arithmetic instructions before they make sense to use
 instead of a conditional move. Not sure how the relation is with memory
 moves but I guess that it would be better to avoid them.

 Christian.

 Am 30.10.2013 14:59, schrieb Marek Olšák:

 I thought that doing S_CMPK followed by S_CBRANCH has less overhead
 than doing a memory read. If we used one of
 S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

 Marek

 On Wed, Oct 30, 2013 at 2:48 PM, Christian König
 deathsim...@vodafone.de wrote:

 Am 30.10.2013 14:23, schrieb Marek Olšák:

 From: Marek Olšák marek.ol...@amd.com

 This also fixes scalar compare instructions which were always
 eliminated,
 because they didn't have a destination of SCC.


 Uff, that looks like quite a bit of overhead, isn't there a simpler
 approach? Like setting the the NumRecord to one and letting unused
 constants
 pointing to a dummy buffer or soemthing like this?

 Christian.


 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
lib/Target/R600/SIISelLowering.cpp | 30
 ++
lib/Target/R600/SIInsertWaits.cpp  |  6 ++
lib/Target/R600/SIInstrInfo.td |  5 +
lib/Target/R600/SIInstructions.td  | 26 +++---
4 files changed, 52 insertions(+), 15 deletions(-)

 diff --git a/lib/Target/R600/SIISelLowering.cpp
 b/lib/Target/R600/SIISelLowering.cpp
 index 371572e..e9f4035 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -14,6 +14,7 @@
  #include SIISelLowering.h
#include AMDGPU.h
 +#include AMDGPUSubtarget.h
#include AMDILIntrinsicInfo.h
#include SIInstrInfo.h
#include SIMachineFunctionInfo.h
 @@ -302,14 +303,37 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock::iterator I = *MI;
 +  const SIInstrInfo *TII =
 +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
 +
 +  // Sea Islands must conditionally execute SMRD instructions depending
 +  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the
 hardware
 +  // doesn't skip the instructions if NUM_RECORDS is 0.
 +  if (TII-isSMRD(MI-getOpcode())) {
 +if
 (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
 +AMDGPUSubtarget::SEA_ISLANDS)
 +  return BB;
 +
 +MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 +unsigned NumRecords =
 MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
 +
 +// XXX should we save and restore the SCC register?
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
 NumRecords)
 +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32),
 AMDGPU::SCC)
 +.addReg(NumRecords)
 +.addImm(0);
 +BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::S_CBRANCH_SCC1))
 +.addImm(1)
 +.addReg(AMDGPU::SCC);
 +return BB;
 +  }
switch (MI-getOpcode()) {
  default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  case AMDGPU::BRANCH: return BB;
  case AMDGPU::SI_ADDR64_RSRC: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
unsigned SuperReg = MI-getOperand(0).getReg();
unsigned SubRegLo =
 MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
 @@ -336,8 +360,6 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
break;
  }
  case AMDGPU::V_SUB_F64: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
MI-getOperand(0).getReg())
.addReg(MI-getOperand(1).getReg())
 diff --git a/lib/Target/R600/SIInsertWaits.cpp
 b/lib/Target/R600/SIInsertWaits.cpp
 index 7e42fb7..2e47346 100644
 --- a/lib/Target/R600/SIInsertWaits.cpp
 +++ b/lib/Target/R600/SIInsertWaits.cpp
 @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock
 MBB,
  if (Counts.Named.EXP == 0)
ExpInstrTypesSeen = 0;
+  // Ensure S_WAITCNT is inserted before S_CBRANCH.
 +  MachineBasicBlock::iterator beforeI = I;
 +  --beforeI;
 +  if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1)
 +I = 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Christian König
Off hand I don't know any use case exept constant buffers where we use 
S_BUFFER_LOAD, but anybody who uses it should be aware how to use it.


What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? 
Hangs or just undefined behaviour?


Christian.

Am 30.10.2013 17:00, schrieb Marek Olšák:

Yeah, it's unusual.

What if S_BUFFER_LOAD is also used by something else, like texture
buffers, or OpenCL? Will we have to fix that as well?

Marek

On Wed, Oct 30, 2013 at 3:32 PM, Christian König
deathsim...@vodafone.de wrote:

Mhm, I'm assumed that having NumRecord zero is actually something quite
unusual. E.g. a shader that accesses a not defined constant buffer or
something like that. So I would rather optimize for the common use case.

Anyway branch instructions are quite expensive, you can issue something
between 12 and 16 arithmetic instructions before they make sense to use
instead of a conditional move. Not sure how the relation is with memory
moves but I guess that it would be better to avoid them.

Christian.

Am 30.10.2013 14:59, schrieb Marek Olšák:


I thought that doing S_CMPK followed by S_CBRANCH has less overhead
than doing a memory read. If we used one of
S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

Marek

On Wed, Oct 30, 2013 at 2:48 PM, Christian König
deathsim...@vodafone.de wrote:

Am 30.10.2013 14:23, schrieb Marek Olšák:


From: Marek Olšák marek.ol...@amd.com

This also fixes scalar compare instructions which were always
eliminated,
because they didn't have a destination of SCC.


Uff, that looks like quite a bit of overhead, isn't there a simpler
approach? Like setting the the NumRecord to one and letting unused
constants
pointing to a dummy buffer or soemthing like this?

Christian.



Signed-off-by: Marek Olšák marek.ol...@amd.com
---
lib/Target/R600/SIISelLowering.cpp | 30
++
lib/Target/R600/SIInsertWaits.cpp  |  6 ++
lib/Target/R600/SIInstrInfo.td |  5 +
lib/Target/R600/SIInstructions.td  | 26 +++---
4 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp
b/lib/Target/R600/SIISelLowering.cpp
index 371572e..e9f4035 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -14,6 +14,7 @@
  #include SIISelLowering.h
#include AMDGPU.h
+#include AMDGPUSubtarget.h
#include AMDILIntrinsicInfo.h
#include SIInstrInfo.h
#include SIMachineFunctionInfo.h
@@ -302,14 +303,37 @@ MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr * MI, MachineBasicBlock * BB) const {
MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo());
+
+  // Sea Islands must conditionally execute SMRD instructions depending
+  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the
hardware
+  // doesn't skip the instructions if NUM_RECORDS is 0.
+  if (TII-isSMRD(MI-getOpcode())) {
+if
(getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
+AMDGPUSubtarget::SEA_ISLANDS)
+  return BB;
+
+MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
+unsigned NumRecords =
MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
+
+// XXX should we save and restore the SCC register?
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
NumRecords)
+.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
+BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32),
AMDGPU::SCC)
+.addReg(NumRecords)
+.addImm(0);
+BuildMI(*BB, I, MI-getDebugLoc(),
TII-get(AMDGPU::S_CBRANCH_SCC1))
+.addImm(1)
+.addReg(AMDGPU::SCC);
+return BB;
+  }
switch (MI-getOpcode()) {
  default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
  case AMDGPU::BRANCH: return BB;
  case AMDGPU::SI_ADDR64_RSRC: {
-const SIInstrInfo *TII =
-  static_castconst
SIInstrInfo*(getTargetMachine().getInstrInfo());
MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
unsigned SuperReg = MI-getOperand(0).getReg();
unsigned SubRegLo =
MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
@@ -336,8 +360,6 @@ MachineBasicBlock *
SITargetLowering::EmitInstrWithCustomInserter(
break;
  }
  case AMDGPU::V_SUB_F64: {
-const SIInstrInfo *TII =
-  static_castconst
SIInstrInfo*(getTargetMachine().getInstrInfo());
BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
MI-getOperand(0).getReg())
.addReg(MI-getOperand(1).getReg())
diff --git a/lib/Target/R600/SIInsertWaits.cpp
b/lib/Target/R600/SIInsertWaits.cpp
index 7e42fb7..2e47346 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock
MBB,
  if 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Marek Olšák
The symptom is a VM protection fault with the address of 0 (probably
because the whole descriptor contains zeros), which
should be harmless, but it spams dmesg.

Marek

On Wed, Oct 30, 2013 at 5:19 PM, Christian König
deathsim...@vodafone.de wrote:
 Off hand I don't know any use case exept constant buffers where we use
 S_BUFFER_LOAD, but anybody who uses it should be aware how to use it.

 What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? Hangs or
 just undefined behaviour?

 Christian.

 Am 30.10.2013 17:00, schrieb Marek Olšák:

 Yeah, it's unusual.

 What if S_BUFFER_LOAD is also used by something else, like texture
 buffers, or OpenCL? Will we have to fix that as well?

 Marek

 On Wed, Oct 30, 2013 at 3:32 PM, Christian König
 deathsim...@vodafone.de wrote:

 Mhm, I'm assumed that having NumRecord zero is actually something quite
 unusual. E.g. a shader that accesses a not defined constant buffer or
 something like that. So I would rather optimize for the common use case.

 Anyway branch instructions are quite expensive, you can issue something
 between 12 and 16 arithmetic instructions before they make sense to use
 instead of a conditional move. Not sure how the relation is with memory
 moves but I guess that it would be better to avoid them.

 Christian.

 Am 30.10.2013 14:59, schrieb Marek Olšák:

 I thought that doing S_CMPK followed by S_CBRANCH has less overhead
 than doing a memory read. If we used one of
 S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

 Marek

 On Wed, Oct 30, 2013 at 2:48 PM, Christian König
 deathsim...@vodafone.de wrote:

 Am 30.10.2013 14:23, schrieb Marek Olšák:

 From: Marek Olšák marek.ol...@amd.com

 This also fixes scalar compare instructions which were always
 eliminated,
 because they didn't have a destination of SCC.


 Uff, that looks like quite a bit of overhead, isn't there a simpler
 approach? Like setting the the NumRecord to one and letting unused
 constants
 pointing to a dummy buffer or soemthing like this?

 Christian.


 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
 lib/Target/R600/SIISelLowering.cpp | 30
 ++
 lib/Target/R600/SIInsertWaits.cpp  |  6 ++
 lib/Target/R600/SIInstrInfo.td |  5 +
 lib/Target/R600/SIInstructions.td  | 26 +++---
 4 files changed, 52 insertions(+), 15 deletions(-)

 diff --git a/lib/Target/R600/SIISelLowering.cpp
 b/lib/Target/R600/SIISelLowering.cpp
 index 371572e..e9f4035 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -14,6 +14,7 @@
   #include SIISelLowering.h
 #include AMDGPU.h
 +#include AMDGPUSubtarget.h
 #include AMDILIntrinsicInfo.h
 #include SIInstrInfo.h
 #include SIMachineFunctionInfo.h
 @@ -302,14 +303,37 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
 MachineInstr * MI, MachineBasicBlock * BB) const {
 MachineBasicBlock::iterator I = *MI;
 +  const SIInstrInfo *TII =
 +static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 +
 +  // Sea Islands must conditionally execute SMRD instructions
 depending
 +  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the
 hardware
 +  // doesn't skip the instructions if NUM_RECORDS is 0.
 +  if (TII-isSMRD(MI-getOpcode())) {
 +if
 (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
 +AMDGPUSubtarget::SEA_ISLANDS)
 +  return BB;
 +
 +MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 +unsigned NumRecords =
 MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
 +
 +// XXX should we save and restore the SCC register?
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
 NumRecords)
 +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
 +BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::S_CMPK_EQ_U32),
 AMDGPU::SCC)
 +.addReg(NumRecords)
 +.addImm(0);
 +BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::S_CBRANCH_SCC1))
 +.addImm(1)
 +.addReg(AMDGPU::SCC);
 +return BB;
 +  }
 switch (MI-getOpcode()) {
   default:
 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI,
 BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 unsigned SuperReg = MI-getOperand(0).getReg();
 unsigned SubRegLo =
 MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
 @@ -336,8 +360,6 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
 break;
   }
   case AMDGPU::V_SUB_F64: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::V_ADD_F64),
 

Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI

2013-10-30 Thread Marek Olšák
Please disregard this patch. I've implemented the workaround in Mesa
as Christian suggested.

Marek

On Wed, Oct 30, 2013 at 5:19 PM, Christian König
deathsim...@vodafone.de wrote:
 Off hand I don't know any use case exept constant buffers where we use
 S_BUFFER_LOAD, but anybody who uses it should be aware how to use it.

 What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? Hangs or
 just undefined behaviour?

 Christian.

 Am 30.10.2013 17:00, schrieb Marek Olšák:

 Yeah, it's unusual.

 What if S_BUFFER_LOAD is also used by something else, like texture
 buffers, or OpenCL? Will we have to fix that as well?

 Marek

 On Wed, Oct 30, 2013 at 3:32 PM, Christian König
 deathsim...@vodafone.de wrote:

 Mhm, I'm assumed that having NumRecord zero is actually something quite
 unusual. E.g. a shader that accesses a not defined constant buffer or
 something like that. So I would rather optimize for the common use case.

 Anyway branch instructions are quite expensive, you can issue something
 between 12 and 16 arithmetic instructions before they make sense to use
 instead of a conditional move. Not sure how the relation is with memory
 moves but I guess that it would be better to avoid them.

 Christian.

 Am 30.10.2013 14:59, schrieb Marek Olšák:

 I thought that doing S_CMPK followed by S_CBRANCH has less overhead
 than doing a memory read. If we used one of
 S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know.

 Marek

 On Wed, Oct 30, 2013 at 2:48 PM, Christian König
 deathsim...@vodafone.de wrote:

 Am 30.10.2013 14:23, schrieb Marek Olšák:

 From: Marek Olšák marek.ol...@amd.com

 This also fixes scalar compare instructions which were always
 eliminated,
 because they didn't have a destination of SCC.


 Uff, that looks like quite a bit of overhead, isn't there a simpler
 approach? Like setting the the NumRecord to one and letting unused
 constants
 pointing to a dummy buffer or soemthing like this?

 Christian.


 Signed-off-by: Marek Olšák marek.ol...@amd.com
 ---
 lib/Target/R600/SIISelLowering.cpp | 30
 ++
 lib/Target/R600/SIInsertWaits.cpp  |  6 ++
 lib/Target/R600/SIInstrInfo.td |  5 +
 lib/Target/R600/SIInstructions.td  | 26 +++---
 4 files changed, 52 insertions(+), 15 deletions(-)

 diff --git a/lib/Target/R600/SIISelLowering.cpp
 b/lib/Target/R600/SIISelLowering.cpp
 index 371572e..e9f4035 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -14,6 +14,7 @@
   #include SIISelLowering.h
 #include AMDGPU.h
 +#include AMDGPUSubtarget.h
 #include AMDILIntrinsicInfo.h
 #include SIInstrInfo.h
 #include SIMachineFunctionInfo.h
 @@ -302,14 +303,37 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
 MachineInstr * MI, MachineBasicBlock * BB) const {
 MachineBasicBlock::iterator I = *MI;
 +  const SIInstrInfo *TII =
 +static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 +
 +  // Sea Islands must conditionally execute SMRD instructions
 depending
 +  // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the
 hardware
 +  // doesn't skip the instructions if NUM_RECORDS is 0.
 +  if (TII-isSMRD(MI-getOpcode())) {
 +if
 (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() !=
 +AMDGPUSubtarget::SEA_ISLANDS)
 +  return BB;
 +
 +MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 +unsigned NumRecords =
 MRI.createVirtualRegister(AMDGPU::SReg_32RegClass);
 +
 +// XXX should we save and restore the SCC register?
 +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY),
 NumRecords)
 +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2);
 +BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::S_CMPK_EQ_U32),
 AMDGPU::SCC)
 +.addReg(NumRecords)
 +.addImm(0);
 +BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::S_CBRANCH_SCC1))
 +.addImm(1)
 +.addReg(AMDGPU::SCC);
 +return BB;
 +  }
 switch (MI-getOpcode()) {
   default:
 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI,
 BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 MachineRegisterInfo MRI = BB-getParent()-getRegInfo();
 unsigned SuperReg = MI-getOperand(0).getReg();
 unsigned SubRegLo =
 MRI.createVirtualRegister(AMDGPU::SReg_64RegClass);
 @@ -336,8 +360,6 @@ MachineBasicBlock *
 SITargetLowering::EmitInstrWithCustomInserter(
 break;
   }
   case AMDGPU::V_SUB_F64: {
 -const SIInstrInfo *TII =
 -  static_castconst
 SIInstrInfo*(getTargetMachine().getInstrInfo());
 BuildMI(*BB, I, MI-getDebugLoc(),
 TII-get(AMDGPU::V_ADD_F64),
 MI-getOperand(0).getReg())