Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg()) .addReg(MI-getOperand(1).getReg()) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7e42fb7..2e47346 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB, if (Counts.Named.EXP == 0) ExpInstrTypesSeen = 0; + // Ensure S_WAITCNT is inserted before S_CBRANCH. + MachineBasicBlock::iterator beforeI = I; + --beforeI; + if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1) +I = beforeI; + // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM 0xF) | diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index ed42a2a..9567879 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -177,6 +177,11 @@ class SOPC_32 bits7 op, string opName, listdag pattern : SOPC opName# $dst, $src0, $src1, pattern ; +class SOPCK_32 bits7 op, string opName, listdag pattern : SOPC + op, (outs SCCReg:$dst), (ins SReg_32:$src0, i16imm:$src1), + opName# $dst, $src0, $src1, pattern +; + class SOPC_64 bits7 op, string opName, listdag pattern : SOPC op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName# $dst, $src0, $src1, pattern diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 048c157..1b275a7 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -115,17 +115,17 @@ def S_CMPK_EQ_I32 : SOPK */ let isCompare = 1 in { -def S_CMPK_LG_I32 : SOPK_32 0x0004, S_CMPK_LG_I32, []; -def S_CMPK_GT_I32 : SOPK_32
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg()) .addReg(MI-getOperand(1).getReg()) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7e42fb7..2e47346 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB, if (Counts.Named.EXP == 0) ExpInstrTypesSeen = 0; + // Ensure S_WAITCNT is inserted before S_CBRANCH. + MachineBasicBlock::iterator beforeI = I; + --beforeI; + if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1) +I = beforeI; + // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM 0xF) | diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index ed42a2a..9567879 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@ -177,6 +177,11 @@ class SOPC_32 bits7 op, string opName, listdag pattern : SOPC opName# $dst, $src0, $src1, pattern ; +class SOPCK_32 bits7 op, string opName, listdag pattern : SOPC + op, (outs SCCReg:$dst), (ins SReg_32:$src0, i16imm:$src1), + opName# $dst, $src0, $src1, pattern +; + class SOPC_64 bits7 op, string opName, listdag pattern : SOPC op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), opName# $dst, $src0, $src1, pattern diff
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
Mhm, I'm assumed that having NumRecord zero is actually something quite unusual. E.g. a shader that accesses a not defined constant buffer or something like that. So I would rather optimize for the common use case. Anyway branch instructions are quite expensive, you can issue something between 12 and 16 arithmetic instructions before they make sense to use instead of a conditional move. Not sure how the relation is with memory moves but I guess that it would be better to avoid them. Christian. Am 30.10.2013 14:59, schrieb Marek Olšák: I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg()) .addReg(MI-getOperand(1).getReg()) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7e42fb7..2e47346 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB, if (Counts.Named.EXP == 0) ExpInstrTypesSeen = 0; + // Ensure S_WAITCNT is inserted before S_CBRANCH. + MachineBasicBlock::iterator beforeI = I; + --beforeI; + if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1) +I = beforeI; + // Build the wait instruction BuildMI(MBB, I, DebugLoc(), TII-get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM 0xF) | diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td index ed42a2a..9567879 100644 --- a/lib/Target/R600/SIInstrInfo.td +++ b/lib/Target/R600/SIInstrInfo.td @@
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
Yeah, it's unusual. What if S_BUFFER_LOAD is also used by something else, like texture buffers, or OpenCL? Will we have to fix that as well? Marek On Wed, Oct 30, 2013 at 3:32 PM, Christian König deathsim...@vodafone.de wrote: Mhm, I'm assumed that having NumRecord zero is actually something quite unusual. E.g. a shader that accesses a not defined constant buffer or something like that. So I would rather optimize for the common use case. Anyway branch instructions are quite expensive, you can issue something between 12 and 16 arithmetic instructions before they make sense to use instead of a conditional move. Not sure how the relation is with memory moves but I guess that it would be better to avoid them. Christian. Am 30.10.2013 14:59, schrieb Marek Olšák: I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg()) .addReg(MI-getOperand(1).getReg()) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7e42fb7..2e47346 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB, if (Counts.Named.EXP == 0) ExpInstrTypesSeen = 0; + // Ensure S_WAITCNT is inserted before S_CBRANCH. + MachineBasicBlock::iterator beforeI = I; + --beforeI; + if (beforeI-getOpcode() == AMDGPU::S_CBRANCH_SCC1) +I =
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
Off hand I don't know any use case exept constant buffers where we use S_BUFFER_LOAD, but anybody who uses it should be aware how to use it. What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? Hangs or just undefined behaviour? Christian. Am 30.10.2013 17:00, schrieb Marek Olšák: Yeah, it's unusual. What if S_BUFFER_LOAD is also used by something else, like texture buffers, or OpenCL? Will we have to fix that as well? Marek On Wed, Oct 30, 2013 at 3:32 PM, Christian König deathsim...@vodafone.de wrote: Mhm, I'm assumed that having NumRecord zero is actually something quite unusual. E.g. a shader that accesses a not defined constant buffer or something like that. So I would rather optimize for the common use case. Anyway branch instructions are quite expensive, you can issue something between 12 and 16 arithmetic instructions before they make sense to use instead of a conditional move. Not sure how the relation is with memory moves but I guess that it would be better to avoid them. Christian. Am 30.10.2013 14:59, schrieb Marek Olšák: I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg()) .addReg(MI-getOperand(1).getReg()) diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index 7e42fb7..2e47346 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -294,6 +294,12 @@ bool SIInsertWaits::insertWait(MachineBasicBlock MBB, if
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
The symptom is a VM protection fault with the address of 0 (probably because the whole descriptor contains zeros), which should be harmless, but it spams dmesg. Marek On Wed, Oct 30, 2013 at 5:19 PM, Christian König deathsim...@vodafone.de wrote: Off hand I don't know any use case exept constant buffers where we use S_BUFFER_LOAD, but anybody who uses it should be aware how to use it. What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? Hangs or just undefined behaviour? Christian. Am 30.10.2013 17:00, schrieb Marek Olšák: Yeah, it's unusual. What if S_BUFFER_LOAD is also used by something else, like texture buffers, or OpenCL? Will we have to fix that as well? Marek On Wed, Oct 30, 2013 at 3:32 PM, Christian König deathsim...@vodafone.de wrote: Mhm, I'm assumed that having NumRecord zero is actually something quite unusual. E.g. a shader that accesses a not defined constant buffer or something like that. So I would rather optimize for the common use case. Anyway branch instructions are quite expensive, you can issue something between 12 and 16 arithmetic instructions before they make sense to use instead of a conditional move. Not sure how the relation is with memory moves but I guess that it would be better to avoid them. Christian. Am 30.10.2013 14:59, schrieb Marek Olšák: I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64),
Re: [Mesa-dev] [PATCH] R600/SI: add a workaround for a scalar-memory-read hw bug on CI
Please disregard this patch. I've implemented the workaround in Mesa as Christian suggested. Marek On Wed, Oct 30, 2013 at 5:19 PM, Christian König deathsim...@vodafone.de wrote: Off hand I don't know any use case exept constant buffers where we use S_BUFFER_LOAD, but anybody who uses it should be aware how to use it. What are the symptoms of issuing a S_BUFFER_LOAD with NumRecords=0? Hangs or just undefined behaviour? Christian. Am 30.10.2013 17:00, schrieb Marek Olšák: Yeah, it's unusual. What if S_BUFFER_LOAD is also used by something else, like texture buffers, or OpenCL? Will we have to fix that as well? Marek On Wed, Oct 30, 2013 at 3:32 PM, Christian König deathsim...@vodafone.de wrote: Mhm, I'm assumed that having NumRecord zero is actually something quite unusual. E.g. a shader that accesses a not defined constant buffer or something like that. So I would rather optimize for the common use case. Anyway branch instructions are quite expensive, you can issue something between 12 and 16 arithmetic instructions before they make sense to use instead of a conditional move. Not sure how the relation is with memory moves but I guess that it would be better to avoid them. Christian. Am 30.10.2013 14:59, schrieb Marek Olšák: I thought that doing S_CMPK followed by S_CBRANCH has less overhead than doing a memory read. If we used one of S_BUFFER_LOAD_DWORDX2,4,8,16, it wouldn't be so bad. I don't know. Marek On Wed, Oct 30, 2013 at 2:48 PM, Christian König deathsim...@vodafone.de wrote: Am 30.10.2013 14:23, schrieb Marek Olšák: From: Marek Olšák marek.ol...@amd.com This also fixes scalar compare instructions which were always eliminated, because they didn't have a destination of SCC. Uff, that looks like quite a bit of overhead, isn't there a simpler approach? Like setting the the NumRecord to one and letting unused constants pointing to a dummy buffer or soemthing like this? Christian. Signed-off-by: Marek Olšák marek.ol...@amd.com --- lib/Target/R600/SIISelLowering.cpp | 30 ++ lib/Target/R600/SIInsertWaits.cpp | 6 ++ lib/Target/R600/SIInstrInfo.td | 5 + lib/Target/R600/SIInstructions.td | 26 +++--- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 371572e..e9f4035 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -14,6 +14,7 @@ #include SIISelLowering.h #include AMDGPU.h +#include AMDGPUSubtarget.h #include AMDILIntrinsicInfo.h #include SIInstrInfo.h #include SIMachineFunctionInfo.h @@ -302,14 +303,37 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineBasicBlock::iterator I = *MI; + const SIInstrInfo *TII = +static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); + + // Sea Islands must conditionally execute SMRD instructions depending + // on the value of SQ_BUF_RSRC_WORD2.NUM_RECORDS, because the hardware + // doesn't skip the instructions if NUM_RECORDS is 0. + if (TII-isSMRD(MI-getOpcode())) { +if (getTargetMachine().getSubtargetAMDGPUSubtarget().getGeneration() != +AMDGPUSubtarget::SEA_ISLANDS) + return BB; + +MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); +unsigned NumRecords = MRI.createVirtualRegister(AMDGPU::SReg_32RegClass); + +// XXX should we save and restore the SCC register? +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::COPY), NumRecords) +.addReg(MI-getOperand(1).getReg(), 0, AMDGPU::sub2); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CMPK_EQ_U32), AMDGPU::SCC) +.addReg(NumRecords) +.addImm(0); +BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::S_CBRANCH_SCC1)) +.addImm(1) +.addReg(AMDGPU::SCC); +return BB; + } switch (MI-getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; case AMDGPU::SI_ADDR64_RSRC: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); MachineRegisterInfo MRI = BB-getParent()-getRegInfo(); unsigned SuperReg = MI-getOperand(0).getReg(); unsigned SubRegLo = MRI.createVirtualRegister(AMDGPU::SReg_64RegClass); @@ -336,8 +360,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( break; } case AMDGPU::V_SUB_F64: { -const SIInstrInfo *TII = - static_castconst SIInstrInfo*(getTargetMachine().getInstrInfo()); BuildMI(*BB, I, MI-getDebugLoc(), TII-get(AMDGPU::V_ADD_F64), MI-getOperand(0).getReg())