Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD
On Thu, Feb 13, 2014 at 07:56:26AM -0800, Matt Arsenault wrote: On Feb 7, 2014, at 7:46 AM, Tom Stellard t...@stellard.net wrote: From: Tom Stellard thomas.stell...@amd.com --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++ lib/Target/R600/SIISelLowering.cpp | 29 lib/Target/R600/SIISelLowering.h | 1 - test/CodeGen/R600/add.ll | 10 +++ test/CodeGen/R600/add_i64.ll | 23 +++- 5 files changed, 75 insertions(+), 36 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index a989135..fea875c 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: { +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); +if (N-getValueType(0) != MVT::i64 || +ST.getGeneration() AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + +SDLoc DL(N); +SDValue LHS = N-getOperand(0); +SDValue RHS = N-getOperand(1); + +SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32); +SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32); + +SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); +SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + +SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); +SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + +SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue); + +SmallVectorSDValue, 8 AddLoArgs; +AddLoArgs.push_back(SDValue(Lo0, 0)); +AddLoArgs.push_back(SDValue(Lo1, 0)); + +SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL, + VTList, AddLoArgs); +SDValue Carry = SDValue(AddLo, 1); +SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL, + MVT::i32, SDValue(Hi0, 0), + SDValue(Hi1, 0), Carry); + +SDValue Args[5] = { + CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, +}; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5); + } case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a22d16..4d2f370 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); Would it be better to mark this as custom lowered, and then just return SDValue() for it? That way it won’t be incorrectly reported as legal for anything that might be checking. That's an interesting idea, but if we do that then the legalizer will try to expand the nodes, which isn't much of a problem now since there is no ExpandNode implementation for ISD:ADD, but someone may want to add one in the future. I think most places use isLegalOrCustom(), so it may not make much of a difference anyway. -Tom setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_castLoadSDNode(Op); @@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerADD(SDValue Op, - SelectionDAG DAG) const { - if
Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD
On Feb 7, 2014, at 7:46 AM, Tom Stellard t...@stellard.net wrote: From: Tom Stellard thomas.stell...@amd.com --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++ lib/Target/R600/SIISelLowering.cpp | 29 lib/Target/R600/SIISelLowering.h | 1 - test/CodeGen/R600/add.ll | 10 +++ test/CodeGen/R600/add_i64.ll | 23 +++- 5 files changed, 75 insertions(+), 36 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index a989135..fea875c 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: { +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); +if (N-getValueType(0) != MVT::i64 || +ST.getGeneration() AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + +SDLoc DL(N); +SDValue LHS = N-getOperand(0); +SDValue RHS = N-getOperand(1); + +SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32); +SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32); + +SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); +SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + +SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); +SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + +SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue); + +SmallVectorSDValue, 8 AddLoArgs; +AddLoArgs.push_back(SDValue(Lo0, 0)); +AddLoArgs.push_back(SDValue(Lo1, 0)); + +SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL, + VTList, AddLoArgs); +SDValue Carry = SDValue(AddLo, 1); +SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL, + MVT::i32, SDValue(Hi0, 0), + SDValue(Hi1, 0), Carry); + +SDValue Args[5] = { + CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, +}; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5); + } case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a22d16..4d2f370 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); Would it be better to mark this as custom lowered, and then just return SDValue() for it? That way it won’t be incorrectly reported as legal for anything that might be checking. setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_castLoadSDNode(Op); @@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerADD(SDValue Op, - SelectionDAG DAG) const { - if (Op.getValueType() != MVT::i64) -return SDValue(); - - SDLoc DL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue One = DAG.getConstant(1, MVT::i32); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); - - SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); - SDValue Hi1 =
Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD
I didn't think to try this. Where is the address folding happening? On 02/07/2014 07:46 AM, Tom Stellard wrote: From: Tom Stellard thomas.stell...@amd.com --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++ lib/Target/R600/SIISelLowering.cpp | 29 lib/Target/R600/SIISelLowering.h | 1 - test/CodeGen/R600/add.ll | 10 +++ test/CodeGen/R600/add_i64.ll | 23 +++- 5 files changed, 75 insertions(+), 36 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index a989135..fea875c 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: { +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); +if (N-getValueType(0) != MVT::i64 || +ST.getGeneration() AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + +SDLoc DL(N); +SDValue LHS = N-getOperand(0); +SDValue RHS = N-getOperand(1); + +SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32); +SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32); + +SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); +SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + +SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); +SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + +SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue); + +SmallVectorSDValue, 8 AddLoArgs; +AddLoArgs.push_back(SDValue(Lo0, 0)); +AddLoArgs.push_back(SDValue(Lo1, 0)); + +SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL, + VTList, AddLoArgs); +SDValue Carry = SDValue(AddLo, 1); +SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL, + MVT::i32, SDValue(Hi0, 0), + SDValue(Hi1, 0), Carry); + +SDValue Args[5] = { + CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, +}; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5); + } case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a22d16..4d2f370 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_castLoadSDNode(Op); @@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerADD(SDValue Op, - SelectionDAG DAG) const { - if (Op.getValueType() != MVT::i64) -return SDValue(); - - SDLoc DL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue One = DAG.getConstant(1, MVT::i32); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); - - SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); - SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); - - SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); - - SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); - SDValue Carry =
Re: [Mesa-dev] [PATCH] R600/SI: Custom select 64-bit ADD
On Fri, Feb 07, 2014 at 10:31:20AM -0800, Matt Arsenault wrote: I didn't think to try this. Where is the address folding happening? There are TableGen patterns that do the folding. I recently added several new ones: r200932-r200935. -Tom On 02/07/2014 07:46 AM, Tom Stellard wrote: From: Tom Stellard thomas.stell...@amd.com --- lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 48 ++ lib/Target/R600/SIISelLowering.cpp | 29 lib/Target/R600/SIISelLowering.h | 1 - test/CodeGen/R600/add.ll | 10 +++ test/CodeGen/R600/add_i64.ll | 23 +++- 5 files changed, 75 insertions(+), 36 deletions(-) diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index a989135..fea875c 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -200,6 +200,54 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } switch (Opc) { default: break; + // We are selecting i64 ADD here instead of custom lower it during + // DAG legalization, so we can fold some i64 ADDs used for address + // calculation into the LOAD and STORE instructions. + case ISD::ADD: { +const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); +if (N-getValueType(0) != MVT::i64 || +ST.getGeneration() AMDGPUSubtarget::SOUTHERN_ISLANDS) + break; + +SDLoc DL(N); +SDValue LHS = N-getOperand(0); +SDValue RHS = N-getOperand(1); + +SDValue Sub0 = CurDAG-getTargetConstant(AMDGPU::sub0, MVT::i32); +SDValue Sub1 = CurDAG-getTargetConstant(AMDGPU::sub1, MVT::i32); + +SDNode *Lo0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub0); +SDNode *Hi0 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, LHS, Sub1); + +SDNode *Lo1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub0); +SDNode *Hi1 = CurDAG-getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, RHS, Sub1); + +SDVTList VTList = CurDAG-getVTList(MVT::i32, MVT::Glue); + +SmallVectorSDValue, 8 AddLoArgs; +AddLoArgs.push_back(SDValue(Lo0, 0)); +AddLoArgs.push_back(SDValue(Lo1, 0)); + +SDNode *AddLo = CurDAG-getMachineNode(AMDGPU::S_ADD_I32, DL, + VTList, AddLoArgs); +SDValue Carry = SDValue(AddLo, 1); +SDNode *AddHi = CurDAG-getMachineNode(AMDGPU::S_ADDC_U32, DL, + MVT::i32, SDValue(Hi0, 0), + SDValue(Hi1, 0), Carry); + +SDValue Args[5] = { + CurDAG-getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32), + SDValue(AddLo,0), + Sub0, + SDValue(AddHi,0), + Sub1, +}; +return CurDAG-SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5); + } case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPUSubtarget ST = TM.getSubtargetAMDGPUSubtarget(); diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 0a22d16..4d2f370 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -76,7 +76,6 @@ SITargetLowering::SITargetLowering(TargetMachine TM) : setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -475,7 +474,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { SIMachineFunctionInfo *MFI = MF.getInfoSIMachineFunctionInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::ADD: return LowerADD(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: { LoadSDNode *Load = dyn_castLoadSDNode(Op); @@ -613,33 +611,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerADD(SDValue Op, - SelectionDAG DAG) const { - if (Op.getValueType() != MVT::i64) -return SDValue(); - - SDLoc DL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - - SDValue Zero = DAG.getConstant(0, MVT::i32); - SDValue One = DAG.getConstant(1, MVT::i32); - - SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); - SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); - - SDValue Lo1 =