On Feb 10, 2008, at 8:19 PM, Nate Begeman wrote: > > + > + if (Subtarget->hasSSE41()) { > ... ... > > + if (Subtarget->is64Bit()) { > + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); > + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Legal); > + > + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); > + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); > + } > + }
I don't see the corresponding patterns? This breaks X86/illegal- insert.ll on SSE4 capable machines. I am going to change them from 'legal' to 'custom' for now. Please fix if that's not right. Thanks, Evan > > > // We want to custom lower some of our intrinsics. > setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); > @@ -3655,10 +3682,34 @@ > } > > SDOperand > +X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, > + SelectionDAG &DAG) { > + MVT::ValueType VT = Op.getValueType(); > + if (MVT::getSizeInBits(VT) == 8) { > + SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32, > + Op.getOperand(0), > Op.getOperand(1)); > + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, > Extract, > + DAG.getValueType(VT)); > + return DAG.getNode(ISD::TRUNCATE, VT, Assert); > + } else if (MVT::getSizeInBits(VT) == 16) { > + SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32, > + Op.getOperand(0), > Op.getOperand(1)); > + SDOperand Assert = DAG.getNode(ISD::AssertZext, MVT::i32, > Extract, > + DAG.getValueType(VT)); > + return DAG.getNode(ISD::TRUNCATE, VT, Assert); > + } > + return SDOperand(); > +} > + > + > +SDOperand > X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, > SelectionDAG &DAG) { > if (!isa<ConstantSDNode>(Op.getOperand(1))) > return SDOperand(); > > + if (Subtarget->hasSSE41()) > + return LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); > + > MVT::ValueType VT = Op.getValueType(); > // TODO: handle v16i8. > if (MVT::getSizeInBits(VT) == 16) { > @@ -3699,6 +3750,9 @@ > return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec, > DAG.getIntPtrConstant(0)); > } else if (MVT::getSizeInBits(VT) == 64) { > + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on > 32b > + // FIXME: seems like this should be unnecessary if mov{h,l}pd > were taught > + // to match extract_elt for f64. > unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue(); > if (Idx == 0) > return Op; > @@ -3724,9 +3778,47 @@ > } > > SDOperand > +X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, > SelectionDAG &DAG){ > + MVT::ValueType VT = Op.getValueType(); > + MVT::ValueType EVT = MVT::getVectorElementType(VT); > + > + SDOperand N0 = Op.getOperand(0); > + SDOperand N1 = Op.getOperand(1); > + SDOperand N2 = Op.getOperand(2); > + > + if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == > 16)) { > + unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB > + : X86ISD::PINSRW; > + // Transform it so it match pinsr{b,w} which expects a GR32 as > its second > + // argument. > + if (N1.getValueType() != MVT::i32) > + N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1); > + if (N2.getValueType() != MVT::i32) > + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)- > >getValue()); > + return DAG.getNode(Opc, VT, N0, N1, N2); > + } else if (EVT == MVT::f32) { > + // Bits [7:6] of the constant are the source select. This will > always be > + // zero here. The DAG Combiner may combine an extract_elt > index into these > + // bits. For example (insert (extract, 3), 2) could be > matched by putting > + // the '3' into bits [7:6] of X86ISD::INSERTPS. > + // Bits [5:4] of the constant are the destination select. This > is the > + // value of the incoming immediate. > + // Bits [3:0] of the constant are the zero mask. The DAG > Combiner may > + // combine either bitwise AND or insert of float 0.0 to set > these bits. > + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() > << 4); > + return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2); > + } > + return SDOperand(); > +} > + > +SDOperand > X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG > &DAG) { > MVT::ValueType VT = Op.getValueType(); > MVT::ValueType EVT = MVT::getVectorElementType(VT); > + > + if (Subtarget->hasSSE41()) > + return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); > + > if (EVT == MVT::i8) > return SDOperand(); > > @@ -5273,7 +5365,10 @@ > case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; > case X86ISD::Wrapper: return "X86ISD::Wrapper"; > case X86ISD::S2VEC: return "X86ISD::S2VEC"; > + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; > case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; > + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; > + case X86ISD::PINSRB: return "X86ISD::PINSRB"; > case X86ISD::PINSRW: return "X86ISD::PINSRW"; > case X86ISD::FMAX: return "X86ISD::FMAX"; > case X86ISD::FMIN: return "X86ISD::FMIN"; > > Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h > URL: > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=46949&r1=46948&r2=46949&view=diff > > = > = > = > = > = > = > = > = > ====================================================================== > --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original) > +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Sun Feb 10 22:19:36 > 2008 > @@ -170,10 +170,22 @@ > /// have to match the operand type. > S2VEC, > > + /// PEXTRB - Extract an 8-bit value from a vector and zero > extend it to > + /// i32, corresponds to X86::PEXTRB. > + PEXTRB, > + > /// PEXTRW - Extract a 16-bit value from a vector and zero > extend it to > /// i32, corresponds to X86::PEXTRW. > PEXTRW, > > + /// INSERTPS - Insert any element of a 4 x float vector into > any element > + /// of a destination 4 x floatvector. > + INSERTPS, > + > + /// PINSRB - Insert the lower 8-bits of a 32-bit value to a > vector, > + /// corresponds to X86::PINSRB. > + PINSRB, > + > /// PINSRW - Insert the lower 16-bits of a 32-bit value to a > vector, > /// corresponds to X86::PINSRW. > PINSRW, > @@ -493,7 +505,9 @@ > SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG > &DAG); > + SDOperand LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op, > SelectionDAG &DAG); > SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG); > + SDOperand LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, > SelectionDAG &DAG); > SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG); > SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG); > > Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td > URL: > http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=46949&r1=46948&r2=46949&view=diff > > = > = > = > = > = > = > = > = > ====================================================================== > --- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original) > +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Sun Feb 10 22:19:36 2008 > @@ -35,8 +35,19 @@ > def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; > def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; > def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, > []>; > -def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, > []>; > -def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, > []>; > +def X86pextrb : SDNode<"X86ISD::PEXTRB", > + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, > SDTCisPtrTy<2>]>>; > +def X86pextrw : SDNode<"X86ISD::PEXTRW", > + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, > SDTCisPtrTy<2>]>>; > +def X86pinsrb : SDNode<"X86ISD::PINSRB", > + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, > SDTCisSameAs<0,1>, > + SDTCisVT<2, i32>, > SDTCisPtrTy<3>]>>; > +def X86pinsrw : SDNode<"X86ISD::PINSRW", > + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, > SDTCisSameAs<0,1>, > + SDTCisVT<2, i32>, > SDTCisPtrTy<3>]>>; > +def X86insrtps : SDNode<"X86ISD::INSERTPS", > + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, > SDTCisSameAs<0,1>, > + SDTCisVT<2, f32>, > SDTCisPtrTy<3>]>>; > > // > = > = > = > ----------------------------------------------------------------------= > ==// > // SSE 'Special' Instructions > @@ -2087,23 +2098,21 @@ > (outs GR32:$dst), (ins VR128:$src1, i32i8imm: > $src2), > "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", > [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), > - (iPTR imm:$src2)))]>; > + imm:$src2))]>; > let isTwoAddress = 1 in { > def PINSRWrri : PDIi8<0xC4, MRMSrcReg, > (outs VR128:$dst), (ins VR128:$src1, > GR32:$src2, i32i8imm:$src3), > "pinsrw\t{$src3, $src2, $dst|$dst, $src2, > $src3}", > [(set VR128:$dst, > - (v8i16 (X86pinsrw (v8i16 VR128:$src1), > - GR32:$src2, (iPTR imm:$src3))))]>; > + (X86pinsrw VR128:$src1, GR32:$src2, imm: > $src3))]>; > def PINSRWrmi : PDIi8<0xC4, MRMSrcMem, > (outs VR128:$dst), (ins VR128:$src1, > i16mem:$src2, i32i8imm:$src3), > "pinsrw\t{$src3, $src2, $dst|$dst, $src2, > $src3}", > - [(set VR128:$dst, > - (v8i16 (X86pinsrw (v8i16 VR128:$src1), > - (i32 (anyext (loadi16 addr:$src2))), > - (iPTR imm:$src3))))]>; > + [(set VR128:$dst, > + (X86pinsrw VR128:$src1, (extloadi16 addr: > $src2), > + imm:$src3))]>; > } > > // Mask creation > @@ -3255,7 +3264,7 @@ > > > /// SS41I_binop_rmi_int - SSE 4.1 binary operator with immediate > -let isTwoAddress = 1 in { > +let Uses = [XMM0], isTwoAddress = 1 in { > multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, > Intrinsic IntId> { > def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), > (ins VR128:$src1, VR128:$src2), > @@ -3328,26 +3337,44 @@ > defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovsxbq", > int_x86_sse41_pmovzxbq>; > > > -/// SS41I_binop_ext8 - SSE 4.1 binary operator with immediate > -multiclass SS41I_binop_ext8<bits<8> opc, string OpcodeStr> { > +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 > bit mem > +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { > def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), > (ins VR128:$src1, i32i8imm:$src2), > !strconcat(OpcodeStr, > "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), > - [(set GR32:$dst, (zext > - (extractelt (v16i8 VR128:$src1), imm:$src2)))]>, > OpSize; > + [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), > imm:$src2))]>, > + OpSize; > def mr : SS4AI<opc, MRMDestMem, (outs), > (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), > !strconcat(OpcodeStr, > "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), > - [(store (extractelt (v16i8 VR128:$src1), imm:$src2), > - addr:$dst)]>, OpSize; > + []>, OpSize; > +// FIXME: > +// There's an AssertZext in the way of writing the store pattern > +// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), > addr:$dst) > +} > + > +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; > + > + > +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination > +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { > + def mr : SS4AI<opc, MRMDestMem, (outs), > + (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), > + !strconcat(OpcodeStr, > + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), > + []>, OpSize; > +// FIXME: > +// There's an AssertZext in the way of writing the store pattern > +// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), > addr:$dst) > } > > -defm PEXTRB : SS41I_binop_ext8<0x14, "pextrb">; > +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; > + > > -/// SS41I_binop_ext32 - SSE 4.1 binary operator with immediate > -multiclass SS41I_binop_ext32<bits<8> opc, string OpcodeStr> { > +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory > destination > +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { > def rr : SS4AI<opc, MRMSrcReg, (outs GR32:$dst), > (ins VR128:$src1, i32i8imm:$src2), > !strconcat(OpcodeStr, > @@ -3362,10 +3389,11 @@ > addr:$dst)]>, OpSize; > } > > -defm PEXTRD : SS41I_binop_ext32<0x16, "pextrd">; > +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; > > -/// SS41I_binop_extf32 - SSE 4.1 binary operator with immediate > -multiclass SS41I_binop_extf32<bits<8> opc, string OpcodeStr> { > + > +/// SS41I_extractf32 - SSE 4.1 extract 32 bits to fp reg or memory > destination > +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { > def rr : SS4AI<opc, MRMSrcReg, (outs FR32:$dst), > (ins VR128:$src1, i32i8imm:$src2), > !strconcat(OpcodeStr, > @@ -3380,5 +3408,65 @@ > addr:$dst)]>, OpSize; > } > > -defm EXTRACTPS : SS41I_binop_extf32<0x17, "extractps">; > +defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; > + > +let isTwoAddress = 1 in { > + multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> { > + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), > + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (X86pinsrb VR128:$src1, GR32:$src2, imm: > $src3))]>, OpSize; > + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), > + (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), > + imm:$src3))]>, OpSize; > + } > +} > + > +defm PINSRB : SS41I_insert8<0x20, "pinsrb">; > + > +let isTwoAddress = 1 in { > + multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> { > + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), > + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm: > $src3)))]>, > + OpSize; > + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), > + (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (v4i32 (insertelt VR128:$src1, (loadi32 addr: > $src2), > + imm:$src3)))]>, OpSize; > + } > +} > + > +defm PINSRD : SS41I_insert32<0x22, "pinsrd">; > + > +let isTwoAddress = 1 in { > + multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> { > + def rr : SS4AI<opc, MRMSrcReg, (outs VR128:$dst), > + (ins VR128:$src1, FR32:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (X86insrtps VR128:$src1, FR32:$src2, imm: > $src3))]>, OpSize; > + def rm : SS4AI<opc, MRMSrcMem, (outs VR128:$dst), > + (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3), > + !strconcat(OpcodeStr, > + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), > + [(set VR128:$dst, > + (X86insrtps VR128:$src1, (loadf32 addr:$src2), > + imm:$src3))]>, OpSize; > + } > +} > > +defm INSERTPS : SS41I_insertf32<0x31, "insertps">; > > > _______________________________________________ > llvm-commits mailing list > llvm-commits@cs.uiuc.edu > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits _______________________________________________ llvm-commits mailing list llvm-commits@cs.uiuc.edu http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits