https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/204144
* Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead * AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics * Move X86 DAG knownbits/demandedbits handling to generic (unchanged) * Move X86 InstCombine folds to generic (unchanged) * Updated clang builtins to emit generics >From 8e21095611e2cd2c4a384b105c0944c490d41dab Mon Sep 17 00:00:00 2001 From: Simon Pilgrim <[email protected]> Date: Tue, 16 Jun 2026 14:19:33 +0100 Subject: [PATCH] [WIP][X86] Replace X86 specific pdep/pext handling with generic PDEF/PEXT intrinsics * Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead * AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics * Move X86 DAG knownbits/demandedbits handling to generic (unchanged) * Move X86 InstCombine folds to generic (unchanged) * Updated clang builtins to emit generics --- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 10 +++ clang/test/CodeGen/X86/bmi2-builtins.c | 8 +- llvm/include/llvm/IR/IntrinsicsX86.td | 12 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++ .../CodeGen/SelectionDAG/TargetLowering.cpp | 28 ++++++ llvm/lib/IR/AutoUpgrade.cpp | 8 ++ llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +---------- .../Target/X86/X86InstCombineIntrinsic.cpp | 88 ------------------- llvm/lib/Target/X86/X86InstrFragments.td | 4 - llvm/lib/Target/X86/X86InstrMisc.td | 54 ++---------- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 4 - .../InstCombine/InstCombineCalls.cpp | 58 ++++++++++++ .../Instrumentation/MemorySanitizer.cpp | 4 - llvm/test/CodeGen/X86/bmi2.ll | 23 +++-- 15 files changed, 152 insertions(+), 223 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index acfeb9967cd2f..50125a71fcd5f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -976,6 +976,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType()); return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)}); } + case X86::BI__builtin_ia32_pdep_si: + case X86::BI__builtin_ia32_pdep_di: { + Function *F = CGM.getIntrinsic(Intrinsic::pdep, Ops[0]->getType()); + return Builder.CreateCall(F, Ops); + } + case X86::BI__builtin_ia32_pext_si: + case X86::BI__builtin_ia32_pext_di: { + Function *F = CGM.getIntrinsic(Intrinsic::pext, Ops[0]->getType()); + return Builder.CreateCall(F, Ops); + } case X86::BI__builtin_ia32_undef128: case X86::BI__builtin_ia32_undef256: case X86::BI__builtin_ia32_undef512: diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c index 1b2cb9048adb2..c83cc43d9fc3f 100644 --- a/clang/test/CodeGen/X86/bmi2-builtins.c +++ b/clang/test/CodeGen/X86/bmi2-builtins.c @@ -17,12 +17,12 @@ unsigned int test_bzhi_u32(unsigned int __X, unsigned int __Y) { } unsigned int test_pdep_u32(unsigned int __X, unsigned int __Y) { - // CHECK: @llvm.x86.bmi.pdep.32 + // CHECK: @llvm.pdep.i32 return _pdep_u32(__X, __Y); } unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) { - // CHECK: @llvm.x86.bmi.pext.32 + // CHECK: @llvm.pext.i32 return _pext_u32(__X, __Y); } @@ -41,12 +41,12 @@ unsigned long long test_bzhi_u64(unsigned long long __X, unsigned long long __Y) } unsigned long long test_pdep_u64(unsigned long long __X, unsigned long long __Y) { - // CHECK: @llvm.x86.bmi.pdep.64 + // CHECK: @llvm.pdep.i64 return _pdep_u64(__X, __Y); } unsigned long long test_pext_u64(unsigned long long __X, unsigned long long __Y) { - // CHECK: @llvm.x86.bmi.pext.64 + // CHECK: @llvm.pext.i64 return _pext_u64(__X, __Y); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index b75a0485d6263..5c7785731111c 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2575,18 +2575,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">, DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; - def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], - [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7fdaacff0582d..c2c8e930abcf7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12245,12 +12245,18 @@ SDValue DAGCombiner::visitPDEP(SDNode *N) { // pdep(x, 0) -> 0 if (isNullOrNullSplat(N1)) return DAG.getConstant(0, DL, VT); + // pdep(x, -1) -> x (all positions selected, bits deposited at identity) if (isAllOnesOrAllOnesSplat(N1)) return N0; + // fold pdep(c1, c2) -> expandBits(c1, c2) if (SDValue C = DAG.FoldConstantArithmetic(ISD::PDEP, DL, VT, {N0, N1})) return C; + + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 534ba48bd5d1a..ed94d757b0509 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3952,6 +3952,24 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setBitsFrom(1); break; } + case ISD::PDEP: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + // Zeros are retained from the mask operand. But not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } + case ISD::PEXT: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + // The result has as many leading zeros as the number of zeroes in the mask. + unsigned Count = Known.Zero.popcount(); + Known.Zero = APInt::getHighBitsSet(BitWidth, Count); + Known.One.clearAllBits(); + break; + } case ISD::CLMUL: { Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 524e084b8afa8..530a46a9331c1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2463,6 +2463,34 @@ bool TargetLowering::SimplifyDemandedBits( Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); break; } + case ISD::PDEP: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); + APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); + + // If the demanded bits has leading zeroes, we don't demand those from the + // mask. + if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) + return true; + + // The number of possible 1s in the mask determines the number of LSBs of + // operand 0 used. Undemanded bits from the mask don't matter so filter + // them before counting. + KnownBits Known2; + uint64_t Count = (~Known.Zero & LoMask).popcount(); + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); + if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) + return true; + + // Zeroes are retained from the mask, but not ones. + Known.One.clearAllBits(); + // The result will have at least as many trailing zeros as the non-mask + // operand since bits can only map to the same or higher bit position. + Known.Zero.setLowBits(Known2.countMinTrailingZeros()); + break; + } case ISD::SIGN_EXTEND_INREG: { SDValue Op0 = Op.getOperand(0); EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 814e985ebf7be..9422fc6129efd 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -533,6 +533,10 @@ static bool shouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.starts_with("vpcom") || // Added in 3.2, Updated in 9.0 Name.starts_with("vprot")); // Added in 8.0 + if (Name.consume_front("bmi.")) + return (Name.starts_with("pdep.") || // Added in 23.0 + Name.starts_with("pext.")); // Added in 23.0 + return (Name == "addcarry.u32" || // Added in 8.0 Name == "addcarry.u64" || // Added in 8.0 Name == "addcarryx.u32" || // Added in 8.0 @@ -4616,6 +4620,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name.starts_with("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. + } else if (Name.starts_with("bmi.pdep.")) { + Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pdep); + } else if (Name.starts_with("bmi.pext.")) { + Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pext); } else reportFatalUsageErrorWithCI("Unexpected intrinsic", CI); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 196ee8775c7f3..aaf7bbe75268a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39732,25 +39732,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.One.clearAllBits(); break; } - case X86ISD::PDEP: { - KnownBits Known2; - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - // Zeros are retained from the mask operand. But not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - break; - } - case X86ISD::PEXT: { - Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); - // The result has as many leading zeros as the number of zeroes in the mask. - unsigned Count = Known.Zero.popcount(); - Known.Zero = APInt::getHighBitsSet(BitWidth, Count); - Known.One.clearAllBits(); - break; - } case X86ISD::VTRUNC: case X86ISD::VTRUNCS: case X86ISD::VTRUNCUS: @@ -45985,34 +45966,6 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( break; } - case X86ISD::PDEP: { - SDValue Op0 = Op.getOperand(0); - SDValue Op1 = Op.getOperand(1); - - unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); - APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); - - // If the demanded bits has leading zeroes, we don't demand those from the - // mask. - if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) - return true; - - // The number of possible 1s in the mask determines the number of LSBs of - // operand 0 used. Undemanded bits from the mask don't matter so filter - // them before counting. - KnownBits Known2; - uint64_t Count = (~Known.Zero & LoMask).popcount(); - APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); - if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) - return true; - - // Zeroes are retained from the mask, but not ones. - Known.One.clearAllBits(); - // The result will have at least as many trailing zeros as the non-mask - // operand since bits can only map to the same or higher bit position. - Known.Zero.setLowBits(Known2.countMinTrailingZeros()); - return false; - } case X86ISD::VPMADD52L: case X86ISD::VPMADD52H: { KnownBits KnownOp0, KnownOp1, KnownOp2; @@ -63393,8 +63346,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::BEXTR: case X86ISD::BEXTRI: - case X86ISD::BZHI: - case X86ISD::PDEP: return combineBMI(N, DAG, DCI); + case X86ISD::BZHI: return combineBMI(N, DAG, DCI); case X86ISD::PCLMULQDQ: return combinePCLMULQDQ(N, DAG, DCI); case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI); case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI); diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 4999581489e82..ad1c171428671 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -2259,94 +2259,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO should we convert this to an AND if the RHS is constant? } break; - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - unsigned MaskIdx, MaskLen; - if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - Value *Input = II.getArgOperand(0); - Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); - Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); - Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); - return IC.replaceInstUsesWith(II, Shifted); - } - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToSet = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToTest = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToSet <<= 1; - // Clear lowest set bit. - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { - if (MaskC->isNullValue()) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); - } - if (MaskC->isAllOnesValue()) { - return IC.replaceInstUsesWith(II, II.getArgOperand(0)); - } - - unsigned MaskIdx, MaskLen; - if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { - // any single contingous sequence of 1s anywhere in the mask simply - // describes a subset of the input bits shifted to the appropriate - // position. Replace with the straight forward IR. - Value *Input = II.getArgOperand(0); - Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); - Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); - Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); - return IC.replaceInstUsesWith(II, Masked); - } - - if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { - uint64_t Src = SrcC->getZExtValue(); - uint64_t Mask = MaskC->getZExtValue(); - uint64_t Result = 0; - uint64_t BitToTest = 1; - - while (Mask) { - // Isolate lowest set bit. - uint64_t BitToSet = Mask & -Mask; - if (BitToTest & Src) - Result |= BitToSet; - - BitToTest <<= 1; - // Clear lowest set bit; - Mask &= Mask - 1; - } - - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), Result)); - } - } - break; case Intrinsic::x86_sse_cvtss2si: case Intrinsic::x86_sse_cvtss2si64: diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index 9316360c5e02a..923b968382866 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -424,10 +424,6 @@ def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>; // Zero High Bits Starting with Specified Bit Position. def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>; -// Parallel extract and deposit. -def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>; -def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>; - // X86-specific multiply by immediate. def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 613a431fe365a..c6acaa697fdc7 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1391,55 +1391,17 @@ multiclass PdepPext<string m, X86TypeInfo t, SDPatternOperator node, } let Predicates = [HasBMI2, NoEGPR] in { - defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep>, XD, VEX; - defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX; - defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX; - defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX; + defm PDEP32 : PdepPext<"pdep", Xi32, pdep>, XD, VEX; + defm PDEP64 : PdepPext<"pdep", Xi64, pdep>, XD, REX_W, VEX; + defm PEXT32 : PdepPext<"pext", Xi32, pext>, XS, VEX; + defm PEXT64 : PdepPext<"pext", Xi64, pext>, XS, REX_W, VEX; } let Predicates = [HasBMI2, HasEGPR] in { - defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX; - defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX; - defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX; - defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX; -} - -let Predicates = [HasBMI2, NoEGPR] in { - def : Pat<(i32 (pext GR32:$src, GR32:$mask)), - (PEXT32rr GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))), - (PEXT32rm GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pext GR64:$src, GR64:$mask)), - (PEXT64rr GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))), - (PEXT64rm GR64:$src, i64mem:$mask)>; - def : Pat<(i32 (pdep GR32:$src, GR32:$mask)), - (PDEP32rr GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))), - (PDEP32rm GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pdep GR64:$src, GR64:$mask)), - (PDEP64rr GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))), - (PDEP64rm GR64:$src, i64mem:$mask)>; -} - -let Predicates = [HasBMI2, HasEGPR] in { - def : Pat<(i32 (pext GR32:$src, GR32:$mask)), - (PEXT32rr_EVEX GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))), - (PEXT32rm_EVEX GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pext GR64:$src, GR64:$mask)), - (PEXT64rr_EVEX GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))), - (PEXT64rm_EVEX GR64:$src, i64mem:$mask)>; - def : Pat<(i32 (pdep GR32:$src, GR32:$mask)), - (PDEP32rr_EVEX GR32:$src, GR32:$mask)>; - def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))), - (PDEP32rm_EVEX GR32:$src, i32mem:$mask)>; - def : Pat<(i64 (pdep GR64:$src, GR64:$mask)), - (PDEP64rr_EVEX GR64:$src, GR64:$mask)>; - def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))), - (PDEP64rm_EVEX GR64:$src, i64mem:$mask)>; + defm PDEP32 : PdepPext<"pdep", Xi32, pdep, "_EVEX">, XD, EVEX; + defm PDEP64 : PdepPext<"pdep", Xi64, pdep, "_EVEX">, XD, REX_W, EVEX; + defm PEXT32 : PdepPext<"pext", Xi32, pext, "_EVEX">, XS, EVEX; + defm PEXT64 : PdepPext<"pext", Xi64, pext, "_EVEX">, XS, REX_W, EVEX; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 9e32ca23dafe2..a6b0db0230cf3 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -1837,10 +1837,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0), - X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0), - X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0), - X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0), - X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index ffa6504053f0d..4c9d3b8bc2100 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2660,6 +2660,64 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return &CI; break; } + case Intrinsic::pdep: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0)); + + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(*II, II->getArgOperand(0)); + + unsigned MaskIdx, MaskLen; + if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + Value *Input = II->getArgOperand(0); + Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx); + Value *Shifted = Builder.CreateShl(Input, ShiftAmt); + Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1)); + return replaceInstUsesWith(*II, Masked); + } + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + // constant folding. + APInt Result = + llvm::APIntOps::expandBits(SrcC->getValue(), MaskC->getValue()); + return replaceInstUsesWith(*II, + ConstantInt::get(II->getType(), Result)); + } + } + break; + case Intrinsic::pext: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0)); + + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(*II, II->getArgOperand(0)); + + unsigned MaskIdx, MaskLen; + if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { + // any single contingous sequence of 1s anywhere in the mask simply + // describes a subset of the input bits shifted to the appropriate + // position. Replace with the straight forward IR. + Value *Input = II->getArgOperand(0); + Value *Masked = Builder.CreateAnd(Input, II->getArgOperand(1)); + Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx); + Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt); + return replaceInstUsesWith(*II, Shifted); + } + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + // constant folding. + APInt Result = + llvm::APIntOps::compressBits(SrcC->getValue(), MaskC->getValue()); + return replaceInstUsesWith(*II, + ConstantInt::get(II->getType(), Result)); + } + } + break; case Intrinsic::ptrmask: { unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType()); KnownBits Known(BitWidth); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 383f503a2e87f..72d6bbc462d00 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -6504,10 +6504,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { case Intrinsic::x86_bmi_bextr_64: case Intrinsic::x86_bmi_bzhi_32: case Intrinsic::x86_bmi_bzhi_64: - case Intrinsic::x86_bmi_pdep_32: - case Intrinsic::x86_bmi_pdep_64: - case Intrinsic::x86_bmi_pext_32: - case Intrinsic::x86_bmi_pext_64: handleBmiIntrinsic(I); break; diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll index cabeebb0c3f36..41585bde9a696 100644 --- a/llvm/test/CodeGen/X86/bmi2.ll +++ b/llvm/test/CodeGen/X86/bmi2.ll @@ -128,7 +128,7 @@ define i32 @pdep32_load(i32 %x, ptr %y) { define i32 @pdep32_anyext(i16 %x) { ; X86-LABEL: pdep32_anyext: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1431655766, %ecx # imm = 0xAAAAAAAA ; X86-NEXT: pdepl %ecx, %eax, %eax ; X86-NEXT: retl @@ -178,7 +178,7 @@ define i32 @pdep32_demandedbits(i32 %x) { define i32 @pdep32_demandedbits2(i32 %x, i32 %y) { ; X86-LABEL: pdep32_demandedbits2: ; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: andl $128, %eax ; X86-NEXT: retl @@ -203,9 +203,8 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) { define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) { ; X86-LABEL: pdep32_demandedbits_mask: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pdepl %eax, %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: andl $32768, %eax # imm = 0x8000 ; X86-NEXT: retl ; @@ -230,9 +229,8 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) { define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) { ; X86-LABEL: pdep32_demandedbits_mask2: ; X86: # %bb.0: -; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pdepl %eax, %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: retl ; @@ -285,22 +283,23 @@ define i32 @pdep32_knownbits(i32 %x) { define i32 @pdep32_knownbits2(i32 %x, i32 %y) { ; X86-LABEL: pdep32_knownbits2: ; X86: # %bb.0: -; X86-NEXT: movl $-256, %eax -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax ; X86-NEXT: pdepl {{[0-9]+}}(%esp), %eax, %eax ; X86-NEXT: imull %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: pdep32_knownbits2: ; X64: # %bb.0: -; X64-NEXT: andl $-256, %edi +; X64-NEXT: andl $16776960, %edi # imm = 0xFFFF00 ; X64-NEXT: pdepl %esi, %edi, %eax ; X64-NEXT: imull %eax, %eax ; X64-NEXT: retq ; ; EGPR-LABEL: pdep32_knownbits2: ; EGPR: # %bb.0: -; EGPR-NEXT: andl $-256, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0xff] +; EGPR-NEXT: andl $16776960, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0x00] +; EGPR-NEXT: # imm = 0xFFFF00 ; EGPR-NEXT: pdepl %esi, %edi, %eax # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x43,0xf5,0xc6] ; EGPR-NEXT: imull %eax, %eax # encoding: [0x0f,0xaf,0xc0] ; EGPR-NEXT: retq # encoding: [0xc3] _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
