https://github.com/RKSimon created 
https://github.com/llvm/llvm-project/pull/204144

* Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead
* AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics
* Move X86 DAG knownbits/demandedbits handling to generic (unchanged)
* Move X86 InstCombine folds to generic (unchanged)
* Updated clang builtins to emit generics

>From 8e21095611e2cd2c4a384b105c0944c490d41dab Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Tue, 16 Jun 2026 14:19:33 +0100
Subject: [PATCH] [WIP][X86] Replace X86 specific pdep/pext handling with
 generic PDEF/PEXT intrinsics

* Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead
* AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics
* Move X86 DAG knownbits/demandedbits handling to generic (unchanged)
* Move X86 InstCombine folds to generic (unchanged)
* Updated clang builtins to emit generics
---
 clang/lib/CodeGen/TargetBuiltins/X86.cpp      | 10 +++
 clang/test/CodeGen/X86/bmi2-builtins.c        |  8 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         | 12 ---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 ++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 28 ++++++
 llvm/lib/IR/AutoUpgrade.cpp                   |  8 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 50 +----------
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 88 -------------------
 llvm/lib/Target/X86/X86InstrFragments.td      |  4 -
 llvm/lib/Target/X86/X86InstrMisc.td           | 54 ++----------
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |  4 -
 .../InstCombine/InstCombineCalls.cpp          | 58 ++++++++++++
 .../Instrumentation/MemorySanitizer.cpp       |  4 -
 llvm/test/CodeGen/X86/bmi2.ll                 | 23 +++--
 15 files changed, 152 insertions(+), 223 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp 
b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index acfeb9967cd2f..50125a71fcd5f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -976,6 +976,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned 
BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
   }
+  case X86::BI__builtin_ia32_pdep_si:
+  case X86::BI__builtin_ia32_pdep_di: {
+    Function *F = CGM.getIntrinsic(Intrinsic::pdep, Ops[0]->getType());
+    return Builder.CreateCall(F, Ops);
+  }
+  case X86::BI__builtin_ia32_pext_si:
+  case X86::BI__builtin_ia32_pext_di: {
+    Function *F = CGM.getIntrinsic(Intrinsic::pext, Ops[0]->getType());
+    return Builder.CreateCall(F, Ops);
+  }
   case X86::BI__builtin_ia32_undef128:
   case X86::BI__builtin_ia32_undef256:
   case X86::BI__builtin_ia32_undef512:
diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c 
b/clang/test/CodeGen/X86/bmi2-builtins.c
index 1b2cb9048adb2..c83cc43d9fc3f 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -17,12 +17,12 @@ unsigned int test_bzhi_u32(unsigned int __X, unsigned int 
__Y) {
 }
 
 unsigned int test_pdep_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.pdep.32
+  // CHECK: @llvm.pdep.i32
   return _pdep_u32(__X, __Y);
 }
 
 unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.pext.32
+  // CHECK: @llvm.pext.i32
   return _pext_u32(__X, __Y);
 }
 
@@ -41,12 +41,12 @@ unsigned long long test_bzhi_u64(unsigned long long __X, 
unsigned long long __Y)
 }
 
 unsigned long long test_pdep_u64(unsigned long long __X, unsigned long long 
__Y) {
-  // CHECK: @llvm.x86.bmi.pdep.64
+  // CHECK: @llvm.pdep.i64
   return _pdep_u64(__X, __Y);
 }
 
 unsigned long long test_pext_u64(unsigned long long __X, unsigned long long 
__Y) {
-  // CHECK: @llvm.x86.bmi.pext.64
+  // CHECK: @llvm.pext.i64
   return _pext_u64(__X, __Y);
 }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td 
b/llvm/include/llvm/IR/IntrinsicsX86.td
index b75a0485d6263..5c7785731111c 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2575,18 +2575,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start 
with "llvm.x86.".
   def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                             [IntrNoMem]>;
-  def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">,
-      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">,
-      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">,
-      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">,
-      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                            [IntrNoMem]>;
 }
 
 
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7fdaacff0582d..c2c8e930abcf7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12245,12 +12245,18 @@ SDValue DAGCombiner::visitPDEP(SDNode *N) {
   // pdep(x, 0) -> 0
   if (isNullOrNullSplat(N1))
     return DAG.getConstant(0, DL, VT);
+
   // pdep(x, -1) -> x  (all positions selected, bits deposited at identity)
   if (isAllOnesOrAllOnesSplat(N1))
     return N0;
+
   // fold pdep(c1, c2) -> expandBits(c1, c2)
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::PDEP, DL, VT, {N0, N1}))
     return C;
+
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 534ba48bd5d1a..ed94d757b0509 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3952,6 +3952,24 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, 
const APInt &DemandedElts,
     Known.Zero.setBitsFrom(1);
     break;
   }
+  case ISD::PDEP: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // Zeros are retained from the mask operand. But not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
+  case ISD::PEXT: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // The result has as many leading zeros as the number of zeroes in the 
mask.
+    unsigned Count = Known.Zero.popcount();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+    Known.One.clearAllBits();
+    break;
+  }
   case ISD::CLMUL: {
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp 
b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 524e084b8afa8..530a46a9331c1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2463,6 +2463,34 @@ bool TargetLowering::SimplifyDemandedBits(
     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     break;
   }
+  case ISD::PDEP: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+    // If the demanded bits has leading zeroes, we don't demand those from the
+    // mask.
+    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+      return true;
+
+    // The number of possible 1s in the mask determines the number of LSBs of
+    // operand 0 used. Undemanded bits from the mask don't matter so filter
+    // them before counting.
+    KnownBits Known2;
+    uint64_t Count = (~Known.Zero & LoMask).popcount();
+    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+      return true;
+
+    // Zeroes are retained from the mask, but not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
   case ISD::SIGN_EXTEND_INREG: {
     SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 814e985ebf7be..9422fc6129efd 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -533,6 +533,10 @@ static bool shouldUpgradeX86Intrinsic(Function *F, 
StringRef Name) {
             Name.starts_with("vpcom") || // Added in 3.2, Updated in 9.0
             Name.starts_with("vprot"));  // Added in 8.0
 
+  if (Name.consume_front("bmi."))
+    return (Name.starts_with("pdep.") || // Added in 23.0
+            Name.starts_with("pext."));  // Added in 23.0
+
   return (Name == "addcarry.u32" ||        // Added in 8.0
           Name == "addcarry.u64" ||        // Added in 8.0
           Name == "addcarryx.u32" ||       // Added in 8.0
@@ -4616,6 +4620,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, 
CallBase *CI, Function *F,
   } else if (Name.starts_with("avx512.mask.") &&
              upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
     // Rep will be updated by the call in the condition.
+  } else if (Name.starts_with("bmi.pdep.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pdep);
+  } else if (Name.starts_with("bmi.pext.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pext);
   } else
     reportFatalUsageErrorWithCI("Unexpected intrinsic", CI);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 196ee8775c7f3..aaf7bbe75268a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39732,25 +39732,6 @@ void 
X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.One.clearAllBits();
     break;
   }
-  case X86ISD::PDEP: {
-    KnownBits Known2;
-    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    // Zeros are retained from the mask operand. But not ones.
-    Known.One.clearAllBits();
-    // The result will have at least as many trailing zeros as the non-mask
-    // operand since bits can only map to the same or higher bit position.
-    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
-    break;
-  }
-  case X86ISD::PEXT: {
-    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    // The result has as many leading zeros as the number of zeroes in the 
mask.
-    unsigned Count = Known.Zero.popcount();
-    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
-    Known.One.clearAllBits();
-    break;
-  }
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
@@ -45985,34 +45966,6 @@ bool 
X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     break;
   }
-  case X86ISD::PDEP: {
-    SDValue Op0 = Op.getOperand(0);
-    SDValue Op1 = Op.getOperand(1);
-
-    unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
-    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
-
-    // If the demanded bits has leading zeroes, we don't demand those from the
-    // mask.
-    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
-      return true;
-
-    // The number of possible 1s in the mask determines the number of LSBs of
-    // operand 0 used. Undemanded bits from the mask don't matter so filter
-    // them before counting.
-    KnownBits Known2;
-    uint64_t Count = (~Known.Zero & LoMask).popcount();
-    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
-    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
-      return true;
-
-    // Zeroes are retained from the mask, but not ones.
-    Known.One.clearAllBits();
-    // The result will have at least as many trailing zeros as the non-mask
-    // operand since bits can only map to the same or higher bit position.
-    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
-    return false;
-  }
   case X86ISD::VPMADD52L:
   case X86ISD::VPMADD52H: {
     KnownBits KnownOp0, KnownOp1, KnownOp2;
@@ -63393,8 +63346,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   case X86ISD::BEXTR:
   case X86ISD::BEXTRI:
-  case X86ISD::BZHI:
-  case X86ISD::PDEP:        return combineBMI(N, DAG, DCI);
+  case X86ISD::BZHI:        return combineBMI(N, DAG, DCI);
   case X86ISD::PCLMULQDQ:   return combinePCLMULQDQ(N, DAG, DCI);
   case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
   case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp 
b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 4999581489e82..ad1c171428671 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2259,94 +2259,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
       // TODO should we convert this to an AND if the RHS is constant?
     }
     break;
-  case Intrinsic::x86_bmi_pext_32:
-  case Intrinsic::x86_bmi_pext_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
-      if (MaskC->isNullValue()) {
-        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
-      }
-      if (MaskC->isAllOnesValue()) {
-        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
-      }
-
-      unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
-        // describes a subset of the input bits shifted to the appropriate
-        // position.  Replace with the straight forward IR.
-        Value *Input = II.getArgOperand(0);
-        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
-        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
-        Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
-        return IC.replaceInstUsesWith(II, Shifted);
-      }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToSet = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToTest = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToSet <<= 1;
-          // Clear lowest set bit.
-          Mask &= Mask - 1;
-        }
-
-        return IC.replaceInstUsesWith(II,
-                                      ConstantInt::get(II.getType(), Result));
-      }
-    }
-    break;
-  case Intrinsic::x86_bmi_pdep_32:
-  case Intrinsic::x86_bmi_pdep_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
-      if (MaskC->isNullValue()) {
-        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
-      }
-      if (MaskC->isAllOnesValue()) {
-        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
-      }
-
-      unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
-        // describes a subset of the input bits shifted to the appropriate
-        // position.  Replace with the straight forward IR.
-        Value *Input = II.getArgOperand(0);
-        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
-        Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
-        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
-        return IC.replaceInstUsesWith(II, Masked);
-      }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToTest = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToSet = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToTest <<= 1;
-          // Clear lowest set bit;
-          Mask &= Mask - 1;
-        }
-
-        return IC.replaceInstUsesWith(II,
-                                      ConstantInt::get(II.getType(), Result));
-      }
-    }
-    break;
 
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td 
b/llvm/lib/Target/X86/X86InstrFragments.td
index 9316360c5e02a..923b968382866 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -424,10 +424,6 @@ def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
 // Zero High Bits Starting with Specified Bit Position.
 def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
 
-// Parallel extract and deposit.
-def X86pdep   : SDNode<"X86ISD::PDEP",   SDTIntBinOp>;
-def X86pext   : SDNode<"X86ISD::PEXT",   SDTIntBinOp>;
-
 // X86-specific multiply by immediate.
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td 
b/llvm/lib/Target/X86/X86InstrMisc.td
index 613a431fe365a..c6acaa697fdc7 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1391,55 +1391,17 @@ multiclass PdepPext<string m, X86TypeInfo t, 
SDPatternOperator node,
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
-  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep>, XD, VEX;
-  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX;
-  defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX;
-  defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX;
+  defm PDEP32 : PdepPext<"pdep", Xi32, pdep>, XD, VEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, pdep>, XD, REX_W, VEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, pext>, XS, VEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, pext>, XS, REX_W, VEX;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
-  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX;
-  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX;
-  defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX;
-  defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX;
-}
-
-let Predicates = [HasBMI2, NoEGPR] in {
-  def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
-            (PEXT32rr GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
-            (PEXT32rm GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
-            (PEXT64rr GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
-            (PEXT64rm GR64:$src, i64mem:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
-            (PDEP32rr GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
-            (PDEP32rm GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
-            (PDEP64rr GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
-            (PDEP64rm GR64:$src, i64mem:$mask)>;
-}
-
-let Predicates = [HasBMI2, HasEGPR] in {
-  def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
-            (PEXT32rr_EVEX GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
-            (PEXT32rm_EVEX GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
-            (PEXT64rr_EVEX GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
-            (PEXT64rm_EVEX GR64:$src, i64mem:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
-            (PDEP32rr_EVEX GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
-            (PDEP32rm_EVEX GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
-            (PDEP64rr_EVEX GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
-            (PDEP64rm_EVEX GR64:$src, i64mem:$mask)>;
+  defm PDEP32 : PdepPext<"pdep", Xi32, pdep, "_EVEX">, XD, EVEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, pdep, "_EVEX">, XD, REX_W, EVEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, pext, "_EVEX">, XS, EVEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, pext, "_EVEX">, XS, REX_W, EVEX;
 }
 
 
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h 
b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 9e32ca23dafe2..a6b0db0230cf3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1837,10 +1837,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
     X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
     X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0),
-    X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0),
-    X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0),
-    X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0),
-    X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
                        0),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index ffa6504053f0d..4c9d3b8bc2100 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2660,6 +2660,64 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
       return &CI;
     break;
   }
+  case Intrinsic::pdep:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
+
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(*II, II->getArgOperand(0));
+
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        Value *Input = II->getArgOperand(0);
+        Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+        Value *Shifted = Builder.CreateShl(Input, ShiftAmt);
+        Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1));
+        return replaceInstUsesWith(*II, Masked);
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        // constant folding.
+        APInt Result =
+            llvm::APIntOps::expandBits(SrcC->getValue(), MaskC->getValue());
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
+  case Intrinsic::pext:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
+
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(*II, II->getArgOperand(0));
+
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        Value *Input = II->getArgOperand(0);
+        Value *Masked = Builder.CreateAnd(Input, II->getArgOperand(1));
+        Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+        Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt);
+        return replaceInstUsesWith(*II, Shifted);
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        // constant folding.
+        APInt Result =
+            llvm::APIntOps::compressBits(SrcC->getValue(), MaskC->getValue());
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
   case Intrinsic::ptrmask: {
     unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType());
     KnownBits Known(BitWidth);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp 
b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 383f503a2e87f..72d6bbc462d00 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -6504,10 +6504,6 @@ struct MemorySanitizerVisitor : public 
InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_bmi_bextr_64:
     case Intrinsic::x86_bmi_bzhi_32:
     case Intrinsic::x86_bmi_bzhi_64:
-    case Intrinsic::x86_bmi_pdep_32:
-    case Intrinsic::x86_bmi_pdep_64:
-    case Intrinsic::x86_bmi_pext_32:
-    case Intrinsic::x86_bmi_pext_64:
       handleBmiIntrinsic(I);
       break;
 
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index cabeebb0c3f36..41585bde9a696 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -128,7 +128,7 @@ define i32 @pdep32_load(i32 %x, ptr %y)   {
 define i32 @pdep32_anyext(i16 %x)   {
 ; X86-LABEL: pdep32_anyext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1431655766, %ecx # imm = 0xAAAAAAAA
 ; X86-NEXT:    pdepl %ecx, %eax, %eax
 ; X86-NEXT:    retl
@@ -178,7 +178,7 @@ define i32 @pdep32_demandedbits(i32 %x) {
 define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
 ; X86-LABEL: pdep32_demandedbits2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    andl $128, %eax
 ; X86-NEXT:    retl
@@ -203,9 +203,8 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
 define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
 ; X86-LABEL: pdep32_demandedbits_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pdepl %eax, %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    andl $32768, %eax # imm = 0x8000
 ; X86-NEXT:    retl
 ;
@@ -230,9 +229,8 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
 define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
 ; X86-LABEL: pdep32_demandedbits_mask2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pdepl %eax, %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    retl
 ;
@@ -285,22 +283,23 @@ define i32 @pdep32_knownbits(i32 %x) {
 define i32 @pdep32_knownbits2(i32 %x, i32 %y) {
 ; X86-LABEL: pdep32_knownbits2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $-256, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_knownbits2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $-256, %edi
+; X64-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
 ; X64-NEXT:    pdepl %esi, %edi, %eax
 ; X64-NEXT:    imull %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; EGPR-LABEL: pdep32_knownbits2:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    andl $-256, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0xff]
+; EGPR-NEXT:    andl $16776960, %edi # encoding: 
[0x81,0xe7,0x00,0xff,0xff,0x00]
+; EGPR-NEXT:    # imm = 0xFFFF00
 ; EGPR-NEXT:    pdepl %esi, %edi, %eax # EVEX TO VEX Compression encoding: 
[0xc4,0xe2,0x43,0xf5,0xc6]
 ; EGPR-NEXT:    imull %eax, %eax # encoding: [0x0f,0xaf,0xc0]
 ; EGPR-NEXT:    retq # encoding: [0xc3]

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to