================
@@ -5386,6 +5386,130 @@ bool 
AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+                                         MachineInstr &MI,
+                                         Intrinsic::ID IID) const {
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+
+  auto createLaneOp = [&](Register &Src0, Register &Src1,
+                          Register &Src2) -> Register {
+    auto LaneOpDst = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+    if (Src2.isValid())
+      return (LaneOpDst.addUse(Src1).addUse(Src2)).getReg(0);
+    if (Src1.isValid())
+      return (LaneOpDst.addUse(Src1)).getReg(0);
+    return LaneOpDst.getReg(0);
+  };
+
+  Register Src1, Src2, Src0Valid, Src2Valid;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) 
{
+    Src1 = MI.getOperand(3).getReg();
+    if (IID == Intrinsic::amdgcn_writelane) {
+      Src2 = MI.getOperand(4).getReg();
+    }
+  }
+
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
+
+  if (Size == 32) {
+    if (Ty.isScalar())
+      // Already legal
+      return true;
+
+    Register Src0Valid = B.buildBitcast(S32, Src0).getReg(0);
+    if (Src2.isValid())
+      Src2Valid = B.buildBitcast(S32, Src2).getReg(0);
+    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
+    B.buildBitcast(DstReg, LaneOp);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Size < 32) {
+    Register Src0Cast = MRI.getType(Src0).isScalar()
+                            ? Src0
+                            : B.buildBitcast(LLT::scalar(Size), 
Src0).getReg(0);
+    Src0Valid = B.buildAnyExt(S32, Src0Cast).getReg(0);
+
+    if (Src2.isValid()) {
+      Register Src2Cast =
+          MRI.getType(Src2).isScalar()
+              ? Src2
+              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+      Src2Valid = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+    }
+    Register LaneOp = createLaneOp(Src0Valid, Src1, Src2Valid);
+    if (Ty.isScalar())
+      B.buildTrunc(DstReg, LaneOp);
+    else {
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOp);
+      B.buildBitcast(DstReg, Trunc);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if ((Size % 32) == 0) {
+    SmallVector<Register, 2> PartialRes;
+    unsigned NumParts = Size / 32;
+    auto Src0Parts = B.buildUnmerge(S32, Src0);
+
+    switch (IID) {
+    case Intrinsic::amdgcn_readlane: {
+      Register Src1 = MI.getOperand(3).getReg();
+      for (unsigned i = 0; i < NumParts; ++i)
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
+                 .addUse(Src0Parts.getReg(i))
+                 .addUse(Src1))
+                .getReg(0));
----------------
arsenm wrote:

We should really add a buildIntrinsic overload that just takes the array of 
inputs like for other instructions 

https://github.com/llvm/llvm-project/pull/89217
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to