Hi,

This patch implement ACLE intrinsic with poly128_t newly introduced. Because of 
Clang and LLVM limits(quite a lot of APIs about i128 doesn't exist), the LLVM 
intrinsic is defined on v16i8 instead of i128. This is a work around solution, 
but in current framework, it is the simplest way to get poly128_t supported. 
Please review. Thanks.

http://llvm-reviews.chandlerc.com/D2344

Files:
  include/llvm/IR/IntrinsicsAArch64.td
  lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
  lib/Target/AArch64/AArch64ISelLowering.cpp
  lib/Target/AArch64/AArch64InstrNEON.td
  test/CodeGen/AArch64/128bit_load_store.ll
  test/CodeGen/AArch64/neon-3vdiff.ll
Index: include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- include/llvm/IR/IntrinsicsAArch64.td
+++ include/llvm/IR/IntrinsicsAArch64.td
@@ -103,6 +103,14 @@
 def int_aarch64_neon_vsqrshrn : Neon_N2V_Narrow_Intrinsic;
 def int_aarch64_neon_vuqrshrn : Neon_N2V_Narrow_Intrinsic;
 
+//128-bit load/store
+def int_aarch64_vldrq : Intrinsic<[llvm_anyvector_ty],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_aarch64_vstrq : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                  llvm_i32_ty], [IntrReadWriteArgMem]>;
+
 // Vector across
 class Neon_Across_Intrinsic
   : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
@@ -325,6 +333,9 @@
 // Signed Saturating Doubling Multiply-Subtract Long
 def int_aarch64_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
 
+def int_aarch64_neon_vmull_p64 :
+  Intrinsic<[llvm_v16i8_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>;
+
 class Neon_2Arg_ShiftImm_Intrinsic
   : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>;
 
Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -1559,6 +1559,58 @@
       };
       return SelectVLDSTLane(Node, false, false, 4, Opcodes);
     }
+    case Intrinsic::aarch64_vldrq: {
+      SDLoc dl(Node);
+      SmallVector<SDValue, 2> Ops;
+
+      // Push back the Memory Address.
+      Ops.push_back(Node->getOperand(2));
+      // Push back the offset 0.
+      Ops.push_back(CurDAG->getConstant(0, MVT::i32, false));
+      // Push back the Chain
+      Ops.push_back(Node->getOperand(0));
+
+      SmallVector<EVT, 3> ResTys;
+      // Push back the type of return super register
+      ResTys.push_back(Node->getValueType(0));
+      ResTys.push_back(MVT::Other); // Type of the Chain
+      SDNode *VLd = CurDAG->getMachineNode(AArch64::LSFP128_LDR,
+                                           dl, ResTys, Ops);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+      return VLd;
+    }
+    case Intrinsic::aarch64_vstrq: {
+      SDLoc dl(Node);
+      SmallVector<SDValue, 6> Ops;
+
+      unsigned Vec0Idx = 3;
+      SmallVector<SDValue, 4> Regs(Node->op_begin() + Vec0Idx,
+                                   Node->op_begin() + Vec0Idx + 1);
+      // Push back the source register
+      Ops.push_back(createQTuple(Regs));
+      // Push back the Memory Address.
+      Ops.push_back(Node->getOperand(2));
+      // Push back the offset 0.
+      Ops.push_back(CurDAG->getConstant(0, MVT::i32, false));
+      // Push back the Chain
+      Ops.push_back(Node->getOperand(0));
+
+      // Transfer memoperands.
+      SmallVector<EVT, 2> ResTys;
+      ResTys.push_back(MVT::Other); // Type for the Chain
+      SDNode *VSt = CurDAG->getMachineNode(AArch64::LSFP128_STR,
+                                           dl, ResTys, Ops);
+
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+      return VSt;
+    }
     } // End of switch IntNo
     break;
   } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4543,6 +4543,7 @@
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
   case Intrinsic::arm_neon_vld4:
+  case Intrinsic::aarch64_vldrq:
   case Intrinsic::aarch64_neon_vld1x2:
   case Intrinsic::aarch64_neon_vld1x3:
   case Intrinsic::aarch64_neon_vld1x4:
@@ -4566,6 +4567,7 @@
   case Intrinsic::arm_neon_vst2:
   case Intrinsic::arm_neon_vst3:
   case Intrinsic::arm_neon_vst4:
+  case Intrinsic::aarch64_vstrq:
   case Intrinsic::aarch64_neon_vst1x2:
   case Intrinsic::aarch64_neon_vst1x3:
   case Intrinsic::aarch64_neon_vst1x4:
Index: lib/Target/AArch64/AArch64InstrNEON.td
===================================================================
--- lib/Target/AArch64/AArch64InstrNEON.td
+++ lib/Target/AArch64/AArch64InstrNEON.td
@@ -3021,19 +3021,19 @@
                                              int_arm_neon_vqsubs>;
 
 multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
+                         SDPatternOperator opnode_8h8b,
+                         SDPatternOperator opnode_1q1d, bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode, VPR128, VPR64, v8i16, v8i8>;
+                              opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
 
-    def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode,
-                             (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-                             asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d",
-                             [], NoItinerary>;
+    def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
+                              opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
   }
 }
 
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>;
+defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
+                              int_aarch64_neon_vmull_p64, 1>;
 
 multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
                                    string opnode, bit Commutable = 0> {
@@ -3042,10 +3042,17 @@
                                       !cast<PatFrag>(opnode # "_16B"),
                                       v8i16, v16i8>;
 
-    def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                             asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
-                             [], NoItinerary>;
+    def _1q2d : 
+      NeonI_3VDiff<0b1, u, 0b11, opcode,
+                   (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                   asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
+                   [(set (v16i8 VPR128:$Rd),
+                      (v16i8 (int_aarch64_neon_vmull_p64 
+                        (v1i64 (scalar_to_vector
+                          (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
+                        (v1i64 (scalar_to_vector
+                          (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
+                   NoItinerary>;
   }
 }
 
Index: test/CodeGen/AArch64/128bit_load_store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/128bit_load_store.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+
+define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+; CHECK: test_vstrq_p128
+; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  %0 = bitcast i128* %ptr to i8*
+  %1 = bitcast i128 %val to <16 x i8>
+  tail call void @llvm.aarch64.vstrq.v16i8(i8* %0, <16 x i8> %1, i32 16)
+  ret void
+}
+
+declare void @llvm.aarch64.vstrq.v16i8(i8*, <16 x i8>, i32) #1
+
+define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
+; CHECK: test_vldrq_p128
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  %0 = bitcast i128* %ptr to i8*
+  %vldrq = tail call <16 x i8> @llvm.aarch64.vldrq.v16i8(i8* %0, i32 16)
+  %conv = bitcast <16 x i8> %vldrq to i128
+  ret i128 %conv
+}
+
+declare <16 x i8> @llvm.aarch64.vldrq.v16i8(i8*, i32) #3
Index: test/CodeGen/AArch64/neon-3vdiff.ll
===================================================================
--- test/CodeGen/AArch64/neon-3vdiff.ll
+++ test/CodeGen/AArch64/neon-3vdiff.ll
@@ -1804,3 +1804,30 @@
   ret <8 x i16> %vmull.i.i
 }
 
+define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
+; CHECK: test_vmull_p64
+; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
+entry:
+  %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
+  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
+  ret i128 %vmull3.i
+}
+
+define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
+; CHECK: test_vmull_high_p64
+; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %0 = extractelement <2 x i64> %a, i32 1
+  %1 = extractelement <2 x i64> %b, i32 1
+  %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
+  %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
+  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
+  ret i128 %vmull3.i.i
+}
+
+declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
+
+
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to