https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/166982

sincospi/sincospif/sincospil does not appear to exist on common
targets. Darwin targets have __sincospi and __sincospif, so define
and use those implementations. I have no idea what version added
those calls, so I'm just guessing it's the same conditions as
__sincos_stret.

Most of this patch is working to preserve codegen when a vector
library is explicitly enabled. This only covers sleef and armpl,
as those are the only cases tested.

The multiple result libcalls have an aberrant process where the
legalizer looks for the scalar type's libcall in RuntimeLibcalls,
and then cross references TargetLibraryInfo to find a matching
vector call. This was unworkable in the sincospi case, since the
common case is there is no scalar call available. To preserve
codegen if the call is available, first try to match a libcall
with the vector type before falling back on the old scalar search.

Eventually all of this logic should be contained in RuntimeLibcalls,
without the link to TargetLibraryInfo. In principle we should perform
the same legalization logic as for an ordinary operation, trying
to find a matching subvector type with a libcall.

>From ca4a71a0341594aef41c88b533966bc16158e9c1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Thu, 6 Nov 2025 11:34:51 -0800
Subject: [PATCH] RuntimeLibcalls: Remove incorrect sincospi from most targets

sincospi/sincospif/sincospil does not appear to exist on common
targets. Darwin targets have __sincospi and __sincospif, so define
and use those implementations. I have no idea what version added
those calls, so I'm just guessing it's the same conditions as
__sincos_stret.

Most of this patch is working to preserve codegen when a vector
library is explicitly enabled. This only covers sleef and armpl,
as those are the only cases tested.

The multiple result libcalls have an aberrant process where the
legalizer looks for the scalar type's libcall in RuntimeLibcalls,
and then cross references TargetLibraryInfo to find a matching
vector call. This was unworkable in the sincospi case, since the
common case is there is no scalar call available. To preserve
codegen if the call is available, first try to match a libcall
with the vector type before falling back on the old scalar search.

Eventually all of this logic should be contained in RuntimeLibcalls,
without the link to TargetLibraryInfo. In principle we should perform
the same legalization logic as for an ordinary operation, trying
to find a matching subvector type with a libcall.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  30 +-
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  10 +-
 llvm/include/llvm/IR/RuntimeLibcalls.td       |  17 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  14 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   3 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |  26 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  53 ++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   1 +
 .../CodeGen/AArch64/llvm.sincospi.error.ll    |  13 +
 llvm/test/CodeGen/AArch64/llvm.sincospi.ll    | 308 +++++++++---------
 llvm/test/CodeGen/ARM/llvm.sincospi.ll        | 249 ++++++++++++++
 llvm/test/CodeGen/PowerPC/llvm.sincos.ll      |  72 ----
 llvm/test/CodeGen/PowerPC/llvm.sincospi.ll    |  21 ++
 .../CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll |  25 ++
 llvm/test/CodeGen/X86/llvm.sincospi.ll        | 233 +++++++++++++
 15 files changed, 801 insertions(+), 274 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll
 create mode 100644 llvm/test/CodeGen/ARM/llvm.sincospi.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/llvm.sincospi.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll
 create mode 100644 llvm/test/CodeGen/X86/llvm.sincospi.ll

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h 
b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index f58525754d7a5..1c167af4b0478 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -316,12 +316,22 @@ class BasicTTIImplBase : public 
TargetTransformInfoImplCRTPBase<T> {
     EVT ScalarVT = VT.getScalarType();
     RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
 
+    /// Migration flag. IsVectorCall cases directly know about the vector
+    /// libcall in RuntimeLibcallsInfo and shouldn't try to use
+    /// LibInfo->getVectorMappingInfo.
+    bool IsVectorCall = false;
+
     switch (ICA.getID()) {
     case Intrinsic::modf:
       LC = RTLIB::getMODF(ScalarVT);
       break;
     case Intrinsic::sincospi:
-      LC = RTLIB::getSINCOSPI(ScalarVT);
+      LC = RTLIB::getSINCOSPI(VT);
+      if (LC == RTLIB::UNKNOWN_LIBCALL)
+        LC = RTLIB::getSINCOSPI(ScalarVT);
+      else if (VT.isVector())
+        IsVectorCall = true;
+
       break;
     case Intrinsic::sincos:
       LC = RTLIB::getSINCOS(ScalarVT);
@@ -345,17 +355,23 @@ class BasicTTIImplBase : public 
TargetTransformInfoImplCRTPBase<T> {
     LLVMContext &Ctx = RetTy->getContext();
     ElementCount VF = getVectorizedTypeVF(RetTy);
     VecDesc const *VD = nullptr;
-    for (bool Masked : {false, true}) {
-      if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
-        break;
+
+    if (!IsVectorCall) {
+      for (bool Masked : {false, true}) {
+        if ((VD = LibInfo->getVectorMappingInfo(LCName, VF, Masked)))
+          break;
+      }
+      if (!VD)
+        return std::nullopt;
     }
-    if (!VD)
-      return std::nullopt;
 
     // Cost the call + mask.
     auto Cost =
         thisT()->getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
-    if (VD->isMasked()) {
+
+    if ((VD && VD->isMasked()) ||
+        (IsVectorCall &&
+         RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(LibcallImpl))) {
       auto VecTy = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
       Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
                                       VecTy, {}, CostKind, 0, nullptr, {});
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h 
b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0dd4f23c6d85f..c8db97cc42a30 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1725,9 +1725,17 @@ class SelectionDAG {
   /// value.
   LLVM_ABI bool
   expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node,
-                                SmallVectorImpl<SDValue> &Results,
+                                SmallVectorImpl<SDValue> &Results, EVT 
CallType,
                                 std::optional<unsigned> CallRetResNo = {});
 
+  // FIXME: Ths should be removed, and form using RTLIB::Libcall should be
+  // preferred. Callers should resolve the exact type libcall to use.
+  LLVM_ABI bool
+  expandMultipleResultFPLibCall(StringRef LibcallName, CallingConv::ID CC,
+                                SDNode *Node, SmallVectorImpl<SDValue> 
&Results,
+                                std::optional<unsigned> CallRetResNo = {},
+                                bool IsVectorMasked = false);
+
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   LLVM_ABI SDValue expandVAArg(SDNode *Node);
 
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td 
b/llvm/include/llvm/IR/RuntimeLibcalls.td
index 1a752aed31992..a0b52395498c5 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -975,10 +975,6 @@ def frexpf : RuntimeLibcallImpl<FREXP_F32>;
 def frexp : RuntimeLibcallImpl<FREXP_F64>;
 defm frexpl : LibmLongDoubleLibCall;
 
-def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
-def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
-defm sincospil : LibmLongDoubleLibCall;
-
 def modff : RuntimeLibcallImpl<MODF_F32>;
 def modf : RuntimeLibcallImpl<MODF_F64>;
 defm modfl : LibmLongDoubleLibCall;
@@ -1055,6 +1051,15 @@ def sincosf : RuntimeLibcallImpl<SINCOS_F32>;
 def sincos : RuntimeLibcallImpl<SINCOS_F64>;
 defm sincosl : LibmLongDoubleLibCall;
 
+// Exists in sun math library
+def sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
+def sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
+defm sincospil : LibmLongDoubleLibCall;
+
+// Exists on macOS
+def __sincospif : RuntimeLibcallImpl<SINCOSPI_F32>;
+def __sincospi : RuntimeLibcallImpl<SINCOSPI_F64>;
+
 def bzero : RuntimeLibcallImpl<BZERO>;
 def __bzero : RuntimeLibcallImpl<BZERO>;
 
@@ -1232,7 +1237,9 @@ defvar DefaultLibcallImpls32 = (add 
DefaultRuntimeLibcallImpls);
 defvar DefaultLibcallImpls64 = (add DefaultRuntimeLibcallImpls,
                                     Int128RTLibcalls);
 
-defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret),
+// TODO: Guessing sincospi added at same time as sincos_stret
+defvar DarwinSinCosStret = LibcallImpls<(add __sincosf_stret, __sincos_stret,
+                                             __sincospif, __sincospi),
                                         darwinHasSinCosStret>;
 defvar DarwinExp10 = LibcallImpls<(add __exp10f, __exp10), darwinHasExp10>;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 316aacdf6978e..a0baf821698a8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4842,9 +4842,15 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode 
*Node) {
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
-    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results);
-    if (!Expanded)
-      llvm_unreachable("Expected scalar FSINCOS[PI] to expand to libcall!");
+    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT);
+    if (!Expanded) {
+      DAG.getContext()->emitError(Twine("no libcall available for ") +
+                                  Node->getOperationName(&DAG));
+      SDValue Poison = DAG.getPOISON(VT);
+      Results.push_back(Poison);
+      Results.push_back(Poison);
+    }
+
     break;
   }
   case ISD::FLOG:
@@ -4934,7 +4940,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode 
*Node) {
     EVT VT = Node->getValueType(0);
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FMODF ? RTLIB::getMODF(VT)
                                                         : RTLIB::getFREXP(VT);
-    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT,
                                                       /*CallRetResNo=*/0);
     if (!Expanded)
       llvm_unreachable("Expected scalar FFREXP/FMODF to expand to libcall!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 58983cb57d7f6..29c4dac12a81a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1726,7 +1726,8 @@ void 
DAGTypeLegalizer::ExpandFloatRes_UnaryWithTwoFPResults(
     SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo) {
   assert(!N->isStrictFPOpcode() && "strictfp not implemented");
   SmallVector<SDValue> Results;
-  DAG.expandMultipleResultFPLibCall(LC, N, Results, CallRetResNo);
+  DAG.expandMultipleResultFPLibCall(LC, N, Results, N->getValueType(0),
+                                    CallRetResNo);
   for (auto [ResNo, Res] : enumerate(Results)) {
     SDValue Lo, Hi;
     GetPairElements(Res, Lo, Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 94751be5b7986..f5a54497c8a98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1268,20 +1268,30 @@ void VectorLegalizer::Expand(SDNode *Node, 
SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
-  case ISD::FSINCOS:
+
   case ISD::FSINCOSPI: {
+    EVT VT = Node->getValueType(0);
+    RTLIB::Libcall LC = RTLIB::getSINCOSPI(VT);
+    if (LC != RTLIB::UNKNOWN_LIBCALL &&
+        DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT))
+      return;
+
+    // TODO: Try to see if there's a narrower call available to use before
+    // scalarizing.
+    break;
+  }
+  case ISD::FSINCOS: {
+    // FIXME: Try to directly match vector case like fsincospi
     EVT VT = Node->getValueType(0).getVectorElementType();
-    RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
-                            ? RTLIB::getSINCOS(VT)
-                            : RTLIB::getSINCOSPI(VT);
-    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
+    RTLIB::Libcall LC = RTLIB::getSINCOS(VT);
+    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT))
       return;
     break;
   }
   case ISD::FMODF: {
-    RTLIB::Libcall LC =
-        RTLIB::getMODF(Node->getValueType(0).getVectorElementType());
-    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+    EVT VT = Node->getValueType(0).getVectorElementType();
+    RTLIB::Libcall LC = RTLIB::getMODF(VT);
+    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results, VT,
                                           /*CallRetResNo=*/0))
       return;
     break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bbc1d734cfef5..a69216e1a0e7f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2515,18 +2515,20 @@ static bool 
canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
 
 bool SelectionDAG::expandMultipleResultFPLibCall(
     RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
-    std::optional<unsigned> CallRetResNo) {
-  LLVMContext &Ctx = *getContext();
-  EVT VT = Node->getValueType(0);
-  unsigned NumResults = Node->getNumValues();
-
+    EVT CallVT, std::optional<unsigned> CallRetResNo) {
   if (LC == RTLIB::UNKNOWN_LIBCALL)
     return false;
 
-  const char *LCName = TLI->getLibcallName(LC);
-  if (!LCName)
+  EVT VT = Node->getValueType(0);
+
+  RTLIB::LibcallImpl Impl = TLI->getLibcallImpl(LC);
+  if (Impl == RTLIB::Unsupported)
     return false;
 
+  StringRef LCName = TLI->getLibcallImplName(Impl);
+
+  // FIXME: This should not use TargetLibraryInfo. There should be
+  // RTLIB::Libcall entries for each used vector type, and directly matched.
   auto getVecDesc = [&]() -> VecDesc const * {
     for (bool Masked : {false, true}) {
       if (VecDesc const *VD = getLibInfo().getVectorMappingInfo(
@@ -2539,9 +2541,34 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
 
   // For vector types, we must find a vector mapping for the libcall.
   VecDesc const *VD = nullptr;
-  if (VT.isVector() && !(VD = getVecDesc()))
+  if (VT.isVector() && !CallVT.isVector() && !(VD = getVecDesc()))
     return false;
 
+  bool IsMasked = (VD && VD->isMasked()) ||
+                  RTLIB::RuntimeLibcallsInfo::hasVectorMaskArgument(Impl);
+
+  // This wrapper function exists because getVectorMappingInfo works in terms 
of
+  // function names instead of RTLIB enums.
+
+  // FIXME: If we used a vector mapping, this assumes the calling convention of
+  // the vector function is the same as the scalar.
+
+  StringRef Name = VD ? VD->getVectorFnName() : LCName;
+
+  return expandMultipleResultFPLibCall(Name,
+                                       TLI->getLibcallImplCallingConv(Impl),
+                                       Node, Results, CallRetResNo, IsMasked);
+}
+
+// FIXME: This belongs in TargetLowering
+bool SelectionDAG::expandMultipleResultFPLibCall(
+    StringRef Name, CallingConv::ID CC, SDNode *Node,
+    SmallVectorImpl<SDValue> &Results, std::optional<unsigned> CallRetResNo,
+    bool IsMasked) {
+  LLVMContext &Ctx = *getContext();
+  EVT VT = Node->getValueType(0);
+  unsigned NumResults = Node->getNumValues();
+
   // Find users of the node that store the results (and share input chains). 
The
   // destination pointers can be used instead of creating stack allocations.
   SDValue StoresInChain;
@@ -2599,7 +2626,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   SDLoc DL(Node);
 
   // Pass the vector mask (if required).
-  if (VD && VD->isMasked()) {
+  if (IsMasked) {
     EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
     SDValue Mask = getBoolConstant(true, DL, MaskVT, VT);
     Args.emplace_back(Mask, MaskVT.getTypeForEVT(Ctx));
@@ -2609,11 +2636,11 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
                       ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
                       : Type::getVoidTy(Ctx);
   SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
-  SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : 
LCName,
-                                     TLI->getPointerTy(getDataLayout()));
+  SDValue Callee =
+      getExternalSymbol(Name.data(), TLI->getPointerTy(getDataLayout()));
   TargetLowering::CallLoweringInfo CLI(*this);
-  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
-      TLI->getLibcallCallingConv(LC), RetType, Callee, std::move(Args));
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(CC, RetType, Callee,
+                                                     std::move(Args));
 
   auto [Call, CallChain] = TLI->LowerCallTo(CLI);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 05a854a0bf3fa..5bce539c45341 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -635,6 +635,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine 
&TM,
     setOperationAction(ISD::FROUNDEVEN, VT, Action);
     setOperationAction(ISD::FTRUNC, VT, Action);
     setOperationAction(ISD::FLDEXP, VT, Action);
+    setOperationAction(ISD::FSINCOSPI, VT, Action);
   };
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll 
b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll
new file mode 100644
index 0000000000000..d074d9ae24641
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.error.ll
@@ -0,0 +1,13 @@
+; RUN: not llc -mtriple=aarch64-gnu-linux -filetype=null %s 2>&1 | FileCheck %s
+
+; CHECK: error: no libcall available for fsincospi
+define { float, float } @test_sincospi_f32(float %a) {
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { double, double } @test_sincospi_f64(double %a) {
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
diff --git a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll 
b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
index d1d7d92adc05a..b386df077c09d 100644
--- a/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.sincospi.ll
@@ -1,268 +1,250 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -mtriple=arm64-apple-macosx10.9 < %s | FileCheck %s
 
-define { half, half } @test_sincospi_f16(half %a) {
+define { half, half } @test_sincospi_f16(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    fcvt h1, s1
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   ret { half, half } %result
 }
 
-define half @test_sincospi_f16_only_use_sin(half %a) {
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16_only_use_sin:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
-define half @test_sincospi_f16_only_use_cos(half %a) {
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
 ; CHECK-LABEL: test_sincospi_f16_only_use_cos:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldr s0, [sp, #8]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { half, half } @llvm.sincospi.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
-define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) {
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v2f16:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #36
-; CHECK-NEXT:    add x1, sp, #32
-; CHECK-NEXT:    fcvt s0, h1
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h1, v0[1]
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    fcvt s0, h1
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #20
+; CHECK-NEXT:    add x1, sp, #16
 ; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    mov h0, v0[2]
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add x0, sp, #44
 ; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    mov h0, v0.h[2]
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    add x0, sp, #60
-; CHECK-NEXT:    add x1, sp, #56
-; CHECK-NEXT:    mov h0, v0.h[3]
+; CHECK-NEXT:    mov h0, v0[3]
 ; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s2, s0, [sp, #32]
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp s3, s1, [sp, #24]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s2, s0, [sp, #24]
+; CHECK-NEXT:    ldp s3, s1, [sp, #16]
+; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    fcvt h4, s0
 ; CHECK-NEXT:    fcvt h2, s2
 ; CHECK-NEXT:    fcvt h0, s1
 ; CHECK-NEXT:    fcvt h1, s3
-; CHECK-NEXT:    ldp s5, s3, [sp, #40]
+; CHECK-NEXT:    ldp s5, s3, [sp, #32]
 ; CHECK-NEXT:    fcvt h3, s3
-; CHECK-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-NEXT:    mov.h v0[1], v4[0]
 ; CHECK-NEXT:    fcvt h4, s5
-; CHECK-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-NEXT:    ldp s5, s2, [sp, #56]
-; CHECK-NEXT:    mov v0.h[2], v3.h[0]
+; CHECK-NEXT:    mov.h v1[1], v2[0]
+; CHECK-NEXT:    ldp s5, s2, [sp, #40]
+; CHECK-NEXT:    mov.h v0[2], v3[0]
 ; CHECK-NEXT:    fcvt h2, s2
 ; CHECK-NEXT:    fcvt h3, s5
-; CHECK-NEXT:    mov v1.h[2], v4.h[0]
-; CHECK-NEXT:    mov v0.h[3], v2.h[0]
-; CHECK-NEXT:    mov v1.h[3], v3.h[0]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    mov.h v1[2], v4[0]
+; CHECK-NEXT:    mov.h v0[3], v2[0]
+; CHECK-NEXT:    mov.h v1[3], v3[0]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
   %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
-define { float, float } @test_sincospi_f32(float %a) {
+define { float, float } @test_sincospi_f32(float %a) #0 {
 ; CHECK-LABEL: test_sincospi_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    add x0, sp, #12
 ; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospif
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospif
 ; CHECK-NEXT:    ldp s1, s0, [sp, #8]
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { float, float } @llvm.sincospi.f32(float %a)
   ret { float, float } %result
 }
 
-define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) {
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
-; CHECK-NEXT:    .cfi_offset w22, -32
-; CHECK-NEXT:    .cfi_offset w30, -48
-; CHECK-NEXT:    add x0, sp, #20
-; CHECK-NEXT:    add x1, sp, #16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #28
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    stp x22, x21, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #36
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    add x19, sp, #36
+; CHECK-NEXT:    add x20, sp, #32
+; CHECK-NEXT:    mov s0, v0[1]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add x0, sp, #44
 ; CHECK-NEXT:    add x1, sp, #40
 ; CHECK-NEXT:    add x21, sp, #44
 ; CHECK-NEXT:    add x22, sp, #40
-; CHECK-NEXT:    mov s0, v0.s[2]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s1, s0, [sp, #16]
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[2], [x21]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x22]
-; CHECK-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    mov s0, v0[2]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #24]
+; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.s { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[2], [x21]
+; CHECK-NEXT:    ld1.s { v1 }[2], [x22]
+; CHECK-NEXT:    ldp x22, x21, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> 
%a)
   ret { <3 x float>, <3 x float> } %result
 }
 
-define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) {
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_sincospi_v2f32:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #32] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    add x0, sp, #44
-; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    add x0, sp, #28
 ; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #28
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    bl sincospif
-; CHECK-NEXT:    ldp s1, s0, [sp, #40]
-; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-NEXT:    stp x20, x19, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #20
+; CHECK-NEXT:    add x1, sp, #16
+; CHECK-NEXT:    add x19, sp, #20
+; CHECK-NEXT:    add x20, sp, #16
+; CHECK-NEXT:    mov s0, v0[1]
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldp s1, s0, [sp, #24]
+; CHECK-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.s { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.s { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] ; 16-byte Folded Reload
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ; kill: def $d1 killed $d1 killed $q1
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> 
%a)
   ret { <2 x float>, <2 x float> } %result
 }
 
-define { double, double } @test_sincospi_f64(double %a) {
+define { double, double } @test_sincospi_f64(double %a) #0 {
 ; CHECK-LABEL: test_sincospi_f64:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    add x1, sp, #8
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr d0, [sp, #24]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #8
+; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldp d1, d0, [sp]
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %result = call { double, double } @llvm.sincospi.f64(double %a)
   ret { double, double } %result
 }
 
-define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) {
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 
{
 ; CHECK-LABEL: test_sincospi_v2f64:
-; CHECK:       // %bb.0:
+; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    add x0, sp, #56
-; CHECK-NEXT:    add x1, sp, #40
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    add x0, sp, #32
-; CHECK-NEXT:    add x1, sp, #24
-; CHECK-NEXT:    add x19, sp, #32
-; CHECK-NEXT:    add x20, sp, #24
-; CHECK-NEXT:    mov d0, v0.d[1]
-; CHECK-NEXT:    bl sincospi
-; CHECK-NEXT:    ldr d0, [sp, #56]
-; CHECK-NEXT:    ldr d1, [sp, #40]
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v0.d }[1], [x19]
-; CHECK-NEXT:    ld1 { v1.d }[1], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #40
+; CHECK-NEXT:    add x1, sp, #32
+; CHECK-NEXT:    stp x20, x19, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    str q0, [sp] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldr q0, [sp] ; 16-byte Folded Reload
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #16
+; CHECK-NEXT:    add x19, sp, #24
+; CHECK-NEXT:    add x20, sp, #16
+; CHECK-NEXT:    mov d0, v0[1]
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    ldp d1, d0, [sp, #32]
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ld1.d { v0 }[1], [x19]
+; CHECK-NEXT:    ld1.d { v1 }[1], [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x 
double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/llvm.sincospi.ll 
b/llvm/test/CodeGen/ARM/llvm.sincospi.ll
new file mode 100644
index 0000000000000..91bf0aaf1806a
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/llvm.sincospi.ll
@@ -0,0 +1,249 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-apple-ios7.0.0 < %s | FileCheck %s
+
+define { half, half } @test_sincospi_f16(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #4]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    pop {r4, pc}
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #4]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    bl ___extendhfsf2
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #12]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    ldr r1, [sp, #4]
+; CHECK-NEXT:    strh.w r0, [sp, #22]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    strh.w r0, [sp, #20]
+; CHECK-NEXT:    add r0, sp, #20
+; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; CHECK-NEXT:    ldr r0, [sp, #8]
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    ldr r1, [sp]
+; CHECK-NEXT:    strh.w r0, [sp, #18]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bl ___truncsfhf2
+; CHECK-NEXT:    strh.w r0, [sp, #16]
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmovl.u16 q9, d8
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vmov.32 r0, d18[0]
+; CHECK-NEXT:    vmov.32 r1, d18[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    vmov.32 r3, d16[1]
+; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    pop {r4, pc}
+  %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) #0 {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldrd r1, r0, [sp], #8
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    vmov d8, r0, r1
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    vmov r0, s17
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    vldr s1, [sp, #4]
+; CHECK-NEXT:    vldr s3, [sp]
+; CHECK-NEXT:    vldr s0, [sp, #12]
+; CHECK-NEXT:    vldr s2, [sp, #8]
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    ldr lr, [sp], #4
+; CHECK-NEXT:    bx lr
+  %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> 
%a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    mov r7, r1
+; CHECK-NEXT:    add r1, sp, #12
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    add r1, sp, #4
+; CHECK-NEXT:    mov r2, sp
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    ldr r0, [sp, #36]
+; CHECK-NEXT:    vmov d0, r7, r6
+; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    add.w r2, r4, #16
+; CHECK-NEXT:    vmov d1, r5, r0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vldr s1, [sp, #8]
+; CHECK-NEXT:    vldr s3, [sp, #12]
+; CHECK-NEXT:    vldr s2, [sp, #4]
+; CHECK-NEXT:    vldr s0, [sp]
+; CHECK-NEXT:    vst1.32 {d1}, [r1:64]!
+; CHECK-NEXT:    vst1.32 {d0}, [r2:64]!
+; CHECK-NEXT:    bl ___sincospif
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> 
%a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) #0 {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r4, r7, lr}
+; CHECK-NEXT:    add r7, sp, #4
+; CHECK-NEXT:    sub sp, #20
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    bfc r4, #0, #3
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    subs r4, r7, #4
+; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
+; CHECK-NEXT:    ldrd r2, r3, [sp]
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop {r4, r7, pc}
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 
{
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    add r7, sp, #16
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    mov r4, sp
+; CHECK-NEXT:    bfc r4, #0, #3
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    mov r6, r1
+; CHECK-NEXT:    ldr r1, [r7, #8]
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r8, r2
+; CHECK-NEXT:    add r2, sp, #24
+; CHECK-NEXT:    add r3, sp, #16
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    add r2, sp, #8
+; CHECK-NEXT:    mov r3, sp
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    mov r1, r8
+; CHECK-NEXT:    bl ___sincospi
+; CHECK-NEXT:    vldr d19, [sp, #24]
+; CHECK-NEXT:    vldr d18, [sp, #8]
+; CHECK-NEXT:    vldr d17, [sp, #16]
+; CHECK-NEXT:    vldr d16, [sp]
+; CHECK-NEXT:    vst1.32 {d18, d19}, [r4]!
+; CHECK-NEXT:    vst1.32 {d16, d17}, [r4]
+; CHECK-NEXT:    sub.w r4, r7, #16
+; CHECK-NEXT:    mov sp, r4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x 
double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll 
b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
index aaf81ff814488..5b4e91c449522 100644
--- a/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincos.ll
@@ -26,30 +26,6 @@ define { ppc_fp128, ppc_fp128 } 
@test_sincos_ppcf128(ppc_fp128 %a) {
   ret { ppc_fp128, ppc_fp128 } %result
 }
 
-define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) {
-; CHECK-LABEL: test_sincospi_ppcf128:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addi r5, r1, 48
-; CHECK-NEXT:    addi r6, r1, 32
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    lfd f1, 48(r1)
-; CHECK-NEXT:    lfd f2, 56(r1)
-; CHECK-NEXT:    lfd f3, 32(r1)
-; CHECK-NEXT:    lfd f4, 40(r1)
-; CHECK-NEXT:    addi r1, r1, 64
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
-  ret { ppc_fp128, ppc_fp128 } %result
-}
-
 ; FIXME: This could be made a tail call with the default expansion of 
llvm.sincos.
 define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias 
%out_sin, ptr noalias %out_cos) {
 ; CHECK-LABEL: test_sincos_ppcf128_void_tail_call:
@@ -73,29 +49,6 @@ define void @test_sincos_ppcf128_void_tail_call(ppc_fp128 
%a, ptr noalias %out_s
   ret void
 }
 
-; FIXME: This could be made a tail call with the default expansion of 
llvm.sincospi.
-define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias 
%out_sin, ptr noalias %out_cos) {
-; CHECK-LABEL: test_sincospi_ppcf128_void_tail_call:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -32(r1)
-; CHECK-NEXT:    std r0, 48(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    addi r1, r1, 32
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = tail call { ppc_fp128, ppc_fp128 } 
@llvm.sincospi.ppcf128(ppc_fp128 %a)
-  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
-  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
-  store ppc_fp128 %result.0, ptr %out_sin, align 16
-  store ppc_fp128 %result.1, ptr %out_cos, align 16
-  ret void
-}
-
 ; NOTE: This would need a struct-return library call for llvm.sincos to become 
a tail call.
 define { ppc_fp128, ppc_fp128 } @test_sincos_ppcf128_tail_call(ppc_fp128 %a) {
 ; CHECK-LABEL: test_sincos_ppcf128_tail_call:
@@ -120,28 +73,3 @@ define { ppc_fp128, ppc_fp128 } 
@test_sincos_ppcf128_tail_call(ppc_fp128 %a) {
   %result = tail call { ppc_fp128, ppc_fp128 } @llvm.sincos.ppcf128(ppc_fp128 
%a)
   ret { ppc_fp128, ppc_fp128 } %result
 }
-
-; NOTE: This would need a struct-return library call for llvm.sincospi to 
become a tail call.
-define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) 
{
-; CHECK-LABEL: test_sincospi_ppcf128_tail_call:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    addi r5, r1, 48
-; CHECK-NEXT:    addi r6, r1, 32
-; CHECK-NEXT:    bl sincospil
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    lfd f1, 48(r1)
-; CHECK-NEXT:    lfd f2, 56(r1)
-; CHECK-NEXT:    lfd f3, 32(r1)
-; CHECK-NEXT:    lfd f4, 40(r1)
-; CHECK-NEXT:    addi r1, r1, 64
-; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    mtlr r0
-; CHECK-NEXT:    blr
-  %result = tail call { ppc_fp128, ppc_fp128 } 
@llvm.sincospi.ppcf128(ppc_fp128 %a)
-  ret { ppc_fp128, ppc_fp128 } %result
-}
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll 
b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll
new file mode 100644
index 0000000000000..75e7559386f16
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ll
@@ -0,0 +1,21 @@
+; RUN: not llc -mtriple=powerpc64le-gnu-linux -filetype=null %s 2>&1 | 
FileCheck %s
+
+; CHECK: error: no libcall available for fsincospi
+define { half, half } @test_sincospi_f16(half %a) #0 {
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { float, float } @test_sincospi_f32(float %a) #0 {
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+; CHECK: error: no libcall available for fsincospi
+define { double, double } @test_sincospi_f64(double %a) #0 {
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll 
b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll
new file mode 100644
index 0000000000000..bc656bb785e9e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/llvm.sincospi.ppcfp128.ll
@@ -0,0 +1,25 @@
+; XFAIL: *
+; FIXME: asserts
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-gnu-linux -filetype=null \
+; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names %s
+
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128(ppc_fp128 %a) {
+  %result = call { ppc_fp128, ppc_fp128 } @llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
+
+; FIXME: This could be made a tail call with the default expansion of 
llvm.sincospi.
+define void @test_sincospi_ppcf128_void_tail_call(ppc_fp128 %a, ptr noalias 
%out_sin, ptr noalias %out_cos) {
+  %result = tail call { ppc_fp128, ppc_fp128 } 
@llvm.sincospi.ppcf128(ppc_fp128 %a)
+  %result.0 = extractvalue { ppc_fp128, ppc_fp128 } %result, 0
+  %result.1 = extractvalue { ppc_fp128, ppc_fp128 } %result, 1
+  store ppc_fp128 %result.0, ptr %out_sin, align 16
+  store ppc_fp128 %result.1, ptr %out_cos, align 16
+  ret void
+}
+
+; NOTE: This would need a struct-return library call for llvm.sincospi to 
become a tail call.
+define { ppc_fp128, ppc_fp128 } @test_sincospi_ppcf128_tail_call(ppc_fp128 %a) 
{
+  %result = tail call { ppc_fp128, ppc_fp128 } 
@llvm.sincospi.ppcf128(ppc_fp128 %a)
+  ret { ppc_fp128, ppc_fp128 } %result
+}
diff --git a/llvm/test/CodeGen/X86/llvm.sincospi.ll 
b/llvm/test/CodeGen/X86/llvm.sincospi.ll
new file mode 100644
index 0000000000000..5546c66deba30
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.sincospi.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 6
+; RUN: llc -mtriple=x86_64-apple-macosx10.9 < %s | FileCheck %s
+
+define { half, half } @test_sincospi_f16(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    movq %rsp, %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.0 = extractvalue { half, half } %result, 0
+  ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { half, half } @llvm.sincospi.f16(half %a)
+  %result.1 = extractvalue { half, half } %result, 1
+  ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $64, %rsp
+; CHECK-NEXT:    pextrw $0, %xmm0, %ebx
+; CHECK-NEXT:    psrld $16, %xmm0
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movzwl %bx, %edi
+; CHECK-NEXT:    callq ___extendhfsf2
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ___truncsfhf2
+; CHECK-NEXT:    ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded 
Reload
+; CHECK-NEXT:    ## xmm0 = 
xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded 
Reload
+; CHECK-NEXT:    ## xmm1 = 
xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; CHECK-NEXT:    addq $64, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+  ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) #0 {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %result = call { float, float } @llvm.sincospi.f32(float %a)
+  ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> 
%a)
+  ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospif
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> 
%a)
+  ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) #0 {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    retq
+  %result = call { double, double } @llvm.sincospi.f64(double %a)
+  ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 
{
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq %rsp, %rsi
+; CHECK-NEXT:    callq ___sincospi
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
+  %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x 
double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+attributes #0 = { nounwind }

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to