https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/168290

>From 6b6155931582b2f8924a76b268f06d9e2696d489 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <[email protected]>
Date: Fri, 14 Nov 2025 21:47:44 -0800
Subject: [PATCH] DAG: Use poison for some vector result widening

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  24 +-
 .../AArch64/sve-extract-scalable-vector.ll    |   7 -
 .../vector-constrained-fp-intrinsics.ll       | 266 +++++------
 llvm/test/CodeGen/X86/matrix-multiply.ll      |  74 +--
 .../X86/vector-constrained-fp-intrinsics.ll   | 434 +++++++++---------
 5 files changed, 399 insertions(+), 406 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp 
b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ef53ee6df9f06..10d5f7a9b4f65 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5654,7 +5654,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       // Widen the input and call convert on the widened input vector.
       unsigned NumConcat =
           WidenEC.getKnownMinValue() / InVTEC.getKnownMinValue();
-      SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+      SmallVector<SDValue, 16> Ops(NumConcat, DAG.getPOISON(InVT));
       Ops[0] = InOp;
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
@@ -5673,7 +5673,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
   EVT EltVT = WidenVT.getVectorElementType();
-  SmallVector<SDValue, 16> Ops(WidenEC.getFixedValue(), DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 16> Ops(WidenEC.getFixedValue(), DAG.getPOISON(EltVT));
   // Use the original element count so we don't do more scalar opts than
   // necessary.
   unsigned MinElts = N->getValueType(0).getVectorNumElements();
@@ -5756,7 +5756,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
   EVT EltVT = WidenVT.getVectorElementType();
   std::array<EVT, 2> EltVTs = {{EltVT, MVT::Other}};
-  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getPOISON(EltVT));
   SmallVector<SDValue, 32> OpChains;
   // Use the original element count so we don't do more scalar opts than
   // necessary.
@@ -5819,7 +5819,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
   }
 
   while (Ops.size() != WidenNumElts)
-    Ops.push_back(DAG.getUNDEF(WidenSVT));
+    Ops.push_back(DAG.getPOISON(WidenSVT));
 
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
@@ -6026,7 +6026,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
         // input and then widening it. To avoid this, we widen the input only 
if
         // it results in a legal type.
         if (WidenSize % InSize == 0) {
-          SmallVector<SDValue, 16> Ops(NewNumParts, DAG.getUNDEF(InVT));
+          SmallVector<SDValue, 16> Ops(NewNumParts, DAG.getPOISON(InVT));
           Ops[0] = InOp;
 
           NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
@@ -6034,7 +6034,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
           SmallVector<SDValue, 16> Ops;
           DAG.ExtractVectorElements(InOp, Ops);
           Ops.append(WidenSize / InScalarSize - Ops.size(),
-                     DAG.getUNDEF(InVT.getVectorElementType()));
+                     DAG.getPOISON(InVT.getVectorElementType()));
 
           NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
         }
@@ -6088,7 +6088,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     if (WidenNumElts % NumInElts == 0) {
       // Add undef vectors to widen to correct length.
       unsigned NumConcat = WidenNumElts / NumInElts;
-      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SDValue UndefVal = DAG.getPOISON(InVT);
       SmallVector<SDValue, 16> Ops(NumConcat);
       for (unsigned i=0; i < NumOperands; ++i)
         Ops[i] = N->getOperand(i);
@@ -6146,7 +6146,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     for (unsigned j = 0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getExtractVectorElt(dl, EltVT, InOp, j);
   }
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  SDValue UndefVal = DAG.getPOISON(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
   return DAG.getBuildVector(WidenVT, dl, Ops);
@@ -6213,7 +6213,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
         Parts.push_back(
             DAG.getExtractSubvector(dl, PartVT, InOp, IdxVal + I * GCD));
       for (; I < WidenNumElts / GCD; ++I)
-        Parts.push_back(DAG.getUNDEF(PartVT));
+        Parts.push_back(DAG.getPOISON(PartVT));
 
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
     }
@@ -6229,7 +6229,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   for (i = 0; i < VTNumElts; ++i)
     Ops[i] = DAG.getExtractVectorElt(dl, EltVT, InOp, IdxVal + i);
 
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  SDValue UndefVal = DAG.getPOISON(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
   return DAG.getBuildVector(WidenVT, dl, Ops);
@@ -6903,7 +6903,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_VECTOR_REVERSE(SDNode *N) {
       Parts.push_back(
           DAG.getExtractSubvector(dl, PartVT, ReverseVal, IdxVal + i * GCD));
     for (; i < WidenNumElts / GCD; ++i)
-      Parts.push_back(DAG.getUNDEF(PartVT));
+      Parts.push_back(DAG.getPOISON(PartVT));
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
   }
@@ -6992,7 +6992,7 @@ SDValue 
DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
   EVT TmpEltVT = LHS.getValueType().getVectorElementType();
 
   // Fully unroll and reassemble.
-  SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getPOISON(EltVT));
   SmallVector<SDValue, 8> Chains(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
     SDValue LHSElem = DAG.getExtractVectorElt(dl, TmpEltVT, LHS, i);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll 
b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index f6251ff66299e..8fc27248abac3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -612,13 +612,6 @@ define <vscale x 14 x i8> 
@extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z3.s
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
-; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpkhi z2.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z0.s
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %res = call <vscale x 14 x i8> @llvm.vector.extract.nxv14i8.nxv28i8(<vscale 
x 28 x i8> %in, i64 14)
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll 
b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index 71c3069a406fe..08ca1d153248e 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -5286,16 +5286,16 @@ entry:
 define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32(<3 x float> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_fptosi_v3i32_v3f32:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xxsldwi 0, 34, 34, 3
-; PC64LE-NEXT:    xxswapd 1, 34
+; PC64LE-NEXT:    xxswapd 0, 34
+; PC64LE-NEXT:    xxsldwi 1, 34, 34, 3
 ; PC64LE-NEXT:    xscvspdpn 0, 0
 ; PC64LE-NEXT:    xscvspdpn 1, 1
 ; PC64LE-NEXT:    xxsldwi 2, 34, 34, 1
 ; PC64LE-NEXT:    xscvdpsxws 0, 0
 ; PC64LE-NEXT:    xscvdpsxws 1, 1
 ; PC64LE-NEXT:    mffprwz 3, 0
-; PC64LE-NEXT:    mtfprwz 0, 3
-; PC64LE-NEXT:    mffprwz 3, 1
+; PC64LE-NEXT:    mffprwz 4, 1
+; PC64LE-NEXT:    mtfprwz 0, 4
 ; PC64LE-NEXT:    mtfprwz 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI97_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI97_0@toc@l
@@ -5311,25 +5311,25 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f32(<3 x float> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_fptosi_v3i32_v3f32:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 3
-; PC64LE9-NEXT:    xxswapd 1, 34
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 1
 ; PC64LE9-NEXT:    xscvspdpn 0, 0
-; PC64LE9-NEXT:    xscvspdpn 1, 1
 ; PC64LE9-NEXT:    xscvdpsxws 0, 0
-; PC64LE9-NEXT:    xscvdpsxws 1, 1
 ; PC64LE9-NEXT:    mffprwz 3, 0
-; PC64LE9-NEXT:    mtfprwz 0, 3
-; PC64LE9-NEXT:    mffprwz 3, 1
-; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI97_0@toc@ha
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    xxsldwi 1, 34, 34, 1
-; PC64LE9-NEXT:    addi 3, 3, .LCPI97_0@toc@l
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    xscvspdpn 1, 1
-; PC64LE9-NEXT:    xscvdpsxws 1, 1
-; PC64LE9-NEXT:    mffprwz 3, 1
+; PC64LE9-NEXT:    xxswapd 0, 34
+; PC64LE9-NEXT:    xscvspdpn 0, 0
+; PC64LE9-NEXT:    xscvdpsxws 0, 0
+; PC64LE9-NEXT:    mffprwz 4, 0
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 3
 ; PC64LE9-NEXT:    mtvsrwz 34, 3
+; PC64LE9-NEXT:    mtfprwz 1, 4
+; PC64LE9-NEXT:    addis 4, 2, .LCPI97_0@toc@ha
+; PC64LE9-NEXT:    xscvspdpn 0, 0
+; PC64LE9-NEXT:    addi 4, 4, .LCPI97_0@toc@l
+; PC64LE9-NEXT:    xscvdpsxws 0, 0
+; PC64LE9-NEXT:    mffprwz 5, 0
+; PC64LE9-NEXT:    mtfprwz 0, 5
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(4)
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -5558,11 +5558,11 @@ entry:
 define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_fptosi_v3i32_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xscvdpsxws 0, 1
-; PC64LE-NEXT:    xscvdpsxws 1, 2
+; PC64LE-NEXT:    xscvdpsxws 0, 2
+; PC64LE-NEXT:    xscvdpsxws 1, 1
 ; PC64LE-NEXT:    mffprwz 3, 0
-; PC64LE-NEXT:    mtfprwz 0, 3
-; PC64LE-NEXT:    mffprwz 3, 1
+; PC64LE-NEXT:    mffprwz 4, 1
+; PC64LE-NEXT:    mtfprwz 0, 4
 ; PC64LE-NEXT:    mtfprwz 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI105_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI105_0@toc@l
@@ -5577,19 +5577,19 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_fptosi_v3i32_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xscvdpsxws 0, 1
-; PC64LE9-NEXT:    xscvdpsxws 1, 2
+; PC64LE9-NEXT:    xscvdpsxws 0, 3
 ; PC64LE9-NEXT:    mffprwz 3, 0
-; PC64LE9-NEXT:    mtfprwz 0, 3
-; PC64LE9-NEXT:    mffprwz 3, 1
-; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI105_0@toc@ha
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    xscvdpsxws 1, 3
-; PC64LE9-NEXT:    addi 3, 3, .LCPI105_0@toc@l
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    mffprwz 3, 1
+; PC64LE9-NEXT:    xscvdpsxws 0, 2
 ; PC64LE9-NEXT:    mtvsrwz 34, 3
+; PC64LE9-NEXT:    mffprwz 4, 0
+; PC64LE9-NEXT:    xscvdpsxws 0, 1
+; PC64LE9-NEXT:    mtfprwz 1, 4
+; PC64LE9-NEXT:    addis 4, 2, .LCPI105_0@toc@ha
+; PC64LE9-NEXT:    addi 4, 4, .LCPI105_0@toc@l
+; PC64LE9-NEXT:    mffprwz 5, 0
+; PC64LE9-NEXT:    mtfprwz 0, 5
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(4)
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -5783,16 +5783,16 @@ entry:
 define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32(<3 x float> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_fptoui_v3i32_v3f32:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xxsldwi 0, 34, 34, 3
-; PC64LE-NEXT:    xxswapd 1, 34
+; PC64LE-NEXT:    xxswapd 0, 34
+; PC64LE-NEXT:    xxsldwi 1, 34, 34, 3
 ; PC64LE-NEXT:    xscvspdpn 0, 0
 ; PC64LE-NEXT:    xscvspdpn 1, 1
 ; PC64LE-NEXT:    xxsldwi 2, 34, 34, 1
 ; PC64LE-NEXT:    xscvdpuxws 0, 0
 ; PC64LE-NEXT:    xscvdpuxws 1, 1
 ; PC64LE-NEXT:    mffprwz 3, 0
-; PC64LE-NEXT:    mtfprwz 0, 3
-; PC64LE-NEXT:    mffprwz 3, 1
+; PC64LE-NEXT:    mffprwz 4, 1
+; PC64LE-NEXT:    mtfprwz 0, 4
 ; PC64LE-NEXT:    mtfprwz 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI113_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI113_0@toc@l
@@ -5808,25 +5808,25 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f32(<3 x float> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_fptoui_v3i32_v3f32:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 3
-; PC64LE9-NEXT:    xxswapd 1, 34
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 1
 ; PC64LE9-NEXT:    xscvspdpn 0, 0
-; PC64LE9-NEXT:    xscvspdpn 1, 1
 ; PC64LE9-NEXT:    xscvdpuxws 0, 0
-; PC64LE9-NEXT:    xscvdpuxws 1, 1
 ; PC64LE9-NEXT:    mffprwz 3, 0
-; PC64LE9-NEXT:    mtfprwz 0, 3
-; PC64LE9-NEXT:    mffprwz 3, 1
-; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI113_0@toc@ha
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    xxsldwi 1, 34, 34, 1
-; PC64LE9-NEXT:    addi 3, 3, .LCPI113_0@toc@l
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    xscvspdpn 1, 1
-; PC64LE9-NEXT:    xscvdpuxws 1, 1
-; PC64LE9-NEXT:    mffprwz 3, 1
+; PC64LE9-NEXT:    xxswapd 0, 34
+; PC64LE9-NEXT:    xscvspdpn 0, 0
+; PC64LE9-NEXT:    xscvdpuxws 0, 0
+; PC64LE9-NEXT:    mffprwz 4, 0
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 3
 ; PC64LE9-NEXT:    mtvsrwz 34, 3
+; PC64LE9-NEXT:    mtfprwz 1, 4
+; PC64LE9-NEXT:    addis 4, 2, .LCPI113_0@toc@ha
+; PC64LE9-NEXT:    xscvspdpn 0, 0
+; PC64LE9-NEXT:    addi 4, 4, .LCPI113_0@toc@l
+; PC64LE9-NEXT:    xscvdpuxws 0, 0
+; PC64LE9-NEXT:    mffprwz 5, 0
+; PC64LE9-NEXT:    mtfprwz 0, 5
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(4)
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -6054,11 +6054,11 @@ entry:
 define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_fptoui_v3i32_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xscvdpuxws 0, 1
-; PC64LE-NEXT:    xscvdpuxws 1, 2
+; PC64LE-NEXT:    xscvdpuxws 0, 2
+; PC64LE-NEXT:    xscvdpuxws 1, 1
 ; PC64LE-NEXT:    mffprwz 3, 0
-; PC64LE-NEXT:    mtfprwz 0, 3
-; PC64LE-NEXT:    mffprwz 3, 1
+; PC64LE-NEXT:    mffprwz 4, 1
+; PC64LE-NEXT:    mtfprwz 0, 4
 ; PC64LE-NEXT:    mtfprwz 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI121_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI121_0@toc@l
@@ -6073,19 +6073,19 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f64(<3 x double> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_fptoui_v3i32_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xscvdpuxws 0, 1
-; PC64LE9-NEXT:    xscvdpuxws 1, 2
+; PC64LE9-NEXT:    xscvdpuxws 0, 3
 ; PC64LE9-NEXT:    mffprwz 3, 0
-; PC64LE9-NEXT:    mtfprwz 0, 3
-; PC64LE9-NEXT:    mffprwz 3, 1
-; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI121_0@toc@ha
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    xscvdpuxws 1, 3
-; PC64LE9-NEXT:    addi 3, 3, .LCPI121_0@toc@l
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    mffprwz 3, 1
+; PC64LE9-NEXT:    xscvdpuxws 0, 2
 ; PC64LE9-NEXT:    mtvsrwz 34, 3
+; PC64LE9-NEXT:    mffprwz 4, 0
+; PC64LE9-NEXT:    xscvdpuxws 0, 1
+; PC64LE9-NEXT:    mtfprwz 1, 4
+; PC64LE9-NEXT:    addis 4, 2, .LCPI121_0@toc@ha
+; PC64LE9-NEXT:    addi 4, 4, .LCPI121_0@toc@l
+; PC64LE9-NEXT:    mffprwz 5, 0
+; PC64LE9-NEXT:    mtfprwz 0, 5
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(4)
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -6269,33 +6269,33 @@ entry:
 define <3 x float> @constrained_vector_fptrunc_v3f64(<3 x double> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_fptrunc_v3f64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xsrsp 0, 1
-; PC64LE-NEXT:    xsrsp 1, 2
+; PC64LE-NEXT:    xsrsp 0, 3
+; PC64LE-NEXT:    xsrsp 2, 2
 ; PC64LE-NEXT:    addis 3, 2, .LCPI129_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI129_0@toc@l
-; PC64LE-NEXT:    xscvdpspn 0, 0
+; PC64LE-NEXT:    xsrsp 1, 1
 ; PC64LE-NEXT:    xscvdpspn 1, 1
-; PC64LE-NEXT:    xxmrghw 34, 1, 0
-; PC64LE-NEXT:    lxvd2x 0, 0, 3
-; PC64LE-NEXT:    xxswapd 35, 0
-; PC64LE-NEXT:    xsrsp 0, 3
+; PC64LE-NEXT:    xscvdpspn 2, 2
 ; PC64LE-NEXT:    xscvdpspn 36, 0
+; PC64LE-NEXT:    xxmrghw 34, 2, 1
+; PC64LE-NEXT:    lxvd2x 1, 0, 3
+; PC64LE-NEXT:    xxswapd 35, 1
 ; PC64LE-NEXT:    vperm 2, 4, 2, 3
 ; PC64LE-NEXT:    blr
 ;
 ; PC64LE9-LABEL: constrained_vector_fptrunc_v3f64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    xsrsp 0, 1
-; PC64LE9-NEXT:    xsrsp 1, 2
+; PC64LE9-NEXT:    xsrsp 0, 3
+; PC64LE9-NEXT:    xsrsp 2, 2
+; PC64LE9-NEXT:    xsrsp 1, 1
 ; PC64LE9-NEXT:    addis 3, 2, .LCPI129_0@toc@ha
 ; PC64LE9-NEXT:    addi 3, 3, .LCPI129_0@toc@l
-; PC64LE9-NEXT:    xscvdpspn 0, 0
 ; PC64LE9-NEXT:    xscvdpspn 1, 1
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    xsrsp 1, 3
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    xscvdpspn 34, 1
-; PC64LE9-NEXT:    xxperm 34, 35, 0
+; PC64LE9-NEXT:    xscvdpspn 2, 2
+; PC64LE9-NEXT:    xscvdpspn 34, 0
+; PC64LE9-NEXT:    xxmrghw 35, 2, 1
+; PC64LE9-NEXT:    lxv 1, 0(3)
+; PC64LE9-NEXT:    xxperm 34, 35, 1
 ; PC64LE9-NEXT:    blr
 entry:
   %result = call <3 x float> 
@llvm.experimental.constrained.fptrunc.v3f32.v3f64(
@@ -7142,8 +7142,8 @@ entry:
 define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_sitofp_v3f32_v3i32:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xxswapd 0, 34
-; PC64LE-NEXT:    xxsldwi 1, 34, 34, 1
+; PC64LE-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE-NEXT:    xxswapd 1, 34
 ; PC64LE-NEXT:    mffprwz 3, 0
 ; PC64LE-NEXT:    mtfprwa 0, 3
 ; PC64LE-NEXT:    mffprwz 3, 1
@@ -7154,7 +7154,7 @@ define <3 x float> 
@constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; PC64LE-NEXT:    xscvsxdsp 1, 1
 ; PC64LE-NEXT:    xscvdpspn 0, 0
 ; PC64LE-NEXT:    xscvdpspn 1, 1
-; PC64LE-NEXT:    xxmrghw 35, 1, 0
+; PC64LE-NEXT:    xxmrghw 35, 0, 1
 ; PC64LE-NEXT:    lxvd2x 0, 0, 3
 ; PC64LE-NEXT:    mfvsrwz 3, 34
 ; PC64LE-NEXT:    xxswapd 36, 0
@@ -7166,24 +7166,24 @@ define <3 x float> 
@constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_sitofp_v3f32_v3i32:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    li 3, 0
+; PC64LE9-NEXT:    li 3, 4
 ; PC64LE9-NEXT:    vextuwrx 3, 3, 2
 ; PC64LE9-NEXT:    mtfprwa 0, 3
-; PC64LE9-NEXT:    li 3, 4
+; PC64LE9-NEXT:    li 3, 0
 ; PC64LE9-NEXT:    vextuwrx 3, 3, 2
 ; PC64LE9-NEXT:    xscvsxdsp 0, 0
 ; PC64LE9-NEXT:    mtfprwa 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI161_0@toc@ha
+; PC64LE9-NEXT:    mfvsrwz 3, 34
 ; PC64LE9-NEXT:    xscvsxdsp 1, 1
-; PC64LE9-NEXT:    addi 3, 3, .LCPI161_0@toc@l
+; PC64LE9-NEXT:    mtfprwa 2, 3
+; PC64LE9-NEXT:    addis 3, 2, .LCPI161_0@toc@ha
+; PC64LE9-NEXT:    xscvsxdsp 2, 2
 ; PC64LE9-NEXT:    xscvdpspn 0, 0
+; PC64LE9-NEXT:    addi 3, 3, .LCPI161_0@toc@l
 ; PC64LE9-NEXT:    xscvdpspn 1, 1
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    xscvdpspn 34, 2
+; PC64LE9-NEXT:    xxmrghw 35, 0, 1
 ; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    mfvsrwz 3, 34
-; PC64LE9-NEXT:    mtfprwa 1, 3
-; PC64LE9-NEXT:    xscvsxdsp 1, 1
-; PC64LE9-NEXT:    xscvdpspn 34, 1
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -7225,15 +7225,15 @@ entry:
 define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    mtfprd 0, 3
-; PC64LE-NEXT:    mtfprd 1, 4
+; PC64LE-NEXT:    mtfprd 0, 4
+; PC64LE-NEXT:    mtfprd 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI163_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI163_0@toc@l
 ; PC64LE-NEXT:    xscvsxdsp 0, 0
 ; PC64LE-NEXT:    xscvsxdsp 1, 1
-; PC64LE-NEXT:    xscvdpspn 0, 0
 ; PC64LE-NEXT:    xscvdpspn 1, 1
-; PC64LE-NEXT:    xxmrghw 34, 1, 0
+; PC64LE-NEXT:    xscvdpspn 0, 0
+; PC64LE-NEXT:    xxmrghw 34, 0, 1
 ; PC64LE-NEXT:    lxvd2x 0, 0, 3
 ; PC64LE-NEXT:    xxswapd 35, 0
 ; PC64LE-NEXT:    mtfprd 0, 5
@@ -7244,20 +7244,20 @@ define <3 x float> 
@constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    mtfprd 0, 3
 ; PC64LE9-NEXT:    mtfprd 1, 4
+; PC64LE9-NEXT:    mtfprd 2, 3
+; PC64LE9-NEXT:    mtfprd 0, 5
 ; PC64LE9-NEXT:    addis 3, 2, .LCPI163_0@toc@ha
-; PC64LE9-NEXT:    xscvsxdsp 0, 0
 ; PC64LE9-NEXT:    xscvsxdsp 1, 1
+; PC64LE9-NEXT:    xscvsxdsp 2, 2
+; PC64LE9-NEXT:    xscvsxdsp 0, 0
 ; PC64LE9-NEXT:    addi 3, 3, .LCPI163_0@toc@l
-; PC64LE9-NEXT:    xscvdpspn 0, 0
+; PC64LE9-NEXT:    xscvdpspn 2, 2
 ; PC64LE9-NEXT:    xscvdpspn 1, 1
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    mtfprd 1, 5
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    xscvsxdsp 1, 1
-; PC64LE9-NEXT:    xscvdpspn 34, 1
-; PC64LE9-NEXT:    xxperm 34, 35, 0
+; PC64LE9-NEXT:    xscvdpspn 34, 0
+; PC64LE9-NEXT:    xxmrghw 35, 1, 2
+; PC64LE9-NEXT:    lxv 1, 0(3)
+; PC64LE9-NEXT:    xxperm 34, 35, 1
 ; PC64LE9-NEXT:    blr
 entry:
   %result = call <3 x float>
@@ -7709,8 +7709,8 @@ entry:
 define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    xxswapd 0, 34
-; PC64LE-NEXT:    xxsldwi 1, 34, 34, 1
+; PC64LE-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE-NEXT:    xxswapd 1, 34
 ; PC64LE-NEXT:    mffprwz 3, 0
 ; PC64LE-NEXT:    mtfprwz 0, 3
 ; PC64LE-NEXT:    mffprwz 3, 1
@@ -7721,7 +7721,7 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; PC64LE-NEXT:    xscvuxdsp 1, 1
 ; PC64LE-NEXT:    xscvdpspn 0, 0
 ; PC64LE-NEXT:    xscvdpspn 1, 1
-; PC64LE-NEXT:    xxmrghw 35, 1, 0
+; PC64LE-NEXT:    xxmrghw 35, 0, 1
 ; PC64LE-NEXT:    lxvd2x 0, 0, 3
 ; PC64LE-NEXT:    mfvsrwz 3, 34
 ; PC64LE-NEXT:    xxswapd 36, 0
@@ -7733,24 +7733,24 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    li 3, 0
+; PC64LE9-NEXT:    li 3, 4
 ; PC64LE9-NEXT:    vextuwrx 3, 3, 2
 ; PC64LE9-NEXT:    mtfprwz 0, 3
-; PC64LE9-NEXT:    li 3, 4
+; PC64LE9-NEXT:    li 3, 0
 ; PC64LE9-NEXT:    vextuwrx 3, 3, 2
 ; PC64LE9-NEXT:    xscvuxdsp 0, 0
 ; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    addis 3, 2, .LCPI179_0@toc@ha
+; PC64LE9-NEXT:    mfvsrwz 3, 34
 ; PC64LE9-NEXT:    xscvuxdsp 1, 1
-; PC64LE9-NEXT:    addi 3, 3, .LCPI179_0@toc@l
+; PC64LE9-NEXT:    mtfprwz 2, 3
+; PC64LE9-NEXT:    addis 3, 2, .LCPI179_0@toc@ha
+; PC64LE9-NEXT:    xscvuxdsp 2, 2
 ; PC64LE9-NEXT:    xscvdpspn 0, 0
+; PC64LE9-NEXT:    addi 3, 3, .LCPI179_0@toc@l
 ; PC64LE9-NEXT:    xscvdpspn 1, 1
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    xscvdpspn 34, 2
+; PC64LE9-NEXT:    xxmrghw 35, 0, 1
 ; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    mfvsrwz 3, 34
-; PC64LE9-NEXT:    mtfprwz 1, 3
-; PC64LE9-NEXT:    xscvuxdsp 1, 1
-; PC64LE9-NEXT:    xscvdpspn 34, 1
 ; PC64LE9-NEXT:    xxperm 34, 35, 0
 ; PC64LE9-NEXT:    blr
 entry:
@@ -7792,15 +7792,15 @@ entry:
 define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; PC64LE-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; PC64LE:       # %bb.0: # %entry
-; PC64LE-NEXT:    mtfprd 0, 3
-; PC64LE-NEXT:    mtfprd 1, 4
+; PC64LE-NEXT:    mtfprd 0, 4
+; PC64LE-NEXT:    mtfprd 1, 3
 ; PC64LE-NEXT:    addis 3, 2, .LCPI181_0@toc@ha
 ; PC64LE-NEXT:    addi 3, 3, .LCPI181_0@toc@l
 ; PC64LE-NEXT:    xscvuxdsp 0, 0
 ; PC64LE-NEXT:    xscvuxdsp 1, 1
-; PC64LE-NEXT:    xscvdpspn 0, 0
 ; PC64LE-NEXT:    xscvdpspn 1, 1
-; PC64LE-NEXT:    xxmrghw 34, 1, 0
+; PC64LE-NEXT:    xscvdpspn 0, 0
+; PC64LE-NEXT:    xxmrghw 34, 0, 1
 ; PC64LE-NEXT:    lxvd2x 0, 0, 3
 ; PC64LE-NEXT:    xxswapd 35, 0
 ; PC64LE-NEXT:    mtfprd 0, 5
@@ -7811,20 +7811,20 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ;
 ; PC64LE9-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; PC64LE9:       # %bb.0: # %entry
-; PC64LE9-NEXT:    mtfprd 0, 3
 ; PC64LE9-NEXT:    mtfprd 1, 4
+; PC64LE9-NEXT:    mtfprd 2, 3
+; PC64LE9-NEXT:    mtfprd 0, 5
 ; PC64LE9-NEXT:    addis 3, 2, .LCPI181_0@toc@ha
-; PC64LE9-NEXT:    xscvuxdsp 0, 0
 ; PC64LE9-NEXT:    xscvuxdsp 1, 1
+; PC64LE9-NEXT:    xscvuxdsp 2, 2
+; PC64LE9-NEXT:    xscvuxdsp 0, 0
 ; PC64LE9-NEXT:    addi 3, 3, .LCPI181_0@toc@l
-; PC64LE9-NEXT:    xscvdpspn 0, 0
+; PC64LE9-NEXT:    xscvdpspn 2, 2
 ; PC64LE9-NEXT:    xscvdpspn 1, 1
-; PC64LE9-NEXT:    xxmrghw 35, 1, 0
-; PC64LE9-NEXT:    mtfprd 1, 5
-; PC64LE9-NEXT:    lxv 0, 0(3)
-; PC64LE9-NEXT:    xscvuxdsp 1, 1
-; PC64LE9-NEXT:    xscvdpspn 34, 1
-; PC64LE9-NEXT:    xxperm 34, 35, 0
+; PC64LE9-NEXT:    xscvdpspn 34, 0
+; PC64LE9-NEXT:    xxmrghw 35, 1, 2
+; PC64LE9-NEXT:    lxv 1, 0(3)
+; PC64LE9-NEXT:    xxperm 34, 35, 1
 ; PC64LE9-NEXT:    blr
 entry:
   %result = call <3 x float>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll 
b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 388d8528a2b80..f38b769fe4987 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -368,46 +368,47 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x 
float> %a1) nounwind {
 ; AVX512F-NEXT:    vaddps %xmm4, %xmm9, %xmm9
 ; AVX512F-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
 ; AVX512F-NEXT:    vmulss %xmm1, %xmm4, %xmm10
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; AVX512F-NEXT:    vmulss %xmm6, %xmm5, %xmm6
-; AVX512F-NEXT:    vaddss %xmm6, %xmm10, %xmm6
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
-; AVX512F-NEXT:    vmulss %xmm8, %xmm10, %xmm8
-; AVX512F-NEXT:    vaddss %xmm6, %xmm8, %xmm6
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3]
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3]
+; AVX512F-NEXT:    vmulss %xmm6, %xmm11, %xmm5
+; AVX512F-NEXT:    vaddss %xmm5, %xmm10, %xmm5
+; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT:    vmulss %xmm6, %xmm8, %xmm8
+; AVX512F-NEXT:    vaddss %xmm5, %xmm8, %xmm5
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3]
 ; AVX512F-NEXT:    vmulps %xmm7, %xmm0, %xmm8
 ; AVX512F-NEXT:    vextractf128 $1, %ymm1, %xmm9
-; AVX512F-NEXT:    vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; AVX512F-NEXT:    vmulps %xmm2, %xmm11, %xmm11
-; AVX512F-NEXT:    vaddps %xmm11, %xmm8, %xmm8
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3]
-; AVX512F-NEXT:    vmulps %xmm3, %xmm11, %xmm12
+; AVX512F-NEXT:    vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; AVX512F-NEXT:    vmulps %xmm2, %xmm10, %xmm10
+; AVX512F-NEXT:    vaddps %xmm10, %xmm8, %xmm8
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; AVX512F-NEXT:    vmulps %xmm3, %xmm10, %xmm12
 ; AVX512F-NEXT:    vaddps %xmm12, %xmm8, %xmm8
 ; AVX512F-NEXT:    vmulss %xmm7, %xmm4, %xmm7
-; AVX512F-NEXT:    vmulss %xmm5, %xmm9, %xmm12
+; AVX512F-NEXT:    vmulss %xmm9, %xmm11, %xmm12
 ; AVX512F-NEXT:    vaddss %xmm7, %xmm12, %xmm7
-; AVX512F-NEXT:    vmulss %xmm11, %xmm10, %xmm11
-; AVX512F-NEXT:    vaddss %xmm7, %xmm11, %xmm7
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3]
-; AVX512F-NEXT:    vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3]
-; AVX512F-NEXT:    vshufpd {{.*#+}} xmm11 = xmm9[1,0]
+; AVX512F-NEXT:    vmulss %xmm6, %xmm10, %xmm10
+; AVX512F-NEXT:    vaddss %xmm7, %xmm10, %xmm7
+; AVX512F-NEXT:    vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512F-NEXT:    vshufpd {{.*#+}} xmm12 = xmm9[1,0]
 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2]
 ; AVX512F-NEXT:    vmulps %xmm0, %xmm9, %xmm0
-; AVX512F-NEXT:    vmulps %xmm2, %xmm8, %xmm2
+; AVX512F-NEXT:    vmulps %xmm2, %xmm10, %xmm2
 ; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; AVX512F-NEXT:    vbroadcastss %xmm1, %xmm2
 ; AVX512F-NEXT:    vmulps %xmm2, %xmm3, %xmm2
 ; AVX512F-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vmulss %xmm4, %xmm11, %xmm2
-; AVX512F-NEXT:    vmulss %xmm5, %xmm8, %xmm3
+; AVX512F-NEXT:    vmulss %xmm4, %xmm12, %xmm2
+; AVX512F-NEXT:    vmulss %xmm10, %xmm11, %xmm3
 ; AVX512F-NEXT:    vaddss %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vmulss %xmm1, %xmm10, %xmm1
+; AVX512F-NEXT:    vmulss %xmm1, %xmm6, %xmm1
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
-; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm0 = 
[0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
-; AVX512F-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3]
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3]
+; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_mul3x3_f32:
@@ -447,26 +448,27 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x 
float> %a1) nounwind {
 ; AVX512VL-NEXT:    vaddss %xmm7, %xmm12, %xmm7
 ; AVX512VL-NEXT:    vmulss %xmm11, %xmm10, %xmm11
 ; AVX512VL-NEXT:    vaddss %xmm7, %xmm11, %xmm7
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX512VL-NEXT:    vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm11 = xmm8[1,0]
+; AVX512VL-NEXT:    vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3]
+; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm12 = xmm8[1,0]
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
 ; AVX512VL-NEXT:    vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT:    vmulps %xmm7, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmulps %xmm2, %xmm11, %xmm2
 ; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
 ; AVX512VL-NEXT:    vbroadcastss %xmm1, %xmm2
 ; AVX512VL-NEXT:    vmulps %xmm2, %xmm6, %xmm2
 ; AVX512VL-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmulss %xmm11, %xmm9, %xmm2
-; AVX512VL-NEXT:    vmulss %xmm7, %xmm4, %xmm4
+; AVX512VL-NEXT:    vmulss %xmm12, %xmm9, %xmm2
+; AVX512VL-NEXT:    vmulss %xmm4, %xmm11, %xmm4
 ; AVX512VL-NEXT:    vaddss %xmm4, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vmulss %xmm1, %xmm10, %xmm1
 ; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
-; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm0 = 
[0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
-; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3]
+; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm5[0]
+; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 entry:
   %block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 
0, i32 1>
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll 
b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 4a5b4277c3cca..88d3ad181d766 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -4143,11 +4143,11 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT:    movd %eax, %xmm1
-; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    movd %ecx, %xmm0
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
@@ -4155,10 +4155,10 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f32() #0 {
 ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -4256,11 +4256,11 @@ define <3 x i64> 
@constrained_vector_fptosi_v3i64_v3f32() #0 {
 ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -4268,11 +4268,11 @@ define <3 x i64> 
@constrained_vector_fptosi_v3i64_v3f32() #0 {
 ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -4382,11 +4382,11 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT:    movd %eax, %xmm1
-; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    movd %ecx, %xmm0
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
@@ -4394,10 +4394,10 @@ define <3 x i32> 
@constrained_vector_fptosi_v3i32_v3f64() #0 {
 ; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX-NEXT:    vmovd %edx, %xmm0
+; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -4498,11 +4498,11 @@ define <3 x i64> 
@constrained_vector_fptosi_v3i64_v3f64() #0 {
 ; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -4510,11 +4510,11 @@ define <3 x i64> 
@constrained_vector_fptosi_v3i64_v3f64() #0 {
 ; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -4645,11 +4645,11 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT:    movd %eax, %xmm1
-; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    movd %ecx, %xmm0
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
@@ -4658,19 +4658,19 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f32() #0 {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm0
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; AVX1-NEXT:    vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX512-NEXT:    vmovd %edx, %xmm0
+; AVX512-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX512-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 entry:
@@ -4911,7 +4911,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() 
#0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm2, %xmm0
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -4921,51 +4921,51 @@ define <3 x i64> 
@constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm3
 ; AVX1-NEXT:  .LBB123_2: # %entry
 ; AVX1-NEXT:    vsubss %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    setbe %cl
-; AVX1-NEXT:    movzbl %cl, %ecx
-; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX1-NEXT:    vcomiss %xmm3, %xmm0
-; AVX1-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX1-NEXT:    setbe %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    shlq $63, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vcomiss %xmm2, %xmm0
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    ja .LBB123_4
 ; AVX1-NEXT:  # %bb.3: # %entry
-; AVX1-NEXT:    vmovaps %xmm0, %xmm4
+; AVX1-NEXT:    vmovaps %xmm0, %xmm3
 ; AVX1-NEXT:  .LBB123_4: # %entry
-; AVX1-NEXT:    vsubss %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
+; AVX1-NEXT:    vsubss %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
 ; AVX1-NEXT:    setbe %cl
 ; AVX1-NEXT:    movzbl %cl, %ecx
 ; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX1-NEXT:    vcomiss %xmm3, %xmm0
+; AVX1-NEXT:    xorq %rdx, %rcx
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vcomiss %xmm2, %xmm0
 ; AVX1-NEXT:    ja .LBB123_6
 ; AVX1-NEXT:  # %bb.5: # %entry
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB123_6: # %entry
-; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    setbe %cl
-; AVX1-NEXT:    movzbl %cl, %ecx
-; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
+; AVX1-NEXT:    setbe %sil
+; AVX1-NEXT:    movzbl %sil, %esi
+; AVX1-NEXT:    shlq $63, %rsi
+; AVX1-NEXT:    xorq %rdx, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -5194,11 +5194,11 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT:    movd %eax, %xmm1
-; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT:    movd %eax, %xmm0
+; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; CHECK-NEXT:    movd %edx, %xmm1
+; CHECK-NEXT:    movd %ecx, %xmm0
 ; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; CHECK-NEXT:    movd %eax, %xmm1
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
@@ -5207,19 +5207,19 @@ define <3 x i32> 
@constrained_vector_fptoui_v3i32_v3f64() #0 {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm0
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; AVX1-NEXT:    vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT:    vmovd %edx, %xmm0
+; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX512-NEXT:    vmovd %edx, %xmm0
+; AVX512-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
 ; AVX512-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 entry:
@@ -5466,7 +5466,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() 
#0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2299999999999997E+1,0.0E+0]
 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm2, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
@@ -5476,51 +5476,51 @@ define <3 x i64> 
@constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; AVX1-NEXT:    vmovapd %xmm0, %xmm3
 ; AVX1-NEXT:  .LBB131_2: # %entry
 ; AVX1-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttsd2si %xmm2, %rax
-; AVX1-NEXT:    setbe %cl
-; AVX1-NEXT:    movzbl %cl, %ecx
-; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0]
-; AVX1-NEXT:    vcomisd %xmm3, %xmm0
-; AVX1-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rcx
+; AVX1-NEXT:    setbe %al
+; AVX1-NEXT:    movzbl %al, %eax
+; AVX1-NEXT:    shlq $63, %rax
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2100000000000001E+1,0.0E+0]
+; AVX1-NEXT:    vcomisd %xmm2, %xmm0
+; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    ja .LBB131_4
 ; AVX1-NEXT:  # %bb.3: # %entry
-; AVX1-NEXT:    vmovapd %xmm0, %xmm4
+; AVX1-NEXT:    vmovapd %xmm0, %xmm3
 ; AVX1-NEXT:  .LBB131_4: # %entry
-; AVX1-NEXT:    vsubsd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
+; AVX1-NEXT:    vsubsd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
 ; AVX1-NEXT:    setbe %cl
 ; AVX1-NEXT:    movzbl %cl, %ecx
 ; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2299999999999997E+1,0.0E+0]
-; AVX1-NEXT:    vcomisd %xmm3, %xmm0
+; AVX1-NEXT:    xorq %rdx, %rcx
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT:    vcomisd %xmm2, %xmm0
 ; AVX1-NEXT:    ja .LBB131_6
 ; AVX1-NEXT:  # %bb.5: # %entry
 ; AVX1-NEXT:    vmovapd %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB131_6: # %entry
-; AVX1-NEXT:    vsubsd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX1-NEXT:    setbe %cl
-; AVX1-NEXT:    movzbl %cl, %ecx
-; AVX1-NEXT:    shlq $63, %rcx
-; AVX1-NEXT:    xorq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT:    vcvttsd2si %xmm0, %rdx
+; AVX1-NEXT:    setbe %sil
+; AVX1-NEXT:    movzbl %sil, %esi
+; AVX1-NEXT:    shlq $63, %rsi
+; AVX1-NEXT:    xorq %rdx, %rsi
+; AVX1-NEXT:    vmovq %rsi, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm0
-; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vmovq %rcx, %xmm1
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
 ; AVX512-NEXT:    vmovq %rax, %xmm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -5731,26 +5731,26 @@ entry:
 define <3 x float> @constrained_vector_fptrunc_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptrunc_v3f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm1
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
-; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm1
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    cvtsd2ss %xmm2, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fptrunc_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
-; AVX-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vcvtsd2ss %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX-NEXT:    retq
 entry:
   %result = call <3 x float> 
@llvm.experimental.constrained.fptrunc.v3f32.v3f64(
@@ -5834,14 +5834,14 @@ define <3 x double> @constrained_vector_fpext_v3f32() 
#0 {
 ;
 ; AVX-LABEL: constrained_vector_fpext_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(
@@ -6702,14 +6702,14 @@ define <3 x double> 
@constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractps $1, %xmm0, %eax
+; AVX-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm1
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm2
-; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <3 x double>
@@ -6722,31 +6722,31 @@ entry:
 define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtsi2ss %eax, %xmm1
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    movd %xmm2, %eax
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    cvtsi2ss %eax, %xmm2
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cvtsi2ss %eax, %xmm0
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractps $1, %xmm0, %eax
+; AVX-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm1
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm2
-; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    retq
 entry:
   %result = call <3 x float>
@@ -6769,28 +6769,28 @@ define <3 x double> 
@constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ;
 ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
 ; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <3 x double>
@@ -6803,39 +6803,38 @@ entry:
 define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvtsi2ss %rsi, %xmm1
-; CHECK-NEXT:    cvtsi2ss %rdi, %xmm0
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtsi2ss %rdx, %xmm1
+; CHECK-NEXT:    cvtsi2ss %rdi, %xmm0
+; CHECK-NEXT:    cvtsi2ss %rsi, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
 ; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
@@ -7415,26 +7414,26 @@ define <3 x double> 
@constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ;
 ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vextractps $1, %xmm0, %eax
+; AVX1-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm2
-; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <3 x double>
@@ -7447,43 +7446,43 @@ entry:
 define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    cvtsi2ss %rax, %xmm1
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    movd %xmm2, %eax
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    cvtsi2ss %rax, %xmm2
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; CHECK-NEXT:    movd %xmm0, %eax
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cvtsi2ss %rax, %xmm0
-; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vextractps $1, %xmm0, %eax
+; AVX1-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vextractps $1, %xmm0, %eax
+; AVX512-NEXT:    vextractps $2, %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm2
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX512-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512-NEXT:    retq
 entry:
   %result = call <3 x float>
@@ -7539,7 +7538,8 @@ define <3 x double> 
@constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ;
 ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    movl %eax, %edx
@@ -7565,9 +7565,7 @@ define <3 x double> 
@constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:  # %bb.3:
 ; AVX1-NEXT:    vaddsd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:  .LBB183_4: # %entry
-; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    movl %eax, %edx
@@ -7580,20 +7578,21 @@ define <3 x double> 
@constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:  .LBB183_6: # %entry
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
 ; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
-; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <3 x double>
@@ -7606,13 +7605,13 @@ entry:
 define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    shrq %rax
-; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    movl %edx, %ecx
 ; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    cmovnsq %rsi, %rcx
+; CHECK-NEXT:    testq %rdx, %rdx
+; CHECK-NEXT:    cmovnsq %rdx, %rcx
 ; CHECK-NEXT:    cvtsi2ss %rcx, %xmm1
 ; CHECK-NEXT:    jns .LBB184_2
 ; CHECK-NEXT:  # %bb.1:
@@ -7630,26 +7629,26 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    addss %xmm0, %xmm0
 ; CHECK-NEXT:  .LBB184_4: # %entry
-; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    shrq %rax
-; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movl %esi, %ecx
 ; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovnsq %rdx, %rcx
-; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    cvtsi2ss %rcx, %xmm1
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    cmovnsq %rsi, %rcx
+; CHECK-NEXT:    cvtsi2ss %rcx, %xmm2
 ; CHECK-NEXT:    jns .LBB184_6
 ; CHECK-NEXT:  # %bb.5:
-; CHECK-NEXT:    addss %xmm1, %xmm1
+; CHECK-NEXT:    addss %xmm2, %xmm2
 ; CHECK-NEXT:  .LBB184_6: # %entry
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    movl %eax, %edx
@@ -7675,9 +7674,7 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:  # %bb.3:
 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:  .LBB184_4: # %entry
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    movq %rax, %rcx
 ; AVX1-NEXT:    shrq %rcx
 ; AVX1-NEXT:    movl %eax, %edx
@@ -7690,21 +7687,22 @@ define <3 x float> 
@constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:  .LBB184_6: # %entry
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT:    vmovq %xmm1, %rax
 ; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
-; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to