Index: test/CodeGen/arm-abi-vector.c
===================================================================
--- test/CodeGen/arm-abi-vector.c	(revision 0)
+++ test/CodeGen/arm-abi-vector.c	(revision 0)
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -triple armv7-apple-darwin -target-abi aapcs -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple armv7-apple-darwin -target-abi apcs-gnu -emit-llvm -o - %s | FileCheck -check-prefix=APCS-GNU %s
+
+#include <stdarg.h>
+
+typedef __attribute__(( ext_vector_type(2) ))  int __int2;
+typedef __attribute__(( ext_vector_type(3) ))  char __char3;
+typedef __attribute__(( ext_vector_type(5) ))  char __char5;
+typedef __attribute__(( ext_vector_type(9) ))  char __char9;
+typedef __attribute__(( ext_vector_type(19) )) char __char19;
+typedef __attribute__(( ext_vector_type(3) ))  short __short3;
+typedef __attribute__(( ext_vector_type(5) ))  short __short5;
+
+// Passing legal vector types as varargs.
+double varargs_vec_2i(int fixed, ...) {
+// CHECK: varargs_vec_2i
+// CHECK: %c3 = alloca <2 x i32>, align 8
+// CHECK: %3 = and i32 %2, -8
+// CHECK: %ap.align = inttoptr i32 %3 to i8*
+// CHECK: %ap.next = getelementptr i8* %ap.align, i32 8
+// CHECK: bitcast i8* %ap.align to <2 x i32>*
+// APCS-GNU: varargs_vec_2i
+// APCS-GNU: %c3 = alloca <2 x i32>, align 8
+// APCS-GNU: %var.align = alloca <2 x i32>
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 8
+// APCS-GNU: %1 = bitcast <2 x i32>* %var.align to i8*
+// APCS-GNU: call void @llvm.memcpy
+// APCS-GNU: %2 = load <2 x i32>* %var.align
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __int2 c3 = va_arg(ap, __int2);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_2i(__int2 *in) {
+// CHECK: test_2i
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_2i(i32 3, <2 x i32> %1)
+// APCS-GNU: test_2i
+// APCS-GNU: call double (i32, ...)* @varargs_vec_2i(i32 3, <2 x i32> %1)
+  return varargs_vec_2i(3, *in);
+}
+
+double varargs_vec_3c(int fixed, ...) {
+// CHECK: varargs_vec_3c
+// CHECK: %c3 = alloca <3 x i8>, align 4
+// CHECK: %ap.next = getelementptr i8* %ap.cur, i32 4
+// CHECK: %1 = bitcast i8* %ap.cur to <3 x i8>*
+// APCS-GNU: varargs_vec_3c
+// APCS-GNU: %c3 = alloca <3 x i8>, align 4
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 4
+// APCS-GNU: bitcast i8* %ap.cur to <3 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char3 c3 = va_arg(ap, __char3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3c(__char3 *in) {
+// CHECK: test_3c
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_3c(i32 3, i32 %2)
+// APCS-GNU: test_3c
+// APCS-GNU: call double (i32, ...)* @varargs_vec_3c(i32 3, i32 %2)
+  return varargs_vec_3c(3, *in);
+}
+
+double varargs_vec_5c(int fixed, ...) {
+// CHECK: varargs_vec_5c
+// CHECK: %c5 = alloca <5 x i8>, align 8
+// CHECK: %3 = and i32 %2, -8
+// CHECK: %ap.align = inttoptr i32 %3 to i8*
+// CHECK: %ap.next = getelementptr i8* %ap.align, i32 8
+// CHECK: bitcast i8* %ap.align to <5 x i8>*
+// APCS-GNU: varargs_vec_5c
+// APCS-GNU: %c5 = alloca <5 x i8>, align 8
+// APCS-GNU: %var.align = alloca <5 x i8>
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 8
+// APCS-GNU: %1 = bitcast <5 x i8>* %var.align to i8*
+// APCS-GNU: call void @llvm.memcpy
+// APCS-GNU: %2 = load <5 x i8>* %var.align
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char5 c5 = va_arg(ap, __char5);
+  sum = sum + c5.x + c5.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_5c(__char5 *in) {
+// CHECK: test_5c
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_5c(i32 5, <2 x i32> %3)
+// APCS-GNU: test_5c
+// APCS-GNU: call double (i32, ...)* @varargs_vec_5c(i32 5, <2 x i32> %3)
+  return varargs_vec_5c(5, *in);
+}
+
+double varargs_vec_9c(int fixed, ...) {
+// CHECK: varargs_vec_9c
+// CHECK: %c9 = alloca <9 x i8>, align 16
+// CHECK: %var.align = alloca <9 x i8>
+// CHECK: %3 = and i32 %2, -8
+// CHECK: %ap.align = inttoptr i32 %3 to i8*
+// CHECK: %ap.next = getelementptr i8* %ap.align, i32 16
+// CHECK: %4 = bitcast <9 x i8>* %var.align to i8*
+// CHECK: call void @llvm.memcpy
+// CHECK: %5 = load <9 x i8>* %var.align
+// APCS-GNU: varargs_vec_9c
+// APCS-GNU: %c9 = alloca <9 x i8>, align 16
+// APCS-GNU: %var.align = alloca <9 x i8>
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 16
+// APCS-GNU: %1 = bitcast <9 x i8>* %var.align to i8*
+// APCS-GNU: call void @llvm.memcpy
+// APCS-GNU: %2 = load <9 x i8>* %var.align
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char9 c9 = va_arg(ap, __char9);
+  sum = sum + c9.x + c9.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_9c(__char9 *in) {
+// CHECK: test_9c
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_9c(i32 9, <4 x i32> %3)
+// APCS-GNU: test_9c
+// APCS-GNU: call double (i32, ...)* @varargs_vec_9c(i32 9, <4 x i32> %3)
+  return varargs_vec_9c(9, *in);
+}
+
+double varargs_vec_19c(int fixed, ...) {
+// CHECK: varargs_vec_19c
+// CHECK: %ap.next = getelementptr i8* %ap.cur, i32 4
+// CHECK: %1 = bitcast i8* %ap.cur to i8**
+// CHECK: %2 = load i8** %1
+// CHECK: bitcast i8* %2 to <19 x i8>*
+// APCS-GNU: varargs_vec_19c
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 4
+// APCS-GNU: %1 = bitcast i8* %ap.cur to i8**
+// APCS-GNU: %2 = load i8** %1
+// APCS-GNU: bitcast i8* %2 to <19 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char19 c19 = va_arg(ap, __char19);
+  sum = sum + c19.x + c19.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_19c(__char19 *in) {
+// CHECK: test_19c
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_19c(i32 19, <19 x i8>* %tmp)
+// APCS-GNU: test_19c
+// APCS-GNU: call double (i32, ...)* @varargs_vec_19c(i32 19, <19 x i8>* %tmp)
+  return varargs_vec_19c(19, *in);
+}
+
+double varargs_vec_3s(int fixed, ...) {
+// CHECK: varargs_vec_3s
+// CHECK: %c3 = alloca <3 x i16>, align 8
+// CHECK: %3 = and i32 %2, -8
+// CHECK: %ap.align = inttoptr i32 %3 to i8*
+// CHECK: %ap.next = getelementptr i8* %ap.align, i32 8
+// CHECK: bitcast i8* %ap.align to <3 x i16>*
+// APCS-GNU: varargs_vec_3s
+// APCS-GNU: %c3 = alloca <3 x i16>, align 8
+// APCS-GNU: %var.align = alloca <3 x i16>
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 8
+// APCS-GNU: %1 = bitcast <3 x i16>* %var.align to i8*
+// APCS-GNU: call void @llvm.memcpy
+// APCS-GNU: %2 = load <3 x i16>* %var.align
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __short3 c3 = va_arg(ap, __short3);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_3s(__short3 *in) {
+// CHECK: test_3s
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_3s(i32 3, <2 x i32> %2)
+// APCS-GNU: test_3s
+// APCS-GNU: call double (i32, ...)* @varargs_vec_3s(i32 3, <2 x i32> %2)
+  return varargs_vec_3s(3, *in);
+}
+
+double varargs_vec_5s(int fixed, ...) {
+// CHECK: varargs_vec_5s
+// CHECK: %c5 = alloca <5 x i16>, align 16
+// CHECK: %var.align = alloca <5 x i16>
+// CHECK: %3 = and i32 %2, -8
+// CHECK: %ap.align = inttoptr i32 %3 to i8*
+// CHECK: %ap.next = getelementptr i8* %ap.align, i32 16
+// CHECK: %4 = bitcast <5 x i16>* %var.align to i8*
+// CHECK: call void @llvm.memcpy
+// CHECK: %5 = load <5 x i16>* %var.align
+// APCS-GNU: varargs_vec_5s
+// APCS-GNU: %c5 = alloca <5 x i16>, align 16
+// APCS-GNU: %var.align = alloca <5 x i16>
+// APCS-GNU: %ap.next = getelementptr i8* %ap.cur, i32 16
+// APCS-GNU: %1 = bitcast <5 x i16>* %var.align to i8*
+// APCS-GNU: call void @llvm.memcpy
+// APCS-GNU: %2 = load <5 x i16>* %var.align
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __short5 c5 = va_arg(ap, __short5);
+  sum = sum + c5.x + c5.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_5s(__short5 *in) {
+// CHECK: test_5s
+// CHECK: call arm_aapcscc double (i32, ...)* @varargs_vec_5s(i32 5, <4 x i32> %3)
+// APCS-GNU: test_5s
+// APCS-GNU: call double (i32, ...)* @varargs_vec_5s(i32 5, <4 x i32> %3)
+  return varargs_vec_5s(5, *in);
+}
Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp	(revision 165846)
+++ lib/CodeGen/TargetInfo.cpp	(working copy)
@@ -2797,6 +2797,7 @@
 
   ABIArgInfo classifyReturnType(QualType RetTy) const;
   ABIArgInfo classifyArgumentType(QualType RetTy) const;
+  bool isIllegalVectorType(QualType Ty) const;
 
   virtual void computeInfo(CGFunctionInfo &FI) const;
 
@@ -2943,6 +2944,27 @@
 }
 
 ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty) const {
+  // Handle illegal vector types here.
+  if (isIllegalVectorType(Ty)) {
+    uint64_t Size = getContext().getTypeSize(Ty);
+    if (Size <= 32) {
+      llvm::Type *ResType =
+          llvm::Type::getInt32Ty(getVMContext());
+      return ABIArgInfo::getDirect(ResType);
+    }
+    if (Size == 64) {
+      llvm::Type *ResType = llvm::VectorType::get(
+          llvm::Type::getInt32Ty(getVMContext()), 2);
+      return ABIArgInfo::getDirect(ResType);
+    }
+    if (Size == 128) {
+      llvm::Type *ResType = llvm::VectorType::get(
+          llvm::Type::getInt32Ty(getVMContext()), 4);
+      return ABIArgInfo::getDirect(ResType);
+    }
+    return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
+  }
+
   if (!isAggregateTypeForABI(Ty)) {
     // Treat an enum type as its underlying type.
     if (const EnumType *EnumTy = Ty->getAs<EnumType>())
@@ -3159,6 +3181,21 @@
   return ABIArgInfo::getIndirect(0);
 }
 
+/// isIllegalVector - check whether Ty is an illegal vector type.
+bool ARMABIInfo::isIllegalVectorType(QualType Ty) const {
+  if (const VectorType *VT = Ty->getAs<VectorType>()) {
+    // Check whether VT is legal.
+    unsigned NumElements = VT->getNumElements();
+    uint64_t Size = getContext().getTypeSize(VT);
+    // NumElements should be power of 2.
+    if ((NumElements & (NumElements - 1)) != 0)
+      return true;
+    // Size should be greater than 32 bits.
+    return Size <= 32;
+  }
+  return false;
+}
+
 llvm::Value *ARMABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
                                    CodeGenFunction &CGF) const {
   llvm::Type *BP = CGF.Int8PtrTy;
@@ -3167,27 +3204,67 @@
   CGBuilderTy &Builder = CGF.Builder;
   llvm::Value *VAListAddrAsBPP = Builder.CreateBitCast(VAListAddr, BPP, "ap");
   llvm::Value *Addr = Builder.CreateLoad(VAListAddrAsBPP, "ap.cur");
+
+  bool IsIndirect = false;
+  uint64_t Size = CGF.getContext().getTypeSize(Ty) / 8;
+  uint64_t TyAlign = CGF.getContext().getTypeAlign(Ty) / 8;
+
+  // The ABI alignment for vectors is 8 for AAPCS and 4 for APCS.
+  if (Ty->getAs<VectorType>() && Size >= 8) {
+    if (getABIKind() == ARMABIInfo::AAPCS_VFP ||
+        getABIKind() == ARMABIInfo::AAPCS)
+      TyAlign = 8;
+    else
+      TyAlign = 4;
+  }
+  // Use indirect if size of the illegal vector is bigger than 16 bytes.
+  if (isIllegalVectorType(Ty) && Size > 16) {
+    IsIndirect = true;
+    Size = 4;
+    TyAlign = 4;
+  }
+
   // Handle address alignment for type alignment > 32 bits
-  uint64_t TyAlign = CGF.getContext().getTypeAlign(Ty) / 8;
   if (TyAlign > 4) {
     assert((TyAlign & (TyAlign - 1)) == 0 &&
            "Alignment is not power of 2!");
     llvm::Value *AddrAsInt = Builder.CreatePtrToInt(Addr, CGF.Int32Ty);
     AddrAsInt = Builder.CreateAdd(AddrAsInt, Builder.getInt32(TyAlign - 1));
     AddrAsInt = Builder.CreateAnd(AddrAsInt, Builder.getInt32(~(TyAlign - 1)));
-    Addr = Builder.CreateIntToPtr(AddrAsInt, BP);
+    Addr = Builder.CreateIntToPtr(AddrAsInt, BP, "ap.align");
   }
-  llvm::Type *PTy =
-    llvm::PointerType::getUnqual(CGF.ConvertType(Ty));
-  llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy);
 
   uint64_t Offset =
-    llvm::RoundUpToAlignment(CGF.getContext().getTypeSize(Ty) / 8, 4);
+    llvm::RoundUpToAlignment(Size, 4);
   llvm::Value *NextAddr =
     Builder.CreateGEP(Addr, llvm::ConstantInt::get(CGF.Int32Ty, Offset),
                       "ap.next");
   Builder.CreateStore(NextAddr, VAListAddrAsBPP);
 
+  if (IsIndirect)
+    Addr = Builder.CreateLoad(Builder.CreateBitCast(Addr, BPP));
+  else if (Ty->getAs<VectorType>() && ((Size == 8 && TyAlign < 8) ||
+           (Size >= 16 && TyAlign < 16))) {
+    // We can't directly cast ap.cur to pointer to a vector type, since ap.cur
+    // may not be correctly aligned for the vector type. We create an aligned
+    // temporary space and copy the content over from ap.cur to the temporary
+    // space. This is necessary if TyAlign < 8 for Size == 8, or
+    // TyAlign < 16 for Size >= 16.
+    llvm::Type *I8PtrTy = Builder.getInt8PtrTy();
+    CharUnits CharSize = getContext().getTypeSizeInChars(Ty);
+    llvm::Value *AlignedTemp = CGF.CreateTempAlloca(CGF.ConvertType(Ty),
+                                                    "var.align");
+    llvm::Value *Dst = Builder.CreateBitCast(AlignedTemp, I8PtrTy);
+    llvm::Value *Src = Builder.CreateBitCast(Addr, I8PtrTy);
+    Builder.CreateMemCpy(Dst, Src,
+        llvm::ConstantInt::get(CGF.IntPtrTy, CharSize.getQuantity()),
+        TyAlign, false);
+    Addr = AlignedTemp; //The content is in aligned location.
+  }
+  llvm::Type *PTy =
+    llvm::PointerType::getUnqual(CGF.ConvertType(Ty));
+  llvm::Value *AddrTyped = Builder.CreateBitCast(Addr, PTy);
+
   return AddrTyped;
 }
 
