Index: osprey/be/cg/cg.h
===================================================================
--- osprey/be/cg/cg.h	(revision 3368)
+++ osprey/be/cg/cg.h	(working copy)
@@ -82,6 +82,8 @@
 extern struct tn*  Local_Dynamic_TLS_Base;    // return value for __get_tls_addr
 extern BOOL PU_References_GOT; // for -m32 -fpic
 extern BOOL PU_has_avx128; // cause emit of vzeroupper
+extern BOOL PU_has_builtin_apply_args; // __builtin_apply_args
+extern BOOL PU_has_builtin_apply; // __builtin_apply
 #endif
 
 extern BOOL CG_PU_Has_Feedback;
Index: osprey/be/cg/calls.cxx
===================================================================
--- osprey/be/cg/calls.cxx	(revision 3368)
+++ osprey/be/cg/calls.cxx	(working copy)
@@ -623,6 +623,12 @@
     ENTRYINFO_sp_adj(ent_info) = OPS_last(&ops);
 #endif //ABI_PROPERTY_stack_ptr
 
+#ifdef TARG_X8664
+    if (PU_has_builtin_apply_args) {
+        Setup_Builtin_Apply_Args(&ops);
+    }
+#endif
+
 #ifdef TARG_SL
     // insert break after sp adjust
     if (DEBUG_Stack_Check & STACK_ENTRY_CHECK) {
Index: osprey/be/cg/localize.cxx
===================================================================
--- osprey/be/cg/localize.cxx	(revision 3368)
+++ osprey/be/cg/localize.cxx	(working copy)
@@ -77,6 +77,7 @@
 #include "cg_internal.h"
 #include "targ_sim.h"
 #include "whirl2ops.h"
+#include "cg.h"
 #if defined(TARG_PPC32)
 #include <queue>
 #include <set>
@@ -462,6 +463,11 @@
 		else if (def && is_func_arg && !BB_call(current_bb)) {
 			Localize_Global_Param_Reg (current_bb, tn);
 		} 
+#ifdef TARG_X8664
+                else if (def && is_func_retval && PU_has_builtin_apply) {
+                        ;       // okay
+                }
+#endif
 		else if (def && is_func_retval && !BB_exit(current_bb)) {
 			Localize_Global_Return_Reg_Def (current_bb, tn);
 		}
@@ -479,6 +485,9 @@
 		else if (!def && regnum == RDX &&
 			 BB_entry(current_bb) && BB_handler(current_bb))
 		  ;   // okay because RAX and RDX will be saved at the entry of a handler
+                else if (!def && regnum == RDX &&
+                        PU_has_builtin_apply_args && BB_entry(current_bb))
+                    ;  // okay because RDX will be saved at the function entry
                 else if (!def && Is_Target_32bit() && BB_like_entry(current_bb) &&
                           ( (regnum==RAX && TY_register_parm( Get_Current_PU_TY() ) > 0 ) ||
                             (regnum==RDX && TY_register_parm( Get_Current_PU_TY() ) > 1 ) ||
Index: osprey/be/cg/cgexp.h
===================================================================
--- osprey/be/cg/cgexp.h	(revision 3368)
+++ osprey/be/cg/cgexp.h	(working copy)
@@ -351,6 +351,10 @@
 			 TN *result2 = NULL, TN *src2 = NULL);
 #endif
 
+#ifdef TARG_X8664
+extern void Setup_Builtin_Apply_Args(OPS *ops);
+#endif
+
 /* Predicate manipulation routines.
  *
  * Most if not all of these routines take two result TNs. The second
Index: osprey/be/cg/x8664/expand.cxx
===================================================================
--- osprey/be/cg/x8664/expand.cxx	(revision 3368)
+++ osprey/be/cg/x8664/expand.cxx	(working copy)
@@ -100,6 +100,7 @@
 #include "targ_const_private.h"
 #include "config_opt.h" /* For Force_IEEE_Comparisons */
 #include "intrn_info.h" // for INTRN_rt_name
+#include "cg.h"
 #ifdef KEY
 #include "ebo.h"
 #endif
@@ -145,6 +146,9 @@
 static TN *Exp_Lock_Release     (TN *addr, TYPE_ID mtype, OPS *ops);
 static TN *Exp_Compare_and_Swap (TN *addr, TN *opnd1, TN *opnd2, TYPE_ID mtype, OPS *ops); 
 static TN *Exp_Bool_Compare_and_Swap (TN *addr, TN *opnd1, TN *opnd2, TYPE_ID mtype, OPS *ops); 
+static TN *Exp_Builtin_Apply_Args (OPS *ops);
+static TN *Exp_Builtin_Apply    (TN *addr, TN *args, TN *argsize, OPS *ops);
+static TN *Exp_Builtin_Return   (TN *result, OPS *ops);
 
 static void Store_To_Temp_Stack(TYPE_ID desc, TN *src, const char *sym_name, TN **mem_base_tn,
 		    TN **mem_ofst_tn, OPS *ops);
@@ -6615,9 +6619,9 @@
 {
   TY_IDX tyi;
   TY& ty = New_TY(tyi);
-  TY_Init(ty, 144, KIND_STRUCT, MTYPE_M,
+  TY_Init(ty, Is_Target_32bit() ? 144 : 192, KIND_STRUCT, MTYPE_M,
           Save_Str("__apply_arg"));
-  Set_TY_align(tyi, 8);
+  Set_TY_align(tyi, 16);
   tmp_apply_arg = New_ST(CURRENT_SYMTAB);
   ST_Init(tmp_apply_arg, TY_name_idx(ty),
           CLASS_VAR, SCLASS_AUTO, EXPORT_LOCAL, tyi);
@@ -9465,6 +9469,15 @@
   case INTRN_BOOL_COMPARE_AND_SWAP_I8:
     result = Exp_Bool_Compare_and_Swap(op0, op1, op2, MTYPE_I8, ops);
     break;
+  case INTRN_APPLY_ARGS:
+    result = Exp_Builtin_Apply_Args(ops);
+    break;
+  case INTRN_APPLY:
+    result = Exp_Builtin_Apply(op0, op1, op2, ops);
+    break;
+  case INTRN_RETURN:
+    result = Exp_Builtin_Return(op0, ops);
+    break;
 
   default:  
     FmtAssert(FALSE, ("Exp_Intrinsic_Call: unimplemented"));
@@ -9472,6 +9485,183 @@
   return result;
 }
 
+
+/* __builtin_apply_args should be insert into the beginning of the function, 
+ * not where it defined. */
+void
+Setup_Builtin_Apply_Args(OPS *ops) {
+    FmtAssert(PU_has_builtin_apply_args,
+            ("Exp_Apply_Args: __builtin_apply_args is not available in current PU"));
+
+    TYPE_ID type = Is_Target_32bit() ? MTYPE_U4 : MTYPE_U8;
+    INT size = Is_Target_32bit() ? 4 : 8;
+
+    // Store register parameters into the new structure
+    INT ofst = size;
+    if (Is_Target_32bit()) { // 32 bits
+        REGISTER int_regs[] = {RAX, RDX, RCX};
+        for (int i = 0; i < sizeof(int_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, int_regs[i], size);
+            Exp_Store(type, tn, tmp_apply_arg, ofst, ops, 0);
+            ofst += size;
+        }
+        if (Is_Target_SSE()) {
+            REGISTER sse_regs[] = {XMM0, XMM1, XMM2};
+            for (int i = 0; i < sizeof(sse_regs) / sizeof(REGISTER); i++) {
+                TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_float, sse_regs[i] - Float_Preg_Min_Offset + 1, 16);
+                Build_OP(TOP_stups, tn, FP_TN, Gen_Symbol_TN(tmp_apply_arg, ofst, 0), ops);
+                ofst += 16;
+            }
+        }
+        if (Is_Target_MMX()) {
+            REGISTER mmx_regs[] = {MM0, MM1, MM2};
+            for (int i = 0; i < sizeof(mmx_regs) / sizeof(REGISTER); i++) {
+                TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_mmx, mmx_regs[i] - MMX_Preg_Min_Offset + 1, 8);
+                Build_OP(TOP_store64_fm, tn, FP_TN, Gen_Symbol_TN(tmp_apply_arg, ofst, 0), ops);
+                ofst += 8;
+            }
+        }
+    } else { // 64 bits
+        REGISTER int_regs[] = {RAX, RDX, RCX, RSI, RDI, R8, R9};
+        for (int i = 0; i < sizeof(int_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, int_regs[i], size);
+            Exp_Store(type, tn, tmp_apply_arg, ofst, ops, 0);
+            ofst += size;
+        }
+        REGISTER sse_regs[] = {XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7 };
+        for (int i = 0; i < sizeof(sse_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_float, sse_regs[i] - Float_Preg_Min_Offset + 1, 16);
+            Build_OP(TOP_stdqu, tn, FP_TN, Gen_Symbol_TN(tmp_apply_arg, ofst, 0), ops);
+            ofst += 16;
+        }
+    }
+
+    // Store function parameters into the new structure
+    TN* function_args = Gen_Register_TN(ISA_REGISTER_CLASS_integer, size);
+    Exp_Lda(type, function_args, FP_Sym, Is_Target_32bit() ? 8 : 16, OPR_STID, ops);
+    Exp_Store(type, function_args, tmp_apply_arg, 0, ops, 0);
+}
+
+TN*
+Exp_Builtin_Apply_Args(OPS *ops)
+{
+    // User may call __builtin_apply_args several times in one function
+    if (!PU_has_builtin_apply_args) {
+        // Generate_Entry will expand __builtin_apply_args later if this flag is set
+        PU_has_builtin_apply_args = TRUE;
+
+        Generate_Temp_Apply_Arg();
+    }
+
+    // Return the pointer to the new structure
+    TN *return_tn = Build_TN_Of_Mtype(Is_Target_32bit() ? MTYPE_U4 : MTYPE_U8);
+    Exp_OP2(Is_Target_32bit() ? OPC_I4ADD : OPC_I8ADD,
+            return_tn, FP_TN, Gen_Symbol_TN(tmp_apply_arg, 0, 0), ops);
+    return return_tn;
+}
+
+TN*
+Exp_Builtin_Apply(TN *addr, TN *args, TN *argsize, OPS *ops)
+{
+    PU_has_builtin_apply = TRUE;
+
+    INT size = Is_Target_32bit() ? 4 : 8;
+
+    // All OPs generated here must set volatile flag, otherwise 
+    // EBO_Remove_Unused_Ops may remove them later
+
+    // Restore register parameters
+    TN *apply_arg = Gen_Register_TN(ISA_REGISTER_CLASS_integer, size);
+    Build_OP(Is_Target_32bit() ? TOP_ld32 : TOP_ld64, apply_arg, args, Gen_Literal_TN(0, size), ops);
+    INT ofst = size;
+    if (Is_Target_32bit()) { // 32 bits
+        REGISTER int_regs[] = {RAX, RDX, RCX};
+        for (int i = 0; i < sizeof(int_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, int_regs[i], size);
+            OP *op = Mk_OP(TOP_ld32, tn, args, Gen_Literal_TN(ofst, 4));
+            Set_OP_volatile(op);
+            OPS_Append_Op(ops, op);
+            ofst += size;
+        }
+        if (Is_Target_SSE()) {
+            REGISTER sse_regs[] = {XMM0, XMM1, XMM2};
+            for (int i = 0; i < sizeof(sse_regs) / sizeof(REGISTER); i++) {
+                TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_float, sse_regs[i] - Float_Preg_Min_Offset + 1, 16);
+                OP *op = Mk_OP(TOP_ldups, tn, args, Gen_Literal_TN(ofst, 4));
+                Set_OP_volatile(op);
+                OPS_Append_Op(ops, op);
+                ofst += 16;
+            }
+        }
+        if (Is_Target_MMX()) {
+            REGISTER mmx_regs[] = {MM0, MM1, MM2};
+            for (int i = 0; i < sizeof(mmx_regs) / sizeof(REGISTER); i++) {
+                TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_mmx, mmx_regs[i] - MMX_Preg_Min_Offset + 1, 8);
+                OP *op = Mk_OP(TOP_ld64_2m, tn, args, Gen_Literal_TN(ofst, 4));
+                Set_OP_volatile(op);
+                OPS_Append_Op(ops, op);
+                ofst += 8;
+            }
+            // Need to emit emms when mixing MMX and x87 FPU instructions
+            Build_OP(TOP_emms, ops);
+        }
+    } else { // 64 bits
+        REGISTER int_regs[] = {RAX, RDX, RCX, RSI, RDI, R8, R9};
+        for (int i = 0; i < sizeof(int_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, int_regs[i], size);
+            OP *op = Mk_OP(TOP_ld64, tn, args, Gen_Literal_TN(ofst, 4));
+            Set_OP_volatile(op);
+            OPS_Append_Op(ops, op);
+            ofst += size;
+        }
+        REGISTER sse_regs[] = {XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7 };
+        for (int i = 0; i < sizeof(sse_regs) / sizeof(REGISTER); i++) {
+            TN *tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_float, sse_regs[i] - Float_Preg_Min_Offset + 1, 16);
+            OP *op = Mk_OP(TOP_lddqu, tn, args, Gen_Literal_TN(ofst, 4));
+            Set_OP_volatile(op);
+            OPS_Append_Op(ops, op);
+            ofst += 16;
+        }
+        // with variable arguments, %rax contains the number of vector registers used
+        OP *op = Mk_OP(TOP_ldc64, Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, RAX, size),
+                Gen_Literal_TN(sizeof(sse_regs) / sizeof(REGISTER), 8));
+        Set_OP_volatile(op);
+        OPS_Append_Op(ops, op);
+    }
+    return NULL;
+}
+
+TN*
+Exp_Builtin_Return(TN *result, OPS *ops)
+{
+    TN *ded_tn;
+    INT ofst = 0;
+    TOP integer_top, float_top;
+    INT size;
+    if (Is_Target_32bit()) {
+        integer_top = TOP_ld32;
+        float_top = TOP_ldups;
+        size = 4;
+    } else {
+        integer_top = TOP_ld64;
+        float_top = TOP_lddqu;
+        size = 8;
+    }
+    ded_tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, RAX, size);
+    Build_OP(integer_top, ded_tn, result, Gen_Literal_TN(ofst, 4), ops);
+    ofst += size;
+    ded_tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_integer, RDX, size);
+    Build_OP(integer_top, ded_tn, result, Gen_Literal_TN(ofst, 4), ops);
+    ofst += size;
+    ded_tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_x87, ST0 - X87_Preg_Min_Offset + 1, 8);
+    Build_OP(TOP_fldt, ded_tn, result, Gen_Literal_TN(ofst, 4), ops);
+    ofst += 16;
+    ded_tn = Build_Dedicated_TN(ISA_REGISTER_CLASS_float, XMM0 - Float_Preg_Min_Offset + 1, 16);
+    Build_OP(float_top, ded_tn, result, Gen_Literal_TN(ofst, 4), ops);
+    ofst += 16;
+    return NULL;
+}
+
 /* Expansion of INTRN_SAVEXMMS into TOP_savexmms pseudo instruction */
 void
 Exp_Savexmms_Intrinsic(WN *intrncall, TN *rax_tn, LABEL_IDX *label, OPS *ops) 
Index: osprey/be/cg/cg.cxx
===================================================================
--- osprey/be/cg/cg.cxx	(revision 3368)
+++ osprey/be/cg/cg.cxx	(working copy)
@@ -193,6 +193,8 @@
 TN*  Local_Dynamic_TLS_Base;
 BOOL PU_References_GOT;  // for -m32 -fpic
 BOOL PU_has_avx128;      // cause emit of vzeroupper 
+BOOL PU_has_builtin_apply_args; // __builtin_apply_args
+BOOL PU_has_builtin_apply; // __builtin_apply
 #endif
 
 BOOL edge_done = FALSE;
@@ -277,6 +279,9 @@
 
   PU_References_GOT = FALSE;
 
+  PU_has_builtin_apply_args = FALSE;
+  PU_has_builtin_apply = FALSE;
+
   if (CG_localize_x87_tns && Is_Target_SSE2()) {
     fprintf(stderr,
 	    "Ignoring CG_localize_x87_tns since it has no effect under SSE2\n");
Index: osprey/be/com/wn_lower.cxx
===================================================================
--- osprey/be/com/wn_lower.cxx	(revision 3368)
+++ osprey/be/com/wn_lower.cxx	(working copy)
@@ -11567,11 +11567,21 @@
     if (TY_is_varargs(call_ty) ||
 	(! TY_has_prototype(call_ty) && 
 	 (callee_st == NULL || ST_sclass(callee_st) != SCLASS_TEXT))) {
-      WN *sse_cnt_wn = WN_Intconst(MTYPE_I4, MIN(8, sse_args));
-      WN *sse_cnt_stid = WN_StidIntoPreg(MTYPE_I4, RAX, Int32_Preg, sse_cnt_wn);
-	
-      WN_Set_Linenum(sse_cnt_stid, srcpos);
-      WN_INSERT_BlockLast(callblock, sse_cnt_stid);
+
+      // check whether the previous WN is an INTRINSIC_CALL to APPLY or not
+      BOOL skip = FALSE;
+      WN *previous_wn = WN_prev(tree);
+      if (previous_wn != NULL && WN_operator_is(previous_wn, OPR_INTRINSIC_CALL) &&
+              WN_intrinsic(previous_wn) == INTRN_APPLY) 
+          skip = TRUE;
+
+      if (!skip) {
+          WN *sse_cnt_wn = WN_Intconst(MTYPE_I4, MIN(8, sse_args));
+          WN *sse_cnt_stid = WN_StidIntoPreg(MTYPE_I4, RAX, Int32_Preg, sse_cnt_wn);
+
+          WN_Set_Linenum(sse_cnt_stid, srcpos);
+          WN_INSERT_BlockLast(callblock, sse_cnt_stid);
+      }
     }
   }
 #endif
Index: osprey/common/com/symtab_access.h
===================================================================
--- osprey/common/com/symtab_access.h	(revision 3368)
+++ osprey/common/com/symtab_access.h	(working copy)
@@ -1113,6 +1113,13 @@
 inline void
 Clear_PU_nothrow (PU& pu)		{ pu.flags &= ~PU_NOTHROW; }
 
+inline BOOL
+PU_has_apply_args (const PU& pu)        { return (pu.flags & PU_HAS_APPLY_ARGS) != 0;}
+inline void
+Set_PU_has_apply_args (PU& pu)          { pu.flags |= PU_HAS_APPLY_ARGS; }
+inline void
+Clear_PU_has_apply_args (PU& pu)        { pu.flags &= ~PU_HAS_APPLY_ARGS; }
+
 inline UINT64
 PU_src_lang (const PU& pu)		{ return pu.src_lang; }
 
Index: osprey/common/com/symtab_defs.h
===================================================================
--- osprey/common/com/symtab_defs.h	(revision 3368)
+++ osprey/common/com/symtab_defs.h	(working copy)
@@ -750,6 +750,7 @@
 #define PU_HAS_GOTO_OUTER_BLOCK	0x0001000000000000LL // has GOTO_OUTER_BLOCK stmt
 #define PU_IS_CDECL             0x0002000000000000LL // __attribute__((cdecl)) semantic
 #define PU_NOTHROW              0x0004000000000000LL // doesn't throw, e.g. decl as "void foo() throw()".
+#define PU_HAS_APPLY_ARGS       0x0008000000000000LL // __builtin_apply_args
 
 enum PU_SRC_LANG_FLAGS
 {
Index: osprey/wgen/wgen_expr.cxx
===================================================================
--- osprey/wgen/wgen_expr.cxx	(revision 3368)
+++ osprey/wgen/wgen_expr.cxx	(working copy)
@@ -8714,6 +8714,7 @@
 		ErrMsg(EC_Unimplemented_Feature, "__builtin_apply_args",
 		  Orig_Src_File_Name?Orig_Src_File_Name:Src_File_Name, lineno);
 #endif
+                Set_PU_has_apply_args(Get_Current_PU());
 		Set_PU_has_alloca(Get_Current_PU());
 		iopc = INTRN_APPLY_ARGS;
 		break;	
@@ -8788,16 +8789,21 @@
 		  WGEN_Stmt_Append (alloca_kid1, Get_Srcpos());
 
 		  // The src is actually in 0(kid1)
-		  kid1 = 
-		    WN_CreateIload (OPR_ILOAD, MTYPE_I4, MTYPE_I4, 0,
-				    MTYPE_To_TY(MTYPE_I4), 
-				    Make_Pointer_Type(MTYPE_To_TY(MTYPE_U8)), 
-				    kid1, 0);
+                  if (Is_Target_32bit())
+                      kid1 = WN_CreateIload (OPR_ILOAD, MTYPE_I4, MTYPE_I4, 0,
+                              MTYPE_To_TY(MTYPE_I4), 
+                              Make_Pointer_Type(MTYPE_To_TY(MTYPE_U8)), 
+                              kid1, 0);
+                  else
+                      kid1 = WN_CreateIload (OPR_ILOAD, MTYPE_I8, MTYPE_I8, 0,
+                              MTYPE_To_TY(MTYPE_I8), 
+                              Make_Pointer_Type(MTYPE_To_TY(MTYPE_U8)), 
+                              kid1, 0);
 		  load_wn = 
 		    WN_CreateMload (0, 
 				    Make_Pointer_Type(MTYPE_To_TY(MTYPE_U8)), 
 				    kid1, kid2);
-		  sp_addr = WN_LdidPreg(MTYPE_U4, 29); // $sp
+		  sp_addr = WN_LdidPreg(Is_Target_32bit() ? MTYPE_U4 : MTYPE_U8, Stack_Pointer_Preg_Offset); // $sp
 		  WGEN_Stmt_Append(WN_CreateMstore (0, 
 			      Make_Pointer_Type(MTYPE_To_TY(MTYPE_U8)), 
 						   load_wn,
@@ -8810,33 +8816,69 @@
 		  call_wn = WN_Create (OPR_ICALL, ret_mtype, MTYPE_V, 1);
 		  WN_kid(call_wn, 0) = 
 		    WGEN_Expand_Expr (gs_tree_value (gs_tree_operand (exp, 1)));
+#if defined(TARG_X8664)
+                  // We assume the function will return floating point to avoid
+                  // Repair_Call_BB manually insert a "fldz" OP later.
+                  WN_set_ty (call_wn, Make_Function_Type(MTYPE_To_TY(MTYPE_F10)));
+#else
 		  WN_set_ty (call_wn, TY_pointed(Get_TY(
 			    gs_tree_type (gs_tree_value(gs_tree_operand (exp, 1))))));
+#endif
 		  WGEN_Stmt_Append (call_wn, Get_Srcpos());		
 
 		  TY_IDX tyi;
 		  TY& ty = New_TY(tyi);
-		  TY_Init(ty, 16, KIND_STRUCT, MTYPE_M,
+		  TY_Init(ty, 48, KIND_STRUCT, MTYPE_M,
 			  Save_Str("__apply"));
 		  Set_TY_align(tyi, 8);
 		  ST *tmpst = New_ST(CURRENT_SYMTAB);
 		  ST_Init(tmpst, TY_name_idx(ty),
 			  CLASS_VAR, SCLASS_AUTO, EXPORT_LOCAL, tyi);
 		  Set_ST_is_temp_var(tmpst);
+
 		  WN *load, *store;
-		  load = WN_LdidPreg(MTYPE_I8, 2); // $v0
-		  store = WN_Stid(MTYPE_I8, 
-				  (WN_OFFSET)0, tmpst, Spill_Int_Type, load);
+                  WN_OFFSET offset = 0;
+
+#if defined(TARG_X8664)
+                  // We need to save %rax and %rdx both on x8664, however use MTYPE_M as
+                  // the return type fails because of a check in lower_return_mstid function.
+                  if (Is_Target_64bit()) {
+                      TY_IDX int_return_tyi;
+                      TY& int_return_ty = New_TY(int_return_tyi);
+                      TY_Init(int_return_ty, 16, KIND_STRUCT, MTYPE_M, STR_IDX_ZERO);
+                      Set_TY_align(int_return_tyi, 8);
+
+                      load = WN_Ldid(MTYPE_M, -1, Return_Val_Preg, int_return_tyi);
+                      store = WN_Stid(MTYPE_M, 
+                              offset, tmpst, int_return_tyi, load);
+                      offset += int_return_ty.size;
+                  } else {
+#endif
+                  load = WN_Ldid(MTYPE_I8, -1, Return_Val_Preg, MTYPE_To_TY(MTYPE_I8));
+                  store = WN_Stid(MTYPE_I8, 
+                          offset, tmpst, Spill_Int_Type, load);
+                  offset += 8;
+#if defined(TARG_X8664)
+                  }
+#endif
 		  WGEN_Stmt_Append (store, Get_Srcpos());		
 #if !defined(TARG_SL)
 		  // SL do not have float-point register
-		  load = WN_LdidPreg(MTYPE_F8, 32); //$f0
-		  store = WN_Stid(MTYPE_F8, 
-				  (WN_OFFSET)8, tmpst, Spill_Int_Type, load);
+		  load = WN_Ldid(MTYPE_F10, -1, Return_Val_Preg, MTYPE_To_TY(MTYPE_F10)); 
+		  store = WN_Stid(MTYPE_F10, 
+				  offset, tmpst, Spill_Int_Type, load);
 		  WGEN_Stmt_Append (store, Get_Srcpos());		
+                  offset += 16;
 #endif
-		  wn = WN_Lda (Pointer_Mtype, 0, tmpst, 
-			       Make_Pointer_Type (ST_type(tmpst), FALSE));
+#if defined(TARG_X8664)
+                  // xmm0
+		  load = WN_Ldid(MTYPE_V16F8, -1, Return_Val_Preg, MTYPE_To_TY(MTYPE_V16F8)); 
+		  store = WN_Stid(MTYPE_V16F8, 
+				  offset, tmpst, Spill_Int_Type, load);
+		  WGEN_Stmt_Append (store, Get_Srcpos());		
+                  offset += 16;
+#endif
+		  wn = WN_Lda (Pointer_Mtype, 0, tmpst);
 
 		  // Dealloca/Restore SP
 		  WN *dealloca_wn = WN_CreateDealloca (2);
@@ -8858,6 +8900,28 @@
 #endif
 		Set_PU_has_alloca(Get_Current_PU());
 		iopc = INTRN_RETURN;
+
+                call_wn = WN_Create (OPR_INTRINSIC_CALL, ret_mtype, MTYPE_V, 
+                        num_args);
+                WN_intrinsic (call_wn) = iopc;
+                WN_Set_Linenum (call_wn, Get_Srcpos());
+                WN_Set_Call_Default_Flags (call_wn);
+                i = 0;
+                for (list = gs_tree_operand (exp, 1);
+                        list;
+                        list = gs_tree_chain (list)) {
+                    arg_wn     = WGEN_Expand_Expr (gs_tree_value (list));
+                    arg_ty_idx = Get_TY(gs_tree_type(gs_tree_value(list)));
+                    arg_mtype  = TY_mtype(arg_ty_idx);
+                    arg_wn = WN_CreateParm (Mtype_comparison (arg_mtype), 
+                            arg_wn,
+                            arg_ty_idx, WN_PARM_BY_VALUE);
+                    WN_kid (call_wn, i++) = arg_wn;
+                }
+                WGEN_Stmt_Append (call_wn, Get_Srcpos());
+                WGEN_Stmt_Append (WN_CreateReturn(), Get_Srcpos());
+
+                whirl_generated = TRUE;
 		break;	
 
                 // Implement built-in versions of the ISO C99 floating point
Index: osprey/ipa/main/analyze/ipa_cg.cxx
===================================================================
--- osprey/ipa/main/analyze/ipa_cg.cxx	(revision 3368)
+++ osprey/ipa/main/analyze/ipa_cg.cxx	(working copy)
@@ -3701,6 +3701,7 @@
 fprintf(fp, "Reason37: formal parameter is a loop index\n");
 fprintf(fp, "Reason38: not inlining nested functions\n");
 fprintf(fp, "Reason39: not inlining non-tiny noreturn functions\n");
+fprintf(fp, "Reason40: not inlining __builtin_apply_args functions\n");
 #endif
 fprintf(fp, SBar);
   
Index: osprey/ipa/main/analyze/ipa_inline.cxx
===================================================================
--- osprey/ipa/main/analyze/ipa_inline.cxx	(revision 3368)
+++ osprey/ipa/main/analyze/ipa_inline.cxx	(working copy)
@@ -1787,6 +1787,13 @@
             reason = "not inlining non-tiny noreturn functions";
             ed->Set_reason_id (39);
     }
+#ifdef KEY
+    else if (PU_has_apply_args(callee->Get_PU())) {
+            result = FALSE;
+            reason = "not inlining __builtin_apply_args functions";
+            ed->Set_reason_id(40);
+    }
+#endif
     // The following else-if must be last
     else if (!IPA_Enable_Lang) {
 	if ((callee->Summary_Proc()->Get_lang() == LANG_F77) || 
@@ -2527,6 +2534,8 @@
             return "not inlining nested functions";
   case 39:
             return "not inlining non-tiny noreturn functions";
+  case 40:
+            return "not inlining __builtin_apply_args functions";
   default:
     return "unknown reason";
   } 
