Author: mberg Date: 2011-07-13 14:35:48 -0400 (Wed, 13 Jul 2011) New Revision: 3690
Modified: trunk/osprey/be/cg/bb.h trunk/osprey/be/cg/cg.cxx trunk/osprey/be/cg/cg_loop.cxx trunk/osprey/be/cg/ebo.cxx trunk/osprey/be/cg/ebo_special.h trunk/osprey/be/cg/lra.cxx trunk/osprey/be/cg/lra.h trunk/osprey/be/cg/oputil.cxx trunk/osprey/be/cg/x8664/cgemit_targ.cxx trunk/osprey/be/cg/x8664/cgtarget.cxx trunk/osprey/be/cg/x8664/ebo_special.cxx trunk/osprey/be/cg/x8664/expand.cxx trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx Log: These are the 4.2.5.1 updates including loop coalescing, register pressure utility updates/additions, movddup updates, a jump table update for pic code, fma diassocation, BD scheduling info updates and CG dep graph updates. CR by Sun. Modified: trunk/osprey/be/cg/bb.h =================================================================== --- trunk/osprey/be/cg/bb.h 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/bb.h 2011-07-13 18:35:48 UTC (rev 3690) @@ -486,7 +486,7 @@ #if defined(TARG_X8664) /* array of all target register classes used to supply pressure info */ INT64 offset; - bool has_regpressure[ISA_REGISTER_CLASS_MAX+1]; + INT32 has_regpressure[ISA_REGISTER_CLASS_MAX+1]; #endif #if defined(TARG_IA64) || defined(TARG_SL) || defined(TARG_MIPS) || defined(TARG_LOONGSON) INT bb_cycle; @@ -539,7 +539,7 @@ } #if defined(TARG_X8664) -inline void Set_BB_regpressure(BB *bb, bool x, ISA_REGISTER_CLASS cl) { +inline void Set_BB_regpressure(BB *bb, INT32 x, ISA_REGISTER_CLASS cl) { bb->has_regpressure[cl] = x; } #endif Modified: trunk/osprey/be/cg/cg.cxx =================================================================== --- trunk/osprey/be/cg/cg.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/cg.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -1494,6 +1494,7 @@ #if !defined(TARG_PPC32) // PPC IGLS_Schedule_Region bugs IGLS_Schedule_Region (TRUE /* before register allocation */); #ifdef TARG_X8664 + Examine_Loop_Info("after prescheduling", TRUE); void Counter_Merge (char*); if (CG_merge_counters_x86 == TRUE && CG_opt_level > 1) { if (Enable_CG_Peephole) { Modified: trunk/osprey/be/cg/cg_loop.cxx =================================================================== --- trunk/osprey/be/cg/cg_loop.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/cg_loop.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -190,6 +190,10 @@ #include "lra.h" #include "calls.h" +#if defined(TARG_X8664) +#include "config_lno.h" +#endif + #if defined(TARG_SL) #include "tag.h" #include "label_util.h" @@ -6947,6 +6951,136 @@ } #endif +#if defined(TARG_X8664) +BOOL LOOP_Block_Merge( LOOP_DESCR* loop ) +{ + BOOL is_merge_cand = FALSE; + BOOL fully_unrolled_segment_in_body = FALSE; + + // Test if there is no internal flow excepting the back edge + // for the collection of loop blocks if more than one. + BB_SET *bbs = LOOP_DESCR_bbset(loop); + BB *head = LOOP_DESCR_loophead(loop); + if ( ( BB_SET_Size(bbs) > 1 ) && + ( BB_innermost(head) == FALSE ) && + ( LOOP_DESCR_Find_Unique_Tail(loop) != NULL ) && + ( LOOP_DESCR_Has_Side_Entrance(loop) == FALSE ) ){ + BOOL loop_is_canonical = FALSE; + BB *bb, *succ, *pred, *bb_head, *bb_tail; + BBLIST *lst; + INT num_bbs = BB_SET_Size(bbs); + MEM_POOL_Push(&MEM_local_nz_pool); + BB **orig_bbs = TYPE_MEM_POOL_ALLOC_N(BB *, &MEM_local_nz_pool, num_bbs); + + if( sort_topologically(loop, orig_bbs) ){ + bb_head = orig_bbs[0]; + bb_tail = orig_bbs[num_bbs-1]; + BOOL loop_body_linear = TRUE; + for (INT bbi = 1; bbi < (num_bbs - 1); bbi++) { + bb = orig_bbs[bbi]; + + // The only branch must be at bb_tail + if( BB_branch_op(bb) ){ + loop_body_linear = FALSE; + break; + } + + // We are looking for an unrolled loop body which is reachable by + // the loop head. We know it is if loop_body_linear remains true. + if( ( BB_unrolled_fully(bb) ) && + ( BB_length(bb) <= CG_LOOP_unrolled_size_max ) ){ + fully_unrolled_segment_in_body = TRUE; + } + } + // Now check if the loop is in canonical form + if( loop_body_linear && + ( bb_head == head ) && + ( BB_succs_len(head) == 1 ) ){ + // one of the successor blocks of bb_tail must be the head block + if( BB_branch_op(bb_tail) && fully_unrolled_segment_in_body ){ + for( lst = BB_succs(bb_tail); lst != NULL; lst = BBLIST_next(lst) ){ + if( succ = BBLIST_item(lst) ){ + if( succ == bb_head ){ + loop_is_canonical = TRUE; + break; + } + } + } + } + } + } + + // The sorted sub-graph of the loop must have only 1 branch + if( loop_is_canonical ){ + BOOL has_call = FALSE; + for (INT bbi = 0; bbi < num_bbs; bbi++) { + bb = orig_bbs[bbi]; + if( BB_call(bb) ){ + has_call = TRUE; + break; + } + } + if( has_call == FALSE ) + is_merge_cand = TRUE; + } + + // We are going to merge a flowless loop with multiple blocks into + // a single block. Scheduling, ebo and register allocation can + // take advantage of this. + if( is_merge_cand ){ + INT num_merged = 0; + for (INT bbi = 1; bbi < num_bbs; bbi++) { + BBLIST *lst_next; + bb = orig_bbs[bbi]; + + // Add all of bb's ops to head + BB_Append_All(head, bb); + + if( BB_SET_MemberP(LOOP_DESCR_bbset(loop), bb) ){ + BB_MAP_Set(LOOP_DESCR_map, bb, loop); + LOOP_DESCR_Delete_BB(loop, bb); + } + + for ( lst = BB_succs(bb); lst != NULL; lst = lst_next ) { + lst_next = BBLIST_next(lst); + if( succ = BBLIST_item(lst) ){ + // preserve outgoing edges + if( bb == bb_tail ){ + if( succ == head ) { + Link_Pred_Succ_with_Prob(head, head, 1.0); + BB_branch_wn(head) == BB_branch_wn(bb_tail); + } else { + Link_Pred_Succ_with_Prob(head, succ, 0.0); + } + } + Unlink_Pred_Succ(bb, succ); + } + } + + for ( lst = BB_preds(bb); lst != NULL; lst = lst_next ) { + lst_next = BBLIST_next(lst); + if (pred = BBLIST_item(lst)) + Unlink_Pred_Succ(pred, bb); + } + + BB_Remove_All(bb); + Remove_BB(bb); + num_merged++; + } + + // now mark it so we can optimize further + if( num_merged == (num_bbs -1) ) + Set_BB_innermost(head); + else + is_merge_cand = FALSE; + } + // now clear the top sort allocation + MEM_POOL_Pop(&MEM_local_nz_pool); + } + return is_merge_cand; +} +#endif + // Perform loop optimizations for one loop // #if defined(TARG_IA64) || defined(TARG_SL) || defined(TARG_MIPS) @@ -6987,6 +7121,18 @@ if (CG_LOOP_unroll_level == 0) return FALSE; + BOOL trace_loop_opt = Get_Trace(TP_CGLOOP, 0x4); + +#ifdef TARG_X8664 + if (LNO_Simd_Rm_Unity_Remainder && + !LOOP_Block_Merge(loop) && + trace_loop_opt) { + fprintf(TFile, "Merge Blocks failed on current loop\n"); + BB_SET_Print(LOOP_DESCR_bbset(loop), TFile); + fprintf(TFile, "\n"); + } +#endif + // if (Is_Inner_Loop(loop)) { if (!BB_innermost(LOOP_DESCR_loophead(loop))) return FALSE; @@ -6994,8 +7140,6 @@ if (Skip_Loop_For_Reason(loop)) return FALSE; - BOOL trace_loop_opt = Get_Trace(TP_CGLOOP, 0x4); - // Determine how to optimize the loop // LOOP_OPT_ACTION action = NO_LOOP_OPT; @@ -8374,13 +8518,8 @@ BOOL after_prescheduling, MEM_POOL *pool) { - // This func is a debug trace utility - if (Get_Trace(TP_CGLOOP, 1) == FALSE) - return; - BB *bb = LOOP_DESCR_loophead(loop); - if (BB_unrollings(bb) && - (BB_SET_Size(LOOP_DESCR_bbset(loop)) == 1)) { + if (BB_SET_Size(LOOP_DESCR_bbset(loop)) == 1) { BOOL saved_state_sched_est; BOOL toggle_sched_est = false; @@ -8416,6 +8555,7 @@ TN *tn; BOOL first_time = TRUE; BOOL changed = TRUE; + BOOL trace_general = Get_Trace(TP_CGLOOP, 2); #ifdef TARG_X8664 // now adjust the number of gpr regs as per the ABI @@ -8437,6 +8577,14 @@ &D_f, ISA_REGISTER_CLASS_float) + 1; + // This routine fires after prescheduling, so we now have + // accurate register pressure components to fill in the currently + // boolean setting to the notion that the scheduler saw register pressure. + // So here we fill in the value for lra to use if needed. + if (BB_regpressure(bb,ISA_REGISTER_CLASS_float)) { + Set_BB_regpressure(bb, D_f, ISA_REGISTER_CLASS_float); + } + // compute the number of gpr Regs Predicted conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use, ISA_REGISTER_CLASS_integer); @@ -8447,19 +8595,27 @@ &D_i, ISA_REGISTER_CLASS_integer) + 1; - // Now print the details of this loop - printf("unrolled loop(%d):size = %d, ntimes=%d\n", - BB_id(bb), BB_length(bb), BB_unrollings(bb)); - printf("%s bb = %d, init_II = %d\n", usage_str, BB_id(bb), init_II); - printf("R_f = %d, D_f = %d, N_f = %d, avg_degree_f = %d\n", - R_f, D_f, N_f, avg_conflicts_f); - printf("R_i = %d, D_i = %d, N_i = %d, avg_degree_i = %d\n", - R_i, D_i, N_i, avg_conflicts_i); + // Now do the same for int regs + if (BB_regpressure(bb,ISA_REGISTER_CLASS_integer)) { + Set_BB_regpressure(bb, D_i, ISA_REGISTER_CLASS_integer); + } + if (trace_general) { + // Now print the details of this loop + fprintf(TFile, "unrolled loop(%d):size = %d, ntimes=%d\n", + BB_id(bb), BB_length(bb), BB_unrollings(bb)); + fprintf(TFile, "%s bb = %d, init_II = %d\n", + usage_str, BB_id(bb), init_II); + fprintf(TFile, "R_f = %d, D_f = %d, N_f = %d, avg_degree_f = %d\n", + R_f, D_f, N_f, avg_conflicts_f); + fprintf(TFile, "R_i = %d, D_i = %d, N_i = %d, avg_degree_i = %d\n", + R_i, D_i, N_i, avg_conflicts_i); + } + TN_MAP_Delete(conflict_map_f); TN_MAP_Delete(conflict_map_i); MEM_POOL_Pop(pool); - } + } } void Examine_Loop_Info(char *usage_str, BOOL after_presched) Modified: trunk/osprey/be/cg/ebo.cxx =================================================================== --- trunk/osprey/be/cg/ebo.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/ebo.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -3006,12 +3006,20 @@ } #ifdef TARG_X8664 + if ( !op_replaced && + Is_Target_Orochi() && + Is_Target_FMA4() && + EBO_Is_FMA4(op) && + !EBO_in_peep && + !EBO_in_loop ) + op_replaced = EBO_Disassociate_FMA( op ); + if( do_load_execute && !op_replaced && !OP_effectively_copy(op) ){ op_replaced = EBO_Load_Execution( op, opnd_tn, orig_tninfo, cmp_merge_idx ); } - + if ( !op_replaced && (CG_LOOP_unroll_level == 2) && (OP_code(op) == TOP_leax32) && Modified: trunk/osprey/be/cg/ebo_special.h =================================================================== --- trunk/osprey/be/cg/ebo_special.h 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/ebo_special.h 2011-07-13 18:35:48 UTC (rev 3690) @@ -199,6 +199,8 @@ BOOL EBO_Not_Load_Exec_Opnd( OP* ); BOOL EBO_Fold_Lea_Const_Component( OP* ); BOOL EBO_Opt_Const_Array( OP*, LOOP_DESCR*, INT ); +BOOL EBO_Is_FMA4( OP* ); +BOOL EBO_Disassociate_FMA( OP* ); BOOL EBO_Load_Execution( OP*, TN**, EBO_TN_INFO**, int ); BOOL EBO_Lea_Insertion( OP*, TN**, EBO_TN_INFO** ); BOOL EBO_Fold_Load_Duplicate( OP*, TN**, EBO_TN_INFO** ); Modified: trunk/osprey/be/cg/lra.cxx =================================================================== --- trunk/osprey/be/cg/lra.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/lra.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -118,6 +118,7 @@ #ifdef KEY static BOOL large_asm_clobber_set[ISA_REGISTER_CLASS_MAX+1]; +#define AVX_FP_REG_FACTOR 2.5 #endif #ifdef TARG_IA64 #define FIRST_INPUT_REG (32+REGISTER_MIN) @@ -981,6 +982,40 @@ } +INT +Find_Degree_For_TN(TN *tn, INT *regs_in_use) +{ + LIVE_RANGE *lr = LR_For_TN(tn); + return Find_Max_Degree_For_LR(regs_in_use, lr); +} + + +OP * +Find_UseOp_For_TN(TN *tn) +{ + // only use with sdsu live ranges + LIVE_RANGE *lr = LR_For_TN(tn); + INT opnum = LR_last_use(lr); + OP *cur_op = OP_VECTOR_element (Insts_Vector, opnum); + return cur_op; +} + + +bool +Is_TN_Sdsu(TN *tn) +{ + bool has_sdsu = false; + LIVE_RANGE *lr = LR_For_TN(tn); + if ((LR_def_cnt(lr) == 1) && (LR_upward_exposed_use(lr) == 0)) { + if (LR_use_cnt(lr) == 1) { + has_sdsu = true; + } + } + + return has_sdsu; +} + + void Merge_Live_Ranges(TN *tn1, TN *tn2, bool make_tn1_span) { LIVE_RANGE *lr1 = LR_For_TN(tn1); @@ -2257,6 +2292,24 @@ } +static BOOL check_uses_destructive_dest(TN *tn, BB *bb) +{ + BOOL uses_destructive_dest = FALSE; + +#ifdef TARG_X8664 + if( BB_regpressure(bb,TN_register_class(tn)) && + (TN_register_class(tn) == ISA_REGISTER_CLASS_float) ){ + INT num_pr = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float); + INT num_measured = BB_regpressure(bb,ISA_REGISTER_CLASS_float); + if (num_measured > (AVX_FP_REG_FACTOR * num_pr)) + uses_destructive_dest = TRUE; + } +#endif + + return uses_destructive_dest; +} + + static BOOL Assign_Registers_For_OP (OP *op, INT opnum, TN **spill_tn, BB *bb) { @@ -2621,8 +2674,14 @@ continue; } - if( opndnum == 0 && - (OP_sse5( op ) == FALSE) && + if ( Is_Target_Orochi() && OP_sse5( op ) ) { + // now the test + if ( check_uses_destructive_dest(tn, bb) && + opndnum == 0 && + result_reg <= REGISTER_MAX ){ + prefer_reg = result_reg; + } + } else if( opndnum == 0 && OP_x86_style( op ) && result_reg <= REGISTER_MAX ){ prefer_reg = result_reg; @@ -4218,9 +4277,15 @@ Set_TN_spill(new_tn, spill_loc); local_spills++;global_spills++; + BOOL uses_destructive_dest = FALSE; + if( Is_Target_Orochi() && OP_sse5( op ) ){ + uses_destructive_dest = check_uses_destructive_dest(prev_tn, bb); + } + if ((OP_same_res(op) #ifdef TARG_X8664 || OP_x86_style(op) // bug 4721 + || uses_destructive_dest #endif ) && TN_Pair_In_OP(op, spill_tn, prev_tn)) { @@ -5492,16 +5557,31 @@ FOR_ALL_BB_OPs( bb, op ){ opnum++; - if( !OP_x86_style( op ) ) - continue; - - if ( OP_sse5(op) ) - continue; - TN* result = OP_result( op, 0 ); TN* opnd0 = OP_opnd( op, 0 ); TN* opnd1 = OP_opnd( op, 1 ); + if( Is_Target_Orochi() && OP_sse5( op ) ){ + if( check_uses_destructive_dest(result, bb) == FALSE ) + continue; + + bool same_class = FALSE; + for( int opnd = 1; opnd < OP_opnds(op); opnd++ ){ + TN* tn = OP_opnd( op, opnd ); + if( TN_is_register(tn) == FALSE ) continue; + if( TN_register_class(result) == TN_register_class(tn) ){ + same_class = TRUE; + break; + } + } + if ( same_class == FALSE ) + continue; + + } else { + if( !OP_x86_style( op ) ) + continue; + } + if( TNs_Are_Equivalent( result, opnd0 ) ) continue; @@ -5877,7 +5957,14 @@ Maintain the "result==OP_opnd(op,0)" property for x86-style operations after register preallocation. */ - if( OP_x86_style( op ) && + if( Is_Target_Orochi() && OP_sse5( op ) ) { + if( check_uses_destructive_dest(old_tn, bb) && + new_result_tn != NULL ){ + Exp_COPY( new_result_tn, old_tn, &pre_ops ); + OP_srcpos(OPS_last(&pre_ops)) = OP_srcpos(op); + Set_OP_opnd( op, 0, new_result_tn ); + } + } else if( OP_x86_style( op ) && new_result_tn != NULL ){ Exp_COPY( new_result_tn, old_tn, &pre_ops ); OP_srcpos(OPS_last(&pre_ops)) = OP_srcpos(op); Modified: trunk/osprey/be/cg/lra.h =================================================================== --- trunk/osprey/be/cg/lra.h 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/lra.h 2011-07-13 18:35:48 UTC (rev 3690) @@ -103,6 +103,9 @@ INT *outgoing_conflicts, ISA_REGISTER_CLASS rclass); extern void Truncate_LRs_For_OP(OP *op); +extern INT Find_Degree_For_TN(TN *tn, INT *regs_in_use); +extern OP *Find_UseOp_For_TN(TN *tn); +extern bool Is_TN_Sdsu(TN *tn); /* Returns the number of registers LRA is requesting from GRA for * the class <cl> in the basic block <bb>. If we run the scheduling Modified: trunk/osprey/be/cg/oputil.cxx =================================================================== --- trunk/osprey/be/cg/oputil.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/oputil.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -1863,6 +1863,7 @@ {TOP_xor128v16, TOP_vxor128v16}, {TOP_xor128v32, TOP_vxor128v32}, {TOP_xor128v64, TOP_vxor128v64}, + {TOP_pxor, TOP_vxor128v8}, {TOP_fxor128v32, TOP_vfxor128v32}, {TOP_fxor128v64, TOP_vfxor128v64}, {TOP_andps, TOP_vandps}, Modified: trunk/osprey/be/cg/x8664/cgemit_targ.cxx =================================================================== --- trunk/osprey/be/cg/x8664/cgemit_targ.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/x8664/cgemit_targ.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -1599,10 +1599,6 @@ OP_Name[TOP_vpclmulqdqx] = "vpclmulqdq"; OP_Name[TOP_vpclmulqdqxx] = "vpclmulqdq"; OP_Name[TOP_vpclmulqdqxxx] = "vpclmulqdq"; - OP_Name[TOP_vlddqu] = "vlddqu"; - OP_Name[TOP_vlddqux] = "vlddqu"; - OP_Name[TOP_vlddquxx] = "vlddqu"; - OP_Name[TOP_vlddqu_n32] = "vlddqu"; OP_Name[TOP_vfadd128v64] = "vaddpd"; OP_Name[TOP_vfaddx128v64] = "vaddpd"; OP_Name[TOP_vfaddxx128v64] = "vaddpd"; @@ -3708,6 +3704,8 @@ if ( !Is_Target_SSE3() || ! CG_use_lddqu) { OP_Name[TOP_lddqu] = "movdqu"; OP_Name[TOP_lddqu_n32] = "movdqu"; + OP_Name[TOP_vlddqu] = "vmovdqu"; + OP_Name[TOP_vlddqu_n32] = "vmovdqu"; } else OP_Name[TOP_lddqu_n32] = "lddqu"; @@ -3741,9 +3739,10 @@ OP_Name[TOP_stdqax] = "movdqa"; OP_Name[TOP_stntpdx]= "movntpd"; OP_Name[TOP_stntpsx]= "movntps"; - if ( !Is_Target_SSE3() || ! CG_use_lddqu) + if ( !Is_Target_SSE3() || ! CG_use_lddqu) { OP_Name[TOP_lddqux] = "movdqu"; - else + OP_Name[TOP_vlddqux] = "vmovdqu"; + } else OP_Name[TOP_lddqux] = "lddqu"; OP_Name[TOP_stdqux] = "movdqu"; OP_Name[TOP_ldhpsx] = "movhps"; @@ -3754,9 +3753,10 @@ OP_Name[TOP_stdqaxx] = "movdqa"; OP_Name[TOP_stntpdxx]= "movntpd"; OP_Name[TOP_stntpsxx]= "movntps"; - if ( !Is_Target_SSE3() || ! CG_use_lddqu) + if ( !Is_Target_SSE3() || ! CG_use_lddqu) { OP_Name[TOP_lddquxx] = "movdqu"; - else + OP_Name[TOP_vlddquxx] = "vmovdqu"; + } else OP_Name[TOP_lddquxx] = "lddqu"; OP_Name[TOP_stdquxx] = "movdqu"; OP_Name[TOP_ldhpsxx] = "movhps"; Modified: trunk/osprey/be/cg/x8664/cgtarget.cxx =================================================================== --- trunk/osprey/be/cg/x8664/cgtarget.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/x8664/cgtarget.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -266,6 +266,12 @@ case TOP_vfnmsubxrss: case TOP_vfnmsubxxrss: case TOP_vfnmsubxxxrss: + case TOP_vfsqrtxss: + case TOP_vfsqrtxxss: + case TOP_vfsqrtxxxss: + case TOP_vfrsqrtxss: + case TOP_vfrsqrtxxss: + case TOP_vfrsqrtxxxss: return 4; case TOP_ldsd: // 64 bit @@ -382,6 +388,12 @@ case TOP_vfnmsubxrsd: case TOP_vfnmsubxxrsd: case TOP_vfnmsubxxxrsd: + case TOP_vfsqrtxsd: + case TOP_vfsqrtxxsd: + case TOP_vfsqrtxxxsd: + case TOP_vmovddupx: + case TOP_vmovddupxx: + case TOP_vmovddupxxx: return 8; case TOP_lddqa: // 128 bit @@ -462,14 +474,11 @@ case TOP_vldapdxx: case TOP_vldapsxx: case TOP_vmovsldupx: + case TOP_vmovsldupxx: + case TOP_vmovsldupxxx: case TOP_vmovshdupx: - case TOP_vmovddupx: - case TOP_vmovsldupxx: case TOP_vmovshdupxx: - case TOP_vmovddupxx: - case TOP_vmovsldupxxx: case TOP_vmovshdupxxx: - case TOP_vmovddupxxx: case TOP_vstdqa: case TOP_vstdqa_n32: case TOP_vstntpd: @@ -838,6 +847,32 @@ case TOP_lock_xor32: case TOP_lock_sub32: case TOP_lock_xadd32: + // AVX + case TOP_vldss: + case TOP_vldss_n32: + case TOP_vldssx: + case TOP_vldssxx: + case TOP_vfaddxss: + case TOP_vsubxss: + case TOP_vfaddxxss: + case TOP_vsubxxss: + case TOP_vfaddxxxss: + case TOP_vsubxxxss: + case TOP_vmulxss: + case TOP_vmulxxss: + case TOP_vmulxxxss: + case TOP_vdivxss: + case TOP_vdivxxss: + case TOP_vdivxxxss: + case TOP_vcomixss: + case TOP_vcomixxss: + case TOP_vcomixxxss: + case TOP_vcvtsd2ssx: + case TOP_vcvtsd2ssxx: + case TOP_vcvtsd2ssxxx: + case TOP_vcvtsi2ssx: + case TOP_vcvtsi2ssxx: + case TOP_vcvtsi2ssxxx: return 4; case TOP_addxr64: @@ -982,6 +1017,44 @@ case TOP_fmovsldupxxx: case TOP_fmovshdupxxx: case TOP_fmovddupxxx: + // AVX + case TOP_vldsd_n32: + case TOP_vldsd: + case TOP_vldsdx: + case TOP_vldsdxx: + case TOP_vfaddxsd: + case TOP_vfaddxxsd: + case TOP_vfaddxxxsd: + case TOP_vsubxsd: + case TOP_vsubxxsd: + case TOP_vsubxxxsd: + case TOP_vmulxsd: + case TOP_vmulxxsd: + case TOP_vmulxxxsd: + case TOP_vdivxsd: + case TOP_vdivxxsd: + case TOP_vdivxxxsd: + case TOP_vcomixsd: + case TOP_vcomixxsd: + case TOP_vcomixxxsd: + case TOP_vmovsldupx: + case TOP_vmovsldupxx: + case TOP_vmovsldupxxx: + case TOP_vmovshdupx: + case TOP_vmovshdupxx: + case TOP_vmovshdupxxx: + case TOP_vmovddupx: + case TOP_vmovddupxx: + case TOP_vmovddupxxx: + case TOP_vcvtsi2sdx: + case TOP_vcvtsi2sdxx: + case TOP_vcvtsi2sdxxx: + case TOP_vcvtsi2sdqx: + case TOP_vcvtsi2sdqxx: + case TOP_vcvtsi2sdqxxx: + case TOP_vcvtsi2ssqx: + case TOP_vcvtsi2ssqxx: + case TOP_vcvtsi2ssqxxx: return 8; case TOP_fldt: Modified: trunk/osprey/be/cg/x8664/ebo_special.cxx =================================================================== --- trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -4574,6 +4574,7 @@ {TOP_UNDEFINED, TOP_vldsd, TOP_vldsdx, TOP_vldsdxx, TOP_vldsd_n32}, {TOP_UNDEFINED, TOP_vlddqa, TOP_vlddqax, TOP_vlddqaxx, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vldupd, TOP_vldupdx, TOP_vldupdxx, TOP_vldupd_n32}, + {TOP_UNDEFINED, TOP_vldups, TOP_vldupsx, TOP_vldupsxx, TOP_vldups_n32}, {TOP_UNDEFINED, TOP_vlddqu, TOP_vlddqux, TOP_vlddquxx, TOP_vlddqu_n32}, {TOP_UNDEFINED, TOP_vldlps, TOP_vldlpsx, TOP_vldlpsxx, TOP_vldlps_n32}, {TOP_UNDEFINED, TOP_vldlpd, TOP_vldlpdx, TOP_vldlpdxx, TOP_vldlpd_n32}, @@ -4662,6 +4663,15 @@ {TOP_mulsd, TOP_mulxsd, TOP_mulxxsd, TOP_mulxxxsd, TOP_UNDEFINED}, {TOP_vmulss, TOP_vmulxss, TOP_vmulxxss, TOP_vmulxxxss, TOP_UNDEFINED}, {TOP_vmulsd, TOP_vmulxsd, TOP_vmulxxsd, TOP_vmulxxxsd, TOP_UNDEFINED}, + {TOP_vfaddss, TOP_vfaddxss, TOP_vfaddxxss, TOP_vfaddxxxss, TOP_UNDEFINED}, + {TOP_vfaddsd, TOP_vfaddxsd, TOP_vfaddxxsd, TOP_vfaddxxxsd, TOP_UNDEFINED}, + {TOP_vfsqrtss, TOP_vfsqrtxss, TOP_vfsqrtxxss, TOP_vfsqrtxxxss, TOP_UNDEFINED}, + {TOP_vfsqrtsd, TOP_vfsqrtxsd, TOP_vfsqrtxxsd, TOP_vfsqrtxxxsd, TOP_UNDEFINED}, + {TOP_vfrsqrtss, TOP_vfrsqrtxss, TOP_vfrsqrtxxss, TOP_vfrsqrtxxxss, TOP_UNDEFINED}, + {TOP_vfrsqrt128v32, TOP_vfrsqrtx128v32, TOP_vfrsqrtxx128v32, TOP_vfrsqrtxxx128v32, TOP_UNDEFINED}, + {TOP_vfsqrt128v64, TOP_vfsqrtx128v64, TOP_vfsqrtxx128v64, TOP_vfsqrtxxx128v64, TOP_UNDEFINED}, + {TOP_vfsqrt128v32, TOP_vfsqrtx128v32, TOP_vfsqrtxx128v32, TOP_vfsqrtxxx128v32, TOP_UNDEFINED}, + {TOP_vfrcp128v32, TOP_vfrcpx128v32, TOP_vfrcpxx128v32, TOP_vfrcpxxx128v32, TOP_UNDEFINED}, {TOP_fmul128v32, TOP_fmulx128v32, TOP_fmulxx128v32, TOP_fmulxxx128v32, TOP_UNDEFINED}, {TOP_fmul128v64, TOP_fmulx128v64, TOP_fmulxx128v64, TOP_fmulxxx128v64, TOP_UNDEFINED}, {TOP_cmpgt128v8, TOP_cmpgtx128v8, TOP_cmpgtxx128v8, TOP_cmpgtxxx128v8, TOP_UNDEFINED}, @@ -4797,24 +4807,24 @@ {TOP_vfmaddsd, TOP_vfmaddxsd, TOP_vfmaddxxsd, TOP_vfmaddxxxsd, TOP_UNDEFINED}, {TOP_vfmaddps, TOP_vfmaddxps, TOP_vfmaddxxps, TOP_vfmaddxxxps, TOP_UNDEFINED}, {TOP_vfmaddpd, TOP_vfmaddxpd, TOP_vfmaddxxpd, TOP_vfmaddxxxpd, TOP_UNDEFINED}, - {TOP_vfmaddsubps, TOP_vfmaddsubxps, TOP_vfmaddsubxxps, TOP_vfmaddsubxxxps, TOP_UNDEFINED}, - {TOP_vfmaddsubpd, TOP_vfmaddsubxpd, TOP_vfmaddsubxxpd, TOP_vfmaddsubxxxpd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddxrss, TOP_vfmaddxxrss, TOP_vfmaddxxxrss, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddxrsd, TOP_vfmaddxxrsd, TOP_vfmaddxxxrsd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddxrps, TOP_vfmaddxxrps, TOP_vfmaddxxxrps, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddxrpd, TOP_vfmaddxxrpd, TOP_vfmaddxxxrpd, TOP_UNDEFINED}, + {TOP_vfmaddsubps, TOP_vfmaddsubxps, TOP_vfmaddsubxxps, TOP_vfmaddsubxxxps, TOP_UNDEFINED}, + {TOP_vfmaddsubpd, TOP_vfmaddsubxpd, TOP_vfmaddsubxxpd, TOP_vfmaddsubxxxpd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddsubxrps, TOP_vfmaddsubxxrps, TOP_vfmaddsubxxxrps, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmaddsubxrpd, TOP_vfmaddsubxxrpd, TOP_vfmaddsubxxxrpd, TOP_UNDEFINED}, {TOP_vfmsubss, TOP_vfmsubxss, TOP_vfmsubxxss, TOP_vfmsubxxxss, TOP_UNDEFINED}, {TOP_vfmsubsd, TOP_vfmsubxsd, TOP_vfmsubxxsd, TOP_vfmsubxxxsd, TOP_UNDEFINED}, {TOP_vfmsubps, TOP_vfmsubxps, TOP_vfmsubxxps, TOP_vfmsubxxxps, TOP_UNDEFINED}, {TOP_vfmsubpd, TOP_vfmsubxpd, TOP_vfmsubxxpd, TOP_vfmsubxxxpd, TOP_UNDEFINED}, - {TOP_vfmsubaddps, TOP_vfmsubaddxps, TOP_vfmsubaddxxps, TOP_vfmsubaddxxxps, TOP_UNDEFINED}, - {TOP_vfmsubaddpd, TOP_vfmsubaddxpd, TOP_vfmsubaddxxpd, TOP_vfmsubaddxxxpd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubxrss, TOP_vfmsubxxrss, TOP_vfmsubxxxrss, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubxrsd, TOP_vfmsubxxrsd, TOP_vfmsubxxxrsd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubxrps, TOP_vfmsubxxrps, TOP_vfmsubxxxrps, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubxrpd, TOP_vfmsubxxrpd, TOP_vfmsubxxxrpd, TOP_UNDEFINED}, + {TOP_vfmsubaddps, TOP_vfmsubaddxps, TOP_vfmsubaddxxps, TOP_vfmsubaddxxxps, TOP_UNDEFINED}, + {TOP_vfmsubaddpd, TOP_vfmsubaddxpd, TOP_vfmsubaddxxpd, TOP_vfmsubaddxxxpd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubaddxrps, TOP_vfmsubaddxxrps, TOP_vfmsubaddxxxrps, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubaddxrpd, TOP_vfmsubaddxxrpd, TOP_vfmsubaddxxxrpd, TOP_UNDEFINED}, {TOP_icall, TOP_icallx, TOP_icallxx, TOP_icallxxx, TOP_UNDEFINED}, @@ -5200,9 +5210,6 @@ } if (OP_load(op) || OP_store(op) || OP_prefetch(op)) { if (Is_Target_Orochi() && Is_Target_AVX() && OP_load(op) && - (old_top != TOP_vldsd) && - (old_top != TOP_vldsdx) && - (old_top != TOP_vldsdxx) && (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) { new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), base, index, scale, offset ); @@ -7623,12 +7630,27 @@ else new_op = Mk_OP( new_top, storeval, base, index, scale, offset ); } else { - if (mode == N32_MODE) - new_op = Mk_OP( new_top, storeval, offset ); - else if (mode == INDEX_MODE) - new_op = Mk_OP( new_top, storeval, index, scale, offset ); - else - new_op = Mk_OP( new_top, storeval, base, offset, index, scale ); + if (Is_Target_Orochi() && Is_Target_AVX() && OP_load(op) && + (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) { + if (mode == N32_MODE) + new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), offset ); + else if (mode == INDEX_MODE) + new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), + index, scale, offset ); + else if (mode == BASE_INDEX_MODE) + new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), + base, index, scale, offset ); + else + new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), + base, offset ); + } else { + if (mode == N32_MODE) + new_op = Mk_OP( new_top, storeval, offset ); + else if (mode == INDEX_MODE) + new_op = Mk_OP( new_top, storeval, index, scale, offset ); + else + new_op = Mk_OP( new_top, storeval, base, offset, index, scale ); + } } } @@ -7754,11 +7776,8 @@ return FALSE; } - const TOP old_top = OP_code(op); if (Is_Target_Orochi() && Is_Target_AVX() && - (old_top != TOP_vldsd) && - (old_top != TOP_vldsdx) && - (old_top != TOP_vldsdxx) && + (CG_load_execute == 0) && (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) { return FALSE; } @@ -9248,6 +9267,217 @@ return rval; } +static void Get_Disassociated_FMA_TOP_Codes( OP *alu_op, + TOP *mul_top, + TOP *arith_top ) +{ + const TOP top = OP_code(alu_op); + TOP new_mul_top; + TOP new_arith_top; + + switch (top) { + // fused multiply-adds + case TOP_vfmaddss: + new_mul_top = TOP_vmulss; + new_arith_top = TOP_vfaddss; + break; + case TOP_vfmaddsd: + new_mul_top = TOP_vmulsd; + new_arith_top = TOP_vfaddsd; + break; + case TOP_vfmaddps: + new_mul_top = TOP_vfmul128v32; + new_arith_top = TOP_vfadd128v32; + break; + case TOP_vfmaddpd: + new_mul_top = TOP_vfmul128v64; + new_arith_top = TOP_vfadd128v64; + break; + + // fused multiply-addsubs + case TOP_vfmaddsubps: + new_mul_top = TOP_vfmul128v32; + new_arith_top = TOP_vfaddsub128v32; + break; + case TOP_vfmaddsubpd: + new_mul_top = TOP_vfmul128v64; + new_arith_top = TOP_vfaddsub128v64; + break; + + // fused multiply-subs + case TOP_vfmsubss: + new_mul_top = TOP_vmulss; + new_arith_top = TOP_vsubss; + break; + case TOP_vfmsubsd: + new_mul_top = TOP_vmulsd; + new_arith_top = TOP_vsubsd; + break; + case TOP_vfmsubps: + new_mul_top = TOP_vfmul128v32; + new_arith_top = TOP_vfsub128v32; + break; + case TOP_vfmsubpd: + new_mul_top = TOP_vfmul128v64; + new_arith_top = TOP_vfsub128v64; + break; + + // everything else + default: + new_mul_top = TOP_UNDEFINED; + new_arith_top = TOP_UNDEFINED; + break; + } + + *mul_top = new_mul_top; + *arith_top = new_arith_top; +} + +static BOOL Is_Benefitial_To_Load_Exec_Float_OP( OP *ld_op, OP *alu_op ) +{ + BOOL ret_val = TRUE; + TN *result = NULL; + + // find an appropriate result tn + for (INT i = 0; i < OP_results(ld_op); i++){ + result = OP_result(ld_op, i); + if( TN_is_register(result) ){ + if( TN_register_class(result) == ISA_REGISTER_CLASS_float ) + break; + } + } + + // Bypass this process if register class is not float + if( ( result != NULL ) && + ( TN_register_class(result) == ISA_REGISTER_CLASS_float ) ){ + BB *bb = OP_bb(ld_op); + + const INT len = BB_length(bb); + INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1)); + mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1]; + TN_MAP conflict_map; + + MEM_POOL load_exe_pool; + MEM_POOL_Initialize(&load_exe_pool, "live_range_info", TRUE); + + MEM_POOL_Push(&load_exe_pool); + LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, &load_exe_pool); + + conflict_map = Calculate_All_Conflicts(bb, regs_in_use, + TN_register_class(result)); + + INT P_x = REGISTER_CLASS_register_count(TN_register_class(result)); + INT local_conflicts = Find_Degree_For_TN(result, regs_in_use); + + TN_MAP_Delete(conflict_map); + MEM_POOL_Pop(&load_exe_pool); + + // In the case where register pressure is manageable, defer this choice + // until later(after register allocation) + if( local_conflicts < P_x ) + ret_val = FALSE; + + if( EBO_Trace_Data_Flow && ret_val ) { + fprintf(TFile, "load-exec(%d) on %s and %s\n", + EBO_in_peep, + TOP_Name(OP_code(ld_op)), TOP_Name(OP_code(alu_op))); + } + } + + return ret_val; +} + +BOOL EBO_Disassociate_FMA( OP* alu_op ) +{ + BOOL ret_val = FALSE; + + if( CG_load_execute == 0 ) + return ret_val; + + // Look for situations where fma insns exist under register pressure. + // We lose opportunities in this scenario for eliminating + // live ranges as our default fma behavior is reg-reg only. It is + // also worth indicating here that load-executing the fma instructions + // themselves reduces througput, making this action desirable for both cases. + // This operation preceeds load_execution, so we can handle both scenarios + // here. This functionalty is under control of CG_load_execute. We resolve + // the register pressure dilemma by disassocation of the fma components + // so that we can load execute two components or remove 2 live ranges via + // the memory forms of the fma components. + if( TOP_is_load_exe(OP_code(alu_op)) == FALSE ) { + BB *bb = OP_bb(alu_op); + TN *mul_opnd1 = OP_opnd( alu_op, 0 ); + TN *mul_opnd2 = OP_opnd( alu_op, 1 ); + TN *arith_opnd = OP_opnd( alu_op, 2 ); + TN *result = OP_result(alu_op, 0); + BOOL fma_chained = FALSE; + const INT len = BB_length(bb); + INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1)); + mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1]; + TN_MAP conflict_map; + TOP mul_top; + TOP arith_top; + + MEM_POOL fma_exe_pool; + MEM_POOL_Initialize(&fma_exe_pool, "live_range_info", TRUE); + + MEM_POOL_Push(&fma_exe_pool); + LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, &fma_exe_pool); + + conflict_map = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_float); + + INT P_x = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float); + INT local_conflicts = Find_Degree_For_TN(result, regs_in_use); + + TN_MAP_Delete(conflict_map); + MEM_POOL_Pop(&fma_exe_pool); + + Get_Disassociated_FMA_TOP_Codes( alu_op, &mul_top, &arith_top ); + + // Chained single use fma instructions produce simple live ranges + // which are better left in this form. + if( Is_TN_Sdsu( result ) ){ + OP *use_op = Find_UseOp_For_TN( result ); + if( use_op && EBO_Is_FMA4( use_op ) ) + fma_chained = TRUE; + } + + // Now if we successfully mapped a translation, add the new code + // for scenarios where we have at least 2 live ranges greater + // than the number of fp registers, as we will be giving potentially + // two back from the load exec forms for the new insns. + if( ( local_conflicts > ( P_x + 2 ) ) && + ( fma_chained == FALSE ) && + ( mul_top != TOP_UNDEFINED ) && + ( arith_top != TOP_UNDEFINED ) ){ + TN *mul_result = Build_TN_Like(result); + OP *mul_op = Mk_OP( mul_top, mul_result, mul_opnd1, mul_opnd2 ); + OP *arith_op = Mk_OP( arith_top, result, mul_result, arith_opnd ); + + // Add the mul component of the fma + Set_OP_unrolling( mul_op, OP_unrolling(alu_op) ); + Set_OP_orig_idx( mul_op, OP_map_idx(alu_op) ); + Set_OP_unroll_bb( mul_op, OP_unroll_bb(alu_op) ); + + OP_srcpos( mul_op ) = OP_srcpos( alu_op ); + BB_Insert_Op_After( bb, alu_op, mul_op ); + + // Now add the arithmetic (add, sub, addsub or subadd part) + Set_OP_unrolling( arith_op, OP_unrolling(alu_op) ); + Set_OP_orig_idx( arith_op, OP_map_idx(alu_op) ); + Set_OP_unroll_bb( arith_op, OP_unroll_bb(alu_op) ); + + OP_srcpos( arith_op ) = OP_srcpos( alu_op ); + BB_Insert_Op_After( bb, mul_op, arith_op ); + + ret_val = TRUE; + } + } + + return ret_val; +} + BOOL EBO_Load_Execution( OP* alu_op, TN** opnd_tn, EBO_TN_INFO** actual_tninfo, @@ -9259,6 +9489,7 @@ const TOP top = OP_code(alu_op); BOOL opnds_swapped = FALSE; BOOL rval = FALSE; + BOOL pressure_check_bypass = FALSE; if( top == TOP_xor64 || top == TOP_or64 || @@ -9410,6 +9641,29 @@ OP_code(alu_op) == TOP_cvtps2pd) return FALSE; + // For AVX code, it is too attractive not to eliminate 3 ops for 1 + // TODO: possibly confine to vectorized loops? For now its a hueristic + if( Is_Target_Orochi() && Is_Target_AVX() && OP_vec_lo_ldst(ld_op) ) { + if( ( PU_src_lang(Get_Current_PU()) == PU_F77_LANG ) || + ( PU_src_lang(Get_Current_PU()) == PU_F90_LANG ) ) { + if( ( OP_code(alu_op) == TOP_vcvtdq2pd ) || + ( OP_code(alu_op) == TOP_vcvtps2pd ) ){ + pressure_check_bypass = TRUE; + } + } else { + pressure_check_bypass = TRUE; + } + } + + // Gate load execute before register allocation based on + // localized register pressure + if( !EBO_in_peep ){ + if( !Is_Benefitial_To_Load_Exec_Float_OP( ld_op, alu_op ) && + !pressure_check_bypass ) { + return Process_Side_Effects(opnd_tn, actual_tninfo, rval, opnds_swapped); + } + } + /* Check <index> and <base> will not be re-defined between <ld_op> and <alu_op>, inclusive. */ @@ -10668,28 +10922,19 @@ TOP topcode; if (base && offset && index && scale) { - topcode = TOP_fmovddupxx; - if (Is_Target_Orochi() && Is_Target_AVX()) - topcode = TOP_vmovddupxx; - new_op = Mk_OP (topcode, + new_op = Mk_OP (TOP_fmovddupxx, OP_result(op, 0), OP_opnd(load, 0), OP_opnd(load, 1), OP_opnd(load, 2), OP_opnd(load, 3)); } else if (base && offset) { - topcode = TOP_fmovddupx; - if (Is_Target_Orochi() && Is_Target_AVX()) - topcode = TOP_vmovddupx; - new_op = Mk_OP (topcode, + new_op = Mk_OP (TOP_fmovddupx, OP_result(op, 0), OP_opnd(load, 0), OP_opnd(load, 1)); } else if (index && scale && offset) { - topcode = TOP_fmovddupxxx; - if (Is_Target_Orochi() && Is_Target_AVX()) - topcode = TOP_vmovddupxxx; - new_op = Mk_OP (topcode, + new_op = Mk_OP (TOP_fmovddupxxx, OP_result(op, 0), OP_opnd(load, 0), OP_opnd(load, 1), Modified: trunk/osprey/be/cg/x8664/expand.cxx =================================================================== --- trunk/osprey/be/cg/x8664/expand.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/be/cg/x8664/expand.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -6227,14 +6227,23 @@ ST* st = Gen_Temp_Symbol( ty, "movd" ); Allocate_Temp_To_Memory( st ); Exp_Store( MTYPE_I8, op1, st, 0, ops, 0); - Exp_Load( MTYPE_F8, MTYPE_F8, tmp, st, 0, ops, 0); - Expand_Copy(result, tmp, MTYPE_F8, ops); - Build_OP(TOP_unpcklpd, result, result, tmp, ops); + if (Is_Target_Orochi() && Is_Target_AVX()) { + Exp_Load( MTYPE_F8, MTYPE_F8, tmp, st, 0, ops, 0); + Build_OP(TOP_fmovddup, result, tmp, tmp, ops); + } else { + Exp_Load( MTYPE_F8, MTYPE_F8, tmp, st, 0, ops, 0); + Expand_Copy(result, tmp, MTYPE_F8, ops); + Build_OP(TOP_unpcklpd, result, result, tmp, ops); + } break; } case OPC_V16F8F8REPLICA: - Expand_Copy(result, op1, MTYPE_F8, ops); - Build_OP(TOP_unpcklpd, result, result, op1, ops); + if (Is_Target_Orochi() && Is_Target_AVX()) { + Build_OP(TOP_fmovddup, result, op1, op1, ops); + } else { + Expand_Copy(result, op1, MTYPE_F8, ops); + Build_OP(TOP_unpcklpd, result, result, op1, ops); + } break; case OPC_V16I4I4REPLICA: { @@ -6242,16 +6251,27 @@ ST* st = Gen_Temp_Symbol( ty, "movd" ); Allocate_Temp_To_Memory( st ); Exp_Store( MTYPE_I4, op1, st, 0, ops, 0); - Exp_Load( MTYPE_F4, MTYPE_F4, tmp, st, 0, ops, 0); - Expand_Copy(result, tmp, MTYPE_F4, ops); - Build_OP(TOP_unpcklps, result, result, tmp, ops); - Build_OP(TOP_unpcklps, result, result, result, ops); + if (Is_Target_Orochi() && Is_Target_AVX()) { + Exp_Load( MTYPE_F4, MTYPE_F4, tmp, st, 0, ops, 0); + Build_OP(TOP_unpcklps, result, tmp, tmp, ops); + Build_OP(TOP_unpcklps, result, result, result, ops); + } else { + Exp_Load( MTYPE_F4, MTYPE_F4, tmp, st, 0, ops, 0); + Expand_Copy(result, tmp, MTYPE_F4, ops); + Build_OP(TOP_unpcklps, result, result, tmp, ops); + Build_OP(TOP_unpcklps, result, result, result, ops); + } break; } case OPC_V16F4F4REPLICA: - Expand_Copy(result, op1, MTYPE_F4, ops); - Build_OP(TOP_unpcklps, result, result, op1, ops); - Build_OP(TOP_unpcklps, result, result, result, ops); + if (Is_Target_Orochi() && Is_Target_AVX()) { + Build_OP(TOP_unpcklps, result, op1, op1, ops); + Build_OP(TOP_unpcklps, result, result, result, ops); + } else { + Expand_Copy(result, op1, MTYPE_F4, ops); + Build_OP(TOP_unpcklps, result, result, op1, ops); + Build_OP(TOP_unpcklps, result, result, result, ops); + } break; case OPC_V16I2I2REPLICA: { @@ -10115,8 +10135,10 @@ if (CG_NoClear_Avx_Simd == false) Build_OP(TOP_vzeroupper, ops ); - Build_OP(TOP_leaxx64, r11_tn, rax_tn, Gen_Literal_TN(8, 4), - Gen_Literal_TN(4*(num_xmms-8), 4), ops); + // The insn size for vstaps is 5 bytes, note that + // leaq 0(%rax,%rax,4) is (5 * %rax) + Build_OP(TOP_leax64, r11_tn, rax_tn, rax_tn, Gen_Literal_TN(4, 4), + Gen_Literal_TN(5*(num_xmms-8), 4), ops); } else { Build_OP(TOP_leaxx64, r11_tn, rax_tn, Gen_Literal_TN(4, 4), Gen_Literal_TN(4*(num_xmms-8), 4), ops); @@ -10134,21 +10156,10 @@ } Build_OP(TOP_lea64, rax_tn, OP_opnd(op, 2), OP_opnd(op, 3), ops); Build_OP(TOP_ijmp, r11_tn, ops); - if (Is_Target_Orochi() && Is_Target_AVX()) { - // The insn size for vstaps is 5 bytes, so we need the 3 byte nop - // below to pad for the scale of 8 in the jump table. - for (INT i = 1; i <= num_xmms; i++) { - Build_OP(TOP_staps, PREG_To_TN(Int_Preg, XMM0+(8-i)), - rax_tn, Gen_Literal_TN(16 * (num_xmms-i) + 1, 4), ops); - Build_OP( TOP_mov64, rax_tn, rax_tn, ops ); - } - } else { - for (INT i = 1; i <= num_xmms; i++) { - Build_OP(TOP_staps, PREG_To_TN(Int_Preg, XMM0+(8-i)), - rax_tn, Gen_Literal_TN(16 * (num_xmms-i) + 1, 4), ops); - } + for (INT i = 1; i <= num_xmms; i++) { + Build_OP(TOP_staps, PREG_To_TN(Int_Preg, XMM0+(8-i)), + rax_tn, Gen_Literal_TN(16 * (num_xmms-i) + 1, 4), ops); } - break; } default: Modified: trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx =================================================================== --- trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -3610,6 +3610,7 @@ /* One result / one mem opnd with scaled-index with base */ ISA_PRINT_TYPE rmemindex = ISA_Print_Type_Create("rmemindex", "%s %s%s(%s,%s,%s),%s"); Name(); + Segment(); Operand(3); Operand(0); Operand(1); @@ -3693,6 +3694,7 @@ /* One result / one mem opnd with scaled-index w/o base */ ISA_PRINT_TYPE rmemindexx = ISA_Print_Type_Create("rmemindexx", "%s %s%s(,%s,%s),%s"); Name(); + Segment(); Operand(2); Operand(0); Operand(1); Modified: trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx =================================================================== --- trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -8,7 +8,7 @@ /* * Copyright (C) 2007 QLogic Corporation. All Rights Reserved. - */ + / /* * Copyright 2003, 2004, 2005, 2006 PathScale, Inc. All Rights Reserved. @@ -6404,6 +6404,9 @@ TOP_vfblendvx128v32, TOP_vfblendvxx128v32, TOP_vfblendvxxx128v32, + TOP_vmovddupx, + TOP_vmovddupxx, + TOP_vmovddupxxx, /* INTEL FMA instructions */ TOP_xfmadd132xpd, TOP_xfmadd132xxpd, @@ -9105,17 +9108,29 @@ TOP_ldupsx, TOP_ldupsxx, /* AVX instructions */ - TOP_vlddqu_n32, + TOP_vldlps, + TOP_vldlpsx, + TOP_vldlpsxx, + TOP_vldhps, + TOP_vldhpsx, + TOP_vldhpsxx, + TOP_vldlpd, + TOP_vldlpdx, + TOP_vldlpdxx, + TOP_vldhpd, + TOP_vldhpdx, + TOP_vldhpdxx, TOP_vlddqu, TOP_vlddqux, TOP_vlddquxx, + TOP_vlddqu_n32, TOP_vldupd, TOP_vldupdx, TOP_vldupdxx, + TOP_vldupd_n32, TOP_vldups, TOP_vldupsx, TOP_vldupsxx, - TOP_vldupd_n32, TOP_vldups_n32, TOP_UNDEFINED); @@ -12731,14 +12746,6 @@ TOP_vpmovzxwqx, TOP_vpmovzxwqxx, TOP_vpmovzxwqxxx, - TOP_vfrcpss, - TOP_vfrcpxss, - TOP_vfrcpxxss, - TOP_vfrcpxxxss, - TOP_vmovddup, - TOP_vmovddupx, - TOP_vmovddupxx, - TOP_vmovddupxxx, TOP_vmovshdup, TOP_vmovshdupx, TOP_vmovshdupxx, @@ -12777,12 +12784,6 @@ TOP_vldhps, TOP_vldhpsx, TOP_vldhpsxx, - TOP_vldsd, - TOP_vldsdx, - TOP_vldsdxx, - TOP_vldss, - TOP_vldssx, - TOP_vldssxx, TOP_vabs128v8, TOP_vabsx128v8, TOP_vabsxx128v8, @@ -15126,18 +15127,19 @@ TOP_stlpsxx, TOP_storelpd, /* AVX instructions */ - TOP_vldsd, - TOP_vldsdx, - TOP_vldsdxx, - TOP_vldlps, - TOP_vldlpsx, - TOP_vldlpsxx, - TOP_vstlps, - TOP_vstlpsx, - TOP_vstlpsxx, - TOP_vldsd_n32, - TOP_vldlps_n32, - TOP_vstlps_n32, + TOP_vldlps, + TOP_vldlpsx, + TOP_vldlpsxx, + TOP_vldlpd, + TOP_vldlpdx, + TOP_vldlpdxx, + TOP_vstlpd, + TOP_vstlpdx, + TOP_vstlpdxx, + TOP_vstlps, + TOP_vstlpsx, + TOP_vstlpsxx, + TOP_vstorelpd, TOP_UNDEFINED); /* ===== Instructions that load and store the higher 64-bits of a xmm register */ @@ -15169,10 +15171,7 @@ TOP_vsthps, TOP_vsthpsx, TOP_vsthpsxx, - TOP_vldhpd_n32, - TOP_vldhps_n32, - TOP_vsthpd_n32, - TOP_vsthps_n32, + TOP_vmovlhps, TOP_UNDEFINED); vector_packed_single = ISA_Property_Create ("vector_packed_single"); Modified: trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx =================================================================== --- trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx 2011-07-13 02:39:29 UTC (rev 3689) +++ trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx 2011-07-13 18:35:48 UTC (rev 3690) @@ -289,10 +289,6 @@ TOP_andxxx8, TOP_andxxx16, TOP_andxxx32, - TOP_addxxxss, - TOP_addxxxsd, - TOP_subxxxss, - TOP_subxxxsd, TOP_andxxx64, TOP_cmpxxx8, TOP_cmpxxx16, @@ -977,7 +973,7 @@ TOP_vmovaps, TOP_UNDEFINED ); Any_Operand_Access_Time(0); - Any_Result_Available_Time(1); + Any_Result_Available_Time(2); Resource_Requirement(res_issue, 0); Resource_Requirement(res_fadd, 0); @@ -1011,6 +1007,18 @@ TOP_lddqa_n32, TOP_lddqax, TOP_lddqaxx, + TOP_vldss_n32, + TOP_vldsd_n32, + TOP_vldss, + TOP_vldsd, + TOP_vldssx, + TOP_vldssxx, + TOP_vldsdx, + TOP_vldsdxx, + TOP_vlddqa, + TOP_vlddqa_n32, + TOP_vlddqax, + TOP_vlddqaxx, TOP_UNDEFINED ); Any_Operand_Access_Time(0); Any_Result_Available_Time(4); @@ -1563,6 +1571,10 @@ TOP_fisttpl, TOP_fisttpll, TOP_fldz, + TOP_vfaddsd, + TOP_vfaddss, + TOP_vsubsd, + TOP_vsubss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(5); @@ -1597,8 +1609,6 @@ TOP_vfaddsub128v32, TOP_vfadd128v64, TOP_vfadd128v32, - TOP_vfaddsd, - TOP_vfaddss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(5); @@ -1648,12 +1658,6 @@ TOP_vfaddx128v32, TOP_vfaddxx128v32, TOP_vfaddxxx128v32, - TOP_vfaddxsd, - TOP_vfaddxxsd, - TOP_vfaddxxxsd, - TOP_vfaddxss, - TOP_vfaddxxss, - TOP_vfaddxxxss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); @@ -2392,16 +2396,32 @@ Instruction_Group("float-alu w/ memory operand", TOP_addxss, + TOP_addxxss, + TOP_addxxxss, TOP_addxsd, - TOP_addxxss, TOP_addxxsd, + TOP_addxxxsd, + TOP_subxss, + TOP_subxxss, + TOP_subxxxss, TOP_subxsd, - TOP_subxss, TOP_subxxsd, - TOP_subxxss, + TOP_subxxxsd, TOP_filds, TOP_fildl, TOP_fildll, + TOP_vfaddxsd, + TOP_vfaddxxsd, + TOP_vfaddxxxsd, + TOP_vfaddxss, + TOP_vfaddxxss, + TOP_vfaddxxxss, + TOP_vsubxsd, + TOP_vsubxxsd, + TOP_vsubxxxsd, + TOP_vsubxss, + TOP_vsubxxss, + TOP_vsubxxxss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); @@ -2461,9 +2481,11 @@ Instruction_Group("float-mul", TOP_mulsd, TOP_mulss, + TOP_vmulsd, + TOP_vmulss, TOP_UNDEFINED); Any_Operand_Access_Time(0); - Any_Result_Available_Time(5); + Any_Result_Available_Time(6); Resource_Requirement(res_issue, 0); Resource_Requirement(res_fmul, 0); @@ -2474,6 +2496,12 @@ TOP_mulxss, TOP_mulxxss, TOP_mulxxxss, + TOP_vmulxsd, + TOP_vmulxxsd, + TOP_vmulxxxsd, + TOP_vmulxss, + TOP_vmulxxss, + TOP_vmulxxxss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); @@ -4627,10 +4655,6 @@ TOP_vldx64_2sse, TOP_vldxx64_2sse, TOP_vld64_2sse_n32, - TOP_vlddqa, - TOP_vlddqax, - TOP_vlddqaxx, - TOP_vlddqa_n32, TOP_vldhpd, TOP_vldhpdx, TOP_vldhpdxx, @@ -4639,10 +4663,6 @@ TOP_vldhpsx, TOP_vldhpsxx, TOP_vldhps_n32, - TOP_vldsd, - TOP_vldsdx, - TOP_vldsdxx, - TOP_vldsd_n32, TOP_vldlps, TOP_vldlpsx, TOP_vldlpsxx, @@ -4654,10 +4674,6 @@ TOP_vldlpdx, TOP_vldlpdxx, TOP_vldlpd_n32, - TOP_vldss, - TOP_vldssx, - TOP_vldssxx, - TOP_vldss_n32, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(4); @@ -4946,8 +4962,6 @@ Instruction_Group( "avx fp arith reg opnd 2", TOP_vfrcp128v32, TOP_vfrcpss, - TOP_vsubsd, - TOP_vsubss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(5); @@ -5116,12 +5130,6 @@ TOP_vfrcpxss, TOP_vfrcpxxss, TOP_vfrcpxxxss, - TOP_vsubxsd, - TOP_vsubxxsd, - TOP_vsubxxxsd, - TOP_vsubxss, - TOP_vsubxxss, - TOP_vsubxxxss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); @@ -5264,6 +5272,8 @@ Any_Result_Available_Time(4); Resource_Requirement(res_issue, 0); Resource_Requirement(res_fstore, 0); + Resource_Requirement(res_loadstore, 0); + Store_Available_Time(4); Instruction_Group( "avx broadcast reg opnd", TOP_vfbroadcastss, @@ -5298,12 +5308,11 @@ Instruction_Group( "avx fp mul reg opnd", TOP_vfmul128v64, TOP_vfmul128v32, - TOP_vmulsd, - TOP_vmulss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(6); Resource_Requirement(res_issue, 0); + Resource_Requirement(res_fmul, 0); Instruction_Group( "avx fp mul mem opnd", TOP_vmpsadbwx, @@ -5322,17 +5331,12 @@ TOP_vfmulx128v32, TOP_vfmulxx128v32, TOP_vfmulxxx128v32, - TOP_vmulxsd, - TOP_vmulxxsd, - TOP_vmulxxxsd, - TOP_vmulxss, - TOP_vmulxxss, - TOP_vmulxxxss, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); Resource_Requirement(res_issue, 0); Resource_Requirement(res_fstore, 0); + Resource_Requirement(res_loadstore, 0); Instruction_Group( "avx ptest reg opnd", TOP_vptest128, @@ -5412,7 +5416,9 @@ Any_Operand_Access_Time(0); Any_Result_Available_Time(4); Resource_Requirement(res_issue, 0); - Resource_Requirement(res_fstore, 0); + Resource_Requirement(res_fadd, 0); + Resource_Requirement(res_loadstore, 0); + Load_Access_Time(4); Instruction_Group( "avx unalign reg opnd transfer", TOP_vstupd, ------------------------------------------------------------------------------ AppSumo Presents a FREE Video for the SourceForge Community by Eric Ries, the creator of the Lean Startup Methodology on "Lean Startup Secrets Revealed." This video shows you how to validate your ideas, optimize your ideas and identify your business strategy. http://p.sf.net/sfu/appsumosfdev2dev _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel