Author: mberg Date: 2011-06-17 15:45:27 -0400 (Fri, 17 Jun 2011) New Revision: 3651
Modified: trunk/osprey/be/cg/cg_flags.cxx trunk/osprey/be/cg/cg_flags.h trunk/osprey/be/cg/cg_loop.cxx trunk/osprey/be/cg/cg_loop.h trunk/osprey/be/cg/cgdriver.cxx trunk/osprey/be/cg/lra.cxx trunk/osprey/be/cg/lra.h trunk/osprey/be/cg/op.h trunk/osprey/be/cg/x8664/ebo_special.cxx trunk/osprey/be/cg/x8664/exp_loadstore.cxx trunk/osprey/be/cg/x8664/expand.cxx trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx Log: Best fit unrolling implementation added along with vmovlps update. The algorithm is controlled by -CG:nobest_fit=<on|off>, the behavior is on by default only on x86 targets under default unroll optimizations(unroll by 4 and size of 128). Other targets will need to revisit default values and add some support in todo marked areas to utilize this code. CR by Jian-Xin. Modified: trunk/osprey/be/cg/cg_flags.cxx =================================================================== --- trunk/osprey/be/cg/cg_flags.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/cg_flags.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -124,6 +124,7 @@ BOOL CG_128bitstore = TRUE; BOOL CG_branch_fuse = TRUE; BOOL CG_dispatch_schedule = FALSE; +BOOL CG_LOOP_nounroll_best_fit_set = FALSE; BOOL CG_strcmp_expand = TRUE; BOOL CG_merge_counters_x86 = FALSE; BOOL CG_interior_ptrs_x86 = FALSE; Modified: trunk/osprey/be/cg/cg_flags.h =================================================================== --- trunk/osprey/be/cg/cg_flags.h 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/cg_flags.h 2011-06-17 19:45:27 UTC (rev 3651) @@ -101,6 +101,9 @@ * BOOL CG_dispatch_schedule * Enable dispatch scheduling for Orochi style architectures. * + * BOOL CG_LOOP_unroll_best_fit + * Toggle default state of unroll best fit behavior. + * * BOOL CG_128bitstore * Enable 128bit unaligned stores optimization which emits movup{s|d} * instead of movhp{s|d} with movlp{s|d}. @@ -530,6 +533,7 @@ extern BOOL CG_cmp_load_exec; extern BOOL CG_fma4_load_exec; extern BOOL CG_dispatch_schedule; +extern BOOL CG_LOOP_nounroll_best_fit_set; extern BOOL CG_128bitstore; extern BOOL CG_branch_fuse; extern BOOL CG_strcmp_expand; Modified: trunk/osprey/be/cg/cg_loop.cxx =================================================================== --- trunk/osprey/be/cg/cg_loop.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/cg_loop.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -187,6 +187,8 @@ #include "ebo.h" #include "hb.h" #include "gra_live.h" +#include "lra.h" +#include "calls.h" #if defined(TARG_SL) #include "tag.h" @@ -256,6 +258,7 @@ BOOL CG_LOOP_optimize_multi_targ = FALSE; BOOL CG_LOOP_optimize_lno_winddown_cache = TRUE; BOOL CG_LOOP_optimize_lno_winddown_reg = TRUE; +BOOL CG_LOOP_unroll_best_fit = FALSE; /* Note: To set default unroll parameters, modify the initialization * of OPT_unroll_times/size in "config.c". @@ -5335,6 +5338,275 @@ } +// This algorithm is based in part on a paper by ma and carr +// for determining register pressure for unrolled loops and +// the most profitable unroll factor, if any, to unroll by. +void CG_LOOP::Determine_Best_Unit_Iteration_Interval(BOOL can_refit) +{ + BB *bb = LOOP_DESCR_loophead(loop); + INT init_II[5]; + BOOL saved_state_sched_est; + BOOL toggle_sched_est = false; + + // only single block loops + if (BB_SET_Size(LOOP_DESCR_bbset(loop)) != 1) + return; + + // This is now the default single block unroll factor calculation + // algorithm, if the user specified any other unroll by or size + // other than the default, the heuristics below will not be use to + // determine unroll factor. + if (CG_LOOP_unroll_best_fit == false) + return; + + MEM_POOL_Push(&MEM_phase_nz_pool); + + mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1]; + const INT len = BB_length(bb); + INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1)); + INT max_conf = 0; + INT R_f = 0; + INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float); + INT N_f = 0; + INT D_f = 0; + INT R_i = 0; + INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer); + INT N_i = 0; + INT D_i = 0; + INT avg_conflicts = 0; + INT k_conflicts = 0; + TN_MAP conflict_map_f; + TN_MAP conflict_map_i; + TN *tn; + LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, &MEM_phase_nz_pool); + +#ifdef TARG_X8664 + // now adjust the number of gpr regs as per the ABI + P_i--; + if (Is_Target_32bit() && Gen_Frame_Pointer) + P_i--; +#endif + + // compute the number of fp Regs Predicted, func return degree so add 1 + conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_float); + R_f = Find_Max_Conflicts(conflict_map_f, + &avg_conflicts, + &k_conflicts, + &N_f, + &D_f, + ISA_REGISTER_CLASS_float) + 1; + + // compute the number of gpr Regs Predicted, func return degree so add 1 + conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_integer); + R_i = Find_Max_Conflicts(conflict_map_i, + &avg_conflicts, + &k_conflicts, + &N_i, + &D_i, + ISA_REGISTER_CLASS_integer) + 1; + + TN_MAP_Delete(conflict_map_f); + TN_MAP_Delete(conflict_map_i); + + // Now figure out E, the total number of cross iteration edges for + // both float and integer regs by counting the live out regs of each + // type for the loop + INT E_f = 0; + INT E_i = 0; + + // Exposed uses which are updated are loop-carried dependences. + for (tn = GTN_SET_Choose(BB_live_use(bb)); + tn != GTN_SET_CHOOSE_FAILURE; + tn = GTN_SET_Choose_Next(BB_live_use(bb),tn)) { + bool exposed_use_is_updated = false; + for( OP* op = BB_first_op(bb); op != NULL; op = OP_next(op) ){ + if (OP_Defs_TN(op, tn)) { + exposed_use_is_updated = true; + break; + } + } + if (exposed_use_is_updated == false) continue; + if (TN_register_class(tn) == ISA_REGISTER_CLASS_float) + E_f++; + if (TN_register_class(tn) == ISA_REGISTER_CLASS_integer) + E_i++; + } + + // Count the number of prefetch insns, as unrolled loop bodies + // will only recieve a single copy for the whole iteration for + // these kind of instructions, same is true for loop carried + // dependences(E_i and E_f). This is not from ma and carr but + // is benefitial in that we miss upper bound opportunities otherwise. + INT num_prefetch = 0; + for (OP *op = BB_first_op(bb); op != NULL; op = OP_next(op)) + if (OP_prefetch(op)) num_prefetch++; + + // Try to refit the next unroll factor by 2 into the current + // size threshold if it will fit using the above data for the + // total loop size(it is more accurate than the prior calc). + if ((Unroll_fully() == false) && + (can_refit) && + is_power_of_two(unroll_factor) && + ((unroll_factor * 2) < CG_LOOP_unroll_times_max)) { + INT loop_size = BB_length(bb); + INT next_factor = unroll_factor * 2; + INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) * + (next_factor - 1)); + if (max_size < CG_LOOP_unrolled_size_max) + Set_unroll_factor(next_factor); + } else if ((Unroll_fully() == false) && + (can_refit) && + ((unroll_factor + 1) < CG_LOOP_unroll_times_max)) { + INT loop_size = BB_length(bb); + INT next_factor = unroll_factor + 1; + INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) * + (next_factor - 1)); + if (max_size < CG_LOOP_unrolled_size_max) + Set_unroll_factor(next_factor); + } + +#ifdef TARG_X8664 + // calculate each unroll factors init_II + INT A_spill_f = CGTARG_Latency(TOP_ldupd); + INT A_spill_i = CGTARG_Latency(TOP_ldx64); +#else + INT A_spill_f = 1; // stubbed, todo - fill in correctly per target + INT A_spill_i = 1; // stubbed, todo - fill in correctly per target +#endif + + INT unit_II[5]; + INT II_penalty_f; + INT II_penalty_i; + INT j, i; + INT iter_j = 0; + INT chose_j = 0; + INT min_unitII = INT_MAX; + INT ntimes = 1; + INT upper_bound = unroll_factor; + + // Refitted unroll factors and already assigned unroll factors of power + // 2 only are utlized here. + if (is_power_of_two(unroll_factor)) { + CG_SCHED_EST *loop_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool, + SCHED_EST_FOR_UNROLL); + CG_SCHED_EST *unroll_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool, + SCHED_EST_FOR_UNROLL | + SCHED_EST_IGNORE_PREFETCH | + SCHED_EST_IGNORE_BRANCH | + SCHED_EST_IGNORE_LOH_OPS | + SCHED_EST_IGNORE_INT_OPS); + init_II[0] = CG_SCHED_EST_Cycles(loop_se); + INT rolling_init_II; + for (j = 2; j <= upper_bound; j++) { + CG_SCHED_EST_Append_Scheds(loop_se, unroll_se); + rolling_init_II = CG_SCHED_EST_Cycles(loop_se); + switch (j) { + case 2: + init_II[1] = rolling_init_II; + break; + case 4: + init_II[2] = rolling_init_II; + break; + case 8: + init_II[3] = rolling_init_II; + break; + case 16: + init_II[4] = rolling_init_II; + break; + } + } + + // Divergences from ma and carr: We have the degree and live + // range info for loop carried dependences, so E_i and E_f are + // not treated as additive components of the unit_II penalty calc, + // also we figure prefetch and E components into N components for + // penalty calc. This is more accurate than ma and carr. And finally, + // we use the defaults as upper bounds for finding the best fit + // or minimal unit_II, where if we do not find a best fit that is other + // than unroll by 1, we defer to the original unroll factor. + int Tot_D_f = D_f; + int Tot_D_i = D_i; + for (i = 1, j = 0; i <= upper_bound; i = i * 2, j++) { + // calculate the II_penalty for float regs + II_penalty_f = 0; + if (N_f) { + int N_adj_f = ((N_f - E_f) * (i - 1)) + N_f; + if (i > 1) + Tot_D_f += (D_f - E_f)*(i-1); + II_penalty_f = ((R_f - P_f) * (Tot_D_f) * A_spill_f); + II_penalty_f = II_penalty_f / N_adj_f; + } + + // calculate the II_penalty for integer regs + II_penalty_i = 0; + if (N_i) { + int N_adj_i = ((N_i - (E_i + num_prefetch)) * (i - 1)) + N_i; + if (i > 1) + Tot_D_i += (D_i - E_i)*(i-1); + II_penalty_i = ((R_i - P_i) * (Tot_D_i) * A_spill_i); + II_penalty_i = II_penalty_i / N_adj_i; + } + + // Now calculate the unified unit_II for both components + unit_II[j] = (init_II[j] + II_penalty_i + II_penalty_f) / i; + if (min_unitII >= unit_II[j]) { + min_unitII = unit_II[j]; + ntimes = i; + iter_j = j; + } else if ((min_unitII < 0) && + (unit_II[j] <= 0) && + (min_unitII < unit_II[j])) { + min_unitII = unit_II[j]; + ntimes = i; + iter_j = j; + } + if (unroll_factor == i) + chose_j = j; + } + } + + // This is also new, and not from ma and carr, evict unrolls that + // fit a clear profile of bad register pressure. + if ((R_f > P_f) || (R_i > P_i)) { + if (R_i > P_i) { + INT pressure_calc_i = (R_i - P_i) * A_spill_i; + INT benefit_calc_i = (num_prefetch / 2) + E_i; + if (E_i > P_i) { + // these regs will require a spill and a reload as they are updated + INT adjust_calc_i = (E_i - P_i) * (A_spill_i * 2); + pressure_calc_i += adjust_calc_i; + } + // prefetch insns are usually clumped and can issue 2 at a time + if (pressure_calc_i > benefit_calc_i) { + ntimes = 1; + Set_unroll_factor(ntimes); + } + } else if (R_f > P_f) { + INT pressure_calc_f = (R_f - P_f) * A_spill_f; + INT benefit_calc_f = E_f; + if (E_f > P_f) { + // these regs will require a spill and a reload as they are updated + INT adjust_calc_f = (E_f - P_f) * (A_spill_f * 2); + pressure_calc_f += adjust_calc_f; + } + if (pressure_calc_f > benefit_calc_f) { + ntimes = 1; + Set_unroll_factor(ntimes); + } + } + } + + // If ntimes is 1, use what we have already, this means that if we + // retained the orig it was either the min value or we did not find one + // or we have a register pressure case. + if ((Unroll_fully() == false) && (ntimes != 1) && (unroll_factor != ntimes)) + Set_unroll_factor(ntimes); + MEM_POOL_Pop(&MEM_phase_nz_pool); +} + + void CG_LOOP::Determine_Unroll_Factor() { LOOPINFO *info = LOOP_DESCR_loopinfo(Loop()); @@ -5385,6 +5657,9 @@ ntimes--; Set_unroll_factor(ntimes); +#ifdef TARG_X8664 + Determine_Best_Unit_Iteration_Interval(TRUE); +#endif } else { BOOL const_trip = TN_is_constant(trip_count_tn); @@ -5437,6 +5712,9 @@ ntimes /= 2; } Set_unroll_factor(ntimes); +#ifdef TARG_X8664 + Determine_Best_Unit_Iteration_Interval(!const_trip); +#endif } } @@ -8091,7 +8369,115 @@ } #endif +void Report_Loop_Info(LOOP_DESCR *loop, + char *usage_str, + BOOL after_prescheduling, + MEM_POOL *pool) +{ + // This func is a debug trace utility + if (Get_Trace(TP_CGLOOP, 1) == FALSE) + return; + BB *bb = LOOP_DESCR_loophead(loop); + if (BB_unrollings(bb) && + (BB_SET_Size(LOOP_DESCR_bbset(loop)) == 1)) { + BOOL saved_state_sched_est; + BOOL toggle_sched_est = false; + + // calculate or obtain the init_II cycle time + // from either the locs scheduler if we have not yet + // prescheduled the code, or from the last ready time + // cycle of the scheduled code if we have. + INT init_II = 0; + if (after_prescheduling) { + init_II = OP_scycle(BB_last_op(bb)); + } else { + SCHED_EST_TYPE type = (SCHED_EST_FOR_UNROLL); + init_II = (INT32)CG_SCHED_EST_BB_Cycles(bb, type); + } + + mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1]; + const INT len = BB_length(bb); + INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1)); + INT max_conf = 0; + INT R_f = 0; + INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float); + INT N_f = 0; + INT D_f = 0; + INT R_i = 0; + INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer); + INT N_i = 0; + INT D_i = 0; + INT avg_conflicts_i = 0; + INT avg_conflicts_f = 0; + INT k_conflicts = 0; + TN_MAP conflict_map_f; + TN_MAP conflict_map_i; + TN *tn; + BOOL first_time = TRUE; + BOOL changed = TRUE; + +#ifdef TARG_X8664 + // now adjust the number of gpr regs as per the ABI + P_i--; + if (Is_Target_32bit() && Gen_Frame_Pointer) + P_i--; +#endif + + MEM_POOL_Push(pool); + LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool); + + // compute the number of fp Regs Predicted + conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_float); + R_f = Find_Max_Conflicts(conflict_map_f, + &avg_conflicts_f, + &k_conflicts, + &N_f, + &D_f, + ISA_REGISTER_CLASS_float) + 1; + + // compute the number of gpr Regs Predicted + conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_integer); + R_i = Find_Max_Conflicts(conflict_map_i, + &avg_conflicts_i, + &k_conflicts, + &N_i, + &D_i, + ISA_REGISTER_CLASS_integer) + 1; + + // Now print the details of this loop + printf("unrolled loop(%d):size = %d, ntimes=%d\n", + BB_id(bb), BB_length(bb), BB_unrollings(bb)); + printf("%s bb = %d, init_II = %d\n", usage_str, BB_id(bb), init_II); + printf("R_f = %d, D_f = %d, N_f = %d, avg_degree_f = %d\n", + R_f, D_f, N_f, avg_conflicts_f); + printf("R_i = %d, D_i = %d, N_i = %d, avg_degree_i = %d\n", + R_i, D_i, N_i, avg_conflicts_i); + + TN_MAP_Delete(conflict_map_f); + TN_MAP_Delete(conflict_map_i); + MEM_POOL_Pop(pool); + } +} + +void Examine_Loop_Info(char *usage_str, BOOL after_presched) +{ + if (CG_opt_level > 0) { + MEM_POOL loop_descr_pool; + MEM_POOL_Initialize(&loop_descr_pool, "loop_descriptors", TRUE); + + Calculate_Dominators(); + for (LOOP_DESCR *loop = LOOP_DESCR_Detect_Loops(&loop_descr_pool); + loop; + loop = LOOP_DESCR_next(loop)) { + Report_Loop_Info(loop, usage_str, after_presched, &loop_descr_pool); + } + Free_Dominators_Memory(); + } +} + // Perform loop optimizations for all inner loops // in the PU. // Modified: trunk/osprey/be/cg/cg_loop.h =================================================================== --- trunk/osprey/be/cg/cg_loop.h 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/cg_loop.h 2011-06-17 19:45:27 UTC (rev 3651) @@ -468,6 +468,7 @@ extern BOOL CG_LOOP_unroll_remainder_fully; extern UINT32 CG_LOOP_unroll_min_trip; extern BOOL CG_LOOP_unroll_analysis; +extern BOOL CG_LOOP_unroll_best_fit; extern BOOL CG_LOOP_ooo_unroll_heuristics; extern BOOL CG_LOOP_ooo_unroll_heuristics_set; extern UINT32 CG_LOOP_reorder_buffer_size; @@ -632,6 +633,7 @@ void Recompute_Liveness(); bool Determine_Unroll_Fully(BOOL count_multi_bb); + void Determine_Best_Unit_Iteration_Interval(BOOL can_refit); void Determine_Unroll_Factor(); void Determine_SWP_Unroll_Factor(); void Build_CG_LOOP_Info(BOOL single_bb); @@ -704,6 +706,8 @@ extern CG_LOOP *Current_CG_LOOP; +extern void Examine_Loop_Info(char *usage_str, BOOL after_presched); + #if defined(TARG_IA64) || defined(TARG_SL) || defined(TARG_MIPS) extern void Perform_Loop_Optimizations(void *rgn_loop_update=NULL); Modified: trunk/osprey/be/cg/cgdriver.cxx =================================================================== --- trunk/osprey/be/cg/cgdriver.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/cgdriver.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -128,6 +128,9 @@ #include "flags.h" #endif #include "cg_swp.h" +#ifdef TARG_X8664 +#include "config_wopt.h" +#endif extern void Set_File_In_Printsrc(char *); /* defined in printsrc.c */ @@ -469,6 +472,8 @@ 0, 0, 0, &CG_fma4_load_exec, NULL }, { OVK_BOOL, OV_VISIBLE, TRUE, "dsched", "", 0, 0, 0, &CG_dispatch_schedule, NULL }, + { OVK_BOOL, OV_VISIBLE, TRUE, "nobest_fit", "", + 0, 0, 0, &CG_LOOP_nounroll_best_fit_set, NULL }, { OVK_BOOL, OV_VISIBLE, TRUE, "unalign_st", "", 0, 0, 0, &CG_128bitstore, NULL }, { OVK_BOOL, OV_VISIBLE, TRUE, "brfuse", "", @@ -2021,6 +2026,18 @@ OPT_unroll_size = 128; #endif +#ifdef TARG_X8664 + if (Is_Target_Orochi() || Is_Target_Barcelona()) { + // check if default to determine if we use best fit unrolling or not + if ((OPT_unroll_size == 128) && + (OPT_unroll_times == 4) && + (WOPT_Enable_WN_Unroll == 1)) { + if (CG_LOOP_nounroll_best_fit_set == false) + CG_LOOP_unroll_best_fit = TRUE; + } + } +#endif + if ( OPT_Unroll_Analysis_Set ) { CG_LOOP_unroll_analysis = OPT_Unroll_Analysis; Modified: trunk/osprey/be/cg/lra.cxx =================================================================== --- trunk/osprey/be/cg/lra.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/lra.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -810,34 +810,66 @@ } -static int -Calculate_Conflicting_Live_Ranges(TN *tn) +void +Populate_Init_Degrees(BB *bb, INT *regs_in_use) { - LIVE_RANGE *cur_lr = LR_For_TN(tn); - int conflict_count = 0; - for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) { - if (cur_lr == lr) continue; - if ((LR_first_def(lr) == 0) && (LR_last_use(lr) == 0)) continue; - if (LR_use_cnt(lr) == 0) continue; - TN *cur_tn = LR_tn(lr); - if (TN_register_class(tn) != TN_register_class(cur_tn)) continue; - if (LR_conflicts_with_reg_LR(lr, cur_lr)) conflict_count++; + for (INT opnum = 0; opnum < BB_length(bb); opnum++) { + regs_in_use[opnum] = 0; } - return conflict_count; } +void +Populate_Degrees_Over_LRs(INT *regs_in_use, LIVE_RANGE *lr) +{ + INT opnum; + + // populate the live range, first def to last use, + // exposed uses will cause the live range to expand. + for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) { + INT32 degree = regs_in_use[opnum]; + degree++; + regs_in_use[opnum] = degree; + } +} + + +INT +Find_Max_Degree_For_LR(INT *regs_in_use, LIVE_RANGE *lr) +{ + INT opnum; + INT32 max_degree = 0; + for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) { + INT32 degree = regs_in_use[opnum]; + max_degree = (degree > max_degree) ? degree : max_degree; + } + return max_degree; +} + + TN_MAP -Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass) +Calculate_All_Conflicts(BB *bb, INT *regs_in_use, ISA_REGISTER_CLASS rclass) { TN_MAP conflict_map = TN_MAP_Create(); + // calculate degrees for live range intervals, op by op. + Populate_Init_Degrees(bb, regs_in_use); for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) { TN *tn = LR_tn(lr); + if (TN_register_class(tn) != rclass) continue; + if (LR_use_cnt(lr) == 0) continue; + if (LR_last_use(lr) == 0) continue; + Populate_Degrees_Over_LRs(regs_in_use, lr); + } + + for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) { + TN *tn = LR_tn(lr); INT num_conflicts; if (TN_register_class(tn) != rclass) continue; if (LR_use_cnt(lr) == 0) continue; - num_conflicts = Calculate_Conflicting_Live_Ranges(tn); + if (LR_last_use(lr) == 0) continue; + // true degree does not include the live range itself. + num_conflicts = Find_Max_Degree_For_LR(regs_in_use, lr) - 1; TN_MAP_Set(conflict_map, tn, (void*)num_conflicts); } @@ -845,6 +877,73 @@ } +void +Print_Range_And_Conflict_Info(TN_MAP conflict_map, + ISA_REGISTER_CLASS rclass) +{ + for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) { + TN *tn = LR_tn(lr); + INT num_conflicts; + if (TN_register_class(tn) != rclass) continue; + if (LR_use_cnt(lr) == 0) continue; + if (LR_last_use(lr) == 0) continue; + num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn); + printf("lr conflicts(%d) :", num_conflicts); + Print_Live_Range (lr); + } +} + + +INT +Find_Max_Conflicts(TN_MAP conflict_map, + INT *average_conflicts, + INT *num_k_conflicts, + INT *num_edges, + INT *outgoing_edges, + ISA_REGISTER_CLASS rclass) +{ + INT max_conflicts = 0; + INT sum_conflicts = 0; + INT n_ranges = 0; + INT n_edges = 0; + INT n_defs = 0; + INT total_def_degree = 0; + INT k_conflicts = 0; + INT num_pr = REGISTER_CLASS_register_count(rclass); + LIVE_RANGE *last_lr = NULL; + + for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) { + TN *tn = LR_tn(lr); + INT num_conflicts; + if (TN_register_class(tn) != rclass) continue; + if (LR_use_cnt(lr) == 0) continue; + if (LR_last_use(lr) == 0) continue; + num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn); + if (num_conflicts > max_conflicts) + max_conflicts = num_conflicts; + if (num_conflicts > num_pr) + k_conflicts++; + n_edges += LR_use_cnt(lr); + n_ranges++; + sum_conflicts += num_conflicts; + last_lr = lr; + if (LR_first_def(lr) != 0) { + n_defs += LR_def_cnt(lr); + if (total_def_degree < num_conflicts) + total_def_degree = num_conflicts; + } + } + if (n_ranges) { + TN *tn = LR_tn(last_lr); + *average_conflicts = (sum_conflicts/n_ranges); + *num_k_conflicts = k_conflicts; + *num_edges = n_edges; + *outgoing_edges = total_def_degree; + } + return max_conflicts; +} + + bool Query_Conflicts_Improved(TN_MAP orig_map, TN_MAP new_map, @@ -877,8 +976,6 @@ } } *num_ranges_mitigated = num_ranges_moved_below_pr_pressure; - TN_MAP_Delete(orig_map); - TN_MAP_Delete(new_map); return (num_improved > num_degraded); } @@ -918,6 +1015,61 @@ } +void +Truncate_LRs_For_OP (OP *op) +{ + if (op == NULL) return; + + BB *bb = OP_bb(op); + INT i; + INT cur_opnum; + + // Find our current OP's opnum + for (cur_opnum = 1; cur_opnum < BB_length(bb); cur_opnum++) { + OP *cur_op = OP_VECTOR_element (Insts_Vector, cur_opnum); + if (op == cur_op) + break; + } + // did we find it? + if (cur_opnum == BB_length(bb)) + return; + + for (i = 0; i < OP_results(op); i++) { + TN *res = OP_result(op, i); + if (TN_is_register(res)) { + LIVE_RANGE *lr = LR_For_TN(res); + if (LR_first_def(lr) == cur_opnum) { + LR_first_def(lr) = 0; + if (LR_upward_exposed_use(lr) == cur_opnum) { + if (LR_exposed_use(lr) == LR_upward_exposed_use(lr)) + LR_exposed_use(lr) = 0; + LR_upward_exposed_use(lr) = 0; + } + } + if (LR_def_cnt(lr) > 1) + LR_def_cnt(lr)--; + } + } + for (i = 0; i < OP_opnds(op); i++) { + TN *opnd_tn = OP_opnd(op,i); + if (TN_is_register(opnd_tn)) { + LIVE_RANGE *lr = LR_For_TN(opnd_tn); + LR_use_cnt(lr)--; + if (LR_last_use(lr) == cur_opnum) { + // walk up to the closest use, else the last use is 0 + LR_last_use(lr) = 0; + for (INT opnum = cur_opnum; opnum > 0; opnum--) { + OP *cur_op = OP_VECTOR_element (Insts_Vector, opnum); + if (OP_Refs_TN(cur_op, opnd_tn)) { + LR_last_use(lr) = opnum; + } + } + } + } + } +} + + /* Mark that TN is used in OP. */ static void Mark_Use (TN *tn, OP *op, INT opnum, BB *bb, BOOL in_lra, Modified: trunk/osprey/be/cg/lra.h =================================================================== --- trunk/osprey/be/cg/lra.h 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/lra.h 2011-06-17 19:45:27 UTC (rev 3651) @@ -85,13 +85,24 @@ * final result, and which comparisons of live range pressure * can be made. */ -extern TN_MAP Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass); +extern TN_MAP Calculate_All_Conflicts(BB *bb, + INT *regs_in_use, + ISA_REGISTER_CLASS rclass); extern void Merge_Live_Ranges(TN *tn1, TN *tn2, bool make_tn1_span); extern bool Query_Conflicts_Improved(TN_MAP orig_map, TN_MAP new_map, INT num_reserved, INT *num_ranges_mitigated, ISA_REGISTER_CLASS rclass); +extern void Print_Range_And_Conflict_Info(TN_MAP conflict_map, + ISA_REGISTER_CLASS rclass); +extern INT Find_Max_Conflicts(TN_MAP conflict_map, + INT *average_conflicts, + INT *num_k_conflicts, + INT *num_edges, + INT *outgoing_conflicts, + ISA_REGISTER_CLASS rclass); +extern void Truncate_LRs_For_OP(OP *op); /* Returns the number of registers LRA is requesting from GRA for * the class <cl> in the basic block <bb>. If we run the scheduling Modified: trunk/osprey/be/cg/op.h =================================================================== --- trunk/osprey/be/cg/op.h 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/op.h 2011-06-17 19:45:27 UTC (rev 3651) @@ -727,6 +727,8 @@ #define OP_memory(o) (OP_load(o) | OP_store(o) | OP_prefetch(o)) #define OP_mcode(o) (TOP_is_mcode(OP_code(o))) #define OP_is4(o) (TOP_is_is4_reg(OP_code(o))) +#define OP_vec_lo_ldst(o) (TOP_is_vector_lo_loadstore(OP_code(o))) +#define OP_vec_hi_ldst(o) (TOP_is_vector_high_loadstore(OP_code(o))) #else #define OP_memory(o) (OP_load(o) | OP_store(o) | OP_prefetch(o)) #endif Modified: trunk/osprey/be/cg/x8664/ebo_special.cxx =================================================================== --- trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -5186,6 +5186,7 @@ OP* new_op = NULL; ADDR_MODE mode = BASE_INDEX_MODE; const TOP new_top = Get_Top_For_Addr_Mode(OP_code(op), mode); + const TOP old_top = OP_code(op); FmtAssert( new_top != TOP_UNDEFINED, ("Compose_Mem_Op: unknown top") ); if( TOP_is_prefetch( new_top ) ){ new_op = Mk_OP( new_top, OP_opnd( op, 0 ), base, offset, index, scale ); @@ -5198,7 +5199,16 @@ storeval = OP_result( op, 0 ); } if (OP_load(op) || OP_store(op) || OP_prefetch(op)) { - new_op = Mk_OP( new_top, storeval, base, offset, index, scale ); + if (Is_Target_Orochi() && Is_Target_AVX() && OP_load(op) && + (old_top != TOP_vldsd) && + (old_top != TOP_vldsdx) && + (old_top != TOP_vldsdxx) && + (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) { + new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), + base, index, scale, offset ); + } else { + new_op = Mk_OP( new_top, storeval, base, offset, index, scale ); + } } else if (OP_load_exe(op)) { if (OP_opnds(op) == 2) { FmtAssert ((storeval != NULL), @@ -5325,6 +5335,22 @@ return found; } +static bool tn_find(std::deque<TN*>& tn_queue, TN *tn) +{ + bool found = false; + std::deque<TN*>::iterator tn_queue_it; + for (tn_queue_it = tn_queue.begin(); + tn_queue_it != tn_queue.end(); + ++tn_queue_it) { + TN* cur_tn = *tn_queue_it; + if (cur_tn == tn) { + found = true; + break; + } + } + return found; +} + // compare two def trees and mark attributes concerning input tn's tree static void compare_def_tree(TN *tn, INT *num, @@ -6124,6 +6150,23 @@ TN_MAP_Delete(def_map); } +static void prune_adds_from_live_range_analysis( + std::deque<TN*>& add_tns, + std::set<TN*>& counted_base_regs, + std::map<TN*,OP*>& add_map) +{ + std::set<TN*>::const_iterator counted_base_regs_it; + for (counted_base_regs_it = counted_base_regs.begin(); + counted_base_regs_it != counted_base_regs.end(); + ++counted_base_regs_it) { + TN *tn = *counted_base_regs_it; + if (tn_find(add_tns, tn) == false) { + OP* add_op = add_map[tn]; + Truncate_LRs_For_OP(add_op); + } + } +} + // After building interior pointers candidates, remove // all the effected counted_base_regs from SIB processing so // that we do not translate them in SIB translation. @@ -6133,7 +6176,8 @@ std::map<INT,std::deque<ST*> >& correlated_addr_map, std::map<ST*,std::deque<TN*> >& symbol_addr_map, std::set<TN*>& counted_base_regs, - BB *lhead, + std::map<TN*,OP*>& add_map, + BB *bb, bool loop_vectorized, MEM_POOL *pool) { INT num_cands = 0; @@ -6189,15 +6233,49 @@ TN_MAP orig_conflict_map; TN_MAP new_conflict_map; mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1]; - const INT len = BB_length(lhead); + const INT len = BB_length(bb); INT* regs_in_use = (INT *)alloca(sizeof(INT) * (len+1)); + std::deque<TN*> add_tns; MEM_POOL_Push(pool); // Calculate the current live ranges - LRA_Estimate_Fat_Points(lhead, fatpoint, regs_in_use, pool); - orig_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer); + LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool); + // build a list of tns interior pointers will operate on + for (counted_addr_sts_it = counted_addr_sts.begin(); + counted_addr_sts_it != counted_addr_sts.end(); + ++counted_addr_sts_it) { + std::deque<TN*> st_tns; + st_tns = symbol_addr_map[*counted_addr_sts_it]; + // Skip the non interior pointer cands + if (st_tns.empty()) continue; + + INT cor_addr_index = st_tns.size(); + if (cor_addr_index != max_pattern) continue; + + if (!correlated_addr_map[cor_addr_index].empty()) { + std::deque<TN*>::iterator st_tns_iter; + for (st_tns_iter = st_tns.begin(); + st_tns_iter != st_tns.end(); + ++st_tns_iter) { + TN *tn1 = *st_tns_iter; + ++st_tns_iter; + TN *tn2 = *st_tns_iter; + add_tns.push_front(tn2); + } + } + } + + // remove all the sib adds from our live range maps so that we get + // an accurate picture for analysis. + prune_adds_from_live_range_analysis(add_tns, counted_base_regs, add_map); + add_tns.clear(); + + // Do the initial live range analysis + orig_conflict_map = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_integer); + // Merge all pairs live ranges on the minor basereg bool first_time = true; for (counted_addr_sts_it = counted_addr_sts.begin(); @@ -6242,13 +6320,31 @@ // If Query_Conflicts_Improved returns with a state that indicates // that introduction of interior pointers does not benefit live // range pressure, we do not proceed with the allowing the translation. - new_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer); + new_conflict_map = Calculate_All_Conflicts(bb, regs_in_use, + ISA_REGISTER_CLASS_integer); + INT N_i = 0; + INT D_i = 0; + INT avg_conflicts = 0; + INT k_conflicts = 0; if (Query_Conflicts_Improved(orig_conflict_map, new_conflict_map, 3, &num_ranges_mitigated, - ISA_REGISTER_CLASS_integer) == false) + ISA_REGISTER_CLASS_integer) == false) { clear_all = true; + } else if (Find_Max_Conflicts(orig_conflict_map, + &avg_conflicts, + &k_conflicts, + &N_i, + &D_i, + ISA_REGISTER_CLASS_integer) == num_pr) { + // For vectorized loops we want more than k-conflicts as max in the + // original conflict map context. + if (loop_vectorized) + min_reclaimable = 4; + } + TN_MAP_Delete(orig_conflict_map); + TN_MAP_Delete(new_conflict_map); MEM_POOL_Pop(pool); } @@ -7006,6 +7102,7 @@ correlated_addr_map, symbol_addr_map, counted_base_regs, + add_map, lhead, loop_vectorized, pool); @@ -7423,7 +7520,7 @@ } } if (dontdoit) continue; - + // // collect the LiveIn sets of pdoms of loop header // which are not part of the loop @@ -7657,6 +7754,15 @@ return FALSE; } + const TOP old_top = OP_code(op); + if (Is_Target_Orochi() && Is_Target_AVX() && + (old_top != TOP_vldsd) && + (old_top != TOP_vldsdx) && + (old_top != TOP_vldsdxx) && + (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) { + return FALSE; + } + const INT op_base_idx = OP_find_opnd_use( op, OU_base ); EBO_TN_INFO* base_tninfo = op_base_idx >= 0 ? actual_tninfo[op_base_idx] : NULL; OP* addr_op = (base_tninfo != NULL) ? base_tninfo->in_op : NULL; Modified: trunk/osprey/be/cg/x8664/exp_loadstore.cxx =================================================================== --- trunk/osprey/be/cg/x8664/exp_loadstore.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/x8664/exp_loadstore.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -382,6 +382,16 @@ } } } + if (Is_Target_Orochi() && Is_Target_AVX() && + ((top == TOP_ldlps) || + (top == TOP_ldhps) || + (top == TOP_ldlpd) || + (top == TOP_ldhpd))){ + TN *xzero = Build_TN_Like(result); + Build_OP( TOP_xzero128v32, xzero, ops ); + Build_OP( top, result, xzero, base, ofst, ops ); + return; + } Build_OP (top, result, base, ofst, ops); } @@ -634,10 +644,34 @@ } else if (mtype == MTYPE_V8I1 || mtype == MTYPE_V8I2 || mtype == MTYPE_V8I4 || mtype == MTYPE_V8I8) { - if (base != NULL) - Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse, result, base, disp, ops); - else Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32, result, disp, ops); + if (Is_Target_Orochi() && Is_Target_AVX()){ + TN *xzero = Build_TN_Like(result); + Build_OP(TOP_xzero128v32, xzero, ops); + if (base != NULL) + Build_OP(TOP_ldlps, result, xzero, base, disp, ops); + else + Build_OP(TOP_ldlps_n32, result, xzero, disp, ops); + } else { + if (base != NULL) + Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse, + result, base, disp, ops); + else + Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32, + result, disp, ops); + } } + else if (mtype == MTYPE_V8F4 ) { + if (Is_Target_Orochi() && Is_Target_AVX()){ + TN *xzero = Build_TN_Like(result); + Build_OP(TOP_xzero128v32, xzero, ops); + if (base != NULL) + Build_OP(TOP_ldlps, result, xzero, base, disp, ops); + else + Build_OP(TOP_ldlps_n32, result, xzero, disp, ops); + } else { + Expand_Composed_Load (op, result, base, disp, variant, ops); + } + } else if (mtype == MTYPE_V16F8 || mtype == MTYPE_V16C8) { if(Is_Target_Barcelona() || Is_Target_Orochi()){ if(base != NULL) Modified: trunk/osprey/be/cg/x8664/expand.cxx =================================================================== --- trunk/osprey/be/cg/x8664/expand.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/be/cg/x8664/expand.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -7988,7 +7988,13 @@ Build_OP( TOP_ldsd, result, op1, Gen_Literal_TN (0,4), ops ); break; case INTRN_LOADHPD: - Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops ); + if (Is_Target_Orochi() && Is_Target_AVX()){ + TN *xzero = Build_TN_Like(result); + Build_OP( TOP_xzero128v32, xzero, ops ); + Build_OP( TOP_ldhpd, result, xzero, op1, Gen_Literal_TN (0,4), ops ); + } else { + Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops ); + } break; case INTRN_UNPCKLPD: Build_OP( TOP_unpcklpd, result, op0, op1, ops ); @@ -8208,10 +8214,22 @@ Build_OP( TOP_ldupd, result, op0, Gen_Literal_TN (0,4), ops ); break; case INTRN_LOADHPS: - Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops ); + if (Is_Target_Orochi() && Is_Target_AVX()){ + TN *xzero = Build_TN_Like(result); + Build_OP( TOP_xzero128v32, xzero, ops ); + Build_OP( TOP_ldhps, result, xzero, op1, Gen_Literal_TN (0,4), ops ); + } else { + Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops ); + } break; case INTRN_LOADLPS: - Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops ); + if (Is_Target_Orochi() && Is_Target_AVX()){ + TN *xzero = Build_TN_Like(result); + Build_OP( TOP_xzero128v32, xzero, ops ); + Build_OP( TOP_ldlps, result, xzero, op1, Gen_Literal_TN (0,4), ops ); + } else { + Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops ); + } break; case INTRN_MOVMSKPS: Build_OP( TOP_movmskps, result, op0, ops ); Modified: trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx =================================================================== --- trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -1545,6 +1545,10 @@ TOP_vaesenclastx, TOP_vaesdecx, TOP_vaesdeclastx, + TOP_vldhpd, + TOP_vldlpd, + TOP_vldhps, + TOP_vldlps, TOP_UNDEFINED); Result(0, fp128); Operand(0, fp128, opnd1); @@ -1785,6 +1789,10 @@ TOP_vaesenclastxx, TOP_vaesdecxx, TOP_vaesdeclastxx, + TOP_vldhpdx, + TOP_vldhpsx, + TOP_vldlpdx, + TOP_vldlpsx, TOP_UNDEFINED); Result(0, fp128); Operand(0, fp128, opnd1); @@ -2025,6 +2033,10 @@ TOP_vaesenclastxxx, TOP_vaesdecxxx, TOP_vaesdeclastxxx, + TOP_vldhpdxx, + TOP_vldhpsxx, + TOP_vldlpdxx, + TOP_vldlpsxx, TOP_UNDEFINED); Result(0, fp128); Operand(0, fp128, opnd1); @@ -4286,15 +4298,21 @@ TOP_vlddqa_n32, TOP_vldapd_n32, TOP_vldaps_n32, - TOP_vldlpd_n32, TOP_vldupd_n32, TOP_vldups_n32, + TOP_UNDEFINED); + Result(0, fp128); + Operand(0, simm32, offset); + + Instruction_Group("avx float load vector w/o base or index", TOP_vldhpd_n32, + TOP_vldlpd_n32, TOP_vldhps_n32, TOP_vldlps_n32, TOP_UNDEFINED); Result(0, fp128); - Operand(0, simm32, offset); + Operand(0, fp128, opnd1); + Operand(1, simm32, offset); Instruction_Group("float load vector", TOP_lddqa, @@ -4318,14 +4336,10 @@ TOP_vldntdqa, TOP_vldapd, TOP_vldaps, - TOP_vldlpd, TOP_vldss, TOP_vldupd, TOP_vldups, - TOP_vldhpd, - TOP_vldhps, TOP_vldsd, - TOP_vldlps, TOP_UNDEFINED); Result(0, fp128); Operand(0, int64, base); @@ -5676,14 +5690,10 @@ TOP_vldntdqax, TOP_vldapdx, TOP_vldapsx, - TOP_vldlpdx, TOP_vldssx, TOP_vldupdx, TOP_vldupsx, - TOP_vldhpdx, - TOP_vldhpsx, TOP_vldsdx, - TOP_vldlpsx, TOP_UNDEFINED); Result(0, fp128); Operand(0, int64, base); @@ -5745,14 +5755,10 @@ TOP_vldntdqaxx, TOP_vldapdxx, TOP_vldapsxx, - TOP_vldlpdxx, TOP_vldssxx, TOP_vldupdxx, TOP_vldupsxx, - TOP_vldhpdxx, - TOP_vldhpsxx, TOP_vldsdxx, - TOP_vldlpsxx, TOP_UNDEFINED); Result(0, fp128); Operand(0, int64, index); Modified: trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx =================================================================== --- trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -1982,6 +1982,10 @@ TOP_vaesdecx, TOP_vaesdeclastx, TOP_vaeskeygenassistx, + TOP_vldhpd, + TOP_vldlpd, + TOP_vldhps, + TOP_vldlps, TOP_UNDEFINED); /* dest=op(memop, reg), non-x86-style */ @@ -2198,6 +2202,10 @@ TOP_vaesdecxx, TOP_vaesdeclastxx, TOP_vaeskeygenassistxx, + TOP_vldhpdx, + TOP_vldhpsx, + TOP_vldlpdx, + TOP_vldlpsx, TOP_UNDEFINED); /* dest=op(memop with scaled index with base, reg), non-x86-style */ @@ -2415,6 +2423,10 @@ TOP_vaesdecxxx, TOP_vaesdeclastxxx, TOP_vaeskeygenassistxxx, + TOP_vldhpdxx, + TOP_vldhpsxx, + TOP_vldlpdxx, + TOP_vldlpsxx, TOP_UNDEFINED); /* dest=op(memop with scaled index without base, reg), non-x86-style */ @@ -3422,12 +3434,8 @@ TOP_vlddqa_n32, TOP_vldapd_n32, TOP_vldaps_n32, - TOP_vldlpd_n32, TOP_vldupd_n32, TOP_vldups_n32, - TOP_vldhpd_n32, - TOP_vldhps_n32, - TOP_vldlps_n32, TOP_vstdqa_n32, TOP_vstdqu_n32, TOP_vstapd_n32, @@ -3441,6 +3449,19 @@ TOP_vaesimc, TOP_UNDEFINED ); + /* One result / two operands */ + ISA_PRINT_TYPE rop2 = ISA_Print_Type_Create("rop2", "%s %s %s,%s"); + Name(); + Operand(1); + Operand(0); + Result(0); + Instruction_Print_Group( rop, + TOP_vldhpd_n32, + TOP_vldhps_n32, + TOP_vldlpd_n32, + TOP_vldlps_n32, + TOP_UNDEFINED ); + /* One result / one mem opnd */ ISA_PRINT_TYPE rmem = ISA_Print_Type_Create("rmem", "%s %s%s(%s),%s"); Name(); @@ -3459,6 +3480,7 @@ TOP_pmovzxbwx, TOP_pmovsxbdx, TOP_pmovzxbdx, + TOP_vldlpd_n32, TOP_pmovsxbqx, TOP_pmovzxbqx, TOP_pmovsxwdx, @@ -3797,14 +3819,10 @@ TOP_vldntdqa, TOP_vldapd, TOP_vldaps, - TOP_vldlpd, TOP_vldss, TOP_vldupd, TOP_vldups, - TOP_vldhpd, - TOP_vldhps, TOP_vldsd, - TOP_vldlps, TOP_vfbroadcastss, TOP_vfbroadcastsd, TOP_vfbroadcastf128, @@ -3928,14 +3946,10 @@ TOP_vldntdqax, TOP_vldapdx, TOP_vldapsx, - TOP_vldlpdx, TOP_vldssx, TOP_vldupdx, TOP_vldupsx, - TOP_vldhpdx, - TOP_vldhpsx, TOP_vldsdx, - TOP_vldlpsx, TOP_vfbroadcastxss, TOP_vfbroadcastxsd, TOP_vfbroadcastxf128, @@ -3985,14 +3999,10 @@ TOP_vldntdqaxx, TOP_vldapdxx, TOP_vldapsxx, - TOP_vldlpdxx, TOP_vldssxx, TOP_vldupdxx, TOP_vldupsxx, - TOP_vldhpdxx, - TOP_vldhpsxx, TOP_vldsdxx, - TOP_vldlpsxx, TOP_vfbroadcastxxss, TOP_vfbroadcastxxsd, TOP_vfbroadcastxxf128, Modified: trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx =================================================================== --- trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-06-16 07:17:12 UTC (rev 3650) +++ trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-06-17 19:45:27 UTC (rev 3651) @@ -5651,6 +5651,18 @@ TOP_vpmadcswdxx, TOP_vpmadcswdxxx, /* AVX instructions */ + TOP_vldhpd, + TOP_vldhps, + TOP_vldlpd, + TOP_vldlps, + TOP_vldhpdx, + TOP_vldhpsx, + TOP_vldlpdx, + TOP_vldlpsx, + TOP_vldhpdxx, + TOP_vldhpsxx, + TOP_vldlpdxx, + TOP_vldlpsxx, TOP_vcmpestrix, TOP_vcmpestrixx, TOP_vcmpestrixxx, ------------------------------------------------------------------------------ EditLive Enterprise is the world's most technically advanced content authoring tool. Experience the power of Track Changes, Inline Image Editing and ensure content is compliant with Accessibility Checking. http://p.sf.net/sfu/ephox-dev2dev _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel