Author: mberg
Date: 2011-06-17 15:45:27 -0400 (Fri, 17 Jun 2011)
New Revision: 3651
Modified:
trunk/osprey/be/cg/cg_flags.cxx
trunk/osprey/be/cg/cg_flags.h
trunk/osprey/be/cg/cg_loop.cxx
trunk/osprey/be/cg/cg_loop.h
trunk/osprey/be/cg/cgdriver.cxx
trunk/osprey/be/cg/lra.cxx
trunk/osprey/be/cg/lra.h
trunk/osprey/be/cg/op.h
trunk/osprey/be/cg/x8664/ebo_special.cxx
trunk/osprey/be/cg/x8664/exp_loadstore.cxx
trunk/osprey/be/cg/x8664/expand.cxx
trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx
trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx
trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx
Log:
Best fit unrolling implementation added along with vmovlps update.
The algorithm is controlled by -CG:nobest_fit=<on|off>, the behavior is
on by default only on x86 targets under default unroll
optimizations(unroll by 4 and size of 128). Other targets will need to
revisit default values and add some support in todo marked areas to
utilize this code.
CR by Jian-Xin.
Modified: trunk/osprey/be/cg/cg_flags.cxx
===================================================================
--- trunk/osprey/be/cg/cg_flags.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_flags.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -124,6 +124,7 @@
BOOL CG_128bitstore = TRUE;
BOOL CG_branch_fuse = TRUE;
BOOL CG_dispatch_schedule = FALSE;
+BOOL CG_LOOP_nounroll_best_fit_set = FALSE;
BOOL CG_strcmp_expand = TRUE;
BOOL CG_merge_counters_x86 = FALSE;
BOOL CG_interior_ptrs_x86 = FALSE;
Modified: trunk/osprey/be/cg/cg_flags.h
===================================================================
--- trunk/osprey/be/cg/cg_flags.h 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_flags.h 2011-06-17 19:45:27 UTC (rev 3651)
@@ -101,6 +101,9 @@
* BOOL CG_dispatch_schedule
* Enable dispatch scheduling for Orochi style architectures.
*
+ * BOOL CG_LOOP_unroll_best_fit
+ * Toggle default state of unroll best fit behavior.
+ *
* BOOL CG_128bitstore
* Enable 128bit unaligned stores optimization which emits movup{s|d}
* instead of movhp{s|d} with movlp{s|d}.
@@ -530,6 +533,7 @@
extern BOOL CG_cmp_load_exec;
extern BOOL CG_fma4_load_exec;
extern BOOL CG_dispatch_schedule;
+extern BOOL CG_LOOP_nounroll_best_fit_set;
extern BOOL CG_128bitstore;
extern BOOL CG_branch_fuse;
extern BOOL CG_strcmp_expand;
Modified: trunk/osprey/be/cg/cg_loop.cxx
===================================================================
--- trunk/osprey/be/cg/cg_loop.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_loop.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -187,6 +187,8 @@
#include "ebo.h"
#include "hb.h"
#include "gra_live.h"
+#include "lra.h"
+#include "calls.h"
#if defined(TARG_SL)
#include "tag.h"
@@ -256,6 +258,7 @@
BOOL CG_LOOP_optimize_multi_targ = FALSE;
BOOL CG_LOOP_optimize_lno_winddown_cache = TRUE;
BOOL CG_LOOP_optimize_lno_winddown_reg = TRUE;
+BOOL CG_LOOP_unroll_best_fit = FALSE;
/* Note: To set default unroll parameters, modify the initialization
* of OPT_unroll_times/size in "config.c".
@@ -5335,6 +5338,275 @@
}
+// This algorithm is based in part on a paper by ma and carr
+// for determining register pressure for unrolled loops and
+// the most profitable unroll factor, if any, to unroll by.
+void CG_LOOP::Determine_Best_Unit_Iteration_Interval(BOOL can_refit)
+{
+ BB *bb = LOOP_DESCR_loophead(loop);
+ INT init_II[5];
+ BOOL saved_state_sched_est;
+ BOOL toggle_sched_est = false;
+
+ // only single block loops
+ if (BB_SET_Size(LOOP_DESCR_bbset(loop)) != 1)
+ return;
+
+ // This is now the default single block unroll factor calculation
+ // algorithm, if the user specified any other unroll by or size
+ // other than the default, the heuristics below will not be use to
+ // determine unroll factor.
+ if (CG_LOOP_unroll_best_fit == false)
+ return;
+
+ MEM_POOL_Push(&MEM_phase_nz_pool);
+
+ mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
+ const INT len = BB_length(bb);
+ INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1));
+ INT max_conf = 0;
+ INT R_f = 0;
+ INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float);
+ INT N_f = 0;
+ INT D_f = 0;
+ INT R_i = 0;
+ INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer);
+ INT N_i = 0;
+ INT D_i = 0;
+ INT avg_conflicts = 0;
+ INT k_conflicts = 0;
+ TN_MAP conflict_map_f;
+ TN_MAP conflict_map_i;
+ TN *tn;
+ LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, &MEM_phase_nz_pool);
+
+#ifdef TARG_X8664
+ // now adjust the number of gpr regs as per the ABI
+ P_i--;
+ if (Is_Target_32bit() && Gen_Frame_Pointer)
+ P_i--;
+#endif
+
+ // compute the number of fp Regs Predicted, func return degree so add 1
+ conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_float);
+ R_f = Find_Max_Conflicts(conflict_map_f,
+ &avg_conflicts,
+ &k_conflicts,
+ &N_f,
+ &D_f,
+ ISA_REGISTER_CLASS_float) + 1;
+
+ // compute the number of gpr Regs Predicted, func return degree so add 1
+ conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_integer);
+ R_i = Find_Max_Conflicts(conflict_map_i,
+ &avg_conflicts,
+ &k_conflicts,
+ &N_i,
+ &D_i,
+ ISA_REGISTER_CLASS_integer) + 1;
+
+ TN_MAP_Delete(conflict_map_f);
+ TN_MAP_Delete(conflict_map_i);
+
+ // Now figure out E, the total number of cross iteration edges for
+ // both float and integer regs by counting the live out regs of each
+ // type for the loop
+ INT E_f = 0;
+ INT E_i = 0;
+
+ // Exposed uses which are updated are loop-carried dependences.
+ for (tn = GTN_SET_Choose(BB_live_use(bb));
+ tn != GTN_SET_CHOOSE_FAILURE;
+ tn = GTN_SET_Choose_Next(BB_live_use(bb),tn)) {
+ bool exposed_use_is_updated = false;
+ for( OP* op = BB_first_op(bb); op != NULL; op = OP_next(op) ){
+ if (OP_Defs_TN(op, tn)) {
+ exposed_use_is_updated = true;
+ break;
+ }
+ }
+ if (exposed_use_is_updated == false) continue;
+ if (TN_register_class(tn) == ISA_REGISTER_CLASS_float)
+ E_f++;
+ if (TN_register_class(tn) == ISA_REGISTER_CLASS_integer)
+ E_i++;
+ }
+
+ // Count the number of prefetch insns, as unrolled loop bodies
+ // will only recieve a single copy for the whole iteration for
+ // these kind of instructions, same is true for loop carried
+ // dependences(E_i and E_f). This is not from ma and carr but
+ // is benefitial in that we miss upper bound opportunities otherwise.
+ INT num_prefetch = 0;
+ for (OP *op = BB_first_op(bb); op != NULL; op = OP_next(op))
+ if (OP_prefetch(op)) num_prefetch++;
+
+ // Try to refit the next unroll factor by 2 into the current
+ // size threshold if it will fit using the above data for the
+ // total loop size(it is more accurate than the prior calc).
+ if ((Unroll_fully() == false) &&
+ (can_refit) &&
+ is_power_of_two(unroll_factor) &&
+ ((unroll_factor * 2) < CG_LOOP_unroll_times_max)) {
+ INT loop_size = BB_length(bb);
+ INT next_factor = unroll_factor * 2;
+ INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) *
+ (next_factor - 1));
+ if (max_size < CG_LOOP_unrolled_size_max)
+ Set_unroll_factor(next_factor);
+ } else if ((Unroll_fully() == false) &&
+ (can_refit) &&
+ ((unroll_factor + 1) < CG_LOOP_unroll_times_max)) {
+ INT loop_size = BB_length(bb);
+ INT next_factor = unroll_factor + 1;
+ INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) *
+ (next_factor - 1));
+ if (max_size < CG_LOOP_unrolled_size_max)
+ Set_unroll_factor(next_factor);
+ }
+
+#ifdef TARG_X8664
+ // calculate each unroll factors init_II
+ INT A_spill_f = CGTARG_Latency(TOP_ldupd);
+ INT A_spill_i = CGTARG_Latency(TOP_ldx64);
+#else
+ INT A_spill_f = 1; // stubbed, todo - fill in correctly per target
+ INT A_spill_i = 1; // stubbed, todo - fill in correctly per target
+#endif
+
+ INT unit_II[5];
+ INT II_penalty_f;
+ INT II_penalty_i;
+ INT j, i;
+ INT iter_j = 0;
+ INT chose_j = 0;
+ INT min_unitII = INT_MAX;
+ INT ntimes = 1;
+ INT upper_bound = unroll_factor;
+
+ // Refitted unroll factors and already assigned unroll factors of power
+ // 2 only are utlized here.
+ if (is_power_of_two(unroll_factor)) {
+ CG_SCHED_EST *loop_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool,
+ SCHED_EST_FOR_UNROLL);
+ CG_SCHED_EST *unroll_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool,
+ SCHED_EST_FOR_UNROLL |
+ SCHED_EST_IGNORE_PREFETCH |
+ SCHED_EST_IGNORE_BRANCH |
+ SCHED_EST_IGNORE_LOH_OPS |
+ SCHED_EST_IGNORE_INT_OPS);
+ init_II[0] = CG_SCHED_EST_Cycles(loop_se);
+ INT rolling_init_II;
+ for (j = 2; j <= upper_bound; j++) {
+ CG_SCHED_EST_Append_Scheds(loop_se, unroll_se);
+ rolling_init_II = CG_SCHED_EST_Cycles(loop_se);
+ switch (j) {
+ case 2:
+ init_II[1] = rolling_init_II;
+ break;
+ case 4:
+ init_II[2] = rolling_init_II;
+ break;
+ case 8:
+ init_II[3] = rolling_init_II;
+ break;
+ case 16:
+ init_II[4] = rolling_init_II;
+ break;
+ }
+ }
+
+ // Divergences from ma and carr: We have the degree and live
+ // range info for loop carried dependences, so E_i and E_f are
+ // not treated as additive components of the unit_II penalty calc,
+ // also we figure prefetch and E components into N components for
+ // penalty calc. This is more accurate than ma and carr. And finally,
+ // we use the defaults as upper bounds for finding the best fit
+ // or minimal unit_II, where if we do not find a best fit that is other
+ // than unroll by 1, we defer to the original unroll factor.
+ int Tot_D_f = D_f;
+ int Tot_D_i = D_i;
+ for (i = 1, j = 0; i <= upper_bound; i = i * 2, j++) {
+ // calculate the II_penalty for float regs
+ II_penalty_f = 0;
+ if (N_f) {
+ int N_adj_f = ((N_f - E_f) * (i - 1)) + N_f;
+ if (i > 1)
+ Tot_D_f += (D_f - E_f)*(i-1);
+ II_penalty_f = ((R_f - P_f) * (Tot_D_f) * A_spill_f);
+ II_penalty_f = II_penalty_f / N_adj_f;
+ }
+
+ // calculate the II_penalty for integer regs
+ II_penalty_i = 0;
+ if (N_i) {
+ int N_adj_i = ((N_i - (E_i + num_prefetch)) * (i - 1)) + N_i;
+ if (i > 1)
+ Tot_D_i += (D_i - E_i)*(i-1);
+ II_penalty_i = ((R_i - P_i) * (Tot_D_i) * A_spill_i);
+ II_penalty_i = II_penalty_i / N_adj_i;
+ }
+
+ // Now calculate the unified unit_II for both components
+ unit_II[j] = (init_II[j] + II_penalty_i + II_penalty_f) / i;
+ if (min_unitII >= unit_II[j]) {
+ min_unitII = unit_II[j];
+ ntimes = i;
+ iter_j = j;
+ } else if ((min_unitII < 0) &&
+ (unit_II[j] <= 0) &&
+ (min_unitII < unit_II[j])) {
+ min_unitII = unit_II[j];
+ ntimes = i;
+ iter_j = j;
+ }
+ if (unroll_factor == i)
+ chose_j = j;
+ }
+ }
+
+ // This is also new, and not from ma and carr, evict unrolls that
+ // fit a clear profile of bad register pressure.
+ if ((R_f > P_f) || (R_i > P_i)) {
+ if (R_i > P_i) {
+ INT pressure_calc_i = (R_i - P_i) * A_spill_i;
+ INT benefit_calc_i = (num_prefetch / 2) + E_i;
+ if (E_i > P_i) {
+ // these regs will require a spill and a reload as they are updated
+ INT adjust_calc_i = (E_i - P_i) * (A_spill_i * 2);
+ pressure_calc_i += adjust_calc_i;
+ }
+ // prefetch insns are usually clumped and can issue 2 at a time
+ if (pressure_calc_i > benefit_calc_i) {
+ ntimes = 1;
+ Set_unroll_factor(ntimes);
+ }
+ } else if (R_f > P_f) {
+ INT pressure_calc_f = (R_f - P_f) * A_spill_f;
+ INT benefit_calc_f = E_f;
+ if (E_f > P_f) {
+ // these regs will require a spill and a reload as they are updated
+ INT adjust_calc_f = (E_f - P_f) * (A_spill_f * 2);
+ pressure_calc_f += adjust_calc_f;
+ }
+ if (pressure_calc_f > benefit_calc_f) {
+ ntimes = 1;
+ Set_unroll_factor(ntimes);
+ }
+ }
+ }
+
+ // If ntimes is 1, use what we have already, this means that if we
+ // retained the orig it was either the min value or we did not find one
+ // or we have a register pressure case.
+ if ((Unroll_fully() == false) && (ntimes != 1) && (unroll_factor != ntimes))
+ Set_unroll_factor(ntimes);
+ MEM_POOL_Pop(&MEM_phase_nz_pool);
+}
+
+
void CG_LOOP::Determine_Unroll_Factor()
{
LOOPINFO *info = LOOP_DESCR_loopinfo(Loop());
@@ -5385,6 +5657,9 @@
ntimes--;
Set_unroll_factor(ntimes);
+#ifdef TARG_X8664
+ Determine_Best_Unit_Iteration_Interval(TRUE);
+#endif
} else {
BOOL const_trip = TN_is_constant(trip_count_tn);
@@ -5437,6 +5712,9 @@
ntimes /= 2;
}
Set_unroll_factor(ntimes);
+#ifdef TARG_X8664
+ Determine_Best_Unit_Iteration_Interval(!const_trip);
+#endif
}
}
@@ -8091,7 +8369,115 @@
}
#endif
+void Report_Loop_Info(LOOP_DESCR *loop,
+ char *usage_str,
+ BOOL after_prescheduling,
+ MEM_POOL *pool)
+{
+ // This func is a debug trace utility
+ if (Get_Trace(TP_CGLOOP, 1) == FALSE)
+ return;
+ BB *bb = LOOP_DESCR_loophead(loop);
+ if (BB_unrollings(bb) &&
+ (BB_SET_Size(LOOP_DESCR_bbset(loop)) == 1)) {
+ BOOL saved_state_sched_est;
+ BOOL toggle_sched_est = false;
+
+ // calculate or obtain the init_II cycle time
+ // from either the locs scheduler if we have not yet
+ // prescheduled the code, or from the last ready time
+ // cycle of the scheduled code if we have.
+ INT init_II = 0;
+ if (after_prescheduling) {
+ init_II = OP_scycle(BB_last_op(bb));
+ } else {
+ SCHED_EST_TYPE type = (SCHED_EST_FOR_UNROLL);
+ init_II = (INT32)CG_SCHED_EST_BB_Cycles(bb, type);
+ }
+
+ mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
+ const INT len = BB_length(bb);
+ INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1));
+ INT max_conf = 0;
+ INT R_f = 0;
+ INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float);
+ INT N_f = 0;
+ INT D_f = 0;
+ INT R_i = 0;
+ INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer);
+ INT N_i = 0;
+ INT D_i = 0;
+ INT avg_conflicts_i = 0;
+ INT avg_conflicts_f = 0;
+ INT k_conflicts = 0;
+ TN_MAP conflict_map_f;
+ TN_MAP conflict_map_i;
+ TN *tn;
+ BOOL first_time = TRUE;
+ BOOL changed = TRUE;
+
+#ifdef TARG_X8664
+ // now adjust the number of gpr regs as per the ABI
+ P_i--;
+ if (Is_Target_32bit() && Gen_Frame_Pointer)
+ P_i--;
+#endif
+
+ MEM_POOL_Push(pool);
+ LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool);
+
+ // compute the number of fp Regs Predicted
+ conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_float);
+ R_f = Find_Max_Conflicts(conflict_map_f,
+ &avg_conflicts_f,
+ &k_conflicts,
+ &N_f,
+ &D_f,
+ ISA_REGISTER_CLASS_float) + 1;
+
+ // compute the number of gpr Regs Predicted
+ conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_integer);
+ R_i = Find_Max_Conflicts(conflict_map_i,
+ &avg_conflicts_i,
+ &k_conflicts,
+ &N_i,
+ &D_i,
+ ISA_REGISTER_CLASS_integer) + 1;
+
+ // Now print the details of this loop
+ printf("unrolled loop(%d):size = %d, ntimes=%d\n",
+ BB_id(bb), BB_length(bb), BB_unrollings(bb));
+ printf("%s bb = %d, init_II = %d\n", usage_str, BB_id(bb), init_II);
+ printf("R_f = %d, D_f = %d, N_f = %d, avg_degree_f = %d\n",
+ R_f, D_f, N_f, avg_conflicts_f);
+ printf("R_i = %d, D_i = %d, N_i = %d, avg_degree_i = %d\n",
+ R_i, D_i, N_i, avg_conflicts_i);
+
+ TN_MAP_Delete(conflict_map_f);
+ TN_MAP_Delete(conflict_map_i);
+ MEM_POOL_Pop(pool);
+ }
+}
+
+void Examine_Loop_Info(char *usage_str, BOOL after_presched)
+{
+ if (CG_opt_level > 0) {
+ MEM_POOL loop_descr_pool;
+ MEM_POOL_Initialize(&loop_descr_pool, "loop_descriptors", TRUE);
+
+ Calculate_Dominators();
+ for (LOOP_DESCR *loop = LOOP_DESCR_Detect_Loops(&loop_descr_pool);
+ loop;
+ loop = LOOP_DESCR_next(loop)) {
+ Report_Loop_Info(loop, usage_str, after_presched, &loop_descr_pool);
+ }
+ Free_Dominators_Memory();
+ }
+}
+
// Perform loop optimizations for all inner loops
// in the PU.
//
Modified: trunk/osprey/be/cg/cg_loop.h
===================================================================
--- trunk/osprey/be/cg/cg_loop.h 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_loop.h 2011-06-17 19:45:27 UTC (rev 3651)
@@ -468,6 +468,7 @@
extern BOOL CG_LOOP_unroll_remainder_fully;
extern UINT32 CG_LOOP_unroll_min_trip;
extern BOOL CG_LOOP_unroll_analysis;
+extern BOOL CG_LOOP_unroll_best_fit;
extern BOOL CG_LOOP_ooo_unroll_heuristics;
extern BOOL CG_LOOP_ooo_unroll_heuristics_set;
extern UINT32 CG_LOOP_reorder_buffer_size;
@@ -632,6 +633,7 @@
void Recompute_Liveness();
bool Determine_Unroll_Fully(BOOL count_multi_bb);
+ void Determine_Best_Unit_Iteration_Interval(BOOL can_refit);
void Determine_Unroll_Factor();
void Determine_SWP_Unroll_Factor();
void Build_CG_LOOP_Info(BOOL single_bb);
@@ -704,6 +706,8 @@
extern CG_LOOP *Current_CG_LOOP;
+extern void Examine_Loop_Info(char *usage_str, BOOL after_presched);
+
#if defined(TARG_IA64) || defined(TARG_SL) || defined(TARG_MIPS)
extern void Perform_Loop_Optimizations(void *rgn_loop_update=NULL);
Modified: trunk/osprey/be/cg/cgdriver.cxx
===================================================================
--- trunk/osprey/be/cg/cgdriver.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cgdriver.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -128,6 +128,9 @@
#include "flags.h"
#endif
#include "cg_swp.h"
+#ifdef TARG_X8664
+#include "config_wopt.h"
+#endif
extern void Set_File_In_Printsrc(char *); /* defined in printsrc.c */
@@ -469,6 +472,8 @@
0, 0, 0, &CG_fma4_load_exec, NULL },
{ OVK_BOOL, OV_VISIBLE, TRUE, "dsched", "",
0, 0, 0, &CG_dispatch_schedule, NULL },
+ { OVK_BOOL, OV_VISIBLE, TRUE, "nobest_fit", "",
+ 0, 0, 0, &CG_LOOP_nounroll_best_fit_set, NULL },
{ OVK_BOOL, OV_VISIBLE, TRUE, "unalign_st", "",
0, 0, 0, &CG_128bitstore, NULL },
{ OVK_BOOL, OV_VISIBLE, TRUE, "brfuse", "",
@@ -2021,6 +2026,18 @@
OPT_unroll_size = 128;
#endif
+#ifdef TARG_X8664
+ if (Is_Target_Orochi() || Is_Target_Barcelona()) {
+ // check if default to determine if we use best fit unrolling or not
+ if ((OPT_unroll_size == 128) &&
+ (OPT_unroll_times == 4) &&
+ (WOPT_Enable_WN_Unroll == 1)) {
+ if (CG_LOOP_nounroll_best_fit_set == false)
+ CG_LOOP_unroll_best_fit = TRUE;
+ }
+ }
+#endif
+
if ( OPT_Unroll_Analysis_Set )
{
CG_LOOP_unroll_analysis = OPT_Unroll_Analysis;
Modified: trunk/osprey/be/cg/lra.cxx
===================================================================
--- trunk/osprey/be/cg/lra.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/lra.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -810,34 +810,66 @@
}
-static int
-Calculate_Conflicting_Live_Ranges(TN *tn)
+void
+Populate_Init_Degrees(BB *bb, INT *regs_in_use)
{
- LIVE_RANGE *cur_lr = LR_For_TN(tn);
- int conflict_count = 0;
- for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
- if (cur_lr == lr) continue;
- if ((LR_first_def(lr) == 0) && (LR_last_use(lr) == 0)) continue;
- if (LR_use_cnt(lr) == 0) continue;
- TN *cur_tn = LR_tn(lr);
- if (TN_register_class(tn) != TN_register_class(cur_tn)) continue;
- if (LR_conflicts_with_reg_LR(lr, cur_lr)) conflict_count++;
+ for (INT opnum = 0; opnum < BB_length(bb); opnum++) {
+ regs_in_use[opnum] = 0;
}
- return conflict_count;
}
+void
+Populate_Degrees_Over_LRs(INT *regs_in_use, LIVE_RANGE *lr)
+{
+ INT opnum;
+
+ // populate the live range, first def to last use,
+ // exposed uses will cause the live range to expand.
+ for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) {
+ INT32 degree = regs_in_use[opnum];
+ degree++;
+ regs_in_use[opnum] = degree;
+ }
+}
+
+
+INT
+Find_Max_Degree_For_LR(INT *regs_in_use, LIVE_RANGE *lr)
+{
+ INT opnum;
+ INT32 max_degree = 0;
+ for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) {
+ INT32 degree = regs_in_use[opnum];
+ max_degree = (degree > max_degree) ? degree : max_degree;
+ }
+ return max_degree;
+}
+
+
TN_MAP
-Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass)
+Calculate_All_Conflicts(BB *bb, INT *regs_in_use, ISA_REGISTER_CLASS rclass)
{
TN_MAP conflict_map = TN_MAP_Create();
+ // calculate degrees for live range intervals, op by op.
+ Populate_Init_Degrees(bb, regs_in_use);
for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
TN *tn = LR_tn(lr);
+ if (TN_register_class(tn) != rclass) continue;
+ if (LR_use_cnt(lr) == 0) continue;
+ if (LR_last_use(lr) == 0) continue;
+ Populate_Degrees_Over_LRs(regs_in_use, lr);
+ }
+
+ for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+ TN *tn = LR_tn(lr);
INT num_conflicts;
if (TN_register_class(tn) != rclass) continue;
if (LR_use_cnt(lr) == 0) continue;
- num_conflicts = Calculate_Conflicting_Live_Ranges(tn);
+ if (LR_last_use(lr) == 0) continue;
+ // true degree does not include the live range itself.
+ num_conflicts = Find_Max_Degree_For_LR(regs_in_use, lr) - 1;
TN_MAP_Set(conflict_map, tn, (void*)num_conflicts);
}
@@ -845,6 +877,73 @@
}
+void
+Print_Range_And_Conflict_Info(TN_MAP conflict_map,
+ ISA_REGISTER_CLASS rclass)
+{
+ for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+ TN *tn = LR_tn(lr);
+ INT num_conflicts;
+ if (TN_register_class(tn) != rclass) continue;
+ if (LR_use_cnt(lr) == 0) continue;
+ if (LR_last_use(lr) == 0) continue;
+ num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn);
+ printf("lr conflicts(%d) :", num_conflicts);
+ Print_Live_Range (lr);
+ }
+}
+
+
+INT
+Find_Max_Conflicts(TN_MAP conflict_map,
+ INT *average_conflicts,
+ INT *num_k_conflicts,
+ INT *num_edges,
+ INT *outgoing_edges,
+ ISA_REGISTER_CLASS rclass)
+{
+ INT max_conflicts = 0;
+ INT sum_conflicts = 0;
+ INT n_ranges = 0;
+ INT n_edges = 0;
+ INT n_defs = 0;
+ INT total_def_degree = 0;
+ INT k_conflicts = 0;
+ INT num_pr = REGISTER_CLASS_register_count(rclass);
+ LIVE_RANGE *last_lr = NULL;
+
+ for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+ TN *tn = LR_tn(lr);
+ INT num_conflicts;
+ if (TN_register_class(tn) != rclass) continue;
+ if (LR_use_cnt(lr) == 0) continue;
+ if (LR_last_use(lr) == 0) continue;
+ num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn);
+ if (num_conflicts > max_conflicts)
+ max_conflicts = num_conflicts;
+ if (num_conflicts > num_pr)
+ k_conflicts++;
+ n_edges += LR_use_cnt(lr);
+ n_ranges++;
+ sum_conflicts += num_conflicts;
+ last_lr = lr;
+ if (LR_first_def(lr) != 0) {
+ n_defs += LR_def_cnt(lr);
+ if (total_def_degree < num_conflicts)
+ total_def_degree = num_conflicts;
+ }
+ }
+ if (n_ranges) {
+ TN *tn = LR_tn(last_lr);
+ *average_conflicts = (sum_conflicts/n_ranges);
+ *num_k_conflicts = k_conflicts;
+ *num_edges = n_edges;
+ *outgoing_edges = total_def_degree;
+ }
+ return max_conflicts;
+}
+
+
bool
Query_Conflicts_Improved(TN_MAP orig_map,
TN_MAP new_map,
@@ -877,8 +976,6 @@
}
}
*num_ranges_mitigated = num_ranges_moved_below_pr_pressure;
- TN_MAP_Delete(orig_map);
- TN_MAP_Delete(new_map);
return (num_improved > num_degraded);
}
@@ -918,6 +1015,61 @@
}
+void
+Truncate_LRs_For_OP (OP *op)
+{
+ if (op == NULL) return;
+
+ BB *bb = OP_bb(op);
+ INT i;
+ INT cur_opnum;
+
+ // Find our current OP's opnum
+ for (cur_opnum = 1; cur_opnum < BB_length(bb); cur_opnum++) {
+ OP *cur_op = OP_VECTOR_element (Insts_Vector, cur_opnum);
+ if (op == cur_op)
+ break;
+ }
+ // did we find it?
+ if (cur_opnum == BB_length(bb))
+ return;
+
+ for (i = 0; i < OP_results(op); i++) {
+ TN *res = OP_result(op, i);
+ if (TN_is_register(res)) {
+ LIVE_RANGE *lr = LR_For_TN(res);
+ if (LR_first_def(lr) == cur_opnum) {
+ LR_first_def(lr) = 0;
+ if (LR_upward_exposed_use(lr) == cur_opnum) {
+ if (LR_exposed_use(lr) == LR_upward_exposed_use(lr))
+ LR_exposed_use(lr) = 0;
+ LR_upward_exposed_use(lr) = 0;
+ }
+ }
+ if (LR_def_cnt(lr) > 1)
+ LR_def_cnt(lr)--;
+ }
+ }
+ for (i = 0; i < OP_opnds(op); i++) {
+ TN *opnd_tn = OP_opnd(op,i);
+ if (TN_is_register(opnd_tn)) {
+ LIVE_RANGE *lr = LR_For_TN(opnd_tn);
+ LR_use_cnt(lr)--;
+ if (LR_last_use(lr) == cur_opnum) {
+ // walk up to the closest use, else the last use is 0
+ LR_last_use(lr) = 0;
+ for (INT opnum = cur_opnum; opnum > 0; opnum--) {
+ OP *cur_op = OP_VECTOR_element (Insts_Vector, opnum);
+ if (OP_Refs_TN(cur_op, opnd_tn)) {
+ LR_last_use(lr) = opnum;
+ }
+ }
+ }
+ }
+ }
+}
+
+
/* Mark that TN is used in OP. */
static void
Mark_Use (TN *tn, OP *op, INT opnum, BB *bb, BOOL in_lra,
Modified: trunk/osprey/be/cg/lra.h
===================================================================
--- trunk/osprey/be/cg/lra.h 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/lra.h 2011-06-17 19:45:27 UTC (rev 3651)
@@ -85,13 +85,24 @@
* final result, and which comparisons of live range pressure
* can be made.
*/
-extern TN_MAP Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass);
+extern TN_MAP Calculate_All_Conflicts(BB *bb,
+ INT *regs_in_use,
+ ISA_REGISTER_CLASS rclass);
extern void Merge_Live_Ranges(TN *tn1, TN *tn2, bool make_tn1_span);
extern bool Query_Conflicts_Improved(TN_MAP orig_map,
TN_MAP new_map,
INT num_reserved,
INT *num_ranges_mitigated,
ISA_REGISTER_CLASS rclass);
+extern void Print_Range_And_Conflict_Info(TN_MAP conflict_map,
+ ISA_REGISTER_CLASS rclass);
+extern INT Find_Max_Conflicts(TN_MAP conflict_map,
+ INT *average_conflicts,
+ INT *num_k_conflicts,
+ INT *num_edges,
+ INT *outgoing_conflicts,
+ ISA_REGISTER_CLASS rclass);
+extern void Truncate_LRs_For_OP(OP *op);
/* Returns the number of registers LRA is requesting from GRA for
* the class <cl> in the basic block <bb>. If we run the scheduling
Modified: trunk/osprey/be/cg/op.h
===================================================================
--- trunk/osprey/be/cg/op.h 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/op.h 2011-06-17 19:45:27 UTC (rev 3651)
@@ -727,6 +727,8 @@
#define OP_memory(o) (OP_load(o) | OP_store(o) | OP_prefetch(o))
#define OP_mcode(o) (TOP_is_mcode(OP_code(o)))
#define OP_is4(o) (TOP_is_is4_reg(OP_code(o)))
+#define OP_vec_lo_ldst(o) (TOP_is_vector_lo_loadstore(OP_code(o)))
+#define OP_vec_hi_ldst(o) (TOP_is_vector_high_loadstore(OP_code(o)))
#else
#define OP_memory(o) (OP_load(o) | OP_store(o) | OP_prefetch(o))
#endif
Modified: trunk/osprey/be/cg/x8664/ebo_special.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-06-16 07:17:12 UTC (rev
3650)
+++ trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-06-17 19:45:27 UTC (rev
3651)
@@ -5186,6 +5186,7 @@
OP* new_op = NULL;
ADDR_MODE mode = BASE_INDEX_MODE;
const TOP new_top = Get_Top_For_Addr_Mode(OP_code(op), mode);
+ const TOP old_top = OP_code(op);
FmtAssert( new_top != TOP_UNDEFINED, ("Compose_Mem_Op: unknown top") );
if( TOP_is_prefetch( new_top ) ){
new_op = Mk_OP( new_top, OP_opnd( op, 0 ), base, offset, index, scale );
@@ -5198,7 +5199,16 @@
storeval = OP_result( op, 0 );
}
if (OP_load(op) || OP_store(op) || OP_prefetch(op)) {
- new_op = Mk_OP( new_top, storeval, base, offset, index, scale );
+ if (Is_Target_Orochi() && Is_Target_AVX() && OP_load(op) &&
+ (old_top != TOP_vldsd) &&
+ (old_top != TOP_vldsdx) &&
+ (old_top != TOP_vldsdxx) &&
+ (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) {
+ new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ),
+ base, index, scale, offset );
+ } else {
+ new_op = Mk_OP( new_top, storeval, base, offset, index, scale );
+ }
} else if (OP_load_exe(op)) {
if (OP_opnds(op) == 2) {
FmtAssert ((storeval != NULL),
@@ -5325,6 +5335,22 @@
return found;
}
+static bool tn_find(std::deque<TN*>& tn_queue, TN *tn)
+{
+ bool found = false;
+ std::deque<TN*>::iterator tn_queue_it;
+ for (tn_queue_it = tn_queue.begin();
+ tn_queue_it != tn_queue.end();
+ ++tn_queue_it) {
+ TN* cur_tn = *tn_queue_it;
+ if (cur_tn == tn) {
+ found = true;
+ break;
+ }
+ }
+ return found;
+}
+
// compare two def trees and mark attributes concerning input tn's tree
static void compare_def_tree(TN *tn,
INT *num,
@@ -6124,6 +6150,23 @@
TN_MAP_Delete(def_map);
}
+static void prune_adds_from_live_range_analysis(
+ std::deque<TN*>& add_tns,
+ std::set<TN*>& counted_base_regs,
+ std::map<TN*,OP*>& add_map)
+{
+ std::set<TN*>::const_iterator counted_base_regs_it;
+ for (counted_base_regs_it = counted_base_regs.begin();
+ counted_base_regs_it != counted_base_regs.end();
+ ++counted_base_regs_it) {
+ TN *tn = *counted_base_regs_it;
+ if (tn_find(add_tns, tn) == false) {
+ OP* add_op = add_map[tn];
+ Truncate_LRs_For_OP(add_op);
+ }
+ }
+}
+
// After building interior pointers candidates, remove
// all the effected counted_base_regs from SIB processing so
// that we do not translate them in SIB translation.
@@ -6133,7 +6176,8 @@
std::map<INT,std::deque<ST*> >& correlated_addr_map,
std::map<ST*,std::deque<TN*> >& symbol_addr_map,
std::set<TN*>& counted_base_regs,
- BB *lhead,
+ std::map<TN*,OP*>& add_map,
+ BB *bb,
bool loop_vectorized,
MEM_POOL *pool) {
INT num_cands = 0;
@@ -6189,15 +6233,49 @@
TN_MAP orig_conflict_map;
TN_MAP new_conflict_map;
mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
- const INT len = BB_length(lhead);
+ const INT len = BB_length(bb);
INT* regs_in_use = (INT *)alloca(sizeof(INT) * (len+1));
+ std::deque<TN*> add_tns;
MEM_POOL_Push(pool);
// Calculate the current live ranges
- LRA_Estimate_Fat_Points(lhead, fatpoint, regs_in_use, pool);
- orig_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer);
+ LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool);
+ // build a list of tns interior pointers will operate on
+ for (counted_addr_sts_it = counted_addr_sts.begin();
+ counted_addr_sts_it != counted_addr_sts.end();
+ ++counted_addr_sts_it) {
+ std::deque<TN*> st_tns;
+ st_tns = symbol_addr_map[*counted_addr_sts_it];
+ // Skip the non interior pointer cands
+ if (st_tns.empty()) continue;
+
+ INT cor_addr_index = st_tns.size();
+ if (cor_addr_index != max_pattern) continue;
+
+ if (!correlated_addr_map[cor_addr_index].empty()) {
+ std::deque<TN*>::iterator st_tns_iter;
+ for (st_tns_iter = st_tns.begin();
+ st_tns_iter != st_tns.end();
+ ++st_tns_iter) {
+ TN *tn1 = *st_tns_iter;
+ ++st_tns_iter;
+ TN *tn2 = *st_tns_iter;
+ add_tns.push_front(tn2);
+ }
+ }
+ }
+
+ // remove all the sib adds from our live range maps so that we get
+ // an accurate picture for analysis.
+ prune_adds_from_live_range_analysis(add_tns, counted_base_regs, add_map);
+ add_tns.clear();
+
+ // Do the initial live range analysis
+ orig_conflict_map = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_integer);
+
// Merge all pairs live ranges on the minor basereg
bool first_time = true;
for (counted_addr_sts_it = counted_addr_sts.begin();
@@ -6242,13 +6320,31 @@
// If Query_Conflicts_Improved returns with a state that indicates
// that introduction of interior pointers does not benefit live
// range pressure, we do not proceed with the allowing the translation.
- new_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer);
+ new_conflict_map = Calculate_All_Conflicts(bb, regs_in_use,
+ ISA_REGISTER_CLASS_integer);
+ INT N_i = 0;
+ INT D_i = 0;
+ INT avg_conflicts = 0;
+ INT k_conflicts = 0;
if (Query_Conflicts_Improved(orig_conflict_map,
new_conflict_map,
3,
&num_ranges_mitigated,
- ISA_REGISTER_CLASS_integer) == false)
+ ISA_REGISTER_CLASS_integer) == false) {
clear_all = true;
+ } else if (Find_Max_Conflicts(orig_conflict_map,
+ &avg_conflicts,
+ &k_conflicts,
+ &N_i,
+ &D_i,
+ ISA_REGISTER_CLASS_integer) == num_pr) {
+ // For vectorized loops we want more than k-conflicts as max in the
+ // original conflict map context.
+ if (loop_vectorized)
+ min_reclaimable = 4;
+ }
+ TN_MAP_Delete(orig_conflict_map);
+ TN_MAP_Delete(new_conflict_map);
MEM_POOL_Pop(pool);
}
@@ -7006,6 +7102,7 @@
correlated_addr_map,
symbol_addr_map,
counted_base_regs,
+ add_map,
lhead,
loop_vectorized,
pool);
@@ -7423,7 +7520,7 @@
}
}
if (dontdoit) continue;
-
+
//
// collect the LiveIn sets of pdoms of loop header
// which are not part of the loop
@@ -7657,6 +7754,15 @@
return FALSE;
}
+ const TOP old_top = OP_code(op);
+ if (Is_Target_Orochi() && Is_Target_AVX() &&
+ (old_top != TOP_vldsd) &&
+ (old_top != TOP_vldsdx) &&
+ (old_top != TOP_vldsdxx) &&
+ (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) {
+ return FALSE;
+ }
+
const INT op_base_idx = OP_find_opnd_use( op, OU_base );
EBO_TN_INFO* base_tninfo = op_base_idx >= 0 ? actual_tninfo[op_base_idx] :
NULL;
OP* addr_op = (base_tninfo != NULL) ? base_tninfo->in_op : NULL;
Modified: trunk/osprey/be/cg/x8664/exp_loadstore.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/exp_loadstore.cxx 2011-06-16 07:17:12 UTC (rev
3650)
+++ trunk/osprey/be/cg/x8664/exp_loadstore.cxx 2011-06-17 19:45:27 UTC (rev
3651)
@@ -382,6 +382,16 @@
}
}
}
+ if (Is_Target_Orochi() && Is_Target_AVX() &&
+ ((top == TOP_ldlps) ||
+ (top == TOP_ldhps) ||
+ (top == TOP_ldlpd) ||
+ (top == TOP_ldhpd))){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP( TOP_xzero128v32, xzero, ops );
+ Build_OP( top, result, xzero, base, ofst, ops );
+ return;
+ }
Build_OP (top, result, base, ofst, ops);
}
@@ -634,10 +644,34 @@
}
else if (mtype == MTYPE_V8I1 || mtype == MTYPE_V8I2 ||
mtype == MTYPE_V8I4 || mtype == MTYPE_V8I8) {
- if (base != NULL)
- Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse, result, base,
disp, ops);
- else Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32,
result, disp, ops);
+ if (Is_Target_Orochi() && Is_Target_AVX()){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP(TOP_xzero128v32, xzero, ops);
+ if (base != NULL)
+ Build_OP(TOP_ldlps, result, xzero, base, disp, ops);
+ else
+ Build_OP(TOP_ldlps_n32, result, xzero, disp, ops);
+ } else {
+ if (base != NULL)
+ Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse,
+ result, base, disp, ops);
+ else
+ Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32,
+ result, disp, ops);
+ }
}
+ else if (mtype == MTYPE_V8F4 ) {
+ if (Is_Target_Orochi() && Is_Target_AVX()){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP(TOP_xzero128v32, xzero, ops);
+ if (base != NULL)
+ Build_OP(TOP_ldlps, result, xzero, base, disp, ops);
+ else
+ Build_OP(TOP_ldlps_n32, result, xzero, disp, ops);
+ } else {
+ Expand_Composed_Load (op, result, base, disp, variant, ops);
+ }
+ }
else if (mtype == MTYPE_V16F8 || mtype == MTYPE_V16C8) {
if(Is_Target_Barcelona() || Is_Target_Orochi()){
if(base != NULL)
Modified: trunk/osprey/be/cg/x8664/expand.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/expand.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/x8664/expand.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -7988,7 +7988,13 @@
Build_OP( TOP_ldsd, result, op1, Gen_Literal_TN (0,4), ops );
break;
case INTRN_LOADHPD:
- Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops );
+ if (Is_Target_Orochi() && Is_Target_AVX()){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP( TOP_xzero128v32, xzero, ops );
+ Build_OP( TOP_ldhpd, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+ } else {
+ Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops );
+ }
break;
case INTRN_UNPCKLPD:
Build_OP( TOP_unpcklpd, result, op0, op1, ops );
@@ -8208,10 +8214,22 @@
Build_OP( TOP_ldupd, result, op0, Gen_Literal_TN (0,4), ops );
break;
case INTRN_LOADHPS:
- Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops );
+ if (Is_Target_Orochi() && Is_Target_AVX()){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP( TOP_xzero128v32, xzero, ops );
+ Build_OP( TOP_ldhps, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+ } else {
+ Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops );
+ }
break;
case INTRN_LOADLPS:
- Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops );
+ if (Is_Target_Orochi() && Is_Target_AVX()){
+ TN *xzero = Build_TN_Like(result);
+ Build_OP( TOP_xzero128v32, xzero, ops );
+ Build_OP( TOP_ldlps, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+ } else {
+ Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops );
+ }
break;
case INTRN_MOVMSKPS:
Build_OP( TOP_movmskps, result, op0, ops );
Modified: trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx 2011-06-16
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx 2011-06-17
19:45:27 UTC (rev 3651)
@@ -1545,6 +1545,10 @@
TOP_vaesenclastx,
TOP_vaesdecx,
TOP_vaesdeclastx,
+ TOP_vldhpd,
+ TOP_vldlpd,
+ TOP_vldhps,
+ TOP_vldlps,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, fp128, opnd1);
@@ -1785,6 +1789,10 @@
TOP_vaesenclastxx,
TOP_vaesdecxx,
TOP_vaesdeclastxx,
+ TOP_vldhpdx,
+ TOP_vldhpsx,
+ TOP_vldlpdx,
+ TOP_vldlpsx,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, fp128, opnd1);
@@ -2025,6 +2033,10 @@
TOP_vaesenclastxxx,
TOP_vaesdecxxx,
TOP_vaesdeclastxxx,
+ TOP_vldhpdxx,
+ TOP_vldhpsxx,
+ TOP_vldlpdxx,
+ TOP_vldlpsxx,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, fp128, opnd1);
@@ -4286,15 +4298,21 @@
TOP_vlddqa_n32,
TOP_vldapd_n32,
TOP_vldaps_n32,
- TOP_vldlpd_n32,
TOP_vldupd_n32,
TOP_vldups_n32,
+ TOP_UNDEFINED);
+ Result(0, fp128);
+ Operand(0, simm32, offset);
+
+ Instruction_Group("avx float load vector w/o base or index",
TOP_vldhpd_n32,
+ TOP_vldlpd_n32,
TOP_vldhps_n32,
TOP_vldlps_n32,
TOP_UNDEFINED);
Result(0, fp128);
- Operand(0, simm32, offset);
+ Operand(0, fp128, opnd1);
+ Operand(1, simm32, offset);
Instruction_Group("float load vector",
TOP_lddqa,
@@ -4318,14 +4336,10 @@
TOP_vldntdqa,
TOP_vldapd,
TOP_vldaps,
- TOP_vldlpd,
TOP_vldss,
TOP_vldupd,
TOP_vldups,
- TOP_vldhpd,
- TOP_vldhps,
TOP_vldsd,
- TOP_vldlps,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, int64, base);
@@ -5676,14 +5690,10 @@
TOP_vldntdqax,
TOP_vldapdx,
TOP_vldapsx,
- TOP_vldlpdx,
TOP_vldssx,
TOP_vldupdx,
TOP_vldupsx,
- TOP_vldhpdx,
- TOP_vldhpsx,
TOP_vldsdx,
- TOP_vldlpsx,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, int64, base);
@@ -5745,14 +5755,10 @@
TOP_vldntdqaxx,
TOP_vldapdxx,
TOP_vldapsxx,
- TOP_vldlpdxx,
TOP_vldssxx,
TOP_vldupdxx,
TOP_vldupsxx,
- TOP_vldhpdxx,
- TOP_vldhpsxx,
TOP_vldsdxx,
- TOP_vldlpsxx,
TOP_UNDEFINED);
Result(0, fp128);
Operand(0, int64, index);
Modified: trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-06-16
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx 2011-06-17
19:45:27 UTC (rev 3651)
@@ -1982,6 +1982,10 @@
TOP_vaesdecx,
TOP_vaesdeclastx,
TOP_vaeskeygenassistx,
+ TOP_vldhpd,
+ TOP_vldlpd,
+ TOP_vldhps,
+ TOP_vldlps,
TOP_UNDEFINED);
/* dest=op(memop, reg), non-x86-style */
@@ -2198,6 +2202,10 @@
TOP_vaesdecxx,
TOP_vaesdeclastxx,
TOP_vaeskeygenassistxx,
+ TOP_vldhpdx,
+ TOP_vldhpsx,
+ TOP_vldlpdx,
+ TOP_vldlpsx,
TOP_UNDEFINED);
/* dest=op(memop with scaled index with base, reg), non-x86-style */
@@ -2415,6 +2423,10 @@
TOP_vaesdecxxx,
TOP_vaesdeclastxxx,
TOP_vaeskeygenassistxxx,
+ TOP_vldhpdxx,
+ TOP_vldhpsxx,
+ TOP_vldlpdxx,
+ TOP_vldlpsxx,
TOP_UNDEFINED);
/* dest=op(memop with scaled index without base, reg), non-x86-style */
@@ -3422,12 +3434,8 @@
TOP_vlddqa_n32,
TOP_vldapd_n32,
TOP_vldaps_n32,
- TOP_vldlpd_n32,
TOP_vldupd_n32,
TOP_vldups_n32,
- TOP_vldhpd_n32,
- TOP_vldhps_n32,
- TOP_vldlps_n32,
TOP_vstdqa_n32,
TOP_vstdqu_n32,
TOP_vstapd_n32,
@@ -3441,6 +3449,19 @@
TOP_vaesimc,
TOP_UNDEFINED );
+ /* One result / two operands */
+ ISA_PRINT_TYPE rop2 = ISA_Print_Type_Create("rop2", "%s %s %s,%s");
+ Name();
+ Operand(1);
+ Operand(0);
+ Result(0);
+ Instruction_Print_Group( rop,
+ TOP_vldhpd_n32,
+ TOP_vldhps_n32,
+ TOP_vldlpd_n32,
+ TOP_vldlps_n32,
+ TOP_UNDEFINED );
+
/* One result / one mem opnd */
ISA_PRINT_TYPE rmem = ISA_Print_Type_Create("rmem", "%s %s%s(%s),%s");
Name();
@@ -3459,6 +3480,7 @@
TOP_pmovzxbwx,
TOP_pmovsxbdx,
TOP_pmovzxbdx,
+ TOP_vldlpd_n32,
TOP_pmovsxbqx,
TOP_pmovzxbqx,
TOP_pmovsxwdx,
@@ -3797,14 +3819,10 @@
TOP_vldntdqa,
TOP_vldapd,
TOP_vldaps,
- TOP_vldlpd,
TOP_vldss,
TOP_vldupd,
TOP_vldups,
- TOP_vldhpd,
- TOP_vldhps,
TOP_vldsd,
- TOP_vldlps,
TOP_vfbroadcastss,
TOP_vfbroadcastsd,
TOP_vfbroadcastf128,
@@ -3928,14 +3946,10 @@
TOP_vldntdqax,
TOP_vldapdx,
TOP_vldapsx,
- TOP_vldlpdx,
TOP_vldssx,
TOP_vldupdx,
TOP_vldupsx,
- TOP_vldhpdx,
- TOP_vldhpsx,
TOP_vldsdx,
- TOP_vldlpsx,
TOP_vfbroadcastxss,
TOP_vfbroadcastxsd,
TOP_vfbroadcastxf128,
@@ -3985,14 +3999,10 @@
TOP_vldntdqaxx,
TOP_vldapdxx,
TOP_vldapsxx,
- TOP_vldlpdxx,
TOP_vldssxx,
TOP_vldupdxx,
TOP_vldupsxx,
- TOP_vldhpdxx,
- TOP_vldhpsxx,
TOP_vldsdxx,
- TOP_vldlpsxx,
TOP_vfbroadcastxxss,
TOP_vfbroadcastxxsd,
TOP_vfbroadcastxxf128,
Modified: trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-06-16
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx 2011-06-17
19:45:27 UTC (rev 3651)
@@ -5651,6 +5651,18 @@
TOP_vpmadcswdxx,
TOP_vpmadcswdxxx,
/* AVX instructions */
+ TOP_vldhpd,
+ TOP_vldhps,
+ TOP_vldlpd,
+ TOP_vldlps,
+ TOP_vldhpdx,
+ TOP_vldhpsx,
+ TOP_vldlpdx,
+ TOP_vldlpsx,
+ TOP_vldhpdxx,
+ TOP_vldhpsxx,
+ TOP_vldlpdxx,
+ TOP_vldlpsxx,
TOP_vcmpestrix,
TOP_vcmpestrixx,
TOP_vcmpestrixxx,
------------------------------------------------------------------------------
EditLive Enterprise is the world's most technically advanced content
authoring tool. Experience the power of Track Changes, Inline Image
Editing and ensure content is compliant with Accessibility Checking.
http://p.sf.net/sfu/ephox-dev2dev
_______________________________________________
Open64-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/open64-devel