Author: mberg
Date: 2011-06-17 15:45:27 -0400 (Fri, 17 Jun 2011)
New Revision: 3651

Modified:
   trunk/osprey/be/cg/cg_flags.cxx
   trunk/osprey/be/cg/cg_flags.h
   trunk/osprey/be/cg/cg_loop.cxx
   trunk/osprey/be/cg/cg_loop.h
   trunk/osprey/be/cg/cgdriver.cxx
   trunk/osprey/be/cg/lra.cxx
   trunk/osprey/be/cg/lra.h
   trunk/osprey/be/cg/op.h
   trunk/osprey/be/cg/x8664/ebo_special.cxx
   trunk/osprey/be/cg/x8664/exp_loadstore.cxx
   trunk/osprey/be/cg/x8664/expand.cxx
   trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx
   trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx
   trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx
Log:
Best fit unrolling implementation added along with vmovlps update.  
The algorithm is controlled by -CG:nobest_fit=<on|off>, the behavior is
on by default only on x86 targets under default unroll
optimizations(unroll by 4 and size of 128). Other targets will need to
revisit default values and add some support in todo marked areas to
utilize this code.

CR by Jian-Xin.  


Modified: trunk/osprey/be/cg/cg_flags.cxx
===================================================================
--- trunk/osprey/be/cg/cg_flags.cxx     2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_flags.cxx     2011-06-17 19:45:27 UTC (rev 3651)
@@ -124,6 +124,7 @@
 BOOL CG_128bitstore = TRUE;
 BOOL CG_branch_fuse = TRUE;
 BOOL CG_dispatch_schedule = FALSE;
+BOOL CG_LOOP_nounroll_best_fit_set = FALSE;
 BOOL CG_strcmp_expand = TRUE;
 BOOL CG_merge_counters_x86 = FALSE;
 BOOL CG_interior_ptrs_x86 = FALSE;

Modified: trunk/osprey/be/cg/cg_flags.h
===================================================================
--- trunk/osprey/be/cg/cg_flags.h       2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_flags.h       2011-06-17 19:45:27 UTC (rev 3651)
@@ -101,6 +101,9 @@
  *  BOOL CG_dispatch_schedule
  *     Enable dispatch scheduling for Orochi style architectures.
  *
+ *  BOOL CG_LOOP_unroll_best_fit
+ *     Toggle default state of unroll best fit behavior.
+ *
  *  BOOL CG_128bitstore
  *     Enable 128bit unaligned stores optimization which emits movup{s|d}
  *     instead of movhp{s|d} with movlp{s|d}.
@@ -530,6 +533,7 @@
 extern BOOL CG_cmp_load_exec;
 extern BOOL CG_fma4_load_exec;
 extern BOOL CG_dispatch_schedule;
+extern BOOL CG_LOOP_nounroll_best_fit_set;
 extern BOOL CG_128bitstore;
 extern BOOL CG_branch_fuse;
 extern BOOL CG_strcmp_expand;

Modified: trunk/osprey/be/cg/cg_loop.cxx
===================================================================
--- trunk/osprey/be/cg/cg_loop.cxx      2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_loop.cxx      2011-06-17 19:45:27 UTC (rev 3651)
@@ -187,6 +187,8 @@
 #include "ebo.h"
 #include "hb.h"
 #include "gra_live.h"
+#include "lra.h"
+#include "calls.h"
 
 #if defined(TARG_SL)
 #include "tag.h"
@@ -256,6 +258,7 @@
 BOOL CG_LOOP_optimize_multi_targ = FALSE;
 BOOL CG_LOOP_optimize_lno_winddown_cache = TRUE;
 BOOL CG_LOOP_optimize_lno_winddown_reg = TRUE;
+BOOL CG_LOOP_unroll_best_fit = FALSE;
 
 /* Note: To set default unroll parameters, modify the initialization
  *      of OPT_unroll_times/size in "config.c".
@@ -5335,6 +5338,275 @@
 }
 
 
+// This algorithm is based in part on a paper by ma and carr
+// for determining register pressure for unrolled loops and
+// the most profitable unroll factor, if any, to unroll by.
+void CG_LOOP::Determine_Best_Unit_Iteration_Interval(BOOL can_refit)
+{
+  BB *bb = LOOP_DESCR_loophead(loop);
+  INT init_II[5];
+  BOOL saved_state_sched_est;
+  BOOL toggle_sched_est = false;
+
+  // only single block loops
+  if (BB_SET_Size(LOOP_DESCR_bbset(loop)) != 1)
+    return;
+
+  // This is now the default single block unroll factor calculation
+  // algorithm, if the user specified any other unroll by or size
+  // other than the default, the heuristics below will not be use to 
+  // determine unroll factor.
+  if (CG_LOOP_unroll_best_fit == false)
+    return;
+
+  MEM_POOL_Push(&MEM_phase_nz_pool);
+
+  mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
+  const INT len = BB_length(bb);
+  INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1));
+  INT max_conf = 0;
+  INT R_f = 0;
+  INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float);
+  INT N_f = 0;
+  INT D_f = 0;
+  INT R_i = 0;
+  INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer);
+  INT N_i = 0;
+  INT D_i = 0;
+  INT avg_conflicts = 0;
+  INT k_conflicts = 0;
+  TN_MAP conflict_map_f;
+  TN_MAP conflict_map_i;
+  TN *tn;
+  LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, &MEM_phase_nz_pool);
+
+#ifdef TARG_X8664
+  // now adjust the number of gpr regs as per the ABI
+  P_i--;
+  if (Is_Target_32bit() && Gen_Frame_Pointer)
+    P_i--;
+#endif
+
+  // compute the number of fp Regs Predicted, func return degree so add 1 
+  conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use, 
+                                           ISA_REGISTER_CLASS_float);
+  R_f = Find_Max_Conflicts(conflict_map_f,
+                           &avg_conflicts,
+                           &k_conflicts,
+                           &N_f,
+                           &D_f,
+                           ISA_REGISTER_CLASS_float) + 1;
+
+  // compute the number of gpr Regs Predicted, func return degree so add 1
+  conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use, 
+                                           ISA_REGISTER_CLASS_integer);
+  R_i = Find_Max_Conflicts(conflict_map_i,
+                           &avg_conflicts,
+                           &k_conflicts,
+                           &N_i,
+                           &D_i,
+                           ISA_REGISTER_CLASS_integer) + 1;
+
+  TN_MAP_Delete(conflict_map_f);
+  TN_MAP_Delete(conflict_map_i);
+
+  // Now figure out E, the total number of cross iteration edges for
+  // both float and integer regs by counting the live out regs of each
+  // type for the loop
+  INT E_f = 0;
+  INT E_i = 0;
+
+  // Exposed uses which are updated are loop-carried dependences.
+  for (tn = GTN_SET_Choose(BB_live_use(bb));
+       tn != GTN_SET_CHOOSE_FAILURE;
+       tn = GTN_SET_Choose_Next(BB_live_use(bb),tn)) {
+    bool exposed_use_is_updated = false;
+    for( OP* op = BB_first_op(bb); op != NULL; op = OP_next(op) ){
+      if (OP_Defs_TN(op, tn)) {
+        exposed_use_is_updated = true;
+        break;
+      }
+    }
+    if (exposed_use_is_updated == false) continue;
+    if (TN_register_class(tn) == ISA_REGISTER_CLASS_float)
+      E_f++;
+    if (TN_register_class(tn) == ISA_REGISTER_CLASS_integer)
+      E_i++;
+  }
+
+  // Count the number of prefetch insns, as unrolled loop bodies
+  // will only recieve a single copy for the whole iteration for
+  // these kind of instructions, same is true for loop carried
+  // dependences(E_i and E_f).  This is not from ma and carr but
+  // is benefitial in that we miss upper bound opportunities otherwise.
+  INT num_prefetch = 0;
+  for (OP *op = BB_first_op(bb); op != NULL; op = OP_next(op))
+    if (OP_prefetch(op)) num_prefetch++;
+
+  // Try to refit the next unroll factor by 2 into the current
+  // size threshold if it will fit using the above data for the
+  // total loop size(it is more accurate than the prior calc).
+  if ((Unroll_fully() == false) &&
+      (can_refit) &&
+      is_power_of_two(unroll_factor) &&
+      ((unroll_factor * 2) < CG_LOOP_unroll_times_max)) {
+    INT loop_size = BB_length(bb);
+    INT next_factor = unroll_factor * 2;
+    INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) * 
+                                (next_factor - 1)); 
+    if (max_size < CG_LOOP_unrolled_size_max)
+      Set_unroll_factor(next_factor);
+  } else if ((Unroll_fully() == false) &&
+             (can_refit) &&
+             ((unroll_factor + 1) < CG_LOOP_unroll_times_max)) {
+    INT loop_size = BB_length(bb);
+    INT next_factor = unroll_factor + 1;
+    INT max_size = (loop_size + (loop_size - (E_i + E_f + num_prefetch)) * 
+                                (next_factor - 1)); 
+    if (max_size < CG_LOOP_unrolled_size_max)
+      Set_unroll_factor(next_factor);
+  }
+
+#ifdef TARG_X8664
+  // calculate each unroll factors init_II
+  INT A_spill_f = CGTARG_Latency(TOP_ldupd);
+  INT A_spill_i = CGTARG_Latency(TOP_ldx64);
+#else
+  INT A_spill_f = 1; // stubbed, todo - fill in correctly per target
+  INT A_spill_i = 1; // stubbed, todo - fill in correctly per target
+#endif
+
+  INT unit_II[5];
+  INT II_penalty_f;
+  INT II_penalty_i;
+  INT j, i;
+  INT iter_j = 0;
+  INT chose_j = 0;
+  INT min_unitII = INT_MAX;
+  INT ntimes = 1;
+  INT upper_bound = unroll_factor;
+
+  // Refitted unroll factors and already assigned unroll factors of power
+  // 2 only are utlized here.
+  if (is_power_of_two(unroll_factor)) {
+    CG_SCHED_EST *loop_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool,
+                                                SCHED_EST_FOR_UNROLL);
+    CG_SCHED_EST *unroll_se = CG_SCHED_EST_Create(bb, &MEM_local_nz_pool,
+                                                  SCHED_EST_FOR_UNROLL |
+                                                  SCHED_EST_IGNORE_PREFETCH |
+                                                  SCHED_EST_IGNORE_BRANCH |
+                                                  SCHED_EST_IGNORE_LOH_OPS |
+                                                  SCHED_EST_IGNORE_INT_OPS);
+    init_II[0] = CG_SCHED_EST_Cycles(loop_se);
+    INT rolling_init_II;
+    for (j = 2; j <= upper_bound; j++) {
+      CG_SCHED_EST_Append_Scheds(loop_se, unroll_se);
+      rolling_init_II = CG_SCHED_EST_Cycles(loop_se);
+      switch (j) {
+      case 2:
+        init_II[1] = rolling_init_II;
+        break;
+      case 4:
+        init_II[2] = rolling_init_II;
+        break;
+      case 8:
+        init_II[3] = rolling_init_II;
+        break;
+      case 16:
+        init_II[4] = rolling_init_II;
+        break;
+      }
+    }
+
+    // Divergences from ma and carr: We have the degree and live
+    // range info for loop carried dependences, so E_i and E_f are
+    // not treated as additive components of the unit_II penalty calc,
+    // also we figure prefetch and E components into N components for 
+    // penalty calc.  This is more accurate than ma and carr.  And finally,
+    // we use the defaults as upper bounds for finding the best fit
+    // or minimal unit_II, where if we do not find a best fit that is other
+    // than unroll by 1, we defer to the original unroll factor.
+    int Tot_D_f = D_f;
+    int Tot_D_i = D_i;
+    for (i = 1, j = 0; i <= upper_bound; i = i * 2, j++) {
+      // calculate the II_penalty for float regs
+      II_penalty_f = 0;
+      if (N_f) {
+        int N_adj_f = ((N_f - E_f) * (i - 1)) + N_f;
+        if (i > 1)
+          Tot_D_f += (D_f - E_f)*(i-1);
+        II_penalty_f = ((R_f - P_f) * (Tot_D_f) * A_spill_f);
+        II_penalty_f = II_penalty_f / N_adj_f;
+      }
+  
+      // calculate the II_penalty for integer regs
+      II_penalty_i = 0;
+      if (N_i) {
+        int N_adj_i = ((N_i - (E_i + num_prefetch)) * (i - 1)) + N_i;
+        if (i > 1)
+          Tot_D_i += (D_i - E_i)*(i-1);
+        II_penalty_i = ((R_i - P_i) * (Tot_D_i) * A_spill_i);
+        II_penalty_i = II_penalty_i / N_adj_i;
+      }
+
+      // Now calculate the unified unit_II for both components
+      unit_II[j] = (init_II[j] + II_penalty_i + II_penalty_f) / i;
+      if (min_unitII >= unit_II[j]) {
+        min_unitII = unit_II[j];
+        ntimes = i;
+        iter_j = j;
+      } else if ((min_unitII < 0) && 
+                 (unit_II[j] <= 0) &&
+                 (min_unitII < unit_II[j])) {
+        min_unitII = unit_II[j];
+        ntimes = i;
+        iter_j = j;
+      }
+      if (unroll_factor == i)
+        chose_j = j;
+    }
+  }
+
+  // This is also new, and not from ma and carr, evict unrolls that
+  // fit a clear profile of bad register pressure.
+  if ((R_f > P_f) || (R_i > P_i)) {
+    if (R_i > P_i) {
+      INT pressure_calc_i = (R_i - P_i) * A_spill_i;
+      INT benefit_calc_i = (num_prefetch / 2) + E_i; 
+      if (E_i > P_i) {
+        // these regs will require a spill and a reload as they are updated
+        INT adjust_calc_i = (E_i - P_i) * (A_spill_i * 2);
+        pressure_calc_i += adjust_calc_i;
+      }
+      // prefetch insns are usually clumped and can issue 2 at a time
+      if (pressure_calc_i > benefit_calc_i) {
+        ntimes = 1;
+        Set_unroll_factor(ntimes);
+      }
+    } else if (R_f > P_f) {
+      INT pressure_calc_f = (R_f - P_f) * A_spill_f;
+      INT benefit_calc_f = E_f;
+      if (E_f > P_f) {
+        // these regs will require a spill and a reload as they are updated
+        INT adjust_calc_f = (E_f - P_f) * (A_spill_f * 2);
+        pressure_calc_f += adjust_calc_f;
+      }
+      if (pressure_calc_f > benefit_calc_f) {
+        ntimes = 1;
+        Set_unroll_factor(ntimes);
+      }
+    }
+  }
+
+  // If ntimes is 1, use what we have already, this means that if we
+  // retained the orig it was either the min value or we did not find one
+  // or we have a register pressure case.
+  if ((Unroll_fully() == false) && (ntimes != 1) && (unroll_factor != ntimes))
+    Set_unroll_factor(ntimes);
+  MEM_POOL_Pop(&MEM_phase_nz_pool);
+}
+
+
 void CG_LOOP::Determine_Unroll_Factor()
 { 
   LOOPINFO *info = LOOP_DESCR_loopinfo(Loop());
@@ -5385,6 +5657,9 @@
       ntimes--;
     Set_unroll_factor(ntimes);
 
+#ifdef TARG_X8664
+    Determine_Best_Unit_Iteration_Interval(TRUE);
+#endif
   } else {
 
     BOOL const_trip = TN_is_constant(trip_count_tn);
@@ -5437,6 +5712,9 @@
          ntimes /= 2;
       }
       Set_unroll_factor(ntimes);
+#ifdef TARG_X8664
+      Determine_Best_Unit_Iteration_Interval(!const_trip);
+#endif
     }
   }
 
@@ -8091,7 +8369,115 @@
 }
 #endif
 
+void Report_Loop_Info(LOOP_DESCR *loop, 
+                      char *usage_str, 
+                      BOOL after_prescheduling,
+                      MEM_POOL *pool)
+{
+  // This func is a debug trace utility
+  if (Get_Trace(TP_CGLOOP, 1) == FALSE)
+    return;
 
+  BB *bb = LOOP_DESCR_loophead(loop);
+  if (BB_unrollings(bb) && 
+      (BB_SET_Size(LOOP_DESCR_bbset(loop)) == 1)) {
+    BOOL saved_state_sched_est;
+    BOOL toggle_sched_est = false; 
+
+    // calculate or obtain the init_II cycle time
+    // from either the locs scheduler if we have not yet
+    // prescheduled the code, or from the last ready time
+    // cycle of the scheduled code if we have.
+    INT init_II = 0;
+    if (after_prescheduling) {
+      init_II = OP_scycle(BB_last_op(bb));
+    } else {
+      SCHED_EST_TYPE type = (SCHED_EST_FOR_UNROLL);
+      init_II = (INT32)CG_SCHED_EST_BB_Cycles(bb, type);
+    }
+
+    mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
+    const INT len = BB_length(bb);
+    INT *regs_in_use = (INT*)alloca(sizeof(INT) * (len+1));
+    INT max_conf = 0;
+    INT R_f = 0;
+    INT P_f = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_float);
+    INT N_f = 0;
+    INT D_f = 0;
+    INT R_i = 0;
+    INT P_i = REGISTER_CLASS_register_count(ISA_REGISTER_CLASS_integer);
+    INT N_i = 0;
+    INT D_i = 0;
+    INT avg_conflicts_i = 0;
+    INT avg_conflicts_f = 0;
+    INT k_conflicts = 0;
+    TN_MAP conflict_map_f;
+    TN_MAP conflict_map_i;
+    TN *tn;
+    BOOL first_time = TRUE;
+    BOOL changed = TRUE;
+
+#ifdef TARG_X8664
+    // now adjust the number of gpr regs as per the ABI
+    P_i--;
+    if (Is_Target_32bit() && Gen_Frame_Pointer)
+      P_i--;
+#endif
+
+    MEM_POOL_Push(pool);
+    LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool);
+
+    // compute the number of fp Regs Predicted
+    conflict_map_f = Calculate_All_Conflicts(bb, regs_in_use, 
+                                             ISA_REGISTER_CLASS_float);
+    R_f = Find_Max_Conflicts(conflict_map_f,
+                             &avg_conflicts_f,
+                             &k_conflicts,
+                             &N_f,
+                             &D_f,
+                             ISA_REGISTER_CLASS_float) + 1;
+
+    // compute the number of gpr Regs Predicted
+    conflict_map_i = Calculate_All_Conflicts(bb, regs_in_use, 
+                                             ISA_REGISTER_CLASS_integer);
+    R_i = Find_Max_Conflicts(conflict_map_i,
+                             &avg_conflicts_i,
+                             &k_conflicts,
+                             &N_i,
+                             &D_i,
+                             ISA_REGISTER_CLASS_integer) + 1;
+
+    // Now print the details of this loop
+    printf("unrolled loop(%d):size = %d,  ntimes=%d\n", 
+           BB_id(bb), BB_length(bb), BB_unrollings(bb));
+    printf("%s bb = %d, init_II = %d\n", usage_str, BB_id(bb), init_II);
+    printf("R_f = %d, D_f = %d, N_f = %d, avg_degree_f = %d\n",
+             R_f, D_f, N_f, avg_conflicts_f);
+    printf("R_i = %d, D_i = %d, N_i = %d, avg_degree_i = %d\n",
+           R_i, D_i, N_i, avg_conflicts_i);
+
+    TN_MAP_Delete(conflict_map_f);
+    TN_MAP_Delete(conflict_map_i);
+    MEM_POOL_Pop(pool);
+  }
+}
+
+void Examine_Loop_Info(char *usage_str, BOOL after_presched)
+{
+  if (CG_opt_level > 0) {
+    MEM_POOL loop_descr_pool;
+    MEM_POOL_Initialize(&loop_descr_pool, "loop_descriptors", TRUE);
+
+    Calculate_Dominators();
+    for (LOOP_DESCR *loop = LOOP_DESCR_Detect_Loops(&loop_descr_pool);
+         loop;
+         loop = LOOP_DESCR_next(loop)) {
+      Report_Loop_Info(loop, usage_str, after_presched, &loop_descr_pool);
+    }
+    Free_Dominators_Memory();
+  }
+}
+
 // Perform loop optimizations for all inner loops
 // in the PU.
 //

Modified: trunk/osprey/be/cg/cg_loop.h
===================================================================
--- trunk/osprey/be/cg/cg_loop.h        2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cg_loop.h        2011-06-17 19:45:27 UTC (rev 3651)
@@ -468,6 +468,7 @@
 extern BOOL CG_LOOP_unroll_remainder_fully;
 extern UINT32 CG_LOOP_unroll_min_trip;
 extern BOOL CG_LOOP_unroll_analysis;
+extern BOOL CG_LOOP_unroll_best_fit;
 extern BOOL CG_LOOP_ooo_unroll_heuristics;
 extern BOOL CG_LOOP_ooo_unroll_heuristics_set;
 extern UINT32 CG_LOOP_reorder_buffer_size;
@@ -632,6 +633,7 @@
 
   void Recompute_Liveness();
   bool Determine_Unroll_Fully(BOOL count_multi_bb);
+  void Determine_Best_Unit_Iteration_Interval(BOOL can_refit);
   void Determine_Unroll_Factor();
   void Determine_SWP_Unroll_Factor();
   void Build_CG_LOOP_Info(BOOL single_bb);
@@ -704,6 +706,8 @@
 
 extern CG_LOOP *Current_CG_LOOP;
 
+extern void Examine_Loop_Info(char *usage_str, BOOL after_presched);
+
 #if defined(TARG_IA64) || defined(TARG_SL)  || defined(TARG_MIPS)
 extern void Perform_Loop_Optimizations(void *rgn_loop_update=NULL);
 

Modified: trunk/osprey/be/cg/cgdriver.cxx
===================================================================
--- trunk/osprey/be/cg/cgdriver.cxx     2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/cgdriver.cxx     2011-06-17 19:45:27 UTC (rev 3651)
@@ -128,6 +128,9 @@
 #include "flags.h"
 #endif
 #include "cg_swp.h"
+#ifdef TARG_X8664
+#include "config_wopt.h"
+#endif
 
 extern void Set_File_In_Printsrc(char *);      /* defined in printsrc.c */
 
@@ -469,6 +472,8 @@
     0, 0, 0,   &CG_fma4_load_exec, NULL },
   { OVK_BOOL,  OV_VISIBLE, TRUE, "dsched", "",
     0, 0, 0,   &CG_dispatch_schedule, NULL },
+  { OVK_BOOL,   OV_VISIBLE, TRUE, "nobest_fit", "",
+    0, 0, 0,    &CG_LOOP_nounroll_best_fit_set, NULL },
   { OVK_BOOL,  OV_VISIBLE, TRUE, "unalign_st", "",
     0, 0, 0,   &CG_128bitstore, NULL },
   { OVK_BOOL,  OV_VISIBLE, TRUE, "brfuse", "",
@@ -2021,6 +2026,18 @@
       OPT_unroll_size = 128;
 #endif
   
+#ifdef TARG_X8664
+  if (Is_Target_Orochi() || Is_Target_Barcelona()) {
+     // check if default to determine if we use best fit unrolling or not
+    if ((OPT_unroll_size == 128) && 
+        (OPT_unroll_times == 4) && 
+        (WOPT_Enable_WN_Unroll == 1)) {
+      if (CG_LOOP_nounroll_best_fit_set == false)
+        CG_LOOP_unroll_best_fit = TRUE;
+    }
+  }
+#endif
+
   if ( OPT_Unroll_Analysis_Set )
   {
     CG_LOOP_unroll_analysis = OPT_Unroll_Analysis;

Modified: trunk/osprey/be/cg/lra.cxx
===================================================================
--- trunk/osprey/be/cg/lra.cxx  2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/lra.cxx  2011-06-17 19:45:27 UTC (rev 3651)
@@ -810,34 +810,66 @@
 }
 
 
-static int
-Calculate_Conflicting_Live_Ranges(TN *tn)
+void
+Populate_Init_Degrees(BB *bb, INT *regs_in_use)
 {
-  LIVE_RANGE *cur_lr = LR_For_TN(tn);
-  int conflict_count = 0;
-  for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
-    if (cur_lr == lr) continue;
-    if ((LR_first_def(lr) == 0) && (LR_last_use(lr) == 0)) continue;
-    if (LR_use_cnt(lr) == 0) continue;
-    TN *cur_tn = LR_tn(lr);
-    if (TN_register_class(tn) != TN_register_class(cur_tn)) continue;
-    if (LR_conflicts_with_reg_LR(lr, cur_lr)) conflict_count++;
+  for (INT opnum = 0; opnum < BB_length(bb); opnum++) {
+    regs_in_use[opnum] = 0;
   }
-  return conflict_count;
 }
 
 
+void
+Populate_Degrees_Over_LRs(INT *regs_in_use, LIVE_RANGE *lr)
+{
+  INT opnum;
+
+  // populate the live range, first def to last use,
+  // exposed uses will cause the live range to expand.
+  for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) {
+    INT32 degree = regs_in_use[opnum];
+    degree++;
+    regs_in_use[opnum] = degree;
+  }
+}
+
+
+INT
+Find_Max_Degree_For_LR(INT *regs_in_use, LIVE_RANGE *lr)
+{
+  INT opnum;
+  INT32 max_degree = 0;
+  for (opnum = LR_first_def(lr); opnum < LR_last_use(lr); opnum++) {
+    INT32 degree = regs_in_use[opnum];
+    max_degree = (degree > max_degree) ? degree : max_degree;
+  }
+  return max_degree;
+}
+
+
 TN_MAP
-Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass)
+Calculate_All_Conflicts(BB *bb, INT *regs_in_use, ISA_REGISTER_CLASS rclass)
 {
   TN_MAP conflict_map = TN_MAP_Create();
 
+  // calculate degrees for live range intervals, op by op.
+  Populate_Init_Degrees(bb, regs_in_use);
   for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
     TN *tn = LR_tn(lr);
+    if (TN_register_class(tn) != rclass) continue;
+    if (LR_use_cnt(lr) == 0) continue;
+    if (LR_last_use(lr) == 0) continue;
+    Populate_Degrees_Over_LRs(regs_in_use, lr);
+  }
+
+  for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+    TN *tn = LR_tn(lr);
     INT num_conflicts;
     if (TN_register_class(tn) != rclass) continue;
     if (LR_use_cnt(lr) == 0) continue;
-    num_conflicts = Calculate_Conflicting_Live_Ranges(tn);
+    if (LR_last_use(lr) == 0) continue;
+    // true degree does not include the live range itself.
+    num_conflicts = Find_Max_Degree_For_LR(regs_in_use, lr) - 1;
     TN_MAP_Set(conflict_map, tn, (void*)num_conflicts);
   }
 
@@ -845,6 +877,73 @@
 }
 
 
+void
+Print_Range_And_Conflict_Info(TN_MAP conflict_map, 
+                              ISA_REGISTER_CLASS rclass)
+{
+  for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+    TN *tn = LR_tn(lr);
+    INT num_conflicts;
+    if (TN_register_class(tn) != rclass) continue;
+    if (LR_use_cnt(lr) == 0) continue;
+    if (LR_last_use(lr) == 0) continue;
+    num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn);
+    printf("lr conflicts(%d) :", num_conflicts);
+    Print_Live_Range (lr);  
+  }
+}
+
+
+INT
+Find_Max_Conflicts(TN_MAP conflict_map,
+                   INT *average_conflicts,
+                   INT *num_k_conflicts,
+                   INT *num_edges,
+                   INT *outgoing_edges,
+                   ISA_REGISTER_CLASS rclass)
+{
+  INT max_conflicts = 0;
+  INT sum_conflicts = 0;
+  INT n_ranges = 0;
+  INT n_edges = 0;
+  INT n_defs = 0;
+  INT total_def_degree = 0;
+  INT k_conflicts = 0;
+  INT num_pr = REGISTER_CLASS_register_count(rclass);
+  LIVE_RANGE *last_lr = NULL;
+
+  for (LIVE_RANGE *lr = Live_Range_List; lr != NULL; lr = LR_next(lr)) {
+    TN *tn = LR_tn(lr);
+    INT num_conflicts;
+    if (TN_register_class(tn) != rclass) continue;
+    if (LR_use_cnt(lr) == 0) continue;
+    if (LR_last_use(lr) == 0) continue;
+    num_conflicts = (INTPTR)TN_MAP_Get(conflict_map, tn);
+    if (num_conflicts > max_conflicts)
+      max_conflicts = num_conflicts;
+    if (num_conflicts > num_pr)
+      k_conflicts++;
+    n_edges += LR_use_cnt(lr);
+    n_ranges++;
+    sum_conflicts += num_conflicts;
+    last_lr = lr;
+    if (LR_first_def(lr) != 0) {
+      n_defs += LR_def_cnt(lr);
+      if (total_def_degree < num_conflicts)
+        total_def_degree = num_conflicts;
+    }
+  }
+  if (n_ranges) {
+    TN *tn = LR_tn(last_lr);
+    *average_conflicts = (sum_conflicts/n_ranges);
+    *num_k_conflicts = k_conflicts;
+    *num_edges = n_edges;
+    *outgoing_edges = total_def_degree;
+  }
+  return max_conflicts;
+}
+
+
 bool
 Query_Conflicts_Improved(TN_MAP orig_map, 
                          TN_MAP new_map, 
@@ -877,8 +976,6 @@
     }
   }
   *num_ranges_mitigated = num_ranges_moved_below_pr_pressure; 
-  TN_MAP_Delete(orig_map);
-  TN_MAP_Delete(new_map);
  
   return (num_improved > num_degraded);
 }
@@ -918,6 +1015,61 @@
 }
 
 
+void
+Truncate_LRs_For_OP (OP *op)
+{
+  if (op == NULL) return;
+
+  BB *bb = OP_bb(op);
+  INT i;
+  INT cur_opnum;
+
+  // Find our current OP's opnum
+  for (cur_opnum = 1; cur_opnum < BB_length(bb); cur_opnum++) {
+    OP *cur_op = OP_VECTOR_element (Insts_Vector, cur_opnum);
+    if (op == cur_op)
+      break;
+  }
+  // did we find it?
+  if (cur_opnum == BB_length(bb))
+    return;
+
+  for (i = 0; i < OP_results(op); i++) {
+    TN *res = OP_result(op, i);  
+    if (TN_is_register(res)) {
+      LIVE_RANGE *lr = LR_For_TN(res);
+      if (LR_first_def(lr) == cur_opnum) {
+        LR_first_def(lr) = 0;
+        if (LR_upward_exposed_use(lr) == cur_opnum) {
+          if (LR_exposed_use(lr) == LR_upward_exposed_use(lr))
+            LR_exposed_use(lr) = 0;
+          LR_upward_exposed_use(lr) = 0;
+        } 
+      }
+      if (LR_def_cnt(lr) > 1)
+        LR_def_cnt(lr)--;
+    }
+  }
+  for (i = 0; i < OP_opnds(op); i++) {
+    TN *opnd_tn = OP_opnd(op,i);
+    if (TN_is_register(opnd_tn)) {
+      LIVE_RANGE *lr = LR_For_TN(opnd_tn);
+      LR_use_cnt(lr)--;
+      if (LR_last_use(lr) == cur_opnum) {
+        // walk up to the closest use, else the last use is 0
+        LR_last_use(lr) = 0;
+        for (INT opnum = cur_opnum; opnum > 0; opnum--) {
+          OP *cur_op = OP_VECTOR_element (Insts_Vector, opnum);
+          if (OP_Refs_TN(cur_op, opnd_tn)) {
+            LR_last_use(lr) = opnum;
+          }
+        }
+      }
+    }
+  }
+}
+
+
 /* Mark that TN is used in OP. */
 static void
 Mark_Use (TN *tn, OP *op, INT opnum, BB *bb, BOOL in_lra,

Modified: trunk/osprey/be/cg/lra.h
===================================================================
--- trunk/osprey/be/cg/lra.h    2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/lra.h    2011-06-17 19:45:27 UTC (rev 3651)
@@ -85,13 +85,24 @@
  * final result, and which comparisons of live range pressure
  * can be made.
  */
-extern TN_MAP Calculate_All_Conflicts(ISA_REGISTER_CLASS rclass);
+extern TN_MAP Calculate_All_Conflicts(BB *bb, 
+                                      INT *regs_in_use,
+                                      ISA_REGISTER_CLASS rclass);
 extern void Merge_Live_Ranges(TN *tn1, TN *tn2, bool make_tn1_span);
 extern bool Query_Conflicts_Improved(TN_MAP orig_map,
                                      TN_MAP new_map,
                                      INT num_reserved,
                                      INT *num_ranges_mitigated,
                                      ISA_REGISTER_CLASS rclass);
+extern void Print_Range_And_Conflict_Info(TN_MAP conflict_map,
+                                          ISA_REGISTER_CLASS rclass);
+extern INT Find_Max_Conflicts(TN_MAP conflict_map,
+                              INT *average_conflicts,
+                              INT *num_k_conflicts,
+                              INT *num_edges,
+                              INT *outgoing_conflicts,
+                              ISA_REGISTER_CLASS rclass);
+extern void Truncate_LRs_For_OP(OP *op);
 
 /* Returns the number of registers LRA is requesting from GRA for
  * the class <cl> in the basic block <bb>. If we run the scheduling

Modified: trunk/osprey/be/cg/op.h
===================================================================
--- trunk/osprey/be/cg/op.h     2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/op.h     2011-06-17 19:45:27 UTC (rev 3651)
@@ -727,6 +727,8 @@
 #define OP_memory(o)           (OP_load(o) | OP_store(o) | OP_prefetch(o))
 #define OP_mcode(o)             (TOP_is_mcode(OP_code(o)))
 #define OP_is4(o)               (TOP_is_is4_reg(OP_code(o)))
+#define OP_vec_lo_ldst(o)       (TOP_is_vector_lo_loadstore(OP_code(o)))
+#define OP_vec_hi_ldst(o)       (TOP_is_vector_high_loadstore(OP_code(o)))
 #else
 #define OP_memory(o)           (OP_load(o) | OP_store(o) | OP_prefetch(o))
 #endif

Modified: trunk/osprey/be/cg/x8664/ebo_special.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/ebo_special.cxx    2011-06-16 07:17:12 UTC (rev 
3650)
+++ trunk/osprey/be/cg/x8664/ebo_special.cxx    2011-06-17 19:45:27 UTC (rev 
3651)
@@ -5186,6 +5186,7 @@
   OP* new_op = NULL;
   ADDR_MODE mode = BASE_INDEX_MODE;
   const TOP new_top = Get_Top_For_Addr_Mode(OP_code(op), mode);
+  const TOP old_top = OP_code(op);
   FmtAssert( new_top != TOP_UNDEFINED, ("Compose_Mem_Op: unknown top") );
   if( TOP_is_prefetch( new_top ) ){
       new_op = Mk_OP( new_top, OP_opnd( op, 0 ), base, offset, index, scale );
@@ -5198,7 +5199,16 @@
       storeval = OP_result( op, 0 );
     }
     if (OP_load(op) || OP_store(op) || OP_prefetch(op)) {
-        new_op = Mk_OP( new_top, storeval, base, offset, index, scale );
+        if (Is_Target_Orochi() && Is_Target_AVX() && OP_load(op) &&
+            (old_top != TOP_vldsd) &&
+            (old_top != TOP_vldsdx) &&
+            (old_top != TOP_vldsdxx) &&
+            (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) {
+            new_op = Mk_OP( new_top, storeval, OP_opnd( op, 0 ), 
+                            base, index, scale, offset );
+        } else {
+            new_op = Mk_OP( new_top, storeval, base, offset, index, scale );
+        }
     } else if (OP_load_exe(op)) {
         if (OP_opnds(op) == 2) {
             FmtAssert ((storeval != NULL), 
@@ -5325,6 +5335,22 @@
   return found;
 }
 
+static bool tn_find(std::deque<TN*>& tn_queue, TN *tn)
+{
+  bool found = false;
+  std::deque<TN*>::iterator tn_queue_it;
+  for (tn_queue_it = tn_queue.begin();
+       tn_queue_it != tn_queue.end();
+       ++tn_queue_it) {
+    TN* cur_tn = *tn_queue_it;
+    if (cur_tn == tn) {
+      found = true;
+      break;
+    }
+  }
+  return found;
+}
+
 // compare two def trees and mark attributes concerning input tn's tree
 static void compare_def_tree(TN *tn,
                              INT *num,
@@ -6124,6 +6150,23 @@
   TN_MAP_Delete(def_map);
 }
 
+static void prune_adds_from_live_range_analysis(
+            std::deque<TN*>& add_tns,
+            std::set<TN*>& counted_base_regs,
+            std::map<TN*,OP*>& add_map)
+{
+  std::set<TN*>::const_iterator counted_base_regs_it;
+  for (counted_base_regs_it = counted_base_regs.begin();
+       counted_base_regs_it != counted_base_regs.end();
+       ++counted_base_regs_it) {
+    TN *tn = *counted_base_regs_it;
+    if (tn_find(add_tns, tn) == false) {
+      OP* add_op = add_map[tn];
+      Truncate_LRs_For_OP(add_op);
+    }
+  }
+}
+
 // After building interior pointers candidates, remove
 // all the effected counted_base_regs from SIB processing so
 // that we do not translate them in SIB translation.
@@ -6133,7 +6176,8 @@
             std::map<INT,std::deque<ST*> >& correlated_addr_map,
             std::map<ST*,std::deque<TN*> >& symbol_addr_map,
             std::set<TN*>& counted_base_regs,
-            BB *lhead,
+            std::map<TN*,OP*>& add_map,
+            BB *bb,
             bool loop_vectorized,
             MEM_POOL *pool) {
   INT num_cands = 0;
@@ -6189,15 +6233,49 @@
     TN_MAP orig_conflict_map;
     TN_MAP new_conflict_map;
     mINT8 fatpoint[ISA_REGISTER_CLASS_MAX+1];
-    const INT len = BB_length(lhead);
+    const INT len = BB_length(bb);
     INT* regs_in_use = (INT *)alloca(sizeof(INT) * (len+1));
+    std::deque<TN*> add_tns;
 
     MEM_POOL_Push(pool);
 
     // Calculate the current live ranges
-    LRA_Estimate_Fat_Points(lhead, fatpoint, regs_in_use, pool);
-    orig_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer);
+    LRA_Estimate_Fat_Points(bb, fatpoint, regs_in_use, pool);
+    // build a list of tns interior pointers will operate on
+    for (counted_addr_sts_it = counted_addr_sts.begin();
+         counted_addr_sts_it != counted_addr_sts.end();
+       ++counted_addr_sts_it) {
+      std::deque<TN*> st_tns;
+      st_tns = symbol_addr_map[*counted_addr_sts_it];
 
+      // Skip the non interior pointer cands
+      if (st_tns.empty()) continue;
+
+      INT cor_addr_index = st_tns.size();
+      if (cor_addr_index != max_pattern) continue;
+
+      if (!correlated_addr_map[cor_addr_index].empty()) {
+        std::deque<TN*>::iterator st_tns_iter;
+        for (st_tns_iter = st_tns.begin();
+             st_tns_iter != st_tns.end();
+             ++st_tns_iter) {
+          TN *tn1 = *st_tns_iter;
+          ++st_tns_iter;
+          TN *tn2 = *st_tns_iter;
+          add_tns.push_front(tn2);
+        }
+      }
+    }
+
+    // remove all the sib adds from our live range maps so that we get
+    // an accurate picture for analysis.
+    prune_adds_from_live_range_analysis(add_tns, counted_base_regs, add_map);
+    add_tns.clear();
+
+    // Do the initial live range analysis
+    orig_conflict_map = Calculate_All_Conflicts(bb, regs_in_use, 
+                                                ISA_REGISTER_CLASS_integer);
+
     // Merge all pairs live ranges on the minor basereg
     bool first_time = true;
     for (counted_addr_sts_it = counted_addr_sts.begin();
@@ -6242,13 +6320,31 @@
     // If Query_Conflicts_Improved returns with a state that indicates
     // that introduction of interior pointers does not benefit live
     // range pressure, we do not proceed with the allowing the translation.
-    new_conflict_map = Calculate_All_Conflicts(ISA_REGISTER_CLASS_integer);
+    new_conflict_map = Calculate_All_Conflicts(bb, regs_in_use, 
+                                               ISA_REGISTER_CLASS_integer);
+    INT N_i = 0;
+    INT D_i = 0;
+    INT avg_conflicts = 0;
+    INT k_conflicts = 0;
     if (Query_Conflicts_Improved(orig_conflict_map, 
                                  new_conflict_map,
                                  3,
                                  &num_ranges_mitigated,
-                                 ISA_REGISTER_CLASS_integer) == false)
+                                 ISA_REGISTER_CLASS_integer) == false) {
       clear_all = true;
+    } else if (Find_Max_Conflicts(orig_conflict_map,
+                                  &avg_conflicts,
+                                  &k_conflicts,
+                                  &N_i,
+                                  &D_i,
+                                  ISA_REGISTER_CLASS_integer) == num_pr) {
+      // For vectorized loops we want more than k-conflicts as max in the
+      // original conflict map context.
+      if (loop_vectorized)
+        min_reclaimable = 4; 
+    }
+    TN_MAP_Delete(orig_conflict_map);
+    TN_MAP_Delete(new_conflict_map);
 
     MEM_POOL_Pop(pool);
   }
@@ -7006,6 +7102,7 @@
                                                        correlated_addr_map,
                                                        symbol_addr_map,
                                                        counted_base_regs,
+                                                       add_map,
                                                        lhead, 
                                                        loop_vectorized,
                                                        pool);
@@ -7423,7 +7520,7 @@
         }
       }
       if (dontdoit) continue;
-
+    
       //
       // collect the LiveIn sets of pdoms of loop header
       // which are not part of the loop
@@ -7657,6 +7754,15 @@
     return FALSE;
   }
 
+  const TOP old_top = OP_code(op);
+  if (Is_Target_Orochi() && Is_Target_AVX() && 
+      (old_top != TOP_vldsd) &&
+      (old_top != TOP_vldsdx) &&
+      (old_top != TOP_vldsdxx) &&
+      (OP_vec_lo_ldst(op) || OP_vec_hi_ldst(op))) {
+    return FALSE;
+  }
+
   const INT op_base_idx = OP_find_opnd_use( op, OU_base );
   EBO_TN_INFO* base_tninfo = op_base_idx >= 0 ? actual_tninfo[op_base_idx] : 
NULL;
   OP* addr_op = (base_tninfo != NULL) ? base_tninfo->in_op : NULL;

Modified: trunk/osprey/be/cg/x8664/exp_loadstore.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/exp_loadstore.cxx  2011-06-16 07:17:12 UTC (rev 
3650)
+++ trunk/osprey/be/cg/x8664/exp_loadstore.cxx  2011-06-17 19:45:27 UTC (rev 
3651)
@@ -382,6 +382,16 @@
       }
     }
   }
+  if (Is_Target_Orochi() && Is_Target_AVX() && 
+      ((top == TOP_ldlps) ||
+       (top == TOP_ldhps) ||
+       (top == TOP_ldlpd) || 
+       (top == TOP_ldhpd))){
+     TN *xzero = Build_TN_Like(result);
+     Build_OP( TOP_xzero128v32, xzero, ops );
+     Build_OP( top, result, xzero, base, ofst, ops );
+     return;
+  }
 
   Build_OP (top, result, base, ofst, ops);
 }
@@ -634,10 +644,34 @@
   }
   else if (mtype == MTYPE_V8I1 || mtype == MTYPE_V8I2 ||
           mtype == MTYPE_V8I4 || mtype == MTYPE_V8I8) {
-    if (base != NULL)
-      Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse, result, base, 
disp, ops);    
-    else Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32, 
result, disp, ops);    
+    if (Is_Target_Orochi() && Is_Target_AVX()){
+      TN *xzero = Build_TN_Like(result);
+      Build_OP(TOP_xzero128v32, xzero, ops);
+      if (base != NULL)
+        Build_OP(TOP_ldlps, result, xzero, base, disp, ops);
+      else 
+        Build_OP(TOP_ldlps_n32, result, xzero, disp, ops);
+    } else {
+      if (base != NULL)
+        Build_OP(!Is_Target_SSE2() ? TOP_ldlps : TOP_ld64_2sse, 
+                 result, base, disp, ops);    
+      else 
+        Build_OP(!Is_Target_SSE2() ? TOP_ldlps_n32 : TOP_ld64_2sse_n32, 
+                 result, disp, ops);    
+    }
   }
+  else if (mtype == MTYPE_V8F4 ) {
+    if (Is_Target_Orochi() && Is_Target_AVX()){
+      TN *xzero = Build_TN_Like(result);
+      Build_OP(TOP_xzero128v32, xzero, ops);
+      if (base != NULL)
+        Build_OP(TOP_ldlps, result, xzero, base, disp, ops);
+      else 
+        Build_OP(TOP_ldlps_n32, result, xzero, disp, ops);
+    } else {
+      Expand_Composed_Load (op, result, base, disp, variant, ops);
+    }
+  }
   else if (mtype == MTYPE_V16F8 || mtype == MTYPE_V16C8) {
     if(Is_Target_Barcelona() || Is_Target_Orochi()){
      if(base != NULL)

Modified: trunk/osprey/be/cg/x8664/expand.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/expand.cxx 2011-06-16 07:17:12 UTC (rev 3650)
+++ trunk/osprey/be/cg/x8664/expand.cxx 2011-06-17 19:45:27 UTC (rev 3651)
@@ -7988,7 +7988,13 @@
     Build_OP( TOP_ldsd, result, op1, Gen_Literal_TN (0,4), ops );
     break;
   case INTRN_LOADHPD:
-    Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops );
+    if (Is_Target_Orochi() && Is_Target_AVX()){
+      TN *xzero = Build_TN_Like(result);
+      Build_OP( TOP_xzero128v32, xzero, ops );
+      Build_OP( TOP_ldhpd, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+    } else {
+      Build_OP( TOP_ldhpd, result, op1, Gen_Literal_TN (0,4), ops );
+    }
     break;
   case INTRN_UNPCKLPD:
     Build_OP( TOP_unpcklpd, result, op0, op1, ops );
@@ -8208,10 +8214,22 @@
     Build_OP( TOP_ldupd, result, op0, Gen_Literal_TN (0,4), ops );
     break;
   case INTRN_LOADHPS:
-    Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops );
+    if (Is_Target_Orochi() && Is_Target_AVX()){
+      TN *xzero = Build_TN_Like(result);
+      Build_OP( TOP_xzero128v32, xzero, ops );
+      Build_OP( TOP_ldhps, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+    } else {
+      Build_OP( TOP_ldhps, result, op1, Gen_Literal_TN (0,4), ops );
+    }
     break;
   case INTRN_LOADLPS:
-    Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops );
+    if (Is_Target_Orochi() && Is_Target_AVX()){
+      TN *xzero = Build_TN_Like(result);
+      Build_OP( TOP_xzero128v32, xzero, ops );
+      Build_OP( TOP_ldlps, result, xzero, op1, Gen_Literal_TN (0,4), ops );
+    } else {
+      Build_OP( TOP_ldlps, result, op1, Gen_Literal_TN (0,4), ops );
+    }
     break;
   case INTRN_MOVMSKPS:
     Build_OP( TOP_movmskps, result, op0, ops );

Modified: trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx    2011-06-16 
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_operands.cxx    2011-06-17 
19:45:27 UTC (rev 3651)
@@ -1545,6 +1545,10 @@
                            TOP_vaesenclastx,
                            TOP_vaesdecx,
                            TOP_vaesdeclastx,
+                           TOP_vldhpd,
+                           TOP_vldlpd,
+                           TOP_vldhps,
+                           TOP_vldlps,
                           TOP_UNDEFINED);
   Result(0, fp128);
   Operand(0, fp128, opnd1);
@@ -1785,6 +1789,10 @@
                            TOP_vaesenclastxx,
                            TOP_vaesdecxx,
                            TOP_vaesdeclastxx,
+                           TOP_vldhpdx,
+                           TOP_vldhpsx,
+                           TOP_vldlpdx,
+                           TOP_vldlpsx,
                           TOP_UNDEFINED);
   Result(0, fp128);
   Operand(0, fp128, opnd1);
@@ -2025,6 +2033,10 @@
                            TOP_vaesenclastxxx,
                            TOP_vaesdecxxx,
                            TOP_vaesdeclastxxx,
+                           TOP_vldhpdxx,
+                           TOP_vldhpsxx,
+                           TOP_vldlpdxx,
+                           TOP_vldlpsxx,
                           TOP_UNDEFINED);
   Result(0, fp128);
   Operand(0, fp128, opnd1);
@@ -4286,15 +4298,21 @@
                     TOP_vlddqa_n32,
                     TOP_vldapd_n32,
                     TOP_vldaps_n32,
-                    TOP_vldlpd_n32,
                     TOP_vldupd_n32,
                     TOP_vldups_n32,
+                   TOP_UNDEFINED);
+  Result(0, fp128);
+  Operand(0, simm32, offset);
+
+  Instruction_Group("avx float load vector w/o base or index",
                     TOP_vldhpd_n32,
+                    TOP_vldlpd_n32,
                     TOP_vldhps_n32,
                     TOP_vldlps_n32,
                    TOP_UNDEFINED);
   Result(0, fp128);
-  Operand(0, simm32, offset);
+  Operand(0, fp128, opnd1);
+  Operand(1, simm32, offset);
 
   Instruction_Group("float load vector",
                    TOP_lddqa,
@@ -4318,14 +4336,10 @@
                     TOP_vldntdqa,
                     TOP_vldapd,
                     TOP_vldaps,
-                    TOP_vldlpd,
                     TOP_vldss,
                     TOP_vldupd,
                     TOP_vldups,
-                    TOP_vldhpd,
-                    TOP_vldhps,
                     TOP_vldsd,
-                    TOP_vldlps,
                    TOP_UNDEFINED);
   Result(0, fp128);
   Operand(0, int64, base);
@@ -5676,14 +5690,10 @@
                     TOP_vldntdqax,
                     TOP_vldapdx,
                     TOP_vldapsx,
-                    TOP_vldlpdx,
                     TOP_vldssx,
                     TOP_vldupdx,
                     TOP_vldupsx,
-                    TOP_vldhpdx,
-                    TOP_vldhpsx,
                     TOP_vldsdx,
-                    TOP_vldlpsx,
                    TOP_UNDEFINED);
   Result(0,  fp128);
   Operand(0, int64, base);
@@ -5745,14 +5755,10 @@
                     TOP_vldntdqaxx,
                     TOP_vldapdxx,
                     TOP_vldapsxx,
-                    TOP_vldlpdxx,
                     TOP_vldssxx,
                     TOP_vldupdxx,
                     TOP_vldupsxx,
-                    TOP_vldhpdxx,
-                    TOP_vldhpsxx,
                     TOP_vldsdxx,
-                    TOP_vldlpsxx,
                    TOP_UNDEFINED);
   Result(0,  fp128);
   Operand(0, int64, index);

Modified: trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx       2011-06-16 
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_print.cxx       2011-06-17 
19:45:27 UTC (rev 3651)
@@ -1982,6 +1982,10 @@
                            TOP_vaesdecx,
                            TOP_vaesdeclastx,
                            TOP_vaeskeygenassistx,
+                           TOP_vldhpd,
+                           TOP_vldlpd,
+                           TOP_vldhps,
+                           TOP_vldlps,
                            TOP_UNDEFINED);
 
   /* dest=op(memop, reg), non-x86-style */
@@ -2198,6 +2202,10 @@
                            TOP_vaesdecxx,
                            TOP_vaesdeclastxx,
                            TOP_vaeskeygenassistxx,
+                           TOP_vldhpdx,
+                           TOP_vldhpsx,
+                           TOP_vldlpdx,
+                           TOP_vldlpsx,
                            TOP_UNDEFINED);
 
   /* dest=op(memop with scaled index with base, reg), non-x86-style */
@@ -2415,6 +2423,10 @@
                            TOP_vaesdecxxx,
                            TOP_vaesdeclastxxx,
                            TOP_vaeskeygenassistxxx,
+                           TOP_vldhpdxx,
+                           TOP_vldhpsxx,
+                           TOP_vldlpdxx,
+                           TOP_vldlpsxx,
                            TOP_UNDEFINED);
 
   /* dest=op(memop with scaled index without base, reg), non-x86-style */
@@ -3422,12 +3434,8 @@
                            TOP_vlddqa_n32,
                            TOP_vldapd_n32,
                            TOP_vldaps_n32,
-                           TOP_vldlpd_n32,
                            TOP_vldupd_n32,
                            TOP_vldups_n32,
-                           TOP_vldhpd_n32,
-                           TOP_vldhps_n32,
-                           TOP_vldlps_n32,
                            TOP_vstdqa_n32,
                            TOP_vstdqu_n32,
                            TOP_vstapd_n32,
@@ -3441,6 +3449,19 @@
                            TOP_vaesimc,
                           TOP_UNDEFINED );
 
+  /* One result / two operands */
+  ISA_PRINT_TYPE rop2 =  ISA_Print_Type_Create("rop2", "%s %s %s,%s");
+  Name();
+  Operand(1);
+  Operand(0);
+  Result(0);
+  Instruction_Print_Group( rop,
+                           TOP_vldhpd_n32,
+                           TOP_vldhps_n32,
+                           TOP_vldlpd_n32,
+                           TOP_vldlps_n32,
+                          TOP_UNDEFINED );
+
   /* One result / one mem opnd */
   ISA_PRINT_TYPE rmem =  ISA_Print_Type_Create("rmem", "%s %s%s(%s),%s");
   Name();
@@ -3459,6 +3480,7 @@
                            TOP_pmovzxbwx,
                            TOP_pmovsxbdx,
                            TOP_pmovzxbdx,
+                           TOP_vldlpd_n32,
                            TOP_pmovsxbqx,
                            TOP_pmovzxbqx,
                            TOP_pmovsxwdx,
@@ -3797,14 +3819,10 @@
                            TOP_vldntdqa,
                            TOP_vldapd,
                            TOP_vldaps,
-                           TOP_vldlpd,
                            TOP_vldss,
                            TOP_vldupd,
                            TOP_vldups,
-                           TOP_vldhpd,
-                           TOP_vldhps,
                            TOP_vldsd,
-                           TOP_vldlps,
                            TOP_vfbroadcastss,
                            TOP_vfbroadcastsd,
                            TOP_vfbroadcastf128,
@@ -3928,14 +3946,10 @@
                            TOP_vldntdqax,
                            TOP_vldapdx,
                            TOP_vldapsx,
-                           TOP_vldlpdx,
                            TOP_vldssx,
                            TOP_vldupdx,
                            TOP_vldupsx,
-                           TOP_vldhpdx,
-                           TOP_vldhpsx,
                            TOP_vldsdx,
-                           TOP_vldlpsx,
                            TOP_vfbroadcastxss,
                            TOP_vfbroadcastxsd,
                            TOP_vfbroadcastxf128,
@@ -3985,14 +3999,10 @@
                            TOP_vldntdqaxx,
                            TOP_vldapdxx,
                            TOP_vldapsxx,
-                           TOP_vldlpdxx,
                            TOP_vldssxx,
                            TOP_vldupdxx,
                            TOP_vldupsxx,
-                           TOP_vldhpdxx,
-                           TOP_vldhpsxx,
                            TOP_vldsdxx,
-                           TOP_vldlpsxx,
                            TOP_vfbroadcastxxss,
                            TOP_vfbroadcastxxsd,
                            TOP_vfbroadcastxxf128,

Modified: trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx
===================================================================
--- trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx  2011-06-16 
07:17:12 UTC (rev 3650)
+++ trunk/osprey/common/targ_info/isa/x8664/isa_properties.cxx  2011-06-17 
19:45:27 UTC (rev 3651)
@@ -5651,6 +5651,18 @@
                      TOP_vpmadcswdxx,
                      TOP_vpmadcswdxxx,
                      /* AVX instructions */
+                     TOP_vldhpd,
+                     TOP_vldhps,
+                     TOP_vldlpd,
+                     TOP_vldlps,
+                     TOP_vldhpdx,
+                     TOP_vldhpsx,
+                     TOP_vldlpdx,
+                     TOP_vldlpsx,
+                     TOP_vldhpdxx,
+                     TOP_vldhpsxx,
+                     TOP_vldlpdxx,
+                     TOP_vldlpsxx,
                      TOP_vcmpestrix,
                      TOP_vcmpestrixx,
                      TOP_vcmpestrixxx,


------------------------------------------------------------------------------
EditLive Enterprise is the world's most technically advanced content
authoring tool. Experience the power of Track Changes, Inline Image
Editing and ensure content is compliant with Accessibility Checking.
http://p.sf.net/sfu/ephox-dev2dev
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel

Reply via email to