Author: mberg
Date: 2012-04-20 17:30:33 -0400 (Fri, 20 Apr 2012)
New Revision: 3915
Modified:
trunk/osprey/be/cg/annotations.h
trunk/osprey/be/cg/bbutil.cxx
trunk/osprey/be/cg/cg_loop.cxx
trunk/osprey/be/cg/cgemit.cxx
trunk/osprey/be/cg/whirl2ops.cxx
trunk/osprey/be/lno/can.cxx
trunk/osprey/be/lno/lnopt_main.cxx
trunk/osprey/be/lno/lnopt_main.h
trunk/osprey/be/lno/pf_loop.cxx
trunk/osprey/be/lno/simd.cxx
trunk/osprey/common/com/config_lno.cxx
trunk/osprey/common/com/config_lno.h
trunk/osprey/common/com/wn_core.h
Log:
Adding functionality for multiversioning for alignment. The flag
that controls this functionality is -LNO:simd_peep_align=<on|off>, where
the default behavior is off by default. This code creates
two versions of select vector loops. One version is peeled
for alignment with a run time test to check if we can execute
it, else the original version is executed.
CR by Mei Ye.
Modified: trunk/osprey/be/cg/annotations.h
===================================================================
--- trunk/osprey/be/cg/annotations.h 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/cg/annotations.h 2012-04-20 21:30:33 UTC (rev 3915)
@@ -178,6 +178,7 @@
SRCPOS srcpos; /* source position of start of body */
BOOL is_multiversion;
BOOL is_vectorized; /* loop is vectorized (yes/no) */
+ BOOL is_align_peeled; /* loop is peeled for alignment (yes/no) */
} LOOPINFO;
#define LOOPINFO_wn(x) ((x)->wn)
@@ -186,6 +187,7 @@
#define LOOPINFO_trip_count_tn(x) ((x)->trip_count_tn)
#define LOOPINFO_multiversion(x) ((x)->is_multiversion)
#define LOOPINFO_vectorized(x) ((x)->is_vectorized)
+#define LOOPINFO_align_peeled(x) ((x)->is_align_peeled)
typedef struct entryinfo {
Modified: trunk/osprey/be/cg/bbutil.cxx
===================================================================
--- trunk/osprey/be/cg/bbutil.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/cg/bbutil.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -953,6 +953,7 @@
if (WN_Loop_Up_Trip(loop_info)) fprintf(TFile, "UP_TRIP ");
if (LOOPINFO_multiversion(info)) fprintf(TFile, "LMV");
if (LOOPINFO_vectorized(info)) fprintf(TFile, "VEC");
+ if (LOOPINFO_align_peeled(info)) fprintf(TFile, "ALIGN_PEELED");
fprintf(TFile, "\n");
if (LOOPINFO_trip_count_tn(info)) {
fprintf(TFile, " trip count TN = ");
Modified: trunk/osprey/be/cg/cg_loop.cxx
===================================================================
--- trunk/osprey/be/cg/cg_loop.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/cg/cg_loop.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -2809,6 +2809,7 @@
LOOPINFO_wn(copied_info) = wn;
LOOPINFO_srcpos(copied_info) = LOOPINFO_srcpos(info);
LOOPINFO_vectorized(copied_info) = LOOPINFO_vectorized(info);
+ LOOPINFO_align_peeled(copied_info) = LOOPINFO_align_peeled(info);
if (TN_is_constant(trip_count))
LOOPINFO_trip_count_tn(copied_info) =
Gen_Literal_TN(new_trip_count_val, TN_size(trip_count));
@@ -2917,6 +2918,7 @@
LOOPINFO_wn(unrolled_info) = wn;
LOOPINFO_srcpos(unrolled_info) = LOOPINFO_srcpos(info);
LOOPINFO_vectorized(unrolled_info) = LOOPINFO_vectorized(info);
+ LOOPINFO_align_peeled(unrolled_info) = LOOPINFO_align_peeled(info);
if (TN_is_constant(trip_count))
LOOPINFO_trip_count_tn(unrolled_info) =
Gen_Literal_TN(new_trip_count_val, TN_size(trip_count));
Modified: trunk/osprey/be/cg/cgemit.cxx
===================================================================
--- trunk/osprey/be/cg/cgemit.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/cg/cgemit.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -4699,6 +4699,14 @@
: ", nesting depth: %d, %siterations: %lld";
fprintf (file, fmt, depth, estimated, trip_count);
+
+ if (LOOPINFO_vectorized(info)) {
+ fprintf (file, "\n #<loop> vectorized");
+ if (LOOPINFO_align_peeled(info))
+ fprintf (file, "\n #<loop> vector loop : peeled for alignment");
+ } else if (LOOPINFO_align_peeled(info)) {
+ fprintf (file, "\n #<loop> scalar loop : peeled iter to align");
+ }
}
fputc ('\n', file);
Modified: trunk/osprey/be/cg/whirl2ops.cxx
===================================================================
--- trunk/osprey/be/cg/whirl2ops.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/cg/whirl2ops.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -7403,6 +7403,7 @@
LOOPINFO_trip_count_tn(info) = trip_tn;
LOOPINFO_multiversion(info) = WN_Loop_Multiversion_Alias(loop_info);
LOOPINFO_vectorized(info) = WN_Loop_Vectorized(loop_info);
+ LOOPINFO_align_peeled(info) = WN_Loop_Align_Peeled(loop_info);
#ifndef TARG_NVISA
if (!CG_PU_Has_Feedback && WN_loop_trip_est(loop_info) == 0)
Modified: trunk/osprey/be/lno/can.cxx
===================================================================
--- trunk/osprey/be/lno/can.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/lno/can.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -417,6 +417,7 @@
if (loop_info) {
dli->Multiversion_Alias = (WN_Loop_Multiversion_Alias(loop_info) != 0);
dli->Loop_Vectorized = (WN_Loop_Vectorized(loop_info) != 0);
+ dli->Loop_Align_Peeled = (WN_Loop_Align_Peeled(loop_info) != 0);
}
WN_MAP_Set(LNO_Info_Map,wn,(void *)dli);
} else {
Modified: trunk/osprey/be/lno/lnopt_main.cxx
===================================================================
--- trunk/osprey/be/lno/lnopt_main.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/lno/lnopt_main.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -2219,6 +2219,7 @@
Has_Barriers = FALSE;
Multiversion_Alias = FALSE;
Loop_Vectorized = FALSE;
+ Loop_Align_Peeled = FALSE;
Is_Ivdep = FALSE;
Is_Concurrent_Call = FALSE;
Concurrent_Directive = FALSE;
@@ -2328,6 +2329,7 @@
Has_Barriers = dli->Has_Barriers;
Multiversion_Alias = dli->Multiversion_Alias;
Loop_Vectorized = dli->Loop_Vectorized;
+ Loop_Align_Peeled = dli->Loop_Align_Peeled;
Is_Ivdep = dli->Is_Ivdep;
Is_Concurrent_Call = dli->Is_Concurrent_Call;
Concurrent_Directive = dli->Concurrent_Directive;
Modified: trunk/osprey/be/lno/lnopt_main.h
===================================================================
--- trunk/osprey/be/lno/lnopt_main.h 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/lno/lnopt_main.h 2012-04-20 21:30:33 UTC (rev 3915)
@@ -894,6 +894,7 @@
mBOOL Has_Barriers;
mBOOL Multiversion_Alias;
mBOOL Loop_Vectorized; // attribute to mark loops which are vectorized
+ mBOOL Loop_Align_Peeled; // attribute to mark loops that are peeled for
align
mINT8 Required_Unroll;
mINT8 Prefer_Fuse;
mINT8 Has_Precom_Def;
Modified: trunk/osprey/be/lno/pf_loop.cxx
===================================================================
--- trunk/osprey/be/lno/pf_loop.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/lno/pf_loop.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -999,6 +999,9 @@
LNO_Num_Iters < 100)))
single_small_trip_loop = TRUE;
}
+ if (dli->Loop_Align_Peeled && !dli->Loop_Vectorized)
+ single_small_trip_loop = TRUE;
+
if ((LNO_Run_Prefetch > SOME_PREFETCH ||
(LNO_Run_Prefetch == SOME_PREFETCH && !Is_Multi_BB (w))) &&
// !simple_copy_loop && // bug 8560 disable this
Modified: trunk/osprey/be/lno/simd.cxx
===================================================================
--- trunk/osprey/be/lno/simd.cxx 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/be/lno/simd.cxx 2012-04-20 21:30:33 UTC (rev 3915)
@@ -92,6 +92,15 @@
#include "simd_util.h"
#include "small_trips.h" // for Remove_Unity_Trip_Loop
+#define simd_util_INCLUDED
+//#include "simd_util.h"
+
+#include <vector>
+#include <set>
+#include <map>
+#include <list>
+#include <deque>
+
#define ABS(a) ((a<0)?-(a):(a))
#define BINARY_OP(opr) (opr == OPR_ADD || opr == OPR_SUB || opr == OPR_MPY ||
opr == OPR_SHL)
@@ -4289,6 +4298,38 @@
}
}
+static WN *Simd_Align_Generate_Peel_MV_Loops(WN *vloop, DO_LOOP_INFO *dli, ST
*first_st)
+{
+ WN *wn_if = NULL;
+ if (first_st &&
+ LNO_Simd_peel_align) {
+ SYMBOL symbol(WN_index(vloop));
+ OPCODE ld_opc = WN_opcode(UBvar(WN_end(vloop)));
+ OPCODE op_lda = OPCODE_make_op(OPR_LDA, Pointer_type, MTYPE_V);
+ WN *wn_lda = WN_CreateLda(op_lda, 0,
+ Make_Pointer_Type(ST_type(first_st)),
+ first_st);
+
+ TYPE_ID exp_type = OPCODE_rtype(ld_opc);
+ WN *align_cond = NULL;
+ WN *wn_align_val = LWN_Make_Icon(exp_type, 15);
+ OPCODE and_opc = OPCODE_make_op(OPR_BAND, exp_type, MTYPE_V);
+ WN *wn_and = LWN_CreateExp2(and_opc, wn_lda, wn_align_val);
+ WN *wn_zero = LWN_Make_Icon(exp_type, 0);
+ OPCODE opeq = OPCODE_make_op(OPR_NE, exp_type, exp_type);
+ align_cond = LWN_CreateExp2(opeq, wn_and, wn_zero);
+
+ WN *stmt_before_loop = WN_prev(vloop);
+ WN *parent_block = LWN_Get_Parent(vloop);
+ wn_if = Version_Loop(vloop);
+ WN_if_test(wn_if) = align_cond;
+ LWN_Insert_Block_After(parent_block, stmt_before_loop, wn_if);
+ LWN_Parentize(wn_if);
+ }
+
+ return wn_if;
+}
+
//generate peeled loop for alignment
static void Simd_Align_Generate_Peel_Loop(WN *vloop, INT best_peel,
DO_LOOP_INFO *dli)
{
@@ -4343,6 +4384,10 @@
Add_Vertices(WN_do_body(ploop));
adg->Fission_Dep_Update(ploop, 1);
adg->Fission_Dep_Update(vloop, 1);
+
+ dli->Loop_Align_Peeled = TRUE;
+ DO_LOOP_INFO* peel_dli = Get_Do_Loop_Info (ploop);
+ peel_dli->Loop_Align_Peeled = TRUE;
}
static INT Simd_Count_Good_Vector(STACK_OF_WN *vec_simd_ops, SIMD_KIND
*simd_op_kind)
@@ -6450,9 +6495,101 @@
}
}
+static ST *Build_Sym_Queues( STACK_OF_WN *vec_simd_ops,
+ std::set<ST*>& counted_load_store_sts,
+ std::map<ST*,std::deque<WN*> >& symbol_wn_map )
+{
+ INT num_loads = 0;
+
+ if (!LNO_Simd_peel_align)
+ return NULL;
+
+ for (INT i=0; i<vec_simd_ops->Elements(); i++){
+ SIMD_KIND simd_kind = simd_op_kind[i];
+
+ if (simd_kind == INVALID)
+ continue;
+
+ WN *simd_op = vec_simd_ops->Top_nth(i);
+ if ((simd_kind == V16I8) && (WN_rtype(simd_op) == MTYPE_F8)) {
+ for (INT kid = 0; kid < WN_kid_count(simd_op); kid ++) {
+ WN *wn = WN_kid(simd_op, kid);
+ if ((WN_operator(wn) == OPR_LDID) || (WN_operator(wn) == OPR_STID)) {
+ ST *st = WN_st(wn);
+ if ((st != NULL) && (ST_class(st) != CLASS_PREG)) {
+ if (symbol_wn_map[st].empty()) {
+ counted_load_store_sts.insert(st);
+ }
+ symbol_wn_map[st].push_front(wn);
+ }
+ }
+ }
+ }
+ }
+
+ // Next stop: Prune the sym queues down to a single entry or no entries.
+ ST *largest_st = NULL;
+ std::set<ST*>::const_iterator counted_load_store_sts_it;
+ for (counted_load_store_sts_it = counted_load_store_sts.begin();
+ counted_load_store_sts_it != counted_load_store_sts.end();
+ ++counted_load_store_sts_it) {
+ ST *st = *counted_load_store_sts_it;
+ num_loads += symbol_wn_map[st].size();
+ if (largest_st == NULL)
+ largest_st = st;
+ else if (symbol_wn_map[largest_st].size() > symbol_wn_map[st].size())
+ largest_st = st;
+ }
+
+ // now remove all the smaller st maps, and do not worry about the race
+ // on the largest, it is sufficient to take the first one
+ for (counted_load_store_sts_it = counted_load_store_sts.begin();
+ counted_load_store_sts_it != counted_load_store_sts.end();
+ ++counted_load_store_sts_it) {
+ ST *st = *counted_load_store_sts_it;
+ if (st != largest_st)
+ symbol_wn_map[st].clear();
+ }
+
+ // Now the final check, what is the ratio of our chosen syms loads vs the
total
+ if (largest_st != NULL) {
+ INT best_size = symbol_wn_map[largest_st].size();
+ if (best_size < 3) {
+ // Heuristic: Do not multiversion a vector loop with fewer than 3
+ // alignable loads.
+ symbol_wn_map[largest_st].clear();
+ largest_st = 0;
+ } else if ((num_loads / best_size) > 3) {
+ // Hueristic: If alignable loads are less than 1/3 of all loads, do
+ // not multiversion the vector loop.
+ symbol_wn_map[largest_st].clear();
+ largest_st = 0;
+ }
+ } else {
+ largest_st = 0;
+ }
+ return largest_st;
+}
+
+static void Clear_Sym_Queues( STACK_OF_WN *vec_simd_ops,
+ std::set<ST*>& counted_load_store_sts,
+ std::map<ST*,std::deque<WN*> >& symbol_wn_map )
+{
+ std::set<ST*>::const_iterator counted_load_store_sts_it;
+ for (counted_load_store_sts_it = counted_load_store_sts.begin();
+ counted_load_store_sts_it != counted_load_store_sts.end();
+ ++counted_load_store_sts_it) {
+ ST *st = *counted_load_store_sts_it;
+ symbol_wn_map[st].clear();
+ }
+}
+
// Vectorize an innerloop
static INT Simd(WN* innerloop)
{
+ std::map<ST*,std::deque<WN*> > sym_wn_map;
+ std::set<ST*> counted_load_store_sts;
+
if (!Simd_vect_conf.Arch_Has_Vect ())
return 0;
@@ -6504,6 +6641,16 @@
simd_op_best_align[k] =
CXX_NEW_ARRAY(INT,vec_simd_ops->Elements(),&SIMD_default_pool);
BOOL ubound_variable = Simd_Align_UB_Variable(innerloop);
+
+ INT num_loads = 0;
+ ST *first_st = NULL;
+ if(dli->Loop_Align_Peeled == FALSE){
+ first_st = Build_Sym_Queues(vec_simd_ops,
+ counted_load_store_sts,
+ sym_wn_map);
+ if (first_st)
+ num_loads = sym_wn_map[first_st].size();
+ }
for (INT i=vec_simd_ops->Elements()-1; i >= 0; i--) {
simd_op=vec_simd_ops->Top_nth(i);
@@ -6566,6 +6713,36 @@
Simd_Align_Array_References(vec_simd_ops,simd_op_kind, //align iloads and
istores
simd_op_best_align,best_peel,innerloop);
}
+
+ // Emit multiversion loops for peeled alignment
+ if (LNO_Simd_peel_align && (dli->Loop_Align_Peeled == FALSE)) {
+ if (num_loads) {
+ WN *wn_if = Simd_Align_Generate_Peel_MV_Loops(innerloop, dli, first_st);
+ WN *peel_loop = WN_first(WN_then(wn_if));
+
+ Simd_Align_Generate_Peel_Loop(peel_loop, 1, dli);
+ innerloop = WN_first(WN_else(wn_if));
+ dli = Get_Do_Loop_Info (innerloop);
+ // prevent processing this loop complex again the same way
+ dli->Loop_Align_Peeled = TRUE;
+
+ // now simd-ize the new mv loop pair
+ if (Simd(peel_loop)) {
+ WN *loop_info = WN_do_loop_info(peel_loop);
+ WN_Set_Vectorized(loop_info);
+ WN_Set_Align_Peeled(loop_info);
+ if (Simd(innerloop)) {
+ loop_info = WN_do_loop_info(innerloop);
+ WN_Set_Vectorized(loop_info);
+ }
+ }
+ dli->Loop_Align_Peeled = FALSE;
+ Clear_Sym_Queues(vec_simd_ops, counted_load_store_sts, sym_wn_map);
+ MEM_POOL_Pop(&SIMD_default_pool);
+ return 1;
+ }
+ }
+
//END: Alignment Module
#ifdef Is_True_On //internal debug purpose
Modified: trunk/osprey/common/com/config_lno.cxx
===================================================================
--- trunk/osprey/common/com/config_lno.cxx 2012-04-20 00:01:13 UTC (rev
3914)
+++ trunk/osprey/common/com/config_lno.cxx 2012-04-20 21:30:33 UTC (rev
3915)
@@ -180,10 +180,12 @@
0, /* Fission */
TRUE, /* Serial_distribute */
1, /* Iter_threshold */
+ FALSE, /* Simd_peel_align */
#else
1, /* Fission */
FALSE, /* Serial_distribute */
1, /* Iter_threshold */
+ FALSE, /* Simd_peel_align */
#endif
0, /* Fission_inner_register_limit */
TRUE, /* Forward_substitution */
@@ -404,10 +406,12 @@
0, /* Fission */
TRUE, /* Serial_distribute */
1, /* Iter_threshold */
+ FALSE, /* Simd_peel_align */
#else
1, /* Fission */
FALSE, /* Serial_distribute */
1, /* Iter_threshold */
+ FALSE, /* Simd_peel_align */
#endif
0, /* Fission_inner_register_limit */
TRUE, /* Forward_substitution */
@@ -732,10 +736,12 @@
LNOPT_U32 ( "fission", "fis", 0,0,2, Fission ),
LNOPT_BOOL ( "distribute", NULL, Serial_distribute ),
LNOPT_U32 ( "iter_threshold", NULL, 0,0,16, Iter_threshold ),
+ LNOPT_BOOL ( "simd_peel_align", NULL, Simd_peel_align ),
#else
LNOPT_U32 ( "fission", "fis", 1,0,2, Fission ),
LNOPT_BOOL ( "distribute", NULL, Serial_distribute ),
LNOPT_U32 ( "iter_threshold", NULL, 0,0,16, Iter_threshold ),
+ LNOPT_BOOL ( "simd_peel_align", NULL, Simd_peel_align ),
#endif
LNOPT_U32 ( "fission_inner_register_limit", NULL, 32,0,99999,
Fission_inner_register_limit ),
@@ -1241,5 +1247,12 @@
Mhd_Options.L[i].TLB_Miss_Penalty;
}
}
+
+ if(LNO_Simd_peel_align) {
+ // Do not align peel when unity rem transforms are on
+ if(LNO_Simd_Rm_Unity_Remainder) {
+ LNO_Simd_peel_align = 0;
+ }
+ }
}
Modified: trunk/osprey/common/com/config_lno.h
===================================================================
--- trunk/osprey/common/com/config_lno.h 2012-04-20 00:01:13 UTC (rev
3914)
+++ trunk/osprey/common/com/config_lno.h 2012-04-20 21:30:33 UTC (rev
3915)
@@ -248,6 +248,7 @@
UINT32 Fission;
BOOL Serial_distribute;
UINT32 Iter_threshold;
+ BOOL Simd_peel_align;
UINT32 Fission_inner_register_limit;
BOOL Forward_substitution;
UINT32 Fusion;
@@ -498,6 +499,7 @@
#define LNO_Fission Current_LNO->Fission
#define LNO_Serial_Distribute Current_LNO->Serial_distribute
#define LNO_Iter_threshold Current_LNO->Iter_threshold
+#define LNO_Simd_peel_align Current_LNO->Simd_peel_align
#define LNO_Fission_Inner_Register_Limit \
Current_LNO->Fission_inner_register_limit
#define LNO_Forward_Substitution Current_LNO->Forward_substitution
Modified: trunk/osprey/common/com/wn_core.h
===================================================================
--- trunk/osprey/common/com/wn_core.h 2012-04-20 00:01:13 UTC (rev 3914)
+++ trunk/osprey/common/com/wn_core.h 2012-04-20 21:30:33 UTC (rev 3915)
@@ -1402,6 +1402,9 @@
/* Is the loop vectorized in Simd */
#define WN_LOOP_VECTORIZED 0x10000
+/* Is the loop align peeled in Simd */
+#define WN_LOOP_ALIGN_PEELED 0x20000
+
/* Is the loop an innermost loop */
#define WN_Loop_Innermost(x) (WN_loop_flag(x) & WN_LOOP_INNERMOST)
#define WN_Set_Loop_Innermost(x) (WN_loop_flag(x) |= WN_LOOP_INNERMOST)
@@ -1475,6 +1478,16 @@
#define WN_Reset_Vectorized(x) \
(WN_loop_flag(x) &= ~WN_LOOP_VECTORIZED)
+/* Mark the fact that we peeled a loop for alignment so that the optimizer
+ * and code generator can make use it.
+ */
+#define WN_Loop_Align_Peeled(x) \
+ (WN_loop_flag(x) & WN_LOOP_ALIGN_PEELED)
+#define WN_Set_Align_Peeled(x) \
+ (WN_loop_flag(x) |= WN_LOOP_ALIGN_PEELED)
+#define WN_Reset_Align_Peeled(x) \
+ (WN_loop_flag(x) &= ~WN_LOOP_ALIGN_PEELED)
+
#define WN_LABEL_HANDLER_BEGIN 0x2
#define WN_Label_Is_Handler_Begin(x) (WN_label_flag(x) & \
WN_LABEL_HANDLER_BEGIN)
------------------------------------------------------------------------------
For Developers, A Lot Can Happen In A Second.
Boundary is the first to Know...and Tell You.
Monitor Your Applications in Ultra-Fine Resolution. Try it FREE!
http://p.sf.net/sfu/Boundary-d2dvs2
_______________________________________________
Open64-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/open64-devel