Author: mberg Date: 2012-04-20 17:30:33 -0400 (Fri, 20 Apr 2012) New Revision: 3915
Modified: trunk/osprey/be/cg/annotations.h trunk/osprey/be/cg/bbutil.cxx trunk/osprey/be/cg/cg_loop.cxx trunk/osprey/be/cg/cgemit.cxx trunk/osprey/be/cg/whirl2ops.cxx trunk/osprey/be/lno/can.cxx trunk/osprey/be/lno/lnopt_main.cxx trunk/osprey/be/lno/lnopt_main.h trunk/osprey/be/lno/pf_loop.cxx trunk/osprey/be/lno/simd.cxx trunk/osprey/common/com/config_lno.cxx trunk/osprey/common/com/config_lno.h trunk/osprey/common/com/wn_core.h Log: Adding functionality for multiversioning for alignment. The flag that controls this functionality is -LNO:simd_peep_align=<on|off>, where the default behavior is off by default. This code creates two versions of select vector loops. One version is peeled for alignment with a run time test to check if we can execute it, else the original version is executed. CR by Mei Ye. Modified: trunk/osprey/be/cg/annotations.h =================================================================== --- trunk/osprey/be/cg/annotations.h 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/cg/annotations.h 2012-04-20 21:30:33 UTC (rev 3915) @@ -178,6 +178,7 @@ SRCPOS srcpos; /* source position of start of body */ BOOL is_multiversion; BOOL is_vectorized; /* loop is vectorized (yes/no) */ + BOOL is_align_peeled; /* loop is peeled for alignment (yes/no) */ } LOOPINFO; #define LOOPINFO_wn(x) ((x)->wn) @@ -186,6 +187,7 @@ #define LOOPINFO_trip_count_tn(x) ((x)->trip_count_tn) #define LOOPINFO_multiversion(x) ((x)->is_multiversion) #define LOOPINFO_vectorized(x) ((x)->is_vectorized) +#define LOOPINFO_align_peeled(x) ((x)->is_align_peeled) typedef struct entryinfo { Modified: trunk/osprey/be/cg/bbutil.cxx =================================================================== --- trunk/osprey/be/cg/bbutil.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/cg/bbutil.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -953,6 +953,7 @@ if (WN_Loop_Up_Trip(loop_info)) fprintf(TFile, "UP_TRIP "); if (LOOPINFO_multiversion(info)) fprintf(TFile, "LMV"); if (LOOPINFO_vectorized(info)) fprintf(TFile, "VEC"); + if (LOOPINFO_align_peeled(info)) fprintf(TFile, "ALIGN_PEELED"); fprintf(TFile, "\n"); if (LOOPINFO_trip_count_tn(info)) { fprintf(TFile, " trip count TN = "); Modified: trunk/osprey/be/cg/cg_loop.cxx =================================================================== --- trunk/osprey/be/cg/cg_loop.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/cg/cg_loop.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -2809,6 +2809,7 @@ LOOPINFO_wn(copied_info) = wn; LOOPINFO_srcpos(copied_info) = LOOPINFO_srcpos(info); LOOPINFO_vectorized(copied_info) = LOOPINFO_vectorized(info); + LOOPINFO_align_peeled(copied_info) = LOOPINFO_align_peeled(info); if (TN_is_constant(trip_count)) LOOPINFO_trip_count_tn(copied_info) = Gen_Literal_TN(new_trip_count_val, TN_size(trip_count)); @@ -2917,6 +2918,7 @@ LOOPINFO_wn(unrolled_info) = wn; LOOPINFO_srcpos(unrolled_info) = LOOPINFO_srcpos(info); LOOPINFO_vectorized(unrolled_info) = LOOPINFO_vectorized(info); + LOOPINFO_align_peeled(unrolled_info) = LOOPINFO_align_peeled(info); if (TN_is_constant(trip_count)) LOOPINFO_trip_count_tn(unrolled_info) = Gen_Literal_TN(new_trip_count_val, TN_size(trip_count)); Modified: trunk/osprey/be/cg/cgemit.cxx =================================================================== --- trunk/osprey/be/cg/cgemit.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/cg/cgemit.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -4699,6 +4699,14 @@ : ", nesting depth: %d, %siterations: %lld"; fprintf (file, fmt, depth, estimated, trip_count); + + if (LOOPINFO_vectorized(info)) { + fprintf (file, "\n #<loop> vectorized"); + if (LOOPINFO_align_peeled(info)) + fprintf (file, "\n #<loop> vector loop : peeled for alignment"); + } else if (LOOPINFO_align_peeled(info)) { + fprintf (file, "\n #<loop> scalar loop : peeled iter to align"); + } } fputc ('\n', file); Modified: trunk/osprey/be/cg/whirl2ops.cxx =================================================================== --- trunk/osprey/be/cg/whirl2ops.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/cg/whirl2ops.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -7403,6 +7403,7 @@ LOOPINFO_trip_count_tn(info) = trip_tn; LOOPINFO_multiversion(info) = WN_Loop_Multiversion_Alias(loop_info); LOOPINFO_vectorized(info) = WN_Loop_Vectorized(loop_info); + LOOPINFO_align_peeled(info) = WN_Loop_Align_Peeled(loop_info); #ifndef TARG_NVISA if (!CG_PU_Has_Feedback && WN_loop_trip_est(loop_info) == 0) Modified: trunk/osprey/be/lno/can.cxx =================================================================== --- trunk/osprey/be/lno/can.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/lno/can.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -417,6 +417,7 @@ if (loop_info) { dli->Multiversion_Alias = (WN_Loop_Multiversion_Alias(loop_info) != 0); dli->Loop_Vectorized = (WN_Loop_Vectorized(loop_info) != 0); + dli->Loop_Align_Peeled = (WN_Loop_Align_Peeled(loop_info) != 0); } WN_MAP_Set(LNO_Info_Map,wn,(void *)dli); } else { Modified: trunk/osprey/be/lno/lnopt_main.cxx =================================================================== --- trunk/osprey/be/lno/lnopt_main.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/lno/lnopt_main.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -2219,6 +2219,7 @@ Has_Barriers = FALSE; Multiversion_Alias = FALSE; Loop_Vectorized = FALSE; + Loop_Align_Peeled = FALSE; Is_Ivdep = FALSE; Is_Concurrent_Call = FALSE; Concurrent_Directive = FALSE; @@ -2328,6 +2329,7 @@ Has_Barriers = dli->Has_Barriers; Multiversion_Alias = dli->Multiversion_Alias; Loop_Vectorized = dli->Loop_Vectorized; + Loop_Align_Peeled = dli->Loop_Align_Peeled; Is_Ivdep = dli->Is_Ivdep; Is_Concurrent_Call = dli->Is_Concurrent_Call; Concurrent_Directive = dli->Concurrent_Directive; Modified: trunk/osprey/be/lno/lnopt_main.h =================================================================== --- trunk/osprey/be/lno/lnopt_main.h 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/lno/lnopt_main.h 2012-04-20 21:30:33 UTC (rev 3915) @@ -894,6 +894,7 @@ mBOOL Has_Barriers; mBOOL Multiversion_Alias; mBOOL Loop_Vectorized; // attribute to mark loops which are vectorized + mBOOL Loop_Align_Peeled; // attribute to mark loops that are peeled for align mINT8 Required_Unroll; mINT8 Prefer_Fuse; mINT8 Has_Precom_Def; Modified: trunk/osprey/be/lno/pf_loop.cxx =================================================================== --- trunk/osprey/be/lno/pf_loop.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/lno/pf_loop.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -999,6 +999,9 @@ LNO_Num_Iters < 100))) single_small_trip_loop = TRUE; } + if (dli->Loop_Align_Peeled && !dli->Loop_Vectorized) + single_small_trip_loop = TRUE; + if ((LNO_Run_Prefetch > SOME_PREFETCH || (LNO_Run_Prefetch == SOME_PREFETCH && !Is_Multi_BB (w))) && // !simple_copy_loop && // bug 8560 disable this Modified: trunk/osprey/be/lno/simd.cxx =================================================================== --- trunk/osprey/be/lno/simd.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/be/lno/simd.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -92,6 +92,15 @@ #include "simd_util.h" #include "small_trips.h" // for Remove_Unity_Trip_Loop +#define simd_util_INCLUDED +//#include "simd_util.h" + +#include <vector> +#include <set> +#include <map> +#include <list> +#include <deque> + #define ABS(a) ((a<0)?-(a):(a)) #define BINARY_OP(opr) (opr == OPR_ADD || opr == OPR_SUB || opr == OPR_MPY || opr == OPR_SHL) @@ -4289,6 +4298,38 @@ } } +static WN *Simd_Align_Generate_Peel_MV_Loops(WN *vloop, DO_LOOP_INFO *dli, ST *first_st) +{ + WN *wn_if = NULL; + if (first_st && + LNO_Simd_peel_align) { + SYMBOL symbol(WN_index(vloop)); + OPCODE ld_opc = WN_opcode(UBvar(WN_end(vloop))); + OPCODE op_lda = OPCODE_make_op(OPR_LDA, Pointer_type, MTYPE_V); + WN *wn_lda = WN_CreateLda(op_lda, 0, + Make_Pointer_Type(ST_type(first_st)), + first_st); + + TYPE_ID exp_type = OPCODE_rtype(ld_opc); + WN *align_cond = NULL; + WN *wn_align_val = LWN_Make_Icon(exp_type, 15); + OPCODE and_opc = OPCODE_make_op(OPR_BAND, exp_type, MTYPE_V); + WN *wn_and = LWN_CreateExp2(and_opc, wn_lda, wn_align_val); + WN *wn_zero = LWN_Make_Icon(exp_type, 0); + OPCODE opeq = OPCODE_make_op(OPR_NE, exp_type, exp_type); + align_cond = LWN_CreateExp2(opeq, wn_and, wn_zero); + + WN *stmt_before_loop = WN_prev(vloop); + WN *parent_block = LWN_Get_Parent(vloop); + wn_if = Version_Loop(vloop); + WN_if_test(wn_if) = align_cond; + LWN_Insert_Block_After(parent_block, stmt_before_loop, wn_if); + LWN_Parentize(wn_if); + } + + return wn_if; +} + //generate peeled loop for alignment static void Simd_Align_Generate_Peel_Loop(WN *vloop, INT best_peel, DO_LOOP_INFO *dli) { @@ -4343,6 +4384,10 @@ Add_Vertices(WN_do_body(ploop)); adg->Fission_Dep_Update(ploop, 1); adg->Fission_Dep_Update(vloop, 1); + + dli->Loop_Align_Peeled = TRUE; + DO_LOOP_INFO* peel_dli = Get_Do_Loop_Info (ploop); + peel_dli->Loop_Align_Peeled = TRUE; } static INT Simd_Count_Good_Vector(STACK_OF_WN *vec_simd_ops, SIMD_KIND *simd_op_kind) @@ -6450,9 +6495,101 @@ } } +static ST *Build_Sym_Queues( STACK_OF_WN *vec_simd_ops, + std::set<ST*>& counted_load_store_sts, + std::map<ST*,std::deque<WN*> >& symbol_wn_map ) +{ + INT num_loads = 0; + + if (!LNO_Simd_peel_align) + return NULL; + + for (INT i=0; i<vec_simd_ops->Elements(); i++){ + SIMD_KIND simd_kind = simd_op_kind[i]; + + if (simd_kind == INVALID) + continue; + + WN *simd_op = vec_simd_ops->Top_nth(i); + if ((simd_kind == V16I8) && (WN_rtype(simd_op) == MTYPE_F8)) { + for (INT kid = 0; kid < WN_kid_count(simd_op); kid ++) { + WN *wn = WN_kid(simd_op, kid); + if ((WN_operator(wn) == OPR_LDID) || (WN_operator(wn) == OPR_STID)) { + ST *st = WN_st(wn); + if ((st != NULL) && (ST_class(st) != CLASS_PREG)) { + if (symbol_wn_map[st].empty()) { + counted_load_store_sts.insert(st); + } + symbol_wn_map[st].push_front(wn); + } + } + } + } + } + + // Next stop: Prune the sym queues down to a single entry or no entries. + ST *largest_st = NULL; + std::set<ST*>::const_iterator counted_load_store_sts_it; + for (counted_load_store_sts_it = counted_load_store_sts.begin(); + counted_load_store_sts_it != counted_load_store_sts.end(); + ++counted_load_store_sts_it) { + ST *st = *counted_load_store_sts_it; + num_loads += symbol_wn_map[st].size(); + if (largest_st == NULL) + largest_st = st; + else if (symbol_wn_map[largest_st].size() > symbol_wn_map[st].size()) + largest_st = st; + } + + // now remove all the smaller st maps, and do not worry about the race + // on the largest, it is sufficient to take the first one + for (counted_load_store_sts_it = counted_load_store_sts.begin(); + counted_load_store_sts_it != counted_load_store_sts.end(); + ++counted_load_store_sts_it) { + ST *st = *counted_load_store_sts_it; + if (st != largest_st) + symbol_wn_map[st].clear(); + } + + // Now the final check, what is the ratio of our chosen syms loads vs the total + if (largest_st != NULL) { + INT best_size = symbol_wn_map[largest_st].size(); + if (best_size < 3) { + // Heuristic: Do not multiversion a vector loop with fewer than 3 + // alignable loads. + symbol_wn_map[largest_st].clear(); + largest_st = 0; + } else if ((num_loads / best_size) > 3) { + // Hueristic: If alignable loads are less than 1/3 of all loads, do + // not multiversion the vector loop. + symbol_wn_map[largest_st].clear(); + largest_st = 0; + } + } else { + largest_st = 0; + } + return largest_st; +} + +static void Clear_Sym_Queues( STACK_OF_WN *vec_simd_ops, + std::set<ST*>& counted_load_store_sts, + std::map<ST*,std::deque<WN*> >& symbol_wn_map ) +{ + std::set<ST*>::const_iterator counted_load_store_sts_it; + for (counted_load_store_sts_it = counted_load_store_sts.begin(); + counted_load_store_sts_it != counted_load_store_sts.end(); + ++counted_load_store_sts_it) { + ST *st = *counted_load_store_sts_it; + symbol_wn_map[st].clear(); + } +} + // Vectorize an innerloop static INT Simd(WN* innerloop) { + std::map<ST*,std::deque<WN*> > sym_wn_map; + std::set<ST*> counted_load_store_sts; + if (!Simd_vect_conf.Arch_Has_Vect ()) return 0; @@ -6504,6 +6641,16 @@ simd_op_best_align[k] = CXX_NEW_ARRAY(INT,vec_simd_ops->Elements(),&SIMD_default_pool); BOOL ubound_variable = Simd_Align_UB_Variable(innerloop); + + INT num_loads = 0; + ST *first_st = NULL; + if(dli->Loop_Align_Peeled == FALSE){ + first_st = Build_Sym_Queues(vec_simd_ops, + counted_load_store_sts, + sym_wn_map); + if (first_st) + num_loads = sym_wn_map[first_st].size(); + } for (INT i=vec_simd_ops->Elements()-1; i >= 0; i--) { simd_op=vec_simd_ops->Top_nth(i); @@ -6566,6 +6713,36 @@ Simd_Align_Array_References(vec_simd_ops,simd_op_kind, //align iloads and istores simd_op_best_align,best_peel,innerloop); } + + // Emit multiversion loops for peeled alignment + if (LNO_Simd_peel_align && (dli->Loop_Align_Peeled == FALSE)) { + if (num_loads) { + WN *wn_if = Simd_Align_Generate_Peel_MV_Loops(innerloop, dli, first_st); + WN *peel_loop = WN_first(WN_then(wn_if)); + + Simd_Align_Generate_Peel_Loop(peel_loop, 1, dli); + innerloop = WN_first(WN_else(wn_if)); + dli = Get_Do_Loop_Info (innerloop); + // prevent processing this loop complex again the same way + dli->Loop_Align_Peeled = TRUE; + + // now simd-ize the new mv loop pair + if (Simd(peel_loop)) { + WN *loop_info = WN_do_loop_info(peel_loop); + WN_Set_Vectorized(loop_info); + WN_Set_Align_Peeled(loop_info); + if (Simd(innerloop)) { + loop_info = WN_do_loop_info(innerloop); + WN_Set_Vectorized(loop_info); + } + } + dli->Loop_Align_Peeled = FALSE; + Clear_Sym_Queues(vec_simd_ops, counted_load_store_sts, sym_wn_map); + MEM_POOL_Pop(&SIMD_default_pool); + return 1; + } + } + //END: Alignment Module #ifdef Is_True_On //internal debug purpose Modified: trunk/osprey/common/com/config_lno.cxx =================================================================== --- trunk/osprey/common/com/config_lno.cxx 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/common/com/config_lno.cxx 2012-04-20 21:30:33 UTC (rev 3915) @@ -180,10 +180,12 @@ 0, /* Fission */ TRUE, /* Serial_distribute */ 1, /* Iter_threshold */ + FALSE, /* Simd_peel_align */ #else 1, /* Fission */ FALSE, /* Serial_distribute */ 1, /* Iter_threshold */ + FALSE, /* Simd_peel_align */ #endif 0, /* Fission_inner_register_limit */ TRUE, /* Forward_substitution */ @@ -404,10 +406,12 @@ 0, /* Fission */ TRUE, /* Serial_distribute */ 1, /* Iter_threshold */ + FALSE, /* Simd_peel_align */ #else 1, /* Fission */ FALSE, /* Serial_distribute */ 1, /* Iter_threshold */ + FALSE, /* Simd_peel_align */ #endif 0, /* Fission_inner_register_limit */ TRUE, /* Forward_substitution */ @@ -732,10 +736,12 @@ LNOPT_U32 ( "fission", "fis", 0,0,2, Fission ), LNOPT_BOOL ( "distribute", NULL, Serial_distribute ), LNOPT_U32 ( "iter_threshold", NULL, 0,0,16, Iter_threshold ), + LNOPT_BOOL ( "simd_peel_align", NULL, Simd_peel_align ), #else LNOPT_U32 ( "fission", "fis", 1,0,2, Fission ), LNOPT_BOOL ( "distribute", NULL, Serial_distribute ), LNOPT_U32 ( "iter_threshold", NULL, 0,0,16, Iter_threshold ), + LNOPT_BOOL ( "simd_peel_align", NULL, Simd_peel_align ), #endif LNOPT_U32 ( "fission_inner_register_limit", NULL, 32,0,99999, Fission_inner_register_limit ), @@ -1241,5 +1247,12 @@ Mhd_Options.L[i].TLB_Miss_Penalty; } } + + if(LNO_Simd_peel_align) { + // Do not align peel when unity rem transforms are on + if(LNO_Simd_Rm_Unity_Remainder) { + LNO_Simd_peel_align = 0; + } + } } Modified: trunk/osprey/common/com/config_lno.h =================================================================== --- trunk/osprey/common/com/config_lno.h 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/common/com/config_lno.h 2012-04-20 21:30:33 UTC (rev 3915) @@ -248,6 +248,7 @@ UINT32 Fission; BOOL Serial_distribute; UINT32 Iter_threshold; + BOOL Simd_peel_align; UINT32 Fission_inner_register_limit; BOOL Forward_substitution; UINT32 Fusion; @@ -498,6 +499,7 @@ #define LNO_Fission Current_LNO->Fission #define LNO_Serial_Distribute Current_LNO->Serial_distribute #define LNO_Iter_threshold Current_LNO->Iter_threshold +#define LNO_Simd_peel_align Current_LNO->Simd_peel_align #define LNO_Fission_Inner_Register_Limit \ Current_LNO->Fission_inner_register_limit #define LNO_Forward_Substitution Current_LNO->Forward_substitution Modified: trunk/osprey/common/com/wn_core.h =================================================================== --- trunk/osprey/common/com/wn_core.h 2012-04-20 00:01:13 UTC (rev 3914) +++ trunk/osprey/common/com/wn_core.h 2012-04-20 21:30:33 UTC (rev 3915) @@ -1402,6 +1402,9 @@ /* Is the loop vectorized in Simd */ #define WN_LOOP_VECTORIZED 0x10000 +/* Is the loop align peeled in Simd */ +#define WN_LOOP_ALIGN_PEELED 0x20000 + /* Is the loop an innermost loop */ #define WN_Loop_Innermost(x) (WN_loop_flag(x) & WN_LOOP_INNERMOST) #define WN_Set_Loop_Innermost(x) (WN_loop_flag(x) |= WN_LOOP_INNERMOST) @@ -1475,6 +1478,16 @@ #define WN_Reset_Vectorized(x) \ (WN_loop_flag(x) &= ~WN_LOOP_VECTORIZED) +/* Mark the fact that we peeled a loop for alignment so that the optimizer + * and code generator can make use it. + */ +#define WN_Loop_Align_Peeled(x) \ + (WN_loop_flag(x) & WN_LOOP_ALIGN_PEELED) +#define WN_Set_Align_Peeled(x) \ + (WN_loop_flag(x) |= WN_LOOP_ALIGN_PEELED) +#define WN_Reset_Align_Peeled(x) \ + (WN_loop_flag(x) &= ~WN_LOOP_ALIGN_PEELED) + #define WN_LABEL_HANDLER_BEGIN 0x2 #define WN_Label_Is_Handler_Begin(x) (WN_label_flag(x) & \ WN_LABEL_HANDLER_BEGIN) ------------------------------------------------------------------------------ For Developers, A Lot Can Happen In A Second. Boundary is the first to Know...and Tell You. Monitor Your Applications in Ultra-Fine Resolution. Try it FREE! http://p.sf.net/sfu/Boundary-d2dvs2 _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel