Author: mlai Date: 2012-03-13 04:20:27 -0400 (Tue, 13 Mar 2012) New Revision: 3881
Modified: trunk/osprey/be/cg/cg_flags.cxx trunk/osprey/be/cg/cg_flags.h trunk/osprey/be/cg/cg_loop.cxx trunk/osprey/be/cg/cgdriver.cxx trunk/osprey/be/cg/x8664/cgtarget.cxx Log: Added non-temporal store generation support for single basic block while loops. Extended the option -CG:movnti to include -1 as a means for the user to instruct the compiler to emit non-temporal stores unconditionally. (Behavior of existing functionality is not affected.) Some TOP code cleanup for non-temporal store. Approved by Jian-Xin Lai. Modified: trunk/osprey/be/cg/cg_flags.cxx =================================================================== --- trunk/osprey/be/cg/cg_flags.cxx 2012-03-12 21:10:05 UTC (rev 3880) +++ trunk/osprey/be/cg/cg_flags.cxx 2012-03-13 08:20:27 UTC (rev 3881) @@ -291,7 +291,7 @@ BOOL CG_use_short_form = FALSE; UINT64 CG_p2align_freq = 10000; UINT32 CG_p2align_max_skip_bytes = 3; -UINT32 CG_movnti = 1000; +INT32 CG_movnti = 1000; BOOL CG_use_incdec = TRUE; BOOL CG_use_xortozero = TRUE; // bug 8592 BOOL CG_use_xortozero_Set = FALSE; Modified: trunk/osprey/be/cg/cg_flags.h =================================================================== --- trunk/osprey/be/cg/cg_flags.h 2012-03-12 21:10:05 UTC (rev 3880) +++ trunk/osprey/be/cg/cg_flags.h 2012-03-13 08:20:27 UTC (rev 3881) @@ -881,7 +881,7 @@ extern BOOL CG_compute_to; extern UINT64 CG_p2align_freq; extern UINT32 CG_p2align_max_skip_bytes; -extern UINT32 CG_movnti; +extern INT32 CG_movnti; extern BOOL CG_use_xortozero; extern BOOL CG_use_xortozero_Set; extern BOOL CG_use_incdec; Modified: trunk/osprey/be/cg/cg_loop.cxx =================================================================== --- trunk/osprey/be/cg/cg_loop.cxx 2012-03-12 21:10:05 UTC (rev 3880) +++ trunk/osprey/be/cg/cg_loop.cxx 2012-03-13 08:20:27 UTC (rev 3881) @@ -7442,6 +7442,9 @@ CG_LOOP_Trace_Loop(loop, "*** Before SINGLE_BB_WHILELOOP_UNROLL ***"); cg_loop.Build_CG_LOOP_Info(TRUE); +#ifdef TARG_X8664 + CGTARG_LOOP_Optimize(loop); +#endif cg_loop.Determine_Unroll_Factor(); Unroll_Dowhile_Loop(loop, cg_loop.Unroll_factor()); cg_loop.Recompute_Liveness(); Modified: trunk/osprey/be/cg/cgdriver.cxx =================================================================== --- trunk/osprey/be/cg/cgdriver.cxx 2012-03-12 21:10:05 UTC (rev 3880) +++ trunk/osprey/be/cg/cgdriver.cxx 2012-03-13 08:20:27 UTC (rev 3881) @@ -1237,8 +1237,8 @@ { OVK_BOOL, OV_INTERNAL, TRUE, "idivbyconst_opt", "", 0, 0, 0, &CG_idivbyconst_opt, NULL }, { OVK_UINT32, OV_INTERNAL, TRUE, "movnti", "", - 120, 0, UINT32_MAX>>1, &CG_movnti, NULL, - "Use x86-64's movnti instead of mov when writing memory blocks of this size or larger (in KB)" }, + 120, -1, UINT32_MAX>>1, &CG_movnti, NULL, + "Use x86-64's movnti instead of mov when writing memory blocks of this size or larger (in KB). Value of -1 generates movnti unconditionally." }, { OVK_BOOL, OV_INTERNAL, TRUE, "cloop", "", 0, 0, 0, &CG_LOOP_cloop, NULL }, { OVK_BOOL, OV_INTERNAL, TRUE,"use_lddqu", "", Modified: trunk/osprey/be/cg/x8664/cgtarget.cxx =================================================================== --- trunk/osprey/be/cg/x8664/cgtarget.cxx 2012-03-12 21:10:05 UTC (rev 3880) +++ trunk/osprey/be/cg/x8664/cgtarget.cxx 2012-03-13 08:20:27 UTC (rev 3881) @@ -111,6 +111,8 @@ #include "stblock.h" //for Base_Symbol_And_Offset_For_Addressing #include "be_symtab.h" //Preg_Lda +#define UNCONDITIONAL_MOVNTI -1 + UINT32 CGTARG_branch_taken_penalty; BOOL CGTARG_branch_taken_penalty_overridden = FALSE; @@ -377,9 +379,6 @@ case TOP_vstsd: case TOP_vstsd_n32: case TOP_vstsdx: - case TOP_vstntsd: - case TOP_vstntsdx: - case TOP_vstntsdxx: case TOP_vstorelpd: case TOP_vdivxxxsd: case TOP_vfaddxxxsd: @@ -3232,12 +3231,20 @@ case TOP_stsd: return TOP_stntsd; break; case TOP_stsdx: return TOP_stntsdx; break; case TOP_stsdxx: return TOP_stntsdxx; break; - case TOP_vstss: return TOP_vstntss; break; - case TOP_vstssx: return TOP_vstntssx; break; - case TOP_vstssxx: return TOP_vstntssxx; break; - case TOP_vstsd: return TOP_vstntsd; break; - case TOP_vstsdx: return TOP_vstntsdx; break; - case TOP_vstsdxx: return TOP_vstntsdxx; break; + + /* following cases asserted by CG_movnti == UNCONDITIONAL_MOVNTI */ + case TOP_stups: return TOP_stntps; break; + case TOP_stupsx: return TOP_stntpsx; break; + case TOP_stupsxx: return TOP_stntpsxx; break; + case TOP_vstups: return TOP_vstntps; break; + case TOP_vstupsx: return TOP_vstntpsx; break; + case TOP_vstupsxx: return TOP_vstntpsxx; break; + case TOP_stupd: return TOP_stntpd; break; + case TOP_stupdx: return TOP_stntpdx; break; + case TOP_stupdxx: return TOP_stntpdxx; break; + case TOP_vstupd: return TOP_vstntpd; break; + case TOP_vstupdx: return TOP_vstntpdx; break; + case TOP_vstupdxx: return TOP_vstntpdxx; break; } FmtAssert(FALSE,("Non-Temporal Store: not supported!")); return TOP_UNDEFINED; @@ -3294,43 +3301,52 @@ { if(CG_movnti==0) return; - UINT32 trip_count = 0; - TN* trip_count_tn = CG_LOOP_Trip_Count(loop); BB* body = LOOP_DESCR_loophead(loop); + OP* op = NULL; - if( trip_count_tn != NULL && - TN_is_constant(trip_count_tn) ){ - trip_count = TN_value( trip_count_tn ); + if (CG_movnti != UNCONDITIONAL_MOVNTI) + { + UINT32 trip_count = 0; + TN* trip_count_tn = CG_LOOP_Trip_Count(loop); - } else { - const ANNOTATION* annot = ANNOT_Get(BB_annotations(body), ANNOT_LOOPINFO); - const LOOPINFO* info = ANNOT_loopinfo(annot); + if( trip_count_tn != NULL && + TN_is_constant(trip_count_tn) ){ + trip_count = TN_value( trip_count_tn ); - trip_count = WN_loop_trip_est(LOOPINFO_wn(info)); - } + } else { + const ANNOTATION* annot = ANNOT_Get(BB_annotations(body), ANNOT_LOOPINFO); + const LOOPINFO* info = NULL; + if (annot != NULL) + info = ANNOT_loopinfo(annot); + if (info != NULL) + trip_count = WN_loop_trip_est(LOOPINFO_wn(info)); + } - OP* op = NULL; - INT64 size = 0; + INT64 size = 0; - Working_Set.Clear(); + Working_Set.Clear(); - /* First, estimate the totol size (in bytes) that this loop will - bring to the cache. - */ - FOR_ALL_BB_OPs_FWD( body, op ){ - if(((OP_store( op ) && !TOP_is_nt_store(OP_code(op))) || //stores - OP_load(op) ) && //loads - !Op_In_Working_Set(op)){ //that were not in the working set - size += CGTARG_Mem_Ref_Bytes(op); + /* First, estimate the totol size (in bytes) that this loop will + bring to the cache. + */ + FOR_ALL_BB_OPs_FWD( body, op ){ + if(((OP_store( op ) && !TOP_is_nt_store(OP_code(op))) || //stores + OP_load(op) ) && //loads + !Op_In_Working_Set(op)){ //that were not in the working set + size += CGTARG_Mem_Ref_Bytes(op); + } } - } - size *= trip_count; + size *= trip_count; - const INT64 cache_size = CG_movnti * 1024; + const INT64 cache_size = CG_movnti * 1024; - if( size < cache_size ) - return; + if( size < cache_size ) + return; + } + // if CG_movnti == UNCONDITIONAL_MOVNTI: generate non-temporal stores + // unconditionally + FOR_ALL_BB_OPs_FWD( body, op ){ if( OP_prefetch( op ) ){ /* Get rid of any prefetchw operation, because it "loads the prefetched @@ -3395,6 +3411,17 @@ new_top = Movnti_Top(OP_code(op)); break; } + case TOP_stups: + case TOP_stupsx: + case TOP_stupsxx: + case TOP_vstups: + case TOP_vstupsx: + case TOP_vstupsxx: + { + if (CG_movnti == UNCONDITIONAL_MOVNTI) + new_top = Movnti_Top(OP_code(op)); + break; + } //SSE2 support case TOP_stapd: case TOP_stapdx: @@ -3419,24 +3446,30 @@ new_top = Movnti_Top(OP_code(op)); break; } - //SSE4a support + //SSE4a/SSE41 support case TOP_stss: case TOP_stssx: case TOP_stssxx: case TOP_stsd: case TOP_stsdx: case TOP_stsdxx: - case TOP_vstss: - case TOP_vstssx: - case TOP_vstssxx: - case TOP_vstsd: - case TOP_vstsdx: - case TOP_vstsdxx: { - if(Is_Target_SSE4a()) + if(Is_Target_SSE4a() || Is_Target_SSE41()) new_top = Movnti_Top(OP_code(op)); break; } + case TOP_stupd: + case TOP_stupdx: + case TOP_stupdxx: + case TOP_vstupd: + case TOP_vstupdx: + case TOP_vstupdxx: + { + if (CG_movnti == UNCONDITIONAL_MOVNTI && + (Is_Target_SSE4a() || Is_Target_SSE41())) + new_top = Movnti_Top(OP_code(op)); + break; + } }//end switch if( new_top != TOP_UNDEFINED ) ------------------------------------------------------------------------------ Keep Your Developer Skills Current with LearnDevNow! The most comprehensive online learning library for Microsoft developers is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3, Metro Style Apps, more. Free future releases when you subscribe now! http://p.sf.net/sfu/learndevnow-d2d _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel