Author: mberg Date: 2011-07-16 23:36:42 -0400 (Sat, 16 Jul 2011) New Revision: 3695
Modified: trunk/osprey/be/cg/cgemit.cxx trunk/osprey/be/cg/lra.cxx trunk/osprey/be/cg/whirl2ops.cxx trunk/osprey/be/cg/x8664/ebo_special.cxx trunk/osprey/be/cg/x8664/expand.cxx trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx Log: Updates for: * Alignment usage on BD * EBO register pressure hueristic tuning * horizontal add peephole opts * updates on replicate forms * fma neg additions * CG dep graph updates for BD CR by Jian-Xin Modified: trunk/osprey/be/cg/cgemit.cxx =================================================================== --- trunk/osprey/be/cg/cgemit.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/be/cg/cgemit.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -4805,9 +4805,11 @@ */ if(max_skip_bytes > 0) { - if(!Is_Target_Barcelona() && !Is_Target_Orochi() || CG_p2align != 2){ + if(!Is_Target_Barcelona() || CG_p2align != 2){ if (max_skip_bytes > 15) max_skip_bytes = 15; + if(Is_Target_Orochi()) + fprintf(Asm_File, "\t.p2align 3,,\n"); fprintf(Asm_File, "\t.p2align 4,,%d\n", max_skip_bytes); } else Modified: trunk/osprey/be/cg/lra.cxx =================================================================== --- trunk/osprey/be/cg/lra.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/be/cg/lra.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -1008,7 +1008,8 @@ LIVE_RANGE *lr = LR_For_TN(tn); if ((LR_def_cnt(lr) == 1) && (LR_upward_exposed_use(lr) == 0)) { if (LR_use_cnt(lr) == 1) { - has_sdsu = true; + // globals are not simple live ranges + has_sdsu = (TN_is_global_reg(tn)) ? has_sdsu : true; } } Modified: trunk/osprey/be/cg/whirl2ops.cxx =================================================================== --- trunk/osprey/be/cg/whirl2ops.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/be/cg/whirl2ops.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -4732,6 +4732,61 @@ } static TN* +Handle_Fnma_Operation(WN* expr, TN* result, WN *mul_wn, BOOL mul_kid0) +{ + + WN* add_wn = (mul_kid0) ? WN_kid1(expr) : WN_kid0(expr); + TN* opnd0; + TN* opnd1; + TN* opnd2; + TOP opcode; + TYPE_ID rtype = OPCODE_rtype(WN_opcode(expr)); + BOOL is_vector = MTYPE_is_vector(rtype); + + // now match a scalar or vector fma4 + switch (WN_opcode(mul_wn)) { + case OPC_F4MPY: + opcode = TOP_vfnmaddss; + break; + case OPC_F8MPY: + opcode = TOP_vfnmaddsd; + break; + case OPC_V16F4MPY: + case OPC_V16C4MPY: + FmtAssert(is_vector, ("unexpected fma vector form")); + opcode = TOP_vfnmaddps; + break; + case OPC_V16F8MPY: + case OPC_V16C8MPY: + FmtAssert(is_vector, ("unexpected fma vector form")); + opcode = TOP_vfnmaddpd; + break; + default: + FmtAssert(FALSE, ("unexpected fma form")); + break; + } + + opnd2 = Expand_Expr(add_wn, expr, NULL); + opnd1 = Expand_Expr(WN_kid1(mul_wn), mul_wn, NULL); + opnd0 = Expand_Expr(WN_kid0(mul_wn), mul_wn, NULL); + + if(result == NULL) + result = Allocate_Result_TN(expr, NULL); + + // Position tn's from loads on operand 1's position if possible. + if (OPCODE_is_load(WN_opcode(WN_kid0(mul_wn)))) + Build_OP(opcode, result, opnd1, opnd0, opnd2, &New_OPs); + else + Build_OP(opcode, result, opnd0, opnd1, opnd2, &New_OPs); + + // TODO: add operand size check for 256-bit + if (PU_has_avx128 == FALSE) + PU_has_avx128 = TRUE; + + return result; +} + +static TN* Handle_Fms_Operation(WN* expr, TN* result, WN *mul_wn, BOOL mul_kid0) { WN* sub_wn = (mul_kid0) ? WN_kid1(expr) : WN_kid0(expr); @@ -5402,7 +5457,11 @@ if (MTYPE_is_float(rtype) || MTYPE_is_vector(rtype)) { if (WN_operator(expr) == OPR_ADD) { return Handle_Fma_Operation(expr, result, mul_wn, FALSE); - } + } else if ((WN_operator(expr) == OPR_SUB) && + (WN_opcode(expr) != OPC_V16C4SUB) && + (WN_opcode(expr) != OPC_V16C8SUB)) { + return Handle_Fnma_Operation(expr, result, mul_wn, FALSE); + } } } } Modified: trunk/osprey/be/cg/x8664/ebo_special.cxx =================================================================== --- trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/be/cg/x8664/ebo_special.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -4827,6 +4827,22 @@ {TOP_vfmsubaddpd, TOP_vfmsubaddxpd, TOP_vfmsubaddxxpd, TOP_vfmsubaddxxxpd, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubaddxrps, TOP_vfmsubaddxxrps, TOP_vfmsubaddxxxrps, TOP_UNDEFINED}, {TOP_UNDEFINED, TOP_vfmsubaddxrpd, TOP_vfmsubaddxxrpd, TOP_vfmsubaddxxxrpd, TOP_UNDEFINED}, + {TOP_vfnmaddss, TOP_vfnmaddxss, TOP_vfnmaddxxss, TOP_vfnmaddxxxss, TOP_UNDEFINED}, + {TOP_vfnmaddsd, TOP_vfnmaddxsd, TOP_vfnmaddxxsd, TOP_vfnmaddxxxsd, TOP_UNDEFINED}, + {TOP_vfnmaddps, TOP_vfnmaddxps, TOP_vfnmaddxxps, TOP_vfnmaddxxxps, TOP_UNDEFINED}, + {TOP_vfnmaddpd, TOP_vfnmaddxpd, TOP_vfnmaddxxpd, TOP_vfnmaddxxxpd, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmaddxrss, TOP_vfnmaddxxrss, TOP_vfnmaddxxxrss, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmaddxrsd, TOP_vfnmaddxxrsd, TOP_vfnmaddxxxrsd, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmaddxrps, TOP_vfnmaddxxrps, TOP_vfnmaddxxxrps, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmaddxrpd, TOP_vfnmaddxxrpd, TOP_vfnmaddxxxrpd, TOP_UNDEFINED}, + {TOP_vfnmsubss, TOP_vfnmsubxss, TOP_vfnmsubxxss, TOP_vfnmsubxxxss, TOP_UNDEFINED}, + {TOP_vfnmsubsd, TOP_vfnmsubxsd, TOP_vfnmsubxxsd, TOP_vfnmsubxxxsd, TOP_UNDEFINED}, + {TOP_vfnmsubps, TOP_vfnmsubxps, TOP_vfnmsubxxps, TOP_vfnmsubxxxps, TOP_UNDEFINED}, + {TOP_vfnmsubpd, TOP_vfnmsubxpd, TOP_vfnmsubxxpd, TOP_vfnmsubxxxpd, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmsubxrss, TOP_vfnmsubxxrss, TOP_vfnmsubxxxrss, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmsubxrsd, TOP_vfnmsubxxrsd, TOP_vfnmsubxxxrsd, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmsubxrps, TOP_vfnmsubxxrps, TOP_vfnmsubxxxrps, TOP_UNDEFINED}, + {TOP_UNDEFINED, TOP_vfnmsubxrpd, TOP_vfnmsubxxrpd, TOP_vfnmsubxxxrpd, TOP_UNDEFINED}, {TOP_icall, TOP_icallx, TOP_icallxx, TOP_icallxxx, TOP_UNDEFINED}, {TOP_ijmp, TOP_ijmpx, TOP_ijmpxx, TOP_ijmpxxx, TOP_UNDEFINED}, {TOP_cvtsd2ss, TOP_cvtsd2ss_x, TOP_cvtsd2ss_xx, TOP_cvtsd2ss_xxx, TOP_UNDEFINED}, @@ -9218,6 +9234,17 @@ case TOP_vfmsubaddpd: ret_val = TRUE; break; + case TOP_vfnmaddss: + case TOP_vfnmaddsd: + case TOP_vfnmaddps: + case TOP_vfnmaddpd: + case TOP_vfnmsubss: + case TOP_vfnmsubsd: + case TOP_vfnmsubps: + case TOP_vfnmsubpd: + ret_val = TRUE; + break; + default: ret_val = FALSE; break; @@ -9226,6 +9253,31 @@ return ret_val; } +BOOL EBO_Is_FMA4_NEG( OP* alu_op) +{ + const TOP top = OP_code(alu_op); + BOOL ret_val; + + switch (top) { + case TOP_vfnmaddss: + case TOP_vfnmaddsd: + case TOP_vfnmaddps: + case TOP_vfnmaddpd: + case TOP_vfnmsubss: + case TOP_vfnmsubsd: + case TOP_vfnmsubps: + case TOP_vfnmsubpd: + ret_val = TRUE; + break; + + default: + ret_val = FALSE; + break; + } + + return ret_val; +} + static BOOL EBO_Allowable_Unaligned_Vector( OP *alu_op ) { const TOP top = OP_code(alu_op); @@ -9305,18 +9357,22 @@ break; // fused multiply-subs + case TOP_vfnmaddss: case TOP_vfmsubss: new_mul_top = TOP_vmulss; new_arith_top = TOP_vsubss; break; + case TOP_vfnmaddsd: case TOP_vfmsubsd: new_mul_top = TOP_vmulsd; new_arith_top = TOP_vsubsd; break; + case TOP_vfnmaddps: case TOP_vfmsubps: new_mul_top = TOP_vfmul128v32; new_arith_top = TOP_vfsub128v32; break; + case TOP_vfnmaddpd: case TOP_vfmsubpd: new_mul_top = TOP_vfmul128v64; new_arith_top = TOP_vfsub128v64; @@ -9453,7 +9509,11 @@ ( arith_top != TOP_UNDEFINED ) ){ TN *mul_result = Build_TN_Like(result); OP *mul_op = Mk_OP( mul_top, mul_result, mul_opnd1, mul_opnd2 ); - OP *arith_op = Mk_OP( arith_top, result, mul_result, arith_opnd ); + OP *arith_op; + if( EBO_Is_FMA4_NEG( alu_op ) ) + arith_op = Mk_OP( arith_top, result, arith_opnd, mul_result ); + else + arith_op = Mk_OP( arith_top, result, mul_result, arith_opnd ); // Add the mul component of the fma Set_OP_unrolling( mul_op, OP_unrolling(alu_op) ); Modified: trunk/osprey/be/cg/x8664/expand.cxx =================================================================== --- trunk/osprey/be/cg/x8664/expand.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/be/cg/x8664/expand.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -5896,7 +5896,7 @@ Build_OP(TOP_shufpd, tmp7, tmp1, tmp1, Gen_Literal_TN(1, 1), ops); if ((CG_opt_level > 1) && Is_Target_Orochi() && Is_Target_AVX() && Is_Target_FMA4()) { - Build_OP(TOP_fhadd128v64, tmp9, tmp3, tmp3, ops); + Expand_Reduce_Add(OPC_F8V16F8REDUCE_ADD, tmp9, tmp3, ops); Build_OP(TOP_shufps, tmp10, src1, src1, Gen_Literal_TN(238, 1), ops); Build_OP(TOP_cvtps2pd, tmp11, tmp10, ops); Build_OP(TOP_vfmaddsubpd, tmp12, tmp7, tmp4, tmp6, ops); @@ -5919,7 +5919,7 @@ Build_OP(TOP_shufpd, tmp22, tmp11, tmp11, Gen_Literal_TN(1, 1), ops); if ((CG_opt_level > 1) && Is_Target_Orochi() && Is_Target_AVX() && Is_Target_FMA4()) { - Build_OP(TOP_fhadd128v64, tmp24, tmp18, tmp18, ops); + Expand_Reduce_Add(OPC_F8V16F8REDUCE_ADD, tmp25, tmp18, ops); Build_OP(TOP_vfmaddsubpd, tmp25, tmp22, tmp19, tmp21, ops); } else { Build_OP(TOP_fmul128v64, tmp23, tmp22, tmp19, ops); @@ -6306,7 +6306,7 @@ { TN* tmp = Build_TN_Like(op1); Build_OP(TOP_movapd, tmp, op1, ops); - if ( Is_Target_SSE3() ) { + if ( Is_Target_SSE3() && !Is_Target_Orochi() ) { Build_OP(TOP_fhadd128v64, result, tmp, tmp, ops); } else { TN* tmp_a = Build_TN_Like(op1); @@ -6319,7 +6319,7 @@ { TN* tmp = Build_TN_Like(op1); Build_OP(TOP_movaps, tmp, op1, ops); - if ( Is_Target_SSE3() ) { + if ( Is_Target_SSE3() && !Is_Target_Orochi() ) { Build_OP(TOP_fhadd128v32, tmp, op1, op1, ops); Build_OP(TOP_fhadd128v32, result, tmp, tmp, ops); } else { @@ -6795,17 +6795,34 @@ case TOP_fmovsldupxxx: new_op = TOP_fmovsldup; break; + case TOP_vmovsldup: + case TOP_vmovsldupx: + case TOP_vmovsldupxx: + case TOP_vmovsldupxxx: + new_op = TOP_vmovsldup; + break; case TOP_fmovshdup: case TOP_fmovshdupx: case TOP_fmovshdupxx: case TOP_fmovshdupxxx: new_op = TOP_fmovshdup; break; + case TOP_vmovshdup: + case TOP_vmovshdupx: + case TOP_vmovshdupxx: + case TOP_vmovshdupxxx: + new_op = TOP_vmovshdup; + break; case TOP_fmovddupx: case TOP_fmovddupxx: case TOP_fmovddupxxx: new_op = TOP_fmovddup; break; + case TOP_vmovddupx: + case TOP_vmovddupxx: + case TOP_vmovddupxxx: + new_op = TOP_vmovddup; + break; default: FmtAssert( FALSE, ("Exp_COPY_Ext: Unsupported opcode (%s)", TOP_Name(opcode)) ); Modified: trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx =================================================================== --- trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx 2011-07-15 07:51:29 UTC (rev 3694) +++ trunk/osprey/common/targ_info/proc/x8664/orochi_si.cxx 2011-07-17 03:36:42 UTC (rev 3695) @@ -1609,6 +1609,8 @@ TOP_vfaddsub128v32, TOP_vfadd128v64, TOP_vfadd128v32, + TOP_vfsub128v64, + TOP_vfsub128v32, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(5); @@ -1658,6 +1660,12 @@ TOP_vfaddx128v32, TOP_vfaddxx128v32, TOP_vfaddxxx128v32, + TOP_vfsubx128v64, + TOP_vfsubxx128v64, + TOP_vfsubxxx128v64, + TOP_vfsubx128v32, + TOP_vfsubxx128v32, + TOP_vfsubxxx128v32, TOP_UNDEFINED); Any_Operand_Access_Time(0); Any_Result_Available_Time(10); @@ -4940,8 +4948,6 @@ TOP_vfshuf128v64, TOP_vshufps, TOP_vfshuf128v32, - TOP_vfsub128v64, - TOP_vfsub128v32, TOP_vunpckh128v64, TOP_vunpckh128v32, TOP_vunpckl128v64, @@ -5118,12 +5124,6 @@ Resource_Requirement(res_fstore, 0); Instruction_Group( "avx fp arith mem opnd 4", - TOP_vfsubx128v64, - TOP_vfsubxx128v64, - TOP_vfsubxxx128v64, - TOP_vfsubx128v32, - TOP_vfsubxx128v32, - TOP_vfsubxxx128v32, TOP_vfrcpx128v32, TOP_vfrcpxx128v32, TOP_vfrcpxxx128v32, ------------------------------------------------------------------------------ AppSumo Presents a FREE Video for the SourceForge Community by Eric Ries, the creator of the Lean Startup Methodology on "Lean Startup Secrets Revealed." This video shows you how to validate your ideas, optimize your ideas and identify your business strategy. http://p.sf.net/sfu/appsumosfdev2dev _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel