Author: pallavimathew Date: 2011-07-06 15:11:17 -0400 (Wed, 06 Jul 2011) New Revision: 3681
Modified: trunk/osprey/be/cg/cgexp_internals.h trunk/osprey/be/cg/whirl2ops.cxx trunk/osprey/be/cg/x8664/expand.cxx trunk/osprey/be/com/x8664/betarget.cxx trunk/osprey/be/lno/simd.cxx trunk/osprey/common/com/config_lno.cxx trunk/osprey/common/com/config_lno.h trunk/osprey/common/com/opcode_gen_core.cxx trunk/osprey/common/com/opcode_gen_core.h trunk/osprey/common/com/wn_util.cxx trunk/osprey/common/com/wn_util.h Log: This patch introduces a framework for vectorization of IF-statements of the form: - if (x != 0) { single_istore_statement } else {empty_body} - if (x != 0) { if (y !=0) {single_istore_statement} else {empty_body}} else {empty_body} This optimization is turned on by default and can be controlled by -LNO:simd_vect_if={on/off}. Sample program: long array[2000000]; void sample() { long i; for(i = 0; i < 2000000; i++) { if (array[i]) array[i] ^= i; } return; } Vectorization of such an if-statement is done by first vectorizing its subexpressions. The result of the vectorized if-condition is computed by 'pcmpeqq' (V16I8V16I8EQ) and is used to select between the result of executing the statement in the if-body or leaving the array element unchanged. This selection is performed by the 'pblendvb' (V16I1V16I1SELECT) operation. Both of these are SSE4.1 instructions. This patch extends the supported types of - OPR_SELECT to include V16I1 (to support generation of pblendvb operation). - OPR_EQ to include V16I8 (to support generation of pcmpeqq operation). This patch also recognizes and handles vectorization of invariants rooted at OPR_ADD, OPR_SUB and OPR_MPY. C.R. by Fred Chow, Mei Ye and Jian-Xin Lai. Modified: trunk/osprey/be/cg/cgexp_internals.h =================================================================== --- trunk/osprey/be/cg/cgexp_internals.h 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/be/cg/cgexp_internals.h 2011-07-06 19:11:17 UTC (rev 3681) @@ -129,6 +129,7 @@ extern void Expand_Max (TN *dest, TN *src1, TN *src2, TYPE_ID mtype, OPS *ops); extern void Expand_MinMax (TN *dest, TN *dest2, TN *src1, TN *src2, TYPE_ID mtype, OPS *ops); extern void Expand_Select (TN *dest_tn, TN *cond_tn, TN *true_tn, TN *false_tn, TYPE_ID mtype, BOOL float_cond, OPS *ops); +extern void Expand_Select_To_Blend (TYPE_ID mtype, TN* result, TN* op0, TN* op1, TN* op2, OPS *ops); extern void Expand_Flop (OPCODE opcode, TN *result, TN *src1, TN *src2, TN *src3, OPS *ops); #ifdef TARG_X8664 Modified: trunk/osprey/be/cg/whirl2ops.cxx =================================================================== --- trunk/osprey/be/cg/whirl2ops.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/be/cg/whirl2ops.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -3166,9 +3166,18 @@ WN *compare; VARIANT variant; + if (opcode == OPC_V16I1V16I1SELECT) { + TN* op1 = Expand_Expr(WN_kid0(select), select, NULL); + TN* op2 = Expand_Expr(WN_kid1(select), select, NULL); + TN* op3 = Expand_Expr(WN_kid2(select), select, NULL); + if (result == NULL) + result = Allocate_Result_TN (select, NULL); + + Expand_Select(result, op1, op2, op3, MTYPE_V16I1, FALSE, &New_OPs); //FALSE passed as dummy arg + return result; + } - /* * Expand the true/false before the condition */ Modified: trunk/osprey/be/cg/x8664/expand.cxx =================================================================== --- trunk/osprey/be/cg/x8664/expand.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/be/cg/x8664/expand.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -4233,6 +4233,11 @@ BOOL float_cond, OPS *ops) { + if (mtype == MTYPE_V16I1) { + Expand_Select_To_Blend(mtype, dest_tn, cond_tn, true_tn, false_tn, ops); + return; + } + Is_True( TN_register_class(cond_tn) == ISA_REGISTER_CLASS_integer, ("Handle this case in Expand_Select") ); const BOOL non_sse2_fp = MTYPE_is_F10(mtype) || @@ -4314,7 +4319,54 @@ } } +//Vector type SELECT are expanded to *blend* operation. +//For now we only handle vector type V16I1. void +Expand_Select_To_Blend (TYPE_ID mtype, TN* result, TN* op0, TN* op1, TN* op2, OPS *ops) +{ + FmtAssert(mtype == MTYPE_V16I1, ("Non-vector type passed to Expand_Select_To_Blend")); + TN* xmm0; + if( Trace_Exp ) { + fprintf(TFile, "expand %s: ", mtype == MTYPE_V16I1? OPCODE_name(OPC_V16I1V16I1SELECT): "***Unsupported opcode***"); + if (result) Print_TN(result,FALSE); + fprintf(TFile, " :- "); + if (op0) Print_TN(op0,FALSE); + fprintf(TFile, " "); + if (op1) Print_TN(op1,FALSE); + fprintf(TFile, " "); + if (op2) Print_TN(op2,FALSE); + fprintf(TFile, " "); + fprintf(TFile, "\n"); + } + + if (!Is_Target_AVX()) { + //pblendvb (non-AVX) uses the 'xmm0' register as an implicit argument containing the mask. + //To build a TN dedicated to reg xmm0, pass value "1" to Build_Dedicated_TN + //instead of "XMM0(enum value of 17)". This avoids a bug in out of bound access + //of the array 'v16_ded_tns' which is size 17. Need to file this bug. + xmm0 = Build_Dedicated_TN(ISA_REGISTER_CLASS_float,1,16); + Exp_COPY(xmm0, op2, ops); + Set_TN_is_global_reg(xmm0); + } + switch(mtype) { + case MTYPE_V16I1: + if (Is_Target_Orochi() && Is_Target_AVX()) + Build_OP(TOP_blendv128v8, result, op0, op1, op2, ops); + else + Build_OP(TOP_blendv128v8, result, op0, xmm0, op1, ops); + break; + default: + FmtAssert(FALSE, + ("Expand_Select_To_Blend: Unsupported mtype (%d)", mtype)); + } + + if (Trace_Exp) { + //Print_OPS appears to be printing extra characters at end of string "into ||| ..." + fprintf(TFile, " into "); Print_OPS (ops); + } +} + +void Expand_Min (TN *dest, TN *src1, TN *src2, TYPE_ID mtype, OPS *ops) { Is_True( !TN_has_value( src1 ), ("Expand_Min: src1 has value") ); Modified: trunk/osprey/be/com/x8664/betarget.cxx =================================================================== --- trunk/osprey/be/com/x8664/betarget.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/be/com/x8664/betarget.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -135,6 +135,11 @@ case OPR_TRAP: FmtAssert( FALSE, ("UNIMPLEMENTED") ); + case OPR_EQ: + if(rtype == MTYPE_V16I8) + return TOP_cmpeq128v64; + else + return TOP_UNDEFINED; default: return TOP_UNDEFINED; } Modified: trunk/osprey/be/lno/simd.cxx =================================================================== --- trunk/osprey/be/lno/simd.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/be/lno/simd.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -94,8 +94,13 @@ #include "small_trips.h" // for Remove_Unity_Trip_Loop #define ABS(a) ((a<0)?-(a):(a)) +#define BINARY_OP(opr) (opr == OPR_ADD || opr == OPR_SUB || opr == OPR_MPY || opr == OPR_SHL) +static void WN_collect_iloads_nr(std::list<WN *> *wn_list, WN *wn); BOOL debug; +BOOL under_if = FALSE; // sub-expression of an if-statement, + // also indicates if if-vectorization is turned on. +BOOL nested_if = FALSE; // sub-expression of a nested-if-statement extern WN *Split_Using_Preg(WN* stmt, WN* simd_op, ARRAY_DIRECTED_GRAPH16* dep_graph, @@ -120,6 +125,7 @@ static REDUCTION_MANAGER *curr_simd_red_manager; static void Simd_Mark_Code (WN* wn); +static TYPE_ID Simd_Get_Vector_Type(WN *istore); static INT Last_Vectorizable_Loop_Id = 0; SIMD_VECTOR_CONF Simd_vect_conf; @@ -263,8 +269,12 @@ case OPR_LT: case OPR_GT: case OPR_LE: case OPR_GE: if (MTYPE_is_float(desc) && MTYPE_is_integral(rtype)) return TRUE; - else + else { + if (under_if && + (opr == OPR_NE && MTYPE_is_integral(desc) && MTYPE_is_integral(rtype))) + return TRUE; return FALSE; + } case OPR_TRUNC: if (rtype == MTYPE_I4 && desc == MTYPE_F4) return TRUE; @@ -357,6 +367,157 @@ } } + +/*Determine whether an if-statement meets criteria for vectorization. + The if-statement can be one of the following forms: + if (x != 0) { single_istore_statement } else {empty_body} + if (x != 0) { if (y !=0) {single_istore_statement} else {empty_body}} else {empty_body} + The check on whether the sub-expressions of if-statement are vectorizable + is done as for other statements by Gather_Vectorizable_Ops_Helper. + To extend the vectorization of if-statements to include triple-nested + statements or more, consider using a bitmask in place of under_if/nested_if. +*/ +BOOL Is_Vectorizable_If(WN* wn, BOOL inner) { + BOOL safe; + + FmtAssert(WN_operator(wn) == OPR_IF, ("Not an IF statement\n")); + WN *if_test = WN_if_test(wn); + WN *if_then = WN_then(wn); + WN *if_else = WN_else(wn); + + FmtAssert(WN_operator(if_else) == OPR_BLOCK, ("Else part of IF is not a block")); + if (WN_first(if_else) != NULL) + //non-empty else-clause + return FALSE; + + // if-condition should be of the form (expr != 0) + if (WN_operator(if_test) != OPR_NE) + return FALSE; + if (!((WN_operator(WN_kid1(if_test)) == OPR_INTCONST) && + WN_const_val(WN_kid1(if_test)) == 0)) + return FALSE; + + FmtAssert(WN_operator(if_then) == OPR_BLOCK, ("Then part of IF is not a block")); + if (WN_first(if_then) != WN_last(if_then)) + //multiple statements in the body + return FALSE; + + //Nested-if: if (cond) {if (inner-cond) {...}} + if (WN_operator(WN_first(if_then)) == OPR_IF) { + if (inner) + //triply nested-if. + return FALSE; + if (!Is_Vectorizable_If(WN_first(if_then), TRUE)) + return FALSE; + //Handle nested-ifs only if the vector lengths of cond and inner-cond match, + //since these two conditions will be fused for vectorization. + WN *nested_if_test = WN_if_test(WN_first(if_then)); + int vlength_cond = Simd_Get_Vector_Type(if_test); + int vlength_inner_cond = Simd_Get_Vector_Type(nested_if_test); + if (vlength_cond != vlength_inner_cond) + return FALSE; + //Vector length must be V16I8 in order to use SSE4.1 instruction pcmpeqq and pblendvb + if (vlength_cond != MTYPE_V16I8) + return FALSE; + // The nested-if-condition must not have side effects, since its evaluation is speculative. + if (WN_has_side_effects(nested_if_test)) + return FALSE; + + //Check if iloads in nested-if-condition are safe to speculate. + std::list<WN *> outer_loads, inner_loads; + //Collect iloads in the outer and inner if-conditions. + WN_collect_iloads(&outer_loads, if_test); + //Collect iloads non-recursively since a match for the outer-most iload implies a match for any inner iloads. + WN_collect_iloads_nr(&inner_loads, nested_if_test); + std::list<WN *>::iterator outer_ld_iter, inner_ld_iter; + for (inner_ld_iter = inner_loads.begin(); inner_ld_iter != inner_loads.end(); ++inner_ld_iter) { + WN *inner_iload = *inner_ld_iter; + safe = FALSE; + //For a speculative iload in the inner if-condition to be safe, + //it must also be present in the outer if-condition. + //The if-condition contains a single condition, so no need to check for short-circuit. + for (outer_ld_iter = outer_loads.begin(); outer_ld_iter != outer_loads.end(); ++outer_ld_iter) { + WN *outer_iload = *outer_ld_iter; + if (Tree_Equiv(inner_iload, outer_iload)) { + safe = TRUE; + break; + } + } + if (!safe) + return FALSE; + } + } + else { + //the single statement in the if-body should be an istore. + if (WN_operator(WN_first(if_then)) != OPR_ISTORE) + return FALSE; + //the RHS of the istore must not have side-effects because after vectorization, + //the RHS is always evaluated regardless of the if-condition. + if (WN_has_side_effects(WN_kid0(WN_first(if_then)))) + return FALSE; + + //Check if iloads in if-body are safe to speculate. + WN *outer_if_test = if_test; + if(inner) { + // This is a nested-if. Check for safe speculation must be done against outermost if-condition. + WN *loop = Enclosing_Do_Loop(if_test); + outer_if_test = WN_if_test(WN_first((WN_do_body(loop)))); + } + std::list<WN *> outer_loads, then_loads; + //Collect iloads in the outer if-condition. + WN_collect_iloads(&outer_loads, outer_if_test); + //Collect iloads from RHS of the istore in the if-body. + WN_collect_iloads_nr(&then_loads, WN_kid0(WN_first(if_then))); + std::list<WN *>::iterator outer_ld_iter, then_ld_iter; + for (then_ld_iter = then_loads.begin(); then_ld_iter != then_loads.end(); ++then_ld_iter) { + WN *then_iload = *then_ld_iter; + safe = FALSE; + //For a speculative iload in the if-body to be safe, + //it must also be present in the outer if-condition. + for (outer_ld_iter = outer_loads.begin(); outer_ld_iter != outer_loads.end(); ++outer_ld_iter) { + WN *outer_iload = *outer_ld_iter; + if (Tree_Equiv(then_iload, outer_iload)) { + safe = TRUE; + break; + } + } + if (!safe) + return FALSE; + } + //Check whether the istore in the if-body is safe to speculate. + WN *istore = WN_first(if_then); + //For the speculative istore (in the if-body) to be safe, + //there must be an iload from the same address present in the outer if-condition. + safe = FALSE; + for (outer_ld_iter = outer_loads.begin(); outer_ld_iter != outer_loads.end(); ++outer_ld_iter) { + WN *outer_iload = *outer_ld_iter; + if (Tree_Equiv(WN_kid1(istore), WN_kid0(outer_iload))) { + if ((WN_offset(istore) == WN_offset(outer_iload)) && + (WN_ty(istore) == WN_load_addr_ty(outer_iload)) && + (WN_desc(istore) == WN_desc(outer_iload))) { + safe = TRUE; + break; + } + } + } + if (!safe) + return FALSE; + } + //Vector length of condition and body of if-statement must match + int vlength_cond = Simd_Get_Vector_Type(if_test); + int vlength_body = Simd_Get_Vector_Type(WN_first(if_then)); + if (vlength_cond != vlength_body) + return FALSE; + if (vlength_cond != MTYPE_V16I8) + return FALSE; + + WN *if_parent =LWN_Get_Parent(wn); + if (WN_operator(if_parent) != OPR_BLOCK) + return FALSE; + + return TRUE; +} + extern WN *find_loop_var_in_simple_ub(WN* loop); // defined in vintr_fission.cxx typedef enum { @@ -376,9 +537,9 @@ opr=WN_operator(wn); } - // Recognize an invariant expression rooted at OPR_SHL. + // Recognize 2 operand invariant expressions. // Should eventually be generalized to any 2 operand operation. - if (opr == OPR_SHL) { + if (BINARY_OP(opr)) { if ((simd_operand_kind(WN_kid0(wn), loop) == Invariant) && (simd_operand_kind(WN_kid1(wn), loop) == Invariant)) return Invariant; @@ -394,7 +555,6 @@ if (symbol1==symbol2) return Complex; DEF_LIST* def_list=Du_Mgr->Ud_Get_Def(wn); - WN* loop_stmt=def_list->Loop_stmt(); WN* body=WN_do_body(loop); DEF_LIST_ITER d_iter(def_list); for (DU_NODE* dnode=d_iter.First(); !d_iter.Is_Empty(); @@ -958,7 +1118,8 @@ return FALSE; } - if (OPCODE_is_compare(WN_opcode(wn)) && WN_operator(parent) != OPR_SELECT) + if (OPCODE_is_compare(WN_opcode(wn)) && (WN_operator(parent) != OPR_SELECT) + && (!under_if && WN_operator(parent) != OPR_IF)) return FALSE; //Bug 10148: don't vectoorize F8RECIP if it is MPY's child @@ -1029,6 +1190,7 @@ return FALSE; if (WN_operator(parent) != OPR_ISTORE && WN_operator(parent) != OPR_STID && + !(under_if && WN_operator(parent) == OPR_IF) && !is_vectorizable_op(WN_operator(parent), WN_rtype(parent), WN_desc(parent))) return FALSE; @@ -1534,22 +1696,40 @@ { SIMD_KIND smallest_kind = INVALID; + //Should not check under_if in conjunction with OPR_IF here + //since Find_Simd_Kind is not called in the context of a single + //expression but a stack of vectorizable nodes. + for (INT i=0; i<vec_simd_ops->Elements(); i++){ WN* simd_op=vec_simd_ops->Top_nth(i); WN* istore=LWN_Get_Parent(simd_op); // bug 2336 - trace up the correct type while(istore && !OPCODE_is_store(WN_opcode(istore)) && + (WN_operator(istore) != OPR_IF) && WN_operator(istore) != OPR_DO_LOOP) istore = LWN_Get_Parent(istore); - FmtAssert(istore || WN_operator(istore) == OPR_DO_LOOP, ("NYI")); - + FmtAssert(!istore || + WN_operator(istore) == OPR_DO_LOOP || + (WN_operator(istore) == OPR_IF) || + OPCODE_is_store(WN_opcode(istore)), ("NYI")); TYPE_ID type; if (WN_desc(istore) == MTYPE_V) type = WN_rtype(istore); else type = WN_desc(istore); + if (WN_operator(istore) == OPR_IF) { + //simd_op is part of the if-condition. + type = WN_rtype(simd_op); + if (WN_operator(simd_op) == OPR_NE) { + //We're assuming that OPR_NE is the root of the condition + //and none of its sub-expr will contain OPR_NE. + //Further work: We need to ensure that this is indeed the case. + type = WN_desc(simd_op); + } + } + switch(type) { case MTYPE_C4: if (smallest_kind > V16C4) @@ -1646,19 +1826,40 @@ TYPE_ID rtype = WN_rtype(wn); TYPE_ID desc = WN_desc(wn); - // Recognize invariant sub-expression rooted at OPR_SHL and do not - // push it onto the stack of vectorizable operations. + // Recognize 2 operand invariant sub-expression + // and do not push it onto the stack of vectorizable operations. // Should eventually be generalized to prevent any 2 operand invariant // from being vectorized. - if (opr == OPR_SHL && - simd_operand_kind(WN_kid0(wn), LWN_Get_Parent(WN_do_body(loop))) == Invariant && - simd_operand_kind(WN_kid1(wn), LWN_Get_Parent(WN_do_body(loop))) == Invariant) - if (is_vectorizable_op(WN_operator(wn), WN_rtype(wn), WN_desc(wn))) - return TRUE; - if (opr == OPR_IF || opr == OPR_REGION){ + WN* body_parent = LWN_Get_Parent(WN_do_body(loop)); + if (BINARY_OP(opr) && + simd_operand_kind(WN_kid0(wn), body_parent) == Invariant && + simd_operand_kind(WN_kid1(wn), body_parent) == Invariant) + if (is_vectorizable_op(WN_operator(wn), WN_rtype(wn), WN_desc(wn))) { + WN* parent = LWN_Get_Parent(wn); + // Invariant children of a store need to be vectorized as they will not be replicated. + if (parent && !OPCODE_is_store(WN_opcode(parent))) + return TRUE; + } + if (opr == OPR_REGION){ Report_Non_Vectorizable_Op(wn); return FALSE; } + if (WN_operator(wn) == OPR_IF) { + if (!Simd_vect_conf.Is_SSE41() || !LNO_Simd_Vect_If) { + Report_Non_Vectorizable_Op(wn); + return FALSE; + } + if(!Is_Vectorizable_If(wn, FALSE)) { + //ok to always pass FALSE. + //Is_Vectorizable_If will correctly pass TRUE for inner-if on recursive call. + return FALSE; + } + if (!under_if) + under_if = TRUE; + else + nested_if = TRUE; + } + if (is_vectorizable_op(opr, rtype, desc)){ if ((opr != OPR_INTRINSIC_OP && Is_Well_Formed_Simd(wn, loop)) || @@ -1872,6 +2073,13 @@ } } + if (WN_operator(wn) == OPR_IF) { + // Done with processing if statement + if (!nested_if) + under_if = FALSE; + else + nested_if = FALSE; + } return TRUE; } @@ -3198,8 +3406,9 @@ while((stmt1=LWN_Get_Parent(stmt)) != body){ stmt = stmt1; if (WN_opcode(stmt)==OPC_BLOCK){ - under_scf=TRUE; - break; + if (!(LNO_Simd_Vect_If && Simd_vect_conf.Is_SSE41()) || (WN_opcode(LWN_Get_Parent(stmt)) != OPC_IF)) + under_scf=TRUE; + break; } } if (under_scf) @@ -3207,11 +3416,14 @@ TYPE_ID rtype = WN_rtype(simd_op); TYPE_ID desc = WN_desc(simd_op); // CHANGED + if ((LNO_Simd_Vect_If && Simd_vect_conf.Is_SSE41()) && (WN_operator(LWN_Get_Parent(simd_op)) == OPR_IF)) + under_if = TRUE; FmtAssert(is_vectorizable_op(WN_operator(simd_op), rtype, desc), ("Handle this piece")); if (!is_vectorizable_op(WN_operator(simd_op), rtype, desc)) continue; //will never happen due to the above assert - + if ((LNO_Simd_Vect_If && Simd_vect_conf.Is_SSE41()) && (WN_operator(LWN_Get_Parent(simd_op)) == OPR_IF) && (under_if)) + under_if = FALSE; for (INT kid_no=0; kid_no<WN_kid_count(simd_op); kid_no++){ WN* tmp=WN_kid(simd_op,kid_no); SIMD_OPERAND_KIND kind=simd_operand_kind(tmp,LWN_Get_Parent(body)); @@ -4349,12 +4561,23 @@ WN_operator(stmt) != OPR_DO_LOOP && // Bug 5225 - trace up should stop at a CVT or a TRUNC. WN_operator(stmt) != OPR_CVT && + WN_operator(stmt) != OPR_IF && WN_operator(stmt) != OPR_TRUNC) { stmt = LWN_Get_Parent(stmt); } if (!stmt || WN_operator(stmt) == OPR_DO_LOOP) type = WN_rtype(istore); //use parent's desc - else type = WN_desc(stmt); //use store's desc + else { + if(WN_operator(stmt) == OPR_IF) + // istore is (part of) the if-condition since if-body will have ISTORE as parent stmt. + // This returns desc of OPR_NE (root of if-condition). + if (WN_operator(istore) == OPR_IF) + type = WN_desc(WN_kid0(istore)); + else + type = WN_desc(istore); + else + type = WN_desc(stmt); //use store's desc + } } else type = WN_desc(istore);//parent is a store switch(type) { case MTYPE_V16C8: case MTYPE_C8: @@ -4385,6 +4608,8 @@ case MTYPE_U8: vmtype = MTYPE_V16I8; break; + default: + DevWarn("Unexpected type in Simd_Get_Vector_Type"); } return vmtype; } @@ -4433,6 +4658,14 @@ if (WN_operator(simd_op) == OPR_CVT || WN_operator(simd_op) == OPR_TRUNC) type = WN_rtype(const_wn); + if ((LNO_Simd_Vect_If && Simd_vect_conf.Is_SSE41()) && (WN_operator(istore) == OPR_IF)) { + FmtAssert(WN_operator(simd_op) == OPR_NE, ("Condition of OPC_IF must be rooted at OPR_NE")); + //Match the vector type of parent simd_op. + //We know that this constant is a zero and so its vectorized version + //can be made to match the size of the vectorized parent. + type = WN_desc(simd_op); + } + if (WN_operator(simd_op) == OPR_PARM && WN_operator(istore) == OPR_INTRINSIC_OP && WN_intrinsic(istore) == INTRN_SUBSU2) { @@ -4492,8 +4725,13 @@ else type = WN_desc(istore); - if (WN_operator(simd_op) == OPR_CVT || WN_operator(simd_op) == OPR_TRUNC) - type = desc; + OPERATOR opr = WN_operator(inv_wn); + if (WN_operator(simd_op) == OPR_CVT || WN_operator(simd_op) == OPR_TRUNC) { + if (WN_operator(simd_op) == OPR_CVT && (BINARY_OP(opr))) + type = WN_rtype(inv_wn); + else + type = desc; + } switch (type) { case MTYPE_V16C8: case MTYPE_C8: @@ -4511,12 +4749,12 @@ break; case MTYPE_V16F4: case MTYPE_F4: inv_wn = - LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16F4, desc), + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16F4, MTYPE_F4), inv_wn); break; case MTYPE_V16F8: case MTYPE_F8: inv_wn = - LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16F8, desc), + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16F8, MTYPE_F8), inv_wn); break; case MTYPE_V16I1: case MTYPE_U1: case MTYPE_I1: @@ -5183,6 +5421,207 @@ CXX_DELETE(equivalence_class, &LNO_local_pool); } +/* + Vectorize an if-statement. + Vectorization of if-statement uses the following SSE4.1 operations: + 1. pcmpeqq: evaluates pairs of if-conditions. + 2. pblendvb: represents the vectorized if-body by selecting the value + to store depending on the result of the pcmpeqq. + + By the time a vectorizable if statement is passed to + Simd_Vectorize_If, its sub-expressions have already been vectorized. + + Original Input to Output of + Simd_Vectorize_If Simd_Vectorize_If + if if vexpr + expr vexpr v0 + 0 v0 veq + ne vne vkid0 + then ===> then ===> kid1 + kid0 vkid0 vblend + kid1 kid1 kid1 + istore vistore vistore +*/ +static void Simd_Vectorize_If(WN *simd_op) { + WN *if_test, *blend_mask, *then_body, *then_istore, *blend_source, + *blend_dest_array, *blend_dest, *blend_parent, *blend, *loop_block, + *inner_if_test, *blend_mask_kid0, *blend_mask_kid1; + BOOL nested = FALSE; + BOOL special = FALSE; + + //step 1: verify that form of the if-statement is as expected. + if_test = WN_if_test(simd_op); + then_body = WN_then(simd_op); + FmtAssert(WN_operator(if_test) == OPR_NE, + ("Unexpected condition in IF-expression\n")); + FmtAssert(WN_rtype(if_test) == MTYPE_V16I8, + ("Unexpected vector type of if-condition")); + FmtAssert(WN_operator(then_body) == OPR_BLOCK, + ("body of if must be a block\n ")); + then_istore = WN_first(then_body); + + //step 2: check for nested-if statement and if so access the istore in the inner body. + if (WN_operator(then_istore) == OPR_IF) { + nested = TRUE; + inner_if_test = WN_if_test(then_istore); + FmtAssert(WN_operator(WN_then(then_istore)) == OPR_BLOCK, + ("body of if must be a block\n ")); + then_istore = WN_first(WN_then(then_istore)); + FmtAssert(WN_operator(inner_if_test) == OPR_NE, + ("Unexpected condition in nested-IF-expression\n")); + } + FmtAssert(WN_operator(then_istore) == OPR_ISTORE, + ("if-body must be an istore\n ")); + + if (!nested) { + //step 3: construct blend mask from the if-condition by + //replacing OPR_NE with OPR_EQ (generates pcmpeqq). + blend_mask = LWN_Copy_Tree(if_test); + LWN_Copy_Def_Use(if_test, blend_mask, Du_Mgr); + FmtAssert(WN_rtype(blend_mask) == MTYPE_V16I8, ("Unexpected rtype for vectorized NE")); + WN_set_operator(blend_mask, OPR_EQ); + WN_desc(blend_mask) == MTYPE_V16I8; + } + else { + // Nested-if: + WN *testval1, *testval2, *testval2kid1, *testval1kid1, *shlkid1, *shlkid2, + *mask1, *mask2, *bitmask, *bitmask_copy; + + testval1 = WN_kid0(if_test); // represents vexpr1 + testval2 = WN_kid0(inner_if_test); //represents vexpr2 + + /* + Step 4: Check if nested-if conditions are of a special form (shown below in original form): + if (x & (1 << c1)) // extract single bit + if (x & (1 << c2)) // extract another single bit + s1; + */ + if (WN_Equiv(testval1, testval2) && WN_operator(testval1) == OPR_BAND) { + if (Tree_Equiv(WN_kid0(testval1), WN_kid0(testval2))) { + testval1kid1 = WN_kid1(testval1); + testval2kid1 = WN_kid1(testval2); + // the sub-expression (1 << c1) is an invariant, so its vectorized form will have OPR_REPLICATE + if (WN_Equiv(testval1kid1, testval2kid1) && WN_operator(testval1kid1) == OPR_REPLICATE) { + shlkid1 = WN_kid0(testval1kid1); + shlkid2 = WN_kid0(testval2kid1); + if (WN_Equiv(shlkid1, shlkid2) && WN_operator(shlkid1) == OPR_SHL) { + if (WN_Equiv(WN_kid0(shlkid1), WN_kid0(shlkid2))) { + if (WN_operator(WN_kid0(shlkid1)) == OPR_INTCONST && + WN_const_val(WN_kid0(shlkid1)) == 1) { + special = TRUE; + } + } + } + } + } + } + if (special) { + /* Step 5: Construct blend mask for special if-condition by + converting it to the following equivalent (shown below in original form): + if ((x & (1<<c1 |1<<c2)) == (1<<c1 | 1<<c2)) // extract the two bits simultaneously + and then vectorize. + */ + blend_mask = LWN_Copy_Tree(if_test); + LWN_Copy_Def_Use(if_test, blend_mask, Du_Mgr); + + //bitmask = por(testval1kid1, testval2kid1); + mask1 = LWN_Copy_Tree(testval1kid1); + LWN_Copy_Def_Use(testval1kid1, mask1, Du_Mgr); + mask2 = LWN_Copy_Tree(testval2kid1); + LWN_Copy_Def_Use(testval2kid1, mask2, Du_Mgr); + //bitmask = (1<<c1|1<<c2, 1<<c1|1<<c2) + bitmask = WN_CreateExp2(OPR_BIOR, WN_rtype(mask1), MTYPE_V, + mask1, mask2); + + // make extract = pand(WN_kid0(testval1, bitmask)); + // extract = ((x[i] & (1<<c1|1<<c2)), (x[i+1] & (1<<c1|1<<c2))) + WN_kid1(WN_kid0(blend_mask)) = bitmask; + // blend_mask = pcmpeqq(extract, bitmask); + bitmask_copy = LWN_Copy_Tree(bitmask); + LWN_Copy_Def_Use(bitmask, bitmask_copy, Du_Mgr); + WN_kid1(blend_mask) = bitmask_copy; + WN_set_operator(blend_mask, OPR_EQ); + } + else { + //Step 6: Construct blend mask for non-special nested if-conditions by + //fusing the two condtions with a por. + /* + Input to Output of + Original Simd_Vectorize_If Simd_Vectorize_If + if if + expr1 vexpr1 vexpr1 + 0 v0 v0 + ne vne veq + then ==> then ==> vexpr2 + if if v0 + expr2 vexpr2 veq + 0 v0 vor + ne vne vkid0 + then then kid1 + kid0 vkid0 vblend + kid1 kid1 kid1 + istore vistore vistore + */ + + blend_mask_kid0 = LWN_Copy_Tree(if_test); + LWN_Copy_Def_Use(if_test, blend_mask_kid0, Du_Mgr); + WN_set_operator(blend_mask_kid0, OPR_EQ); + + blend_mask_kid1 = LWN_Copy_Tree(inner_if_test); + LWN_Copy_Def_Use(inner_if_test, blend_mask_kid1, Du_Mgr); + WN_set_operator(blend_mask_kid1, OPR_EQ); + + blend_mask = WN_CreateExp2(OPR_BIOR, WN_rtype(blend_mask_kid1), MTYPE_V, + blend_mask_kid0, blend_mask_kid1); + } + } + + //Step 7: Construct kid1 of blend from LHS of the istore statement. + blend_source = WN_kid0(then_istore); + + //Step 8: Construct kid2 of blend from RHS of the istore statement. + blend_dest_array = LWN_Copy_Tree(WN_kid1(then_istore)); + LWN_Copy_Def_Use(WN_kid1(then_istore), blend_dest_array, Du_Mgr); + blend_dest = WN_Iload(WN_desc(then_istore), WN_offset(then_istore), + Make_Pointer_Type(MTYPE_To_TY(WN_desc(then_istore))), + blend_dest_array, WN_field_id(then_istore)); + + blend_parent = then_istore; + if(special) { + blend = WN_CreateExp3(OPR_SELECT, MTYPE_V16I1, MTYPE_V16I1, + blend_dest, blend_source, blend_mask); + } + else { + blend = WN_CreateExp3(OPR_SELECT, MTYPE_V16I1, MTYPE_V16I1, + blend_source, blend_dest, blend_mask); + } + + //Step 9: Store result of blend is stored in the LHS of the istore + WN_kid0(then_istore) = blend; + LWN_Set_Parent(blend, blend_parent); + LWN_Parentize(blend); + + loop_block = LWN_Get_Parent(simd_op); + LWN_Set_Parent(blend_parent, loop_block); + + //Step 10: Replace the if-statement with the istore expression containing the blend-tree + if (simd_op == WN_first(loop_block)) + WN_first(loop_block) = blend_parent; + else { + WN_next(WN_prev(simd_op)) = blend_parent; + WN_prev(blend_parent) = WN_prev(simd_op); + } + if (simd_op == WN_last(loop_block)) { + WN_last(loop_block) = blend_parent; + WN_next(blend_parent) = NULL; + } + else { + WN_next(blend_parent) = WN_next(simd_op); + WN_prev(WN_next(simd_op)) = blend_parent; + } + return; +} + static void Simd_Vectorize_SimdOp_And_Kids(WN *simd_op, TYPE_ID vmtype, BOOL *invarkid) { @@ -5244,6 +5683,18 @@ WN *istore = LWN_Get_Parent(simd_op); if(WN_operator(istore) == OPR_SHUFFLE) istore = LWN_Get_Parent(istore); //up one level + + if(WN_operator(istore) == OPR_IF) { + if (WN_operator(LWN_Get_Parent(istore)) == OPR_BLOCK && + WN_operator(LWN_Get_Parent(LWN_Get_Parent(istore))) == OPR_IF) { + // This is the inner if of a nested-if. Do nothing here as + // Simd_Vectorize_SimdOp_And_Kids on the parent will vectorize this inner if. + // LWN_Get_Parent(LWN_Get_Parent(istore)) is the parent if. + } + else + Simd_Vectorize_If(istore); + } + if (WN_operator(istore) != OPR_STID && WN_operator(istore) != OPR_CVT && WN_operator(istore) != OPR_TRUNC && !OPCODE_is_compare(WN_opcode(istore))) { @@ -6373,3 +6824,22 @@ // Bug 3617 : Num_Vec() from ACCESS_ARRAY may not be in synch with // WN_num_dim(array) dues to delinearization. If we were to access different // kids in array, WN_num_dim(array) is the reliable source to find #kids. + +// Collect the indirect loads in a whirl tree. +// Does not recursively inspect kids of iloads. +static void WN_collect_iloads_nr(std::list<WN *> *wn_list, WN *wn) +{ + if (!wn_list || !wn) return; + + if (OPCODE_operator(WN_opcode(wn))==OPR_ILOAD) + wn_list->push_back(wn); + else + { + int i; + for (i = 0; i < WN_kid_count(wn); i++) + { + WN *kid = WN_kid(wn,i); + WN_collect_iloads_nr(wn_list,kid); + } + } +} Modified: trunk/osprey/common/com/config_lno.cxx =================================================================== --- trunk/osprey/common/com/config_lno.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/config_lno.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -179,11 +179,11 @@ #ifdef TARG_X8664 0, /* Fission */ TRUE, /* Serial_distribute */ - 0, /* Iter_threshold */ + 1, /* Iter_threshold */ #else 1, /* Fission */ FALSE, /* Serial_distribute */ - 0, /* Iter_threshold */ + 1, /* Iter_threshold */ #endif 0, /* Fission_inner_register_limit */ TRUE, /* Forward_substitution */ @@ -271,6 +271,7 @@ TRUE, /* Simd_Reduction */ TRUE, /* Simd_Avoid_Fusion */ FALSE, /* Simd_Rm_Unity_Remainder */ + TRUE, /* Simd_Vect_If */ TRUE, /* Run_hoistif */ TRUE, /* Ignore_Feedback */ TRUE, /* Run_unswitch */ @@ -401,11 +402,11 @@ #ifdef TARG_X8664 0, /* Fission */ TRUE, /* Serial_distribute */ - 0, /* Iter_threshold */ + 1, /* Iter_threshold */ #else 1, /* Fission */ FALSE, /* Serial_distribute */ - 0, /* Iter_threshold */ + 1, /* Iter_threshold */ #endif 0, /* Fission_inner_register_limit */ TRUE, /* Forward_substitution */ @@ -492,7 +493,8 @@ FALSE, /* Simd_Verbose */ TRUE, /* Simd_Reduction */ TRUE, /* Simd_Avoid_Fusion */ - FALSE, /* Simd_Rm_Unity_Remainder*/ + FALSE, /* Simd_Rm_Unity_Remainder*/ + TRUE, /* Simd_Vect_If */ TRUE, /* Run_hoistif */ TRUE, /* Ignore_Feedback */ TRUE, /* Run_unswitch */ @@ -879,6 +881,7 @@ LNOPT_BOOL ( "simd_reduction", "simd_red", Simd_Reduction ), LNOPT_BOOL ( "simd_avoid_fusion", NULL, Simd_Avoid_Fusion ), LNOPT_BOOL ( "simd_rm_unity_remainder", NULL, Simd_Rm_Unity_Remainder), + LNOPT_BOOL ( "simd_vect_if", NULL, Simd_Vect_If ), LNOPT_BOOL ( "hoistif", NULL, Run_hoistif ), LNOPT_BOOL ( "ignore_feedback", NULL, Ignore_Feedback ), LNOPT_BOOL ( "unswitch", NULL, Run_unswitch ), @@ -1224,5 +1227,12 @@ Mhd_Options.L[i].TLB_Miss_Penalty; } } + + /* Value of 1 for LNO_Iter_threshold is interpreted as default in which case + the flag is set based on target. Otherwise use user-specified value. + */ + if(LNO_Iter_threshold == 1) { + LNO_Iter_threshold = (Is_Target_SSE41())? 8 : 0; + } } Modified: trunk/osprey/common/com/config_lno.h =================================================================== --- trunk/osprey/common/com/config_lno.h 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/config_lno.h 2011-07-06 19:11:17 UTC (rev 3681) @@ -323,6 +323,7 @@ BOOL Simd_Reduction; BOOL Simd_Avoid_Fusion; BOOL Simd_Rm_Unity_Remainder; + BOOL Simd_Vect_If; BOOL Run_hoistif; BOOL Ignore_Feedback; BOOL Run_unswitch; @@ -575,6 +576,7 @@ #define LNO_Simd_Reduction Current_LNO->Simd_Reduction #define LNO_Simd_Avoid_Fusion Current_LNO->Simd_Avoid_Fusion #define LNO_Simd_Rm_Unity_Remainder Current_LNO->Simd_Rm_Unity_Remainder +#define LNO_Simd_Vect_If Current_LNO->Simd_Vect_If #define LNO_Run_hoistif Current_LNO->Run_hoistif #define LNO_Ignore_Feedback Current_LNO->Ignore_Feedback #define LNO_Run_Unswitch Current_LNO->Run_unswitch Modified: trunk/osprey/common/com/opcode_gen_core.cxx =================================================================== --- trunk/osprey/common/com/opcode_gen_core.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/opcode_gen_core.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -3270,9 +3270,13 @@ break; case OPR_SELECT: - // [RTYPE] : b,f,i,p,z [DESC] : V,B + // [RTYPE] : b,f,i,p,z,V16 [DESC] : V,B,V16 valid = Is_MTYPE_b_f_i_p_z [rtype] && (desc == MTYPE_V || desc == MTYPE_B); +#ifdef TARG_X8664 + // add more valid vector types later. + valid = valid || (rtype == MTYPE_V16I1 && desc == MTYPE_V16I1); +#endif break; case OPR_TAS: @@ -3631,7 +3635,7 @@ break; case OPR_SELECT: - // [RTYPE] : b,f,i,p,z [DESC] : V,b + // [RTYPE] : b,f,i,p,z,V16 [DESC] : V,b,V16 sprintf (buffer, "OPC_%s%s%s", MTYPE_name(rtype), desc == MTYPE_V ? "" : MTYPE_name(desc), &OPERATOR_info [opr]._name [4]); break; Modified: trunk/osprey/common/com/opcode_gen_core.h =================================================================== --- trunk/osprey/common/com/opcode_gen_core.h 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/opcode_gen_core.h 2011-07-06 19:11:17 UTC (rev 3681) @@ -1034,6 +1034,11 @@ OPC_U8I16EQ = OPR_EQ + RTYPE(MTYPE_U8) + DESC(MTYPE_I16), OPC_U8U16EQ = OPR_EQ + RTYPE(MTYPE_U8) + DESC(MTYPE_U16), #endif /* TARG_X8664 */ +#ifdef TARG_X8664 + // Note that the result of the comparison of individual elements (for OPC_V16I8V16I8EQ) + // should be either 0 or the bit mask of all 1's (NOT 0 or 1). + OPC_V16I8V16I8EQ = OPR_EQ + RTYPE(MTYPE_V16I8) + DESC(MTYPE_V16I8), +#endif OPC_EVAL = OPR_EVAL + RTYPE(MTYPE_V) + DESC(MTYPE_V), OPC_EXC_SCOPE_BEGIN = OPR_EXC_SCOPE_BEGIN + RTYPE(MTYPE_V) + DESC(MTYPE_V), OPC_EXC_SCOPE_END = OPR_EXC_SCOPE_END + RTYPE(MTYPE_V) + DESC(MTYPE_V), @@ -2351,6 +2356,7 @@ #else OPC_V16F4SELECT = OPR_SELECT + RTYPE(MTYPE_V16F4) + DESC(MTYPE_V), OPC_V16F8SELECT = OPR_SELECT + RTYPE(MTYPE_V16F8) + DESC(MTYPE_V), + OPC_V16I1V16I1SELECT = OPR_SELECT + RTYPE(MTYPE_V16I1) + DESC(MTYPE_V16I1), #endif /* TARG_X8664 */ OPC_BBSELECT = OPR_SELECT + RTYPE(MTYPE_B) + DESC(MTYPE_B), OPC_I4BSELECT = OPR_SELECT + RTYPE(MTYPE_I4) + DESC(MTYPE_B), Modified: trunk/osprey/common/com/wn_util.cxx =================================================================== --- trunk/osprey/common/com/wn_util.cxx 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/wn_util.cxx 2011-07-06 19:11:17 UTC (rev 3681) @@ -1983,3 +1983,23 @@ } } +/*********************************************************************** + * + * This routine collects all the indirect loads in a whirl tree. + * + ***********************************************************************/ +void +WN_collect_iloads(std::list<WN*>* wn_list, WN* wn) +{ + if (!wn_list || !wn) return; + + if (WN_operator(wn) == OPR_ILOAD) + wn_list->push_back(wn); + + for (int i = 0; i < WN_kid_count(wn); i++) + { + WN *kid = WN_kid(wn, i); + WN_collect_iloads(wn_list, kid); + } +} + Modified: trunk/osprey/common/com/wn_util.h =================================================================== --- trunk/osprey/common/com/wn_util.h 2011-07-05 23:09:12 UTC (rev 3680) +++ trunk/osprey/common/com/wn_util.h 2011-07-06 19:11:17 UTC (rev 3681) @@ -481,6 +481,7 @@ /* Needed for the STL vector class used below */ #include "vector" +#include "list" #include "mempool_allocator.h" typedef mempool_allocator<WN*> VEC_POOL_ALLOCATOR; @@ -494,6 +495,8 @@ WN_MAP parent_map, BOOL make_compiler_generated); +extern void WN_collect_iloads(std::list<WN*>*, WN*); + #endif /* __cplusplus */ #endif /* wn_util_INCLUDED */ ------------------------------------------------------------------------------ All of the data generated in your IT infrastructure is seriously valuable. Why? It contains a definitive record of application performance, security threats, fraudulent activity, and more. Splunk takes this data and makes sense of it. IT sense. And common sense. http://p.sf.net/sfu/splunk-d2d-c2 _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel