Author: pallavimathew Date: 2011-04-18 17:53:10 -0400 (Mon, 18 Apr 2011) New Revision: 3558
Modified: trunk/osprey/be/cg/x8664/expand.cxx trunk/osprey/be/lno/simd.cxx trunk/osprey/common/com/opcode_gen_core.h Log: This patch 1. enables vectorization of BAND, BXOR and SHL operations as well as adds support for expanding vectorized shift-left operation. 2. fixes a bug in vectorizing expressions with negative coefficient. An example program is: int a[]; double b[], x; void foo (void) { int i; for (i = 0; i < 1000; i++) b[i] += a[1000-i] * x; } Note that for "(double)a[1000-i]": - the coefficient of the loop index is -1 (so, still unit-strided), - and sizeof(a[*]) == 4 There are only *TWO* elements encapsulated in a single vector. However, current SIMD implementation takes for granted there are four elements involved. On the other hand, since the coefficient is negative, the elements of vector need to be reversed. There was no OPCODE capable of reversing two 4-byte elements, hence V8I4V8I4SHUFFLE is introduced. The wrong vectorized WHIRL of the expression was: 1 U8LDA 0 <1,52,a> T<57,anon_ptr.,8> 2 U4INTCONST 0 (0x0) 3 I4INTCONST 997 (0x3e5) /* wrong starting addr */ 4 I4I4LDID 49 <1,4,.preg_I4> T<4,.predef_I4,4> # i 5 I4SUB 6 U8ARRAY 1 4 7 V8I4V8I4ILOAD 0 T<4,.predef_I4,4> T<58,anon_ptr.,8> 8 V16I4V16I4SHUFFLE 0 /* wrong shuffle */ 9 V16F8V8I4CVT with the fix it is: 1 U8LDA 0 <1,52,a> T<57,anon_ptr.,8> 2 U4INTCONST 0 (0x0) 3 I4INTCONST 999 (0x3e7) 4 I4I4LDID 49 <1,4,.preg_I4> T<4,.predef_I4,4> # i 5 I4SUB 6 U8ARRAY 1 4 7 V8I4V8I4ILOAD 0 T<4,.predef_I4,4> T<58,anon_ptr.,8> 8 V8I4V8I4SHUFFLE 0 9 V16F8V8I4CVT Reviewed by Min Zhao and approved by Mei Ye. Modified: trunk/osprey/be/cg/x8664/expand.cxx =================================================================== --- trunk/osprey/be/cg/x8664/expand.cxx 2011-04-18 18:42:38 UTC (rev 3557) +++ trunk/osprey/be/cg/x8664/expand.cxx 2011-04-18 21:53:10 UTC (rev 3558) @@ -1879,10 +1879,26 @@ } } - if( OP_NEED_PAIR( mtype ) ) - Expand_Split_Shift( kind, result, src1, src2, ops ); - else - Build_OP(top, result, src1, src2, ops); + + switch(mtype) { + case MTYPE_V16I1: + if (kind == shift_left) + Build_OP(TOP_psllw, result, src1, src2, ops); break; + case MTYPE_V16I2: + if (kind == shift_left) + Build_OP(TOP_psllw, result, src1, src2, ops); break; + case MTYPE_V16I4: + if (kind == shift_left) + Build_OP(TOP_pslld, result, src1, src2, ops); break; + case MTYPE_V16I8: + if (kind == shift_left) + Build_OP(TOP_psllq, result, src1, src2, ops); break; + default: + if( OP_NEED_PAIR( mtype ) ) + Expand_Split_Shift( kind, result, src1, src2, ops ); + else + Build_OP(top, result, src1, src2, ops); + } } void @@ -6502,6 +6518,15 @@ case OPC_V16I4V16I4SHUFFLE: Build_OP(TOP_pshufd, result, op1, Gen_Literal_TN(0x1B, 1), ops); break; + + case OPC_V8I4V8I4SHUFFLE: + case OPC_V8F4V8F4SHUFFLE: + // Transpose elements 0 and 1. The content of element 2 and 3 are + // immaterial. + // + Build_OP (TOP_pshufd, result, op1, Gen_Literal_TN(0x1, 1), ops); + break; + case OPC_V16I8V16I8SHUFFLE: case OPC_V16F8V16F8SHUFFLE: if (Is_Target_Orochi() && Is_Target_AVX()) { @@ -6528,7 +6553,7 @@ break; } default: - FmtAssert(FALSE, ("NYI")); + FmtAssert(FALSE, ("expand %s, NYI", OPCODE_name(opc))); } return; } Modified: trunk/osprey/be/lno/simd.cxx =================================================================== --- trunk/osprey/be/lno/simd.cxx 2011-04-18 18:42:38 UTC (rev 3557) +++ trunk/osprey/be/lno/simd.cxx 2011-04-18 21:53:10 UTC (rev 3558) @@ -287,6 +287,13 @@ return TRUE; else return FALSE; + case OPR_BAND: + case OPR_SHL: + case OPR_BXOR: + if (MTYPE_is_integral(rtype)) + return TRUE; + else + return FALSE; case OPR_SQRT: if (rtype == MTYPE_F4 || rtype == MTYPE_F8) return TRUE; @@ -335,6 +342,14 @@ opr=WN_operator(wn); } + // Recognize an invariant expression rooted at OPR_SHL. + // Should eventually be generalized to any 2 operand operation. + if (opr == OPR_SHL) { + if ((simd_operand_kind(WN_kid0(wn), loop) == Invariant) && + (simd_operand_kind(WN_kid1(wn), loop) == Invariant)) + return Invariant; + } + if (opr==OPR_CONST || opr==OPR_INTCONST) { return Invariant; } else if (opr==OPR_LDA) { @@ -734,6 +749,20 @@ kid1 = WN_kid2(wn); } + // For all vectorized versions of the shift-left operation psll(w/d/q/dq), + // each w/d/q/dq in the first operand is left shifted by the same number + // of bits given by the second argument. Hence for a scalar shift in a + // loop to be vectorized, the second operand to the shift must be a loop invariant. + if (WN_operator(wn) == OPR_SHL) { + SIMD_OPERAND_KIND shl_op_kind = simd_operand_kind(WN_kid1(wn), LWN_Get_Parent(WN_do_body(loop))); + if (shl_op_kind != Invariant) + return FALSE; + // cannot vectorize a 128-bit or 8-bit shift since there is no corresponding vectorized instruction. + if (WN_rtype(wn) == MTYPE_I16 || WN_rtype(wn) == MTYPE_U16 || + WN_rtype(wn) == MTYPE_I1 || WN_rtype(wn) == MTYPE_U1) + return FALSE; + } + if (OPCODE_is_compare(WN_opcode(wn)) && WN_operator(parent) != OPR_SELECT) return FALSE; @@ -1418,6 +1447,15 @@ TYPE_ID rtype = WN_rtype(wn); TYPE_ID desc = WN_desc(wn); + // Recognize invariant sub-expression rooted at OPR_SHL and do not + // push it onto the stack of vectorizable operations. + // Should eventually be generalized to prevent any 2 operand invariant + // from being vectorized. + if (opr == OPR_SHL && + simd_operand_kind(WN_kid0(wn), LWN_Get_Parent(WN_do_body(loop))) == Invariant && + simd_operand_kind(WN_kid1(wn), LWN_Get_Parent(WN_do_body(loop))) == Invariant) + if (is_vectorizable_op(WN_operator(wn), WN_rtype(wn), WN_desc(wn))) + return TRUE; if (opr == OPR_IF || opr == OPR_REGION){ Report_Non_Vectorizable_Op(wn); return FALSE; @@ -3828,6 +3866,70 @@ return remainderloop; } +// Simd_Handle_Negative_Coefficient_Helper() is helper function of +// Simd_Handle_Negative_Coefficient(). This function is to handle +// the vectorizable expression like "(double)a[i]" where a[i] is +// either 4 byte integer or floating point. +// +static void Simd_Handle_Negative_Coefficient_Helper( + WN *parent, // shffle's parent + INT which_kid, // which kid + WN *array, // array to shuffle + WN *loop, // the loop + BOOL no_shuffle) { + + // step 1: derive vector length etc + // + + // This func is supposed to be called only by Simd_Handle_Negative_Coefficient. + // + Is_True (WN_operator(parent) == OPR_CVT, ("wrong caller")); + + INT vect_len = 16/MTYPE_byte_size (WN_rtype(parent)); + Is_True (vect_len == 2 && WN_element_size(array) == 4, + ("For now, this func only handle F8I4CVT and F8F4CVT")); + + ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get (LNO_Info_Map,array); + Is_True (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop))==-1, + ("loop coefficient is not -1")); + + WN *opnd = LWN_Get_Parent(array); + TYPE_ID vect_ty = MTYPE_is_float (WN_desc(opnd)) ? MTYPE_V8F4 : MTYPE_V8I4; + + // step 2: adjust array index. e.g. If the vectorizable expression is + // "(double)a[i]" where sizeof(a[i]) = 4, the index need to subtract by + // "vector_length - 1". In this case, vect-len = 2, so, the ultimate + // vectorized expression is like "F16F8V8I4CVT shuffle ((*(V8I4*)&a[i-1])). + // + TYPE_ID idx_ty = WN_rtype(WN_end(loop)); + OPCODE adjust = OPCODE_make_op (OPR_INTCONST, idx_ty, MTYPE_V); + OPCODE sub_opc = OPCODE_make_op (OPR_SUB, + Mtype_TransferSign (MTYPE_I4, idx_ty), MTYPE_V); + + WN* orig_idx = WN_array_index (array, WN_num_dim(array)-1); + WN_array_index (array, WN_num_dim(array)-1) = + LWN_CreateExp2 (sub_opc, orig_idx, + WN_CreateIntconst(adjust, vect_len-1)); + + LWN_Parentize (array); + if (!no_shuffle) { + WN_kid (parent, which_kid) = + LWN_CreateExp1 (OPCODE_make_op(OPR_SHUFFLE, vect_ty, vect_ty), + WN_kid(parent, which_kid)); + // "0" means reverse vector elements. As of I write this note, + // CG doesn't respect this parameter -- it bindly reverses elements + // regardless WN_offset() is 0 or not. + // + // Since the vector involved here is shorter (8 byte) than underlying + // machine is providing, care must be take by CG to only swap elements + // 0 and 1, instead of all four elements. + // + WN_offset (WN_kid(parent, which_kid)) = 0; + } + + LWN_Parentize(parent); +} + //handle negative loop coefficient static void Simd_Handle_Negative_Coefficient( WN *parent,/*shffle's parent*/ @@ -3837,9 +3939,29 @@ BOOL no_shuffle) { FmtAssert(WN_element_size(array), ("NYI")); + + ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get(LNO_Info_Map,array); + if (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop)) != -1) + return; + + TYPE_ID res_ty = WN_rtype (parent); + TYPE_ID desc_ty = WN_desc (parent); + if (WN_operator (parent) == OPR_CVT && + MTYPE_is_float(res_ty) && + MTYPE_byte_size(res_ty) != MTYPE_byte_size(desc_ty)) { + if (MTYPE_byte_size(res_ty) == 8 && MTYPE_byte_size(desc_ty) == 4) { + Simd_Handle_Negative_Coefficient_Helper (parent, which_kid, array, + loop, no_shuffle); + return; + } else { + FmtAssert (FALSE, ("Don't know how to handle %s", + OPCODE_name (WN_opcode(parent)))); + } + } + INT incr = 16/ABS(WN_element_size(array)); - ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get(LNO_Info_Map,array); - if (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop))==-1){ + + { TYPE_ID vector_type; WN *opnd = LWN_Get_Parent(array); switch(ABS(WN_element_size(array))) { @@ -3952,6 +4074,14 @@ } +// When vectorizing constants and invariants, care must be taken to appropriately +// vectorize the second operand of OPR_SHL. Most constants/invariant can be vectorized +// by replicating them in each b/w/d/q of the xmm register as per the type of the vector. +// In the case of packed shift left (psllw/d/q), the second operand must always be +// loaded into the lower 64-bits of the 128-bit xmm reg or memory. Note that if the +// second argument is a constant it can be placed in a 1 byte immediate if it fits. +// But the first option has been chosen because it fits easier with the existing framework. + static WN *Simd_Vectorize_Constants(WN *const_wn,//to be vectorized WN *istore, //parent of simd_op WN *simd_op) //const_wn's parent @@ -4000,10 +4130,16 @@ const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I1, MTYPE_V, sym); break; case MTYPE_U2: case MTYPE_I2: case MTYPE_V16I2: - const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I2, MTYPE_V, sym); + if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == const_wn) + const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym); + else + const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I2, MTYPE_V, sym); break; case MTYPE_U4: case MTYPE_I4: case MTYPE_V16I4: - const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I4, MTYPE_V, sym); + if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == const_wn) + const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym); + else + const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I4, MTYPE_V, sym); break; case MTYPE_U8: case MTYPE_I8: case MTYPE_V16I8: const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym); @@ -4057,14 +4193,32 @@ inv_wn); break; case MTYPE_V16I2: case MTYPE_U2: case MTYPE_I2: - inv_wn = - LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I2, MTYPE_I2), - inv_wn); + if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == inv_wn) { + WN* cvt_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_CVT, MTYPE_I8, MTYPE_I2), + inv_wn); + inv_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I8, MTYPE_I8), + cvt_wn); + } + else + inv_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I2, MTYPE_I2), + inv_wn); break; case MTYPE_V16I4: case MTYPE_U4: case MTYPE_I4: - inv_wn = - LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I4, MTYPE_I4), - inv_wn); + if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == inv_wn) { + WN* cvt_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_CVT, MTYPE_I8, MTYPE_I4), + inv_wn); + inv_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I8, MTYPE_I8), + inv_wn); + } + else + inv_wn = + LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I4, MTYPE_I4), + inv_wn); break; case MTYPE_V16I8: case MTYPE_U8: case MTYPE_I8: inv_wn = Modified: trunk/osprey/common/com/opcode_gen_core.h =================================================================== --- trunk/osprey/common/com/opcode_gen_core.h 2011-04-18 18:42:38 UTC (rev 3557) +++ trunk/osprey/common/com/opcode_gen_core.h 2011-04-18 21:53:10 UTC (rev 3558) @@ -2706,6 +2706,8 @@ OPC_V16I2V16I2SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I2) + DESC(MTYPE_V16I2), OPC_V16I4V16I4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I4) + DESC(MTYPE_V16I4), OPC_V16I8V16I8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I8) + DESC(MTYPE_V16I8), + OPC_V8I4V8I4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V8I4) + DESC(MTYPE_V8I4), + OPC_V8F4V8F4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V8F4) + DESC(MTYPE_V8F4), OPC_V16F4V16F4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16F4) + DESC(MTYPE_V16F4), OPC_V16F8V16F8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16F8) + DESC(MTYPE_V16F8), OPC_V16C8V16C8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16C8) + DESC(MTYPE_V16C8), ------------------------------------------------------------------------------ Benefiting from Server Virtualization: Beyond Initial Workload Consolidation -- Increasing the use of server virtualization is a top priority.Virtualization can reduce costs, simplify management, and improve application availability and disaster protection. Learn more about boosting the value of server virtualization. http://p.sf.net/sfu/vmware-sfdev2dev _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel