Author: pallavimathew
Date: 2011-04-18 17:53:10 -0400 (Mon, 18 Apr 2011)
New Revision: 3558

Modified:
   trunk/osprey/be/cg/x8664/expand.cxx
   trunk/osprey/be/lno/simd.cxx
   trunk/osprey/common/com/opcode_gen_core.h
Log:
This patch
1. enables vectorization of BAND, BXOR and SHL operations as well as adds 
support for expanding vectorized shift-left operation.
2. fixes a bug in vectorizing expressions with negative coefficient. An example 
program is:
  int a[];
  double b[], x;
  void foo (void) {
      int i;
      for (i = 0; i < 1000; i++)
          b[i] += a[1000-i] * x;
  }

  Note that for "(double)a[1000-i]":
    - the coefficient of the loop index is -1 (so, still unit-strided),
    - and sizeof(a[*]) == 4

  There are only *TWO* elements encapsulated in a single vector. However, 
current SIMD implementation takes for granted there 
  are four elements involved. On the other hand, since the coefficient is 
negative, the elements of vector need to be reversed. 
  There was no OPCODE capable of reversing two 4-byte elements, hence 
V8I4V8I4SHUFFLE is introduced.

  The wrong vectorized WHIRL of the expression was:
      1          U8LDA 0 <1,52,a> T<57,anon_ptr.,8>
      2          U4INTCONST 0 (0x0)
      3           I4INTCONST 997 (0x3e5) /* wrong starting addr */
      4           I4I4LDID 49 <1,4,.preg_I4> T<4,.predef_I4,4> # i
      5          I4SUB
      6         U8ARRAY 1 4
      7        V8I4V8I4ILOAD 0 T<4,.predef_I4,4> T<58,anon_ptr.,8>
      8       V16I4V16I4SHUFFLE 0  /* wrong shuffle */
      9      V16F8V8I4CVT

  with the fix it is:
      1          U8LDA 0 <1,52,a> T<57,anon_ptr.,8>
      2          U4INTCONST 0 (0x0)
      3           I4INTCONST 999 (0x3e7)
      4           I4I4LDID 49 <1,4,.preg_I4> T<4,.predef_I4,4> # i
      5          I4SUB
      6         U8ARRAY 1 4
      7        V8I4V8I4ILOAD 0 T<4,.predef_I4,4> T<58,anon_ptr.,8>
      8       V8I4V8I4SHUFFLE 0
      9      V16F8V8I4CVT 

Reviewed by Min Zhao and approved by Mei Ye.


Modified: trunk/osprey/be/cg/x8664/expand.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/expand.cxx 2011-04-18 18:42:38 UTC (rev 3557)
+++ trunk/osprey/be/cg/x8664/expand.cxx 2011-04-18 21:53:10 UTC (rev 3558)
@@ -1879,10 +1879,26 @@
     }
   }
 
-  if( OP_NEED_PAIR( mtype ) )
-    Expand_Split_Shift( kind, result, src1, src2, ops );
-  else
-    Build_OP(top, result, src1, src2, ops);
+
+  switch(mtype) {
+  case MTYPE_V16I1: 
+    if (kind == shift_left)
+      Build_OP(TOP_psllw, result, src1, src2, ops); break;
+  case MTYPE_V16I2: 
+    if (kind == shift_left)
+      Build_OP(TOP_psllw, result, src1, src2, ops); break;
+  case MTYPE_V16I4: 
+    if (kind == shift_left)
+      Build_OP(TOP_pslld, result, src1, src2, ops); break;
+  case MTYPE_V16I8: 
+    if (kind == shift_left)
+      Build_OP(TOP_psllq, result, src1, src2, ops); break;
+  default:
+    if( OP_NEED_PAIR( mtype ) )
+      Expand_Split_Shift( kind, result, src1, src2, ops );
+    else
+      Build_OP(top, result, src1, src2, ops);
+  }
 }
 
 void
@@ -6502,6 +6518,15 @@
   case OPC_V16I4V16I4SHUFFLE:
     Build_OP(TOP_pshufd, result, op1, Gen_Literal_TN(0x1B, 1), ops);
     break;
+
+  case OPC_V8I4V8I4SHUFFLE:
+  case OPC_V8F4V8F4SHUFFLE:
+    // Transpose elements 0 and 1. The content of element 2 and 3 are 
+    // immaterial.
+    //
+    Build_OP (TOP_pshufd, result, op1, Gen_Literal_TN(0x1, 1), ops);
+    break;
+
   case OPC_V16I8V16I8SHUFFLE:
   case OPC_V16F8V16F8SHUFFLE:
     if (Is_Target_Orochi() && Is_Target_AVX()) {
@@ -6528,7 +6553,7 @@
       break;
     }
   default:
-    FmtAssert(FALSE, ("NYI"));
+    FmtAssert(FALSE, ("expand %s, NYI", OPCODE_name(opc)));
   }
   return;
 }

Modified: trunk/osprey/be/lno/simd.cxx
===================================================================
--- trunk/osprey/be/lno/simd.cxx        2011-04-18 18:42:38 UTC (rev 3557)
+++ trunk/osprey/be/lno/simd.cxx        2011-04-18 21:53:10 UTC (rev 3558)
@@ -287,6 +287,13 @@
       return TRUE;
     else
       return FALSE;
+  case OPR_BAND:
+  case OPR_SHL:
+  case OPR_BXOR:
+    if (MTYPE_is_integral(rtype))
+      return TRUE;
+    else
+      return FALSE;    
   case OPR_SQRT:
     if (rtype == MTYPE_F4 || rtype == MTYPE_F8)
       return TRUE;
@@ -335,6 +342,14 @@
     opr=WN_operator(wn);
   }
 
+  // Recognize an invariant expression rooted at OPR_SHL.
+  // Should eventually be generalized to any 2 operand operation.
+  if (opr == OPR_SHL) {
+    if ((simd_operand_kind(WN_kid0(wn), loop) == Invariant) &&
+        (simd_operand_kind(WN_kid1(wn), loop) == Invariant))
+      return Invariant;
+  }
+
   if (opr==OPR_CONST || opr==OPR_INTCONST) {
     return Invariant;
   } else if (opr==OPR_LDA) {
@@ -734,6 +749,20 @@
     kid1 = WN_kid2(wn);
   }
  
+  // For all vectorized versions of the shift-left operation psll(w/d/q/dq), 
+  // each  w/d/q/dq in the first operand is left shifted by the same number 
+  // of bits given by the second argument. Hence for a scalar shift in a 
+  // loop to be vectorized, the second operand to the shift must be a loop 
invariant. 
+  if (WN_operator(wn) == OPR_SHL) {
+    SIMD_OPERAND_KIND shl_op_kind = simd_operand_kind(WN_kid1(wn), 
LWN_Get_Parent(WN_do_body(loop)));
+    if (shl_op_kind != Invariant)
+      return FALSE;
+    // cannot vectorize a 128-bit or 8-bit shift since there is no 
corresponding vectorized instruction.
+    if (WN_rtype(wn) == MTYPE_I16 || WN_rtype(wn) == MTYPE_U16 ||
+       WN_rtype(wn) == MTYPE_I1  || WN_rtype(wn) == MTYPE_U1)
+      return FALSE;
+  }
+
   if (OPCODE_is_compare(WN_opcode(wn)) && WN_operator(parent) != OPR_SELECT)
     return FALSE;
 
@@ -1418,6 +1447,15 @@
   TYPE_ID rtype = WN_rtype(wn);
   TYPE_ID desc = WN_desc(wn);
   
+  // Recognize invariant sub-expression rooted at OPR_SHL and do not
+  // push it onto the stack of vectorizable operations. 
+  // Should eventually be generalized to prevent any 2 operand invariant
+  // from being vectorized.
+  if (opr == OPR_SHL && 
+      simd_operand_kind(WN_kid0(wn), LWN_Get_Parent(WN_do_body(loop))) == 
Invariant &&
+      simd_operand_kind(WN_kid1(wn), LWN_Get_Parent(WN_do_body(loop))) == 
Invariant)
+    if (is_vectorizable_op(WN_operator(wn), WN_rtype(wn), WN_desc(wn)))
+      return TRUE;
   if (opr == OPR_IF || opr == OPR_REGION){
     Report_Non_Vectorizable_Op(wn);
     return FALSE;
@@ -3828,6 +3866,70 @@
  return remainderloop; 
 } 
 
+// Simd_Handle_Negative_Coefficient_Helper() is helper function of 
+// Simd_Handle_Negative_Coefficient(). This function is to handle 
+// the vectorizable expression like "(double)a[i]" where a[i] is 
+// either 4 byte integer or floating point.
+//
+static void Simd_Handle_Negative_Coefficient_Helper(
+  WN *parent,     // shffle's parent
+  INT which_kid,  // which kid 
+  WN *array,      // array to shuffle
+  WN *loop,       // the loop
+  BOOL no_shuffle) {
+
+  // step 1: derive vector length etc
+  //
+
+  // This func is supposed to be called only by 
Simd_Handle_Negative_Coefficient.
+  //
+  Is_True (WN_operator(parent) == OPR_CVT, ("wrong caller"));
+
+  INT vect_len = 16/MTYPE_byte_size (WN_rtype(parent)); 
+  Is_True (vect_len == 2 && WN_element_size(array) == 4, 
+           ("For now, this func only handle F8I4CVT and F8F4CVT"));
+
+  ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get (LNO_Info_Map,array);
+  Is_True (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop))==-1,
+           ("loop coefficient is not -1"));
+
+  WN *opnd = LWN_Get_Parent(array);
+  TYPE_ID vect_ty = MTYPE_is_float (WN_desc(opnd)) ? MTYPE_V8F4 : MTYPE_V8I4;
+
+  // step 2: adjust array index. e.g. If the vectorizable expression is 
+  //  "(double)a[i]" where sizeof(a[i]) = 4, the index need to subtract by 
+  //  "vector_length - 1". In this case, vect-len = 2, so, the ultimate
+  //  vectorized expression is like "F16F8V8I4CVT shuffle ((*(V8I4*)&a[i-1])).
+  //
+  TYPE_ID idx_ty = WN_rtype(WN_end(loop));
+  OPCODE adjust = OPCODE_make_op (OPR_INTCONST, idx_ty, MTYPE_V);
+  OPCODE sub_opc = OPCODE_make_op (OPR_SUB,
+                    Mtype_TransferSign (MTYPE_I4, idx_ty), MTYPE_V);
+
+  WN* orig_idx = WN_array_index (array, WN_num_dim(array)-1);
+  WN_array_index (array, WN_num_dim(array)-1) = 
+          LWN_CreateExp2 (sub_opc, orig_idx,
+                          WN_CreateIntconst(adjust, vect_len-1));
+
+  LWN_Parentize (array);
+  if (!no_shuffle) {
+    WN_kid (parent, which_kid) = 
+      LWN_CreateExp1 (OPCODE_make_op(OPR_SHUFFLE, vect_ty, vect_ty),
+                             WN_kid(parent, which_kid));
+    // "0" means reverse vector elements. As of I write this note, 
+    // CG doesn't respect this parameter -- it bindly reverses elements 
+    // regardless WN_offset() is 0 or not.
+    //
+    // Since the vector involved here is shorter (8 byte) than underlying 
+    // machine is providing, care must be take by CG to only swap elements 
+    // 0 and 1, instead of all four elements.
+    //
+    WN_offset (WN_kid(parent, which_kid)) = 0; 
+  }
+
+  LWN_Parentize(parent);
+}
+
 //handle negative loop coefficient
 static void Simd_Handle_Negative_Coefficient(
                                       WN *parent,/*shffle's parent*/
@@ -3837,9 +3939,29 @@
                                       BOOL no_shuffle)
 {
   FmtAssert(WN_element_size(array), ("NYI"));
+
+  ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get(LNO_Info_Map,array);
+  if (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop)) != -1)
+    return;
+
+  TYPE_ID res_ty = WN_rtype (parent);
+  TYPE_ID desc_ty = WN_desc (parent);
+  if (WN_operator (parent) == OPR_CVT && 
+      MTYPE_is_float(res_ty) && 
+      MTYPE_byte_size(res_ty) != MTYPE_byte_size(desc_ty)) {
+    if (MTYPE_byte_size(res_ty) == 8 && MTYPE_byte_size(desc_ty) == 4) {
+      Simd_Handle_Negative_Coefficient_Helper (parent, which_kid, array, 
+                                               loop, no_shuffle);
+      return;
+    } else {
+      FmtAssert (FALSE, ("Don't know how to handle %s", 
+                 OPCODE_name (WN_opcode(parent))));
+    }
+  }
+
   INT incr = 16/ABS(WN_element_size(array));
-  ACCESS_ARRAY* aa = (ACCESS_ARRAY*)WN_MAP_Get(LNO_Info_Map,array);
-  if (aa->Dim(aa->Num_Vec()-1)->Loop_Coeff(Do_Loop_Depth(loop))==-1){
+    
+  {
       TYPE_ID vector_type;
       WN *opnd = LWN_Get_Parent(array);
       switch(ABS(WN_element_size(array))) {
@@ -3952,6 +4074,14 @@
 }
 
 
+// When vectorizing constants and invariants, care must be taken to 
appropriately 
+// vectorize the second operand of OPR_SHL. Most constants/invariant can be 
vectorized 
+// by replicating them in each b/w/d/q of the xmm register as per the type of 
the vector.
+// In the case of packed shift left (psllw/d/q), the second operand must 
always be 
+// loaded into the lower 64-bits of the 128-bit xmm reg or memory.  Note that 
if the 
+// second argument is a constant it can be placed in a 1 byte immediate if it 
fits. 
+// But the first option has been chosen because it fits easier with the 
existing framework.
+
 static WN *Simd_Vectorize_Constants(WN *const_wn,//to be vectorized 
                                     WN *istore,  //parent of simd_op
                                     WN *simd_op) //const_wn's parent
@@ -4000,10 +4130,16 @@
           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I1, MTYPE_V, sym);
           break;
      case MTYPE_U2: case MTYPE_I2: case MTYPE_V16I2:
-          const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I2, MTYPE_V, sym);
+          if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == const_wn)
+           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym);
+         else
+           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I2, MTYPE_V, sym);
           break;
      case MTYPE_U4: case MTYPE_I4: case MTYPE_V16I4:
-          const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I4, MTYPE_V, sym);
+          if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == const_wn)
+           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym);
+         else
+           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I4, MTYPE_V, sym);
           break;
      case MTYPE_U8: case MTYPE_I8: case MTYPE_V16I8:
           const_wn = WN_CreateConst (OPR_CONST, MTYPE_V16I8, MTYPE_V, sym);
@@ -4057,14 +4193,32 @@
                            inv_wn);
           break;
      case MTYPE_V16I2: case MTYPE_U2: case MTYPE_I2:
-          inv_wn =
-            LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I2, 
MTYPE_I2),
-                           inv_wn);
+          if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == inv_wn) {
+           WN* cvt_wn = 
+             LWN_CreateExp1(OPCODE_make_op(OPR_CVT, MTYPE_I8, MTYPE_I2),
+                            inv_wn);
+           inv_wn =
+             LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I8, 
MTYPE_I8),
+                            cvt_wn);
+         }
+         else
+           inv_wn =
+             LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I2, 
MTYPE_I2),
+                            inv_wn);
           break;
      case MTYPE_V16I4: case MTYPE_U4: case MTYPE_I4:
-          inv_wn =
-            LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I4, 
MTYPE_I4),
-                           inv_wn);
+          if (WN_operator(simd_op) == OPR_SHL && WN_kid1(simd_op) == inv_wn) {
+           WN* cvt_wn = 
+             LWN_CreateExp1(OPCODE_make_op(OPR_CVT, MTYPE_I8, MTYPE_I4),
+                            inv_wn);
+           inv_wn =
+             LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I8, 
MTYPE_I8),
+                            inv_wn);
+         }
+         else
+           inv_wn =
+             LWN_CreateExp1(OPCODE_make_op(OPR_REPLICATE, MTYPE_V16I4, 
MTYPE_I4),
+                            inv_wn);
           break;
      case MTYPE_V16I8: case MTYPE_U8: case MTYPE_I8:
           inv_wn =

Modified: trunk/osprey/common/com/opcode_gen_core.h
===================================================================
--- trunk/osprey/common/com/opcode_gen_core.h   2011-04-18 18:42:38 UTC (rev 
3557)
+++ trunk/osprey/common/com/opcode_gen_core.h   2011-04-18 21:53:10 UTC (rev 
3558)
@@ -2706,6 +2706,8 @@
   OPC_V16I2V16I2SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I2) + DESC(MTYPE_V16I2),
   OPC_V16I4V16I4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I4) + DESC(MTYPE_V16I4),
   OPC_V16I8V16I8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16I8) + DESC(MTYPE_V16I8),
+  OPC_V8I4V8I4SHUFFLE   = OPR_SHUFFLE + RTYPE(MTYPE_V8I4)  + DESC(MTYPE_V8I4),
+  OPC_V8F4V8F4SHUFFLE   = OPR_SHUFFLE + RTYPE(MTYPE_V8F4)  + DESC(MTYPE_V8F4),
   OPC_V16F4V16F4SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16F4) + DESC(MTYPE_V16F4),
   OPC_V16F8V16F8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16F8) + DESC(MTYPE_V16F8),
   OPC_V16C8V16C8SHUFFLE = OPR_SHUFFLE + RTYPE(MTYPE_V16C8) + DESC(MTYPE_V16C8),


------------------------------------------------------------------------------
Benefiting from Server Virtualization: Beyond Initial Workload 
Consolidation -- Increasing the use of server virtualization is a top
priority.Virtualization can reduce costs, simplify management, and improve 
application availability and disaster protection. Learn more about boosting 
the value of server virtualization. http://p.sf.net/sfu/vmware-sfdev2dev
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel

Reply via email to