Hi, The little-endian implementation of vec_sums is incorrect. I had misread the specification and thought that the fields not containing the result value were undefined, but in fact they are defined to contain zero. My previous implementation used a vector splat to copy the field from BE element 3 to LE element 3. The corrected implementation will use a vector shift left to move the field and fill the remaining fields with zeros.
When I fixed this, I discovered I had also missed a use of gen_altivec_vsumsws, which should now use gen_altivec_vsumsws_direct instead. This is fixed in this patch as well. Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no regressions. Bootstrap and regression test on powerpc64-unknown-linux-gnu is in progress. If no big-endian regressions are found, is this ok for trunk? Thanks, Bill gcc: 2014-02-21 Bill Schmidt <wschm...@linux.vnet.ibm.com> * config/rs6000/altivec.md (altivec_vsumsws): Replace second vspltw with vsldoi. (reduc_uplus_v16qi): Use gen_altivec_vsumsws_direct instead of gen_altivec_vsumsws. gcc/testsuite: 2014-02-21 Bill Schmidt <wschm...@linux.vnet.ibm.com> * gcc.dg/vmx/vsums.c: Check entire result vector. * gcc.dg/vmx/vsums-be-order.c: Likewise. Index: gcc/config/rs6000/altivec.md =================================================================== --- gcc/config/rs6000/altivec.md (revision 207967) +++ gcc/config/rs6000/altivec.md (working copy) @@ -1651,7 +1651,7 @@ if (VECTOR_ELT_ORDER_BIG) return "vsumsws %0,%1,%2"; else - return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvspltw %0,%3,3"; + return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12"; } [(set_attr "type" "veccomplex") (set (attr "length") @@ -2483,7 +2539,7 @@ emit_insn (gen_altivec_vspltisw (vzero, const0_rtx)); emit_insn (gen_altivec_vsum4ubs (vtmp1, operands[1], vzero)); - emit_insn (gen_altivec_vsumsws (dest, vtmp1, vzero)); + emit_insn (gen_altivec_vsumsws_direct (dest, vtmp1, vzero)); DONE; }) Index: gcc/testsuite/gcc.dg/vmx/vsums-be-order.c =================================================================== --- gcc/testsuite/gcc.dg/vmx/vsums-be-order.c (revision 207967) +++ gcc/testsuite/gcc.dg/vmx/vsums-be-order.c (working copy) @@ -8,12 +8,13 @@ static void test() #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ vector signed int vb = {128,0,0,0}; + vector signed int evd = {136,0,0,0}; #else vector signed int vb = {0,0,0,128}; + vector signed int evd = {0,0,0,136}; #endif vector signed int vd = vec_sums (va, vb); - signed int r = vec_extract (vd, 3); - check (r == 136, "sums"); + check (vec_all_eq (vd, evd), "sums"); } Index: gcc/testsuite/gcc.dg/vmx/vsums.c =================================================================== --- gcc/testsuite/gcc.dg/vmx/vsums.c (revision 207967) +++ gcc/testsuite/gcc.dg/vmx/vsums.c (working copy) @@ -4,9 +4,9 @@ static void test() { vector signed int va = {-7,11,-13,17}; vector signed int vb = {0,0,0,128}; + vector signed int evd = {0,0,0,136}; vector signed int vd = vec_sums (va, vb); - signed int r = vec_extract (vd, 3); - check (r == 136, "sums"); + check (vec_all_eq (vd, evd), "sums"); }