Hi,

       There is  an off by one error in neon_evpc_vrev which means it
rarely gets triggerred. The problem is that if you are looking at
d->perm [i +j]  and you increment i by just diff you end up starting
looking where you looked at the end of the last place where you
checked. Given this I think this patch should make it to trunk and to
the 4.7 branch after appropriate testing and if the release managers
don't object to such a change. The point is that this helps generate
proper vect_reverse style instructions for arm_neon. There is scope
for a follow up patch that fixes up the intrinsics essentially
identical to all the functions in the test that I've added (except for
a couple of missing v2sf and v4sf type operations which should boil
down to the same implementation) . It's a permute so the type really
doesn't matter. However it requires some ML magic with permutations
for the masks which will prove to be good fun on a plane trip.

Testing currently in progress and if there are no regressions I intend
to commit this to trunk and (after a while I would like to backport
this to the 4.7 branch if there are no objections from the release
managers)

 RichardH , since you did the original patch would you mind having a
quick look to see if I haven't missed anything obvious.



        * config/arm/arm.c (arm_evpc_neon_vrev): Fix off by one
        error.
        * gcc.target/arm/neon-vrev.c: New.


regards,
Ramana



1. For one example from the testcase you can see the effects before
and after for this :  (left is new compiler and right is old compiler)



vrev16q2_u8:                                                    vrev16q2_u8:
        @ args = 0, pretend = 0, frame = 0                              @ args 
= 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0                     @ 
frame_needed = 0,
uses_anonymous_args = 0
        @ link register save eliminated.                                @ link 
register save eliminated.
        vmov    d16, r0, r1  @ v16qi                          |         vmov    
d20, r0, r1  @ v16qi
        vmov    d17, r2, r3                                   |         vmov    
d21, r2, r3
        vrev16.8        q8, q8                                |         vldr    
d16, .L41
        vmov    r0, r1, d16  @ v16qi                          |         vldr    
d17, .L41+8
        vmov    r2, r3, d17                                   |         vtbl.8  
d18, {d20, d21}, d16
        bx      lr                                            |         vtbl.8  
d19, {d20, d21}, d17
                                                              >         vmov    
r0, r1, d18  @ v16qi
                                                              >         vmov    
r2, r3, d19
                                                              >         bx      
lr
                                                              > .L42:
                                                              >         .align  
3
                                                              > .L41:
                                                              >         .byte   
1
                                                              >         .byte   0
                                                              >         .byte   
3
                                                              >         .byte   
2
                                                              >         .byte   
5
                                                              >         .byte   
4
                                                              >         .byte   
7
                                                              >         .byte   
6
                                                              >         .byte   
9
                                                              >         .byte   
8
                                                              >         .byte   
11
                                                              >         .byte   
10
                                                              >         .byte   
13
                                                              >         .byte   
12
                                                              >         .byte   
15
                                                              >         .byte   
14
        .size   vrev16q2_u8, .-vrev16q2_u8                              .size   
vrev16q2_u8, .-vrev16q2_u8
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 321e6b5..bcad0b9 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25668,10 +25668,18 @@ arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
       return false;
     }
 
-  for (i = 0; i < nelt; i += diff)
+  for (i = 0; i < nelt ; i += (diff + 1))
     for (j = 0; j <= diff; j += 1)
-      if (d->perm[i + j] != i + diff - j)
-       return false;
+      {
+       /* This is guaranteed to be true as the value of diff
+          is 7, 3, 1 and we should have enough elements in the
+          queue to generate this. Getting a vector mask with a
+          value of diff other than these values implies that
+          something is wrong by the time we get here.  */
+       gcc_assert ((i + j) < nelt);
+       if (d->perm[i + j] != i + diff - j)
+         return false;
+      }
 
   /* Success! */
   if (d->testing_p)
--- /dev/null   2012-05-25 12:51:24.801630363 +0100
+++ gcc/testsuite/gcc.target/arm/neon-vrev.c    2012-05-26 03:18:02.095635775 
+0100
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x4_t
+tst_vrev642_u16 (uint16x4_t __a)
+{
+  uint16x4_t __rv;
+  uint16x4_t __mask1 = { 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vrev64q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __rv;
+  uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4 };
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x8_t
+tst_vrev642_u8 (uint8x8_t __a)
+{
+  uint8x8_t __rv;
+  uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vrev64q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __rv;
+  uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x2_t
+tst_vrev642_u32 (uint32x2_t __a)
+{
+  uint32x2_t __rv;
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x4_t
+tst_vrev64q2_u32 (uint32x4_t __a)
+{
+  uint32x4_t __rv;
+  uint32x4_t __mask1 = {1, 0, 3, 2};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vrev322_u16 (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = { 1, 0, 3, 2 };
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint16x8_t
+tst_vrev32q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = { 1, 0, 3, 2, 5, 4, 7, 6 }; 
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev322_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x16_t
+tst_vrev32q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev162_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6};
+  return __builtin_shuffle (__a, __mask);
+}
+
+uint8x16_t
+tst_vrev16q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  return __builtin_shuffle (__a, __mask);
+}
+
+/* { dg-final {scan-assembler-times "vrev32\.16\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev32\.8\\t" 2} }  */ 
+/* { dg-final {scan-assembler-times "vrev16\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.16\\t" 2} }  */

Reply via email to