From e28b4a137b061f8ce755e20e0de8fdd635eab214 Mon Sep 17 00:00:00 2001
From: Rong Yan <rongyan236@gmail.com>
Date: Tue, 4 Nov 2014 06:27:58 +0000
Subject: [PATCH 2/3] libavcodec/ppc/hpeldsp_altivec.c : fix
 ff_put_pixels16_altivec() put_no_rnd_pixels16_xy2_altivec()
 put_no_rnd_pixels8_xy2_altivec() avg_pixels8_altivec()
 avg_pixels8_xy2_altivec() put_pixels16_xy2_altivec()
 put_pixels8_xy2_altivec() ff_avg_pixels16_altivec() for POWER LE

---
 libavcodec/ppc/hpeldsp_altivec.c | 147 ++++++++++++++++++++++++++++++---------
 1 file changed, 113 insertions(+), 34 deletions(-)

diff --git a/libavcodec/ppc/hpeldsp_altivec.c b/libavcodec/ppc/hpeldsp_altivec.c
index 79c2af8..5167a14 100644
--- a/libavcodec/ppc/hpeldsp_altivec.c
+++ b/libavcodec/ppc/hpeldsp_altivec.c
@@ -36,46 +36,22 @@
 
 #if HAVE_ALTIVEC
 /* next one assumes that ((line_size % 16) == 0) */
-#if HAVE_VSX
-void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
-{
-  register vector unsigned char pixelsv1;
-  register vector unsigned char pixelsv1B;
-  register vector unsigned char pixelsv1C;
-  register vector unsigned char pixelsv1D;
-
-  int i;
-  register ptrdiff_t line_size_2 = line_size << 1;
-  register ptrdiff_t line_size_3 = line_size + line_size_2;
-  register ptrdiff_t line_size_4 = line_size << 2;
-
-// hand-unrolling the loop by 4 gains about 15%
-// mininum execution time goes from 74 to 60 cycles
-// it's faster than -funroll-loops, but using
-// -funroll-loops w/ this is bad - 74 cycles again.
-// all this is on a 7450, tuning for the 7450
-  for (i = 0; i < h; i += 4) {
-    pixelsv1  = vec_vsx_ld( 0, pixels);
-    pixelsv1B = vec_vsx_ld(line_size, pixels);
-    pixelsv1C = vec_vsx_ld(line_size_2, pixels);
-    pixelsv1D = vec_vsx_ld(line_size_3, pixels);
-    vec_vsx_st(pixelsv1, 0, (unsigned char*)block);
-    vec_vsx_st(pixelsv1B, line_size, (unsigned char*)block);
-    vec_vsx_st(pixelsv1C, line_size_2, (unsigned char*)block);
-    vec_st(pixelsv1D, line_size_3, (unsigned char*)block);
-    pixels+=line_size_4;
-    block +=line_size_4;
-  }
-}
-#else
 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
+#if HAVE_BIGENDIAN
     register vector unsigned char pixelsv1, pixelsv2;
     register vector unsigned char pixelsv1B, pixelsv2B;
     register vector unsigned char pixelsv1C, pixelsv2C;
     register vector unsigned char pixelsv1D, pixelsv2D;
 
     register vector unsigned char perm = vec_lvsl(0, pixels);
+#else /* HAVE_BIGENDIAN */
+    register vector unsigned char pixelsv1;
+    register vector unsigned char pixelsv1B;
+    register vector unsigned char pixelsv1C;
+    register vector unsigned char pixelsv1D;
+#endif /* HAVE_BIGENDIAN */
+
     int i;
     register ptrdiff_t line_size_2 = line_size << 1;
     register ptrdiff_t line_size_3 = line_size + line_size_2;
@@ -87,6 +63,7 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
 // -funroll-loops w/ this is bad - 74 cycles again.
 // all this is on a 7450, tuning for the 7450
     for (i = 0; i < h; i += 4) {
+#if HAVE_BIGENDIAN
         pixelsv1  = vec_ld( 0, pixels);
         pixelsv2  = vec_ld(15, pixels);
         pixelsv1B = vec_ld(line_size, pixels);
@@ -103,26 +80,42 @@ void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t li
                line_size_2, (unsigned char*)block);
         vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
                line_size_3, (unsigned char*)block);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1  = vec_vsx_ld( 0, pixels);
+        pixelsv1B = vec_vsx_ld(line_size, pixels);
+        pixelsv1C = vec_vsx_ld(line_size_2, pixels);
+        pixelsv1D = vec_vsx_ld(line_size_3, pixels);
+        vec_vsx_st(pixelsv1, 0, (unsigned char*)block);
+        vec_vsx_st(pixelsv1B, line_size, (unsigned char*)block);
+        vec_vsx_st(pixelsv1C, line_size_2, (unsigned char*)block);
+        vec_st(pixelsv1D, line_size_3, (unsigned char*)block);
+#endif /* HAVE_BIGENDIAN */
         pixels+=line_size_4;
         block +=line_size_4;
     }
 }
 
-#endif /* HAVE_VSX */
-
 /* next one assumes that ((line_size % 16) == 0) */
 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
 {
     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+#if HAVE_BIGENDIAN
     register vector unsigned char perm = vec_lvsl(0, pixels);
+#endif /* HAVE_BIGENDIAN */
     int i;
 
     for (i = 0; i < h; i++) {
+#if HAVE_BIGENDIAN
         pixelsv1 = vec_ld( 0, pixels);
         pixelsv2 = vec_ld(16,pixels);
+#endif /* HAVE_BIGENDIAN */
         blockv = vec_ld(0, block);
+#if HAVE_BIGENDIAN
         pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+#else /* HAVE_BIGENDIAN */
+        pixelsv = vec_vsx_ld( 0, pixels);
+#endif /* HAVE_BIGENDIAN */
         blockv = vec_avg(blockv,pixelsv);
         vec_st(blockv, 0, (unsigned char*)block);
         pixels+=line_size;
@@ -142,9 +135,13 @@ static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff
        int rightside = ((unsigned long)block & 0x0000000F);
 
        blockv = vec_ld(0, block);
+#if HAVE_BIGENDIAN
        pixelsv1 = vec_ld( 0, pixels);
        pixelsv2 = vec_ld(16, pixels);
        pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+#else /* HAVE_BIGENDIAN */
+       pixelsv = vec_vsx_ld( 0, pixels);
+#endif /* HAVE_BIGENDIAN */
 
        if (rightside) {
            pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
@@ -171,6 +168,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
+#if HAVE_BIGENDIAN
     temp1 = vec_ld(0, pixels);
     temp2 = vec_ld(16, pixels);
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
@@ -181,6 +179,12 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     }
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+    pixelsv1 = vec_vsx_ld(0, pixels);
+    pixelsv2 = vec_vsx_ld(1, pixels);
+    pixelsv1 = vec_mergeh(pixelsv1, vczero);
+    pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -189,6 +193,7 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
+#if HAVE_BIGENDIAN
         temp1 = vec_ld(line_size, pixels);
         temp2 = vec_ld(line_size + 16, pixels);
         pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
@@ -200,6 +205,12 @@ static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 
         pixelsv1 = vec_mergeh(vczero, pixelsv1);
         pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1 = vec_vsx_ld(line_size, pixels);
+        pixelsv2 = vec_vsx_ld(line_size+1, pixels);
+        pixelsv1 = vec_mergeh(pixelsv1, vczero);
+        pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -231,6 +242,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
+#if HAVE_BIGENDIAN
     temp1 = vec_ld(0, pixels);
     temp2 = vec_ld(16, pixels);
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
@@ -241,6 +253,12 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
     }
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+    pixelsv1 = vec_vsx_ld(0, pixels);
+    pixelsv2 = vec_vsx_ld(1, pixels);
+    pixelsv1 = vec_mergeh(pixelsv1, vczero);
+    pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vcone);
@@ -249,6 +267,7 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
+#if HAVE_BIGENDIAN
         temp1 = vec_ld(line_size, pixels);
         temp2 = vec_ld(line_size + 16, pixels);
         pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
@@ -260,6 +279,12 @@ static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels
 
         pixelsv1 = vec_mergeh(vczero, pixelsv1);
         pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1 = vec_vsx_ld(line_size, pixels);
+        pixelsv2 = vec_vsx_ld(line_size+1, pixels);
+        pixelsv1 = vec_mergeh(pixelsv1, vczero);
+        pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
@@ -291,6 +316,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
+#if HAVE_BIGENDIAN
     temp1 = vec_ld(0, pixels);
     temp2 = vec_ld(16, pixels);
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
@@ -303,6 +329,14 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     pixelsv4 = vec_mergel(vczero, pixelsv2);
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+    pixelsv1 = vec_vsx_ld(0, pixels);
+    pixelsv2 = vec_vsx_ld(1, pixels);
+    pixelsv3 = vec_mergel(pixelsv1, vczero);
+    pixelsv4 = vec_mergel(pixelsv2, vczero);
+    pixelsv1 = vec_mergeh(pixelsv1, vczero);
+    pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vctwo);
@@ -313,6 +347,7 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
     for (i = 0; i < h ; i++) {
         blockv = vec_ld(0, block);
 
+#if HAVE_BIGENDIAN
         temp1 = vec_ld(line_size, pixels);
         temp2 = vec_ld(line_size + 16, pixels);
         pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
@@ -326,6 +361,14 @@ static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, pt
         pixelsv4 = vec_mergel(vczero, pixelsv2);
         pixelsv1 = vec_mergeh(vczero, pixelsv1);
         pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1 = vec_vsx_ld(line_size, pixels);
+        pixelsv2 = vec_vsx_ld(line_size+1, pixels);
+        pixelsv3 = vec_mergel(pixelsv1,vczero);
+        pixelsv4 = vec_mergel(pixelsv2, vczero);
+        pixelsv1 = vec_mergeh(pixelsv1, vczero);
+        pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
 
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
@@ -360,6 +403,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
     register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 
+#if HAVE_BIGENDIAN
     temp1 = vec_ld(0, pixels);
     temp2 = vec_ld(16, pixels);
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
@@ -372,6 +416,14 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelsv4 = vec_mergel(vczero, pixelsv2);
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+    pixelsv1 = vec_vsx_ld(0, pixels);
+    pixelsv2 = vec_vsx_ld(1, pixels);
+    pixelsv3 = vec_mergel(pixelsv1, vczero);
+    pixelsv4 = vec_mergel(pixelsv2, vczero);
+    pixelsv1 = vec_mergeh(pixelsv1, vczero);
+    pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
     pixelssum3 = vec_add((vector unsigned short)pixelsv3,
                          (vector unsigned short)pixelsv4);
     pixelssum3 = vec_add(pixelssum3, vcone);
@@ -380,6 +432,7 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
     pixelssum1 = vec_add(pixelssum1, vcone);
 
     for (i = 0; i < h ; i++) {
+#if HAVE_BIGENDIAN
         blockv = vec_ld(0, block);
 
         temp1 = vec_ld(line_size, pixels);
@@ -395,6 +448,14 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
         pixelsv4 = vec_mergel(vczero, pixelsv2);
         pixelsv1 = vec_mergeh(vczero, pixelsv1);
         pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1 = vec_vsx_ld(line_size, pixels);
+        pixelsv2 = vec_vsx_ld(line_size+1, pixels);
+        pixelsv3 = vec_mergel(pixelsv1, vczero);
+        pixelsv4 = vec_mergel(pixelsv2, vczero);
+        pixelsv1 = vec_mergeh(pixelsv1, vczero);
+        pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
 
         pixelssum4 = vec_add((vector unsigned short)pixelsv3,
                              (vector unsigned short)pixelsv4);
@@ -410,7 +471,11 @@ static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pix
 
         blockv = vec_packsu(temp3, temp4);
 
+#if HAVE_BIGENDIAN
         vec_st(blockv, 0, block);
+#else /* HAVE_BIGENDIAN */
+        vec_vsx_st(blockv, 0, block);
+#endif /* HAVE_BIGENDIAN */
 
         block += line_size;
         pixels += line_size;
@@ -430,6 +495,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     register const vector unsigned short vctwo = (const vector unsigned short)
                                         vec_splat_u16(2);
 
+#if HAVE_BIGENDIAN
     temp1 = vec_ld(0, pixels);
     temp2 = vec_ld(16, pixels);
     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
@@ -440,6 +506,12 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
     }
     pixelsv1 = vec_mergeh(vczero, pixelsv1);
     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+    pixelsv1 = vec_vsx_ld(0, pixels);
+    pixelsv2 = vec_vsx_ld(1, pixels);
+    pixelsv1 = vec_mergeh(pixelsv1, vczero);
+    pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
     pixelssum1 = vec_add((vector unsigned short)pixelsv1,
                          (vector unsigned short)pixelsv2);
     pixelssum1 = vec_add(pixelssum1, vctwo);
@@ -448,6 +520,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
         int rightside = ((unsigned long)block & 0x0000000F);
         blockv = vec_ld(0, block);
 
+#if HAVE_BIGENDIAN
         temp1 = vec_ld(line_size, pixels);
         temp2 = vec_ld(line_size + 16, pixels);
         pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
@@ -459,6 +532,12 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdi
 
         pixelsv1 = vec_mergeh(vczero, pixelsv1);
         pixelsv2 = vec_mergeh(vczero, pixelsv2);
+#else /* HAVE_BIGENDIAN */
+        pixelsv1 = vec_vsx_ld(line_size, pixels);
+        pixelsv2 = vec_vsx_ld(line_size+1, pixels);
+        pixelsv1 = vec_mergeh(pixelsv1, vczero);
+        pixelsv2 = vec_mergeh(pixelsv2, vczero);
+#endif /* HAVE_BIGENDIAN */
         pixelssum2 = vec_add((vector unsigned short)pixelsv1,
                              (vector unsigned short)pixelsv2);
         temp3 = vec_add(pixelssum1, pixelssum2);
-- 
1.9.1

