./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -
9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.
Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.
yuv420p9le
12341 UNITS in planarX, 130976 runs, 96 skips
73752 UNITS in planarX, 131066 runs, 6 skips
yuv420p9be
12364 UNITS in planarX, 131025 runs, 47 skips
73001 UNITS in planarX, 131055 runs, 17 skips
yuv420p10le
12386 UNITS in planarX, 131042 runs, 30 skips
72735 UNITS in planarX, 131062 runs, 10 skips
yuv420p10be
12337 UNITS in planarX, 131045 runs, 27 skips
72734 UNITS in planarX, 131057 runs, 15 skips
yuv420p12le
12236 UNITS in planarX, 131058 runs, 14 skips
73029 UNITS in planarX, 131062 runs, 10 skips
yuv420p12be
12218 UNITS in planarX, 130973 runs, 99 skips
72402 UNITS in planarX, 131069 runs, 3 skips
yuv420p14le
12168 UNITS in planarX, 131067 runs, 5 skips
72480 UNITS in planarX, 131069 runs, 3 skips
yuv420p14be
12358 UNITS in planarX, 130948 runs,124 skips
73772 UNITS in planarX, 131063 runs, 9 skips
yuv420p16le
10439 UNITS in planarX, 130911 runs,161 skips
157923 UNITS in planarX, 131068 runs, 4 skips
yuv420p16be
10463 UNITS in planarX, 130874 runs,198 skips
154405 UNITS in planarX, 131061 runs, 11 skips
Signed-off-by: Lauri Kasanen
---
The existing VSX yuv2plane1 is also ifdefed out for POWER7, even though it
works there.
This is for cleanliness mainly, separating the macros would be a bit uglier. If
we
have POWER7 users who need that one, please speak up.
libswscale/ppc/swscale_ppc_template.c | 4 +-
libswscale/ppc/swscale_vsx.c | 177 +-
2 files changed, 178 insertions(+), 3 deletions(-)
diff --git a/libswscale/ppc/swscale_ppc_template.c
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest,
const uint8_t *dither, int offset, int x)
{
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int
filterSize,
yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
offset, i);
yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..baca36c 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
#include "swscale_ppc_template.c"
#undef FUNC
+#undef vzero
+
#endif /* !HAVE_BIGENDIAN */
static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src,
uint16_t *dest, int dstW,
yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
}
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+ const int16_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift,
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap,
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip,
clip, clip, clip, clip, clip};