./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags
fast_bilinear \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -
32-bit mul, power8 only.
~2x speedup:
rgb24
24431 UNITS in yuv2packed2, 16384 runs, 0 skips
13783 UNITS in yuv2packed2, 16383 runs, 1 skips
bgr24
24396 UNITS in yuv2packed2, 16384 runs, 0 skips
14059 UNITS in yuv2packed2, 16384 runs, 0 skips
rgba
26815 UNITS in yuv2packed2, 16383 runs, 1 skips
12797 UNITS in yuv2packed2, 16383 runs, 1 skips
bgra
27060 UNITS in yuv2packed2, 16384 runs, 0 skips
13138 UNITS in yuv2packed2, 16384 runs, 0 skips
argb
26998 UNITS in yuv2packed2, 16384 runs, 0 skips
12728 UNITS in yuv2packed2, 16381 runs, 3 skips
bgra
26651 UNITS in yuv2packed2, 16384 runs, 0 skips
13124 UNITS in yuv2packed2, 16384 runs, 0 skips
This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx
version
is also heavily inaccurate, while the vsx version has high accuracy.
Signed-off-by: Lauri Kasanen
---
libswscale/ppc/swscale_vsx.c | 188 +++
1 file changed, 188 insertions(+)
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index e05f9ec..ba00791 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -793,6 +793,180 @@ yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t
*buf[2],
}
}
+static av_always_inline void
+yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+const int16_t *buf0 = buf[0], *buf1 = buf[1],
+ *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+ *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+ *abuf0 = hasAlpha ? abuf[0] : NULL,
+ *abuf1 = hasAlpha ? abuf[1] : NULL;
+const int16_t yalpha1 = 4096 - yalpha;
+const int16_t uvalpha1 = 4096 - uvalpha;
+vector int16_t vy, vu, vv, A = vec_splat_s16(0);
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l,
vvd32_r;
+vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector int16_t vyalpha1 = vec_splats(yalpha1);
+const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
+const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
+const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift19 = vec_splats(19U);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint32_t shift10 = vec_splat_u32(10);
+const vector int32_t dec128 = vec_splats(128 << 19);
+const vector int32_t add18 = vec_splats(1 << 18);
+int i;
+
+// Various permutations
+const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
+0, 1, 2, 3,
+4, 5, 6, 7,
+4, 5, 6, 7 };
+const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
+8, 9, 10, 11,
+12, 13, 14, 15,
+12, 13, 14, 15 };
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+ 0x1, 0x11, 0,
+ 0x2, 0x12, 0,
+ 0x3, 0x13, 0,
+ 0x4, 0x14, 0,
+ 0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+ 0x6, 0x16, 0,
+ 0x7, 0x17, 0