Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2

2019-04-11 Thread Lauri Kasanen
On Fri, 5 Apr 2019 11:41:19 +0300
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
> fast_bilinear \
> -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 32-bit mul, power8 only.
>
> ~2x speedup:
>
> rgb24
>   24431 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13783 UNITS in yuv2packed2,   16383 runs,  1 skips
> bgr24
>   24396 UNITS in yuv2packed2,   16384 runs,  0 skips
>   14059 UNITS in yuv2packed2,   16384 runs,  0 skips
> rgba
>   26815 UNITS in yuv2packed2,   16383 runs,  1 skips
>   12797 UNITS in yuv2packed2,   16383 runs,  1 skips
> bgra
>   27060 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13138 UNITS in yuv2packed2,   16384 runs,  0 skips
> argb
>   26998 UNITS in yuv2packed2,   16384 runs,  0 skips
>   12728 UNITS in yuv2packed2,   16381 runs,  3 skips
> bgra
>   26651 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13124 UNITS in yuv2packed2,   16384 runs,  0 skips
>
> This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
> version
> is also heavily inaccurate, while the vsx version has high accuracy.
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 188 
> +++
>  1 file changed, 188 insertions(+)

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2

2019-04-05 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
fast_bilinear \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

32-bit mul, power8 only.

~2x speedup:

rgb24
  24431 UNITS in yuv2packed2,   16384 runs,  0 skips
  13783 UNITS in yuv2packed2,   16383 runs,  1 skips
bgr24
  24396 UNITS in yuv2packed2,   16384 runs,  0 skips
  14059 UNITS in yuv2packed2,   16384 runs,  0 skips
rgba
  26815 UNITS in yuv2packed2,   16383 runs,  1 skips
  12797 UNITS in yuv2packed2,   16383 runs,  1 skips
bgra
  27060 UNITS in yuv2packed2,   16384 runs,  0 skips
  13138 UNITS in yuv2packed2,   16384 runs,  0 skips
argb
  26998 UNITS in yuv2packed2,   16384 runs,  0 skips
  12728 UNITS in yuv2packed2,   16381 runs,  3 skips
bgra
  26651 UNITS in yuv2packed2,   16384 runs,  0 skips
  13124 UNITS in yuv2packed2,   16384 runs,  0 skips

This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
version
is also heavily inaccurate, while the vsx version has high accuracy.

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 188 +++
 1 file changed, 188 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index e05f9ec..ba00791 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -793,6 +793,180 @@ yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t 
*buf[2],
 }
 }

+static av_always_inline void
+yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+  *abuf0 = hasAlpha ? abuf[0] : NULL,
+  *abuf1 = hasAlpha ? abuf[1] : NULL;
+const int16_t  yalpha1 = 4096 - yalpha;
+const int16_t uvalpha1 = 4096 - uvalpha;
+vector int16_t vy, vu, vv, A = vec_splat_s16(0);
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, 
vvd32_r;
+vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector int16_t vyalpha1 = vec_splats(yalpha1);
+const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
+const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
+const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift19 = vec_splats(19U);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint32_t shift10 = vec_splat_u32(10);
+const vector int32_t dec128 = vec_splats(128 << 19);
+const vector int32_t add18 = vec_splats(1 << 18);
+int i;
+
+// Various permutations
+const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
+0, 1, 2, 3,
+4, 5, 6, 7,
+4, 5, 6, 7 };
+const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
+8, 9, 10, 11,
+12, 13, 14, 15,
+12, 13, 14, 15 };
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0