[FFmpeg-devel] [PATCH] configure: use -r, not -E, for sed

2018-11-15 Thread Lauri Kasanen
Old versions of sed do not support the -E option.

Signed-off-by: Lauri Kasanen 
---
 configure | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index b02b4cc..51f1227 100755
--- a/configure
+++ b/configure
@@ -3722,7 +3722,7 @@ find_things_extern(){
 find_filters_extern(){
 file=$source_path/$1
 #sed -n "s/^extern AVFilter 
ff_\([avfsinkrc]\{2,5\}\)_\(\w\+\);/\2_filter/p" $file
-sed -E -n "s/^extern AVFilter 
ff_([avfsinkrc]{2,5})_([a-zA-Z0-9_]+);/\2_filter/p" $file
+sed -r -n "s/^extern AVFilter 
ff_([avfsinkrc]{2,5})_([a-zA-Z0-9_]+);/\2_filter/p" $file
 }
 
 FILTER_LIST=$(find_filters_extern libavfilter/allfilters.c)
@@ -5188,7 +5188,7 @@ case $target_os in
 is_in -isysroot $ld $LDFLAGS  || check_ldflags  -isysroot 
$sysroot
 fi
 version_script='-exported_symbols_list'
-VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n 
/global:/,/local:/p | grep ";" | tr ";" "\n" | sed -E "s/(.+)/_\1/g" | sed -E 
"s/(.+[^*])/\1*/"'
+VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n 
/global:/,/local:/p | grep ";" | tr ";" "\n" | sed -r "s/(.+)/_\1/g" | sed -r 
"s/(.+[^*])/\1*/"'
 ;;
 msys*)
 die "Native MSYS builds are discouraged, please use the MINGW 
environment."
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] configure: use -r, not -E, for sed

2018-11-16 Thread Lauri Kasanen
On Fri, 16 Nov 2018 22:36:16 +0100
Carl Eugen Hoyos  wrote:

> 2018-11-15 15:00 GMT+01:00, Lauri Kasanen :
> > Old versions of sed do not support the -E option.
> 
> > -VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n
> > /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -E "s/(.+)/_\1/g" | sed
> > -E "s/(.+[^*])/\1*/"'
> > +VERSION_SCRIPT_POSTPROCESS_CMD='tr " " "\n" | sed -n
> > /global:/,/local:/p | grep ";" | tr ";" "\n" | sed -r "s/(.+)/_\1/g" | sed
> > -r "s/(.+[^*])/\1*/"'
> 
> Could you try to replace the current command with one that
> neither needs "-E" nor "-r"?
> Your suggestions fixes antique Linux systems but not current
> non-Linux Posix systems (and contradicts the documentation).

Regexes tend to be write-only. Not sure I can parse what that tries to
do, to rewrite it in basic RE that posix sed supports.

What do you mean by contradicts docs?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-17 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p \
-f null -vframes 100 -v error -nostats -

1158 UNITS in planar1,   65528 runs,  8 skips

-cpuflags 0

19082 UNITS in planar1,   65533 runs,  3 skips

16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version
takes as many cycles as the x86 SSE2 version, yikes it's fast.

Note that this function uses VSX instructions, but is not marked so.
This is because several existing functions also make that mistake.
I'll submit a patch moving them once this is reviewed.

v2: Remove !BE check
Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_altivec.c | 53 
 1 file changed, 53 insertions(+)

diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 2fb2337..8c6056d 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -324,6 +324,53 @@ static void hScale_altivec_real(SwsContext *c, int16_t 
*dst, int dstW,
 }
 }
 }
+
+static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset, int start)
+{
+int i;
+for (i = start; i < dstW; i++) {
+int val = (src[i] + dither[(i + offset) & 7]) >> 7;
+dest[i] = av_clip_uint8(val);
+}
+}
+
+static void yuv2plane1_8_altivec(const int16_t *src, uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset)
+{
+const int dst_u = -(uintptr_t)dest & 15;
+int i, j;
+LOCAL_ALIGNED(16, int16_t, val, [16]);
+const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
+vector int16_t vi, vileft, ditherleft, ditherright;
+vector uint8_t vd;
+
+for (j = 0; j < 16; j++) {
+val[j] = dither[(dst_u + offset + j) & 7];
+}
+
+ditherleft = vec_ld(0, val);
+ditherright = vec_ld(0, [8]);
+
+yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
+
+for (i = dst_u; i < dstW - 15; i += 16) {
+
+vi = vec_vsx_ld(0, [i]);
+vi = vec_adds(ditherleft, vi);
+vileft = vec_sra(vi, shifts);
+
+vi = vec_vsx_ld(0, [i + 8]);
+vi = vec_adds(ditherright, vi);
+vi = vec_sra(vi, shifts);
+
+vd = vec_packsu(vileft, vi);
+vec_st(vd, 0, [i]);
+}
+
+yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
+}
+
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
@@ -367,6 +414,12 @@ av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
 c->yuv2packedX = ff_yuv2rgb24_X_altivec;
 break;
 }
+
+switch (c->dstBpc) {
+case 8:
+c->yuv2plane1 = yuv2plane1_8_altivec;
+break;
+}
 }
 #endif /* HAVE_ALTIVEC */
 }
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-17 Thread Lauri Kasanen
On Fri, 16 Nov 2018 22:09:25 +0100
Carl Eugen Hoyos  wrote:

> (This is less important atm, but I believe all functions currently
> in libswscale/ppc compile and run fine on - old - 32bit be hardware
> as your new function does.
> My completely inexperienced suspicion is that the instruction that
> you call "VSX" also exists on Altivec.)

Ref
http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06

VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7.
They shouldn't compile on earlier PPC like Apple G4/G5. Is your machine
at least POWER7?

> I wanted to write that this hunk breaks compilation on big-endian
> (you should be able to test with "#if 0" instead of "#if !HAVE_BIGENDIAN")
> but the good news is that your patch works fine on big-endian,
> just remove the if-endif block. (Tested visually with lena on 32 and 64bit 
> be.)

Thanks, will do.

> Are you aware of the bounty that is offered for this task?
> https://trac.ffmpeg.org/ticket/5568
> (and #5569, #5570)

Yes, I admit that's why I started. Looking to make some extra, and
helping IBM is not a bad way to do so. I'm considering getting a Raptor
Blackbird when it comes out next year.

> There is a bug report about one altivec routine that works on
> big-endian but breaks the output visually on little-endian while
> many other functions work on both, could you have a look?
> https://trac.ffmpeg.org/ticket/7124

I'll try. This patch was my first time playing with Power vectors.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 9-16 bit yuv2planeX

2019-01-06 Thread Lauri Kasanen
On Sun, 6 Jan 2019 13:23:43 +0100
Carl Eugen Hoyos  wrote:

> 2019-01-04 20:43 GMT+01:00, Lauri Kasanen :
> > +#ifdef __POWER8_VECTOR__
> 
> If this is correct, I assume it fixes a bug in the current code
> and should be a separate patch, no?
> 
> >  case 16:
> >  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx  :
> > yuv2plane1_16LE_vsx;
> > +c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx  :
> > yuv2planeX_16LE_vsx;
> >  break;
> > -#endif
> > +#endif /* __POWER8_VECTOR__ */

These mails do tend to get long with so many bench results, but that was
covered:
> The existing VSX yuv2plane1 is also ifdefed out for POWER7, even though it 
> works there.
> This is for cleanliness mainly, separating the macros would be a bit
> uglier. If we have POWER7 users who need that one, please speak up.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-01-16 Thread Lauri Kasanen
On Tue, 8 Jan 2019 11:08:04 +0200
Lauri Kasanen  wrote:

> The existing code was in no released kernel that I can see. The corrected code
> was added in 3.9.
> 
> Signed-off-by: Lauri Kasanen 
> ---
>  libavutil/ppc/cpu.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)

Ping.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 16-bit yuv2plane1

2018-12-13 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16le \
-f null -vframes 100 -v error -nostats -

19157 UNITS in planar1,   65512 runs, 24 skips

-cpuflags 0

2120 UNITS in planar1,   65393 runs,143 skips

9.03632 speedup, 16be similarly.

Fate passes, each format tested with an image to video conversion.

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 59 
 1 file changed, 59 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 6462c11..70da6ae 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -180,6 +180,60 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+#undef output_pixel
+
+#define output_pixel(pos, val, bias, signedness) \
+if (big_endian) { \
+AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+} else { \
+AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+}
+
+static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+const int shift = 3;
+
+for (i = start; i < dstW; i++) {
+int val = src[i] + (1 << (shift - 1));
+output_pixel([i], val, 0, uint);
+}
+}
+
+static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
+   int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 
8 : 0);
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+vector uint32_t v, v2;
+vector uint16_t vd;
+int i;
+
+yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
+
+for (i = dst_u; i < dstW - 7; i += 8) {
+v = vec_vsx_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+
+v2 = vec_vsx_ld(0, (const uint32_t *) [i + 4]);
+v2 = vec_add(v2, vadd);
+v2 = vec_sr(v2, vshift);
+
+vd = vec_packsu(v, v2);
+vd = vec_rl(vd, vswap);
+
+vec_st(vd, 0, [i]);
+}
+
+yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
+}
+
 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
  uint8_t *dest, int dstW, \
@@ -197,6 +251,8 @@ yuv2NBPS(12, BE, 1, nbps, int16_t)
 yuv2NBPS(12, LE, 0, nbps, int16_t)
 yuv2NBPS(14, BE, 1, nbps, int16_t)
 yuv2NBPS(14, LE, 0, nbps, int16_t)
+yuv2NBPS(16, BE, 1, 16, int32_t)
+yuv2NBPS(16, LE, 0, 16, int32_t)
 
 #endif /* !HAVE_BIGENDIAN */
 
@@ -240,6 +296,9 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 case 14:
 c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx  : 
yuv2plane1_14LE_vsx;
 break;
+case 16:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx  : 
yuv2plane1_16LE_vsx;
+break;
 #endif
 }
 }
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] swscale/output: VSX-optimize 16-bit yuv2plane1

2018-12-13 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16le \
-f null -vframes 100 -v error -nostats -

2120 UNITS in planar1,   65393 runs,143 skips

-cpuflags 0

19157 UNITS in planar1,   65512 runs, 24 skips

9.03632 speedup, 16be similarly.

Fate passes, each format tested with an image to video conversion.

Signed-off-by: Lauri Kasanen 
---

v2: Copy-pasted rows were flipped.

 libswscale/ppc/swscale_vsx.c | 59 
 1 file changed, 59 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 6462c11..70da6ae 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -180,6 +180,60 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+#undef output_pixel
+
+#define output_pixel(pos, val, bias, signedness) \
+if (big_endian) { \
+AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+} else { \
+AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+}
+
+static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+const int shift = 3;
+
+for (i = start; i < dstW; i++) {
+int val = src[i] + (1 << (shift - 1));
+output_pixel([i], val, 0, uint);
+}
+}
+
+static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
+   int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 
8 : 0);
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+vector uint32_t v, v2;
+vector uint16_t vd;
+int i;
+
+yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
+
+for (i = dst_u; i < dstW - 7; i += 8) {
+v = vec_vsx_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+
+v2 = vec_vsx_ld(0, (const uint32_t *) [i + 4]);
+v2 = vec_add(v2, vadd);
+v2 = vec_sr(v2, vshift);
+
+vd = vec_packsu(v, v2);
+vd = vec_rl(vd, vswap);
+
+vec_st(vd, 0, [i]);
+}
+
+yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
+}
+
 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
  uint8_t *dest, int dstW, \
@@ -197,6 +251,8 @@ yuv2NBPS(12, BE, 1, nbps, int16_t)
 yuv2NBPS(12, LE, 0, nbps, int16_t)
 yuv2NBPS(14, BE, 1, nbps, int16_t)
 yuv2NBPS(14, LE, 0, nbps, int16_t)
+yuv2NBPS(16, BE, 1, 16, int32_t)
+yuv2NBPS(16, LE, 0, 16, int32_t)
 
 #endif /* !HAVE_BIGENDIAN */
 
@@ -240,6 +296,9 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 case 14:
 c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx  : 
yuv2plane1_14LE_vsx;
 break;
+case 16:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx  : 
yuv2plane1_16LE_vsx;
+break;
 #endif
 }
 }
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize float yuv2plane1

2018-12-16 Thread Lauri Kasanen
On Sun, 16 Dec 2018 00:22:00 +0100
Michael Niedermayer  wrote:

> On Sat, Dec 15, 2018 at 06:32:31PM +0200, Lauri Kasanen wrote:
> > Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated.
> > 
> >  libswscale/ppc/swscale_altivec.c | 139 
> > ++-
> >  1 file changed, 137 insertions(+), 2 deletions(-)
> 
> breaks build:
> src/libswscale/ppc/swscale_altivec.c: In function ‘yuv2plane1_float_altivec’:
> src/libswscale/ppc/swscale_altivec.c:158:80: error: expected declaration 
> specifiers or ‘...’ before ‘(’ token
>  const vector float vzero = (vector float) {0, 0, 0, 0};

Thanks for testing. I missed the vzero define at the top, I wonder why
my gcc did not break. Patch v2 coming.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1

2018-12-16 Thread Lauri Kasanen
This function wouldn't benefit from VSX instructions, so I put it
under altivec.

./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt grayf32le 
\
-f null -vframes 100 -v error -nostats -

3743 UNITS in planar1,   65495 runs, 41 skips

-cpuflags 0

23511 UNITS in planar1,   65530 runs,  6 skips

grayf32be

4647 UNITS in planar1,   65449 runs, 87 skips

-cpuflags 0

28608 UNITS in planar1,   65530 runs,  6 skips

The native speedup is 6.28133, and the bswapping one 6.15623.
Fate passes, each format tested with an image to video conversion.

Signed-off-by: Lauri Kasanen 
---

Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated.

v2: Added #undef vzero, that define broke the build on older gcc. Thanks Michael

 libswscale/ppc/swscale_altivec.c | 141 ++-
 1 file changed, 139 insertions(+), 2 deletions(-)

diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 1d2b2fa..d72ed1e 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -31,7 +31,8 @@
 #include "yuv2rgb_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+#if HAVE_BIGENDIAN
 #define vzero vec_splat_s32(0)
 
 #define  GET_LS(a,b,c,s) {\
@@ -102,7 +103,137 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#undef vzero
+
+#endif /* HAVE_BIGENDIAN */
+
+#define output_pixel(pos, val, bias, signedness) \
+if (big_endian) { \
+AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+} else { \
+AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+}
+
+static void
+yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
+{
+static const int big_endian = HAVE_BIGENDIAN;
+static const int shift = 3;
+static const float float_mult = 1.0f / 65535.0f;
+int i, val;
+uint16_t val_uint;
+
+for (i = start; i < dstW; ++i){
+val = src[i] + (1 << (shift - 1));
+output_pixel(_uint, val, 0, uint);
+dest[i] = float_mult * (float)val_uint;
+}
+}
+
+static void
+yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int 
start)
+{
+static const int big_endian = HAVE_BIGENDIAN;
+static const int shift = 3;
+static const float float_mult = 1.0f / 65535.0f;
+int i, val;
+uint16_t val_uint;
+
+for (i = start; i < dstW; ++i){
+val = src[i] + (1 << (shift - 1));
+output_pixel(_uint, val, 0, uint);
+dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
+}
+}
+
+static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
+{
+const int dst_u = -(uintptr_t)dest & 3;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const int clip = (1 << 16) - 1;
+const float fmult = 1.0f / 65535.0f;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, 
clip};
+const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
+const vector float vzero = (vector float) {0, 0, 0, 0};
+vector uint32_t v;
+vector float vd;
+int i;
+
+yuv2plane1_float_u(src, dest, dst_u, 0);
+
+for (i = dst_u; i < dstW - 3; i += 4) {
+v = vec_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+v = vec_min(v, vlargest);
+
+vd = vec_ctf(v, 0);
+vd = vec_madd(vd, vmul, vzero);
+
+vec_st(vd, 0, [i]);
+}
+
+yuv2plane1_float_u(src, dest, dstW, i);
+}
+
+static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, 
int dstW)
+{
+const int dst_u = -(uintptr_t)dest & 3;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const int clip = (1 << 16) - 1;
+const float fmult = 1.0f / 65535.0f;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, 
clip};
+const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
+const vector float vzero = (vector float) {0, 0, 0, 0};
+const vector uint32_t vswapbig = (vector uint32_t) {16, 16, 16, 16};
+const vector uint16_t vswapsmall = vec_splat_u16(8);
+vector uint32_t v;
+vector float vd;
+int i;
+
+yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
+
+for (i = dst_u; i < dstW - 3; i += 4) {
+v = vec_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+v = vec_min(v, vlargest);
+
+

Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-12-10 Thread Lauri Kasanen
On Thu, 6 Dec 2018 21:47:18 +0100
Michael Niedermayer  wrote:

> On Tue, Dec 04, 2018 at 02:27:22PM +0100, Michael Niedermayer wrote:
> > > > > On Mon, Dec 03, 2018 at 09:24:47AM +0200, Lauri Kasanen wrote:
> > > > > > Also ping on "swscale/output: VSX-optimize
> > > > > > nbps yuv2plane1".
> > > > > 
> > > > > This IIUC has not been tested on BE yet
> > > > > 
> > > > > my ppc emulation setup is a bit broken and my ppc hw ive not tried 
> > > > > using
> > > > > since years and it was not in good shape last i used it.
> > > > > So i cant just quickly test this ...
> > these are more suggestions than i expected :)
> > but i just got cross build working again and i also just eliminated a
> > mysterious ld.so related segfault
> > ATM iam re rerunning fate with a freshly rebuilt qemu
> > (the past one had an issue with altivec)
> 
> i have cross build with ppc and qemu partly working
> but it appears gcc or something is just buggy

Hi,

Carl Eugen Hoyos reported that it builds fine on BE, the guards being
in correct place not to affect BE. How are things on your side?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1

2018-12-16 Thread Lauri Kasanen
On Mon, 17 Dec 2018 01:03:36 +0100
Carl Eugen Hoyos  wrote:

> 2018-12-16 10:06 GMT+01:00, Lauri Kasanen :
> > This function wouldn't benefit from VSX instructions, so I put it
> > under altivec.
> >
> > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
> > grayf32le \
> > -f null -vframes 100 -v error -nostats -
> >
> > 3743 UNITS in planar1,   65495 runs, 41 skips
> >
> > -cpuflags 0
> >
> > 23511 UNITS in planar1,   65530 runs,  6 skips
> >
> > grayf32be
> >
> > 4647 UNITS in planar1,   65449 runs, 87 skips
> >
> > -cpuflags 0
> >
> > 28608 UNITS in planar1,   65530 runs,  6 skips
> >
> > The native speedup is 6.28133, and the bswapping one 6.15623.
> 
> > Fate passes
> 
> I wonder a little how, given that grayf32 already breaks fate as-is...

Are the tests for it disabled? fate.ffmpeg.org reports 100% success for
many platforms.

> Note that this function / this pix_fmt currently has no real use-case
> afaict.

Is there a list of which pix fmts are useful? Of course I don't want to
waste both my and reviewers' time, if the format is considered for
removal or otherwise broken.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1

2018-12-17 Thread Lauri Kasanen
On Mon, 17 Dec 2018 14:52:49 +0100
Carl Eugen Hoyos  wrote:

> >> Note that this function / this pix_fmt currently has no real use-case
> >> afaict.
> >
> > Is there a list of which pix fmts are useful? Of course I don't want to
> > waste both my and reviewers' time, if the format is considered for
> > removal or otherwise broken.
> 
> The pix_fmt is not deprecated (it's new), what I meant was that it is
> currently only used for obscure monochrome Photoshop images
> and one filter, so I am not sure optimizing this colour conversion
> will help often.

Oh, thanks for the clarification. I'm going roughly in difficulty
order, doing the easy functions first.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize float yuv2plane1

2018-12-24 Thread Lauri Kasanen
On Sun, 16 Dec 2018 11:06:53 +0200
Lauri Kasanen  wrote:

> This function wouldn't benefit from VSX instructions, so I put it
> under altivec.
> 
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> grayf32le \
> -f null -vframes 100 -v error -nostats -
> 
> 3743 UNITS in planar1,   65495 runs, 41 skips
> 
> -cpuflags 0
> 
> 23511 UNITS in planar1,   65530 runs,  6 skips
> 
> grayf32be
> 
> 4647 UNITS in planar1,   65449 runs, 87 skips
> 
> -cpuflags 0
> 
> 28608 UNITS in planar1,   65530 runs,  6 skips
> 
> The native speedup is 6.28133, and the bswapping one 6.15623.
> Fate passes, each format tested with an image to video conversion.
> 
> Signed-off-by: Lauri Kasanen 
> ---
> 
> Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated.
> 
> v2: Added #undef vzero, that define broke the build on older gcc. Thanks 
> Michael

Ping. And of course it's not gcc version dependant, but rather it was
the BE ifdef; it was too early in the morning.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize float yuv2plane1

2018-12-15 Thread Lauri Kasanen
This function wouldn't benefit from VSX instructions, and input
and output share alignment, so I put it under altivec.

./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt grayf32le 
\
-f null -vframes 100 -v error -nostats -

3743 UNITS in planar1,   65495 runs, 41 skips

-cpuflags 0

23511 UNITS in planar1,   65530 runs,  6 skips

grayf32be

4647 UNITS in planar1,   65449 runs, 87 skips

-cpuflags 0

28608 UNITS in planar1,   65530 runs,  6 skips

The native speedup is 6.28133, and the bswapping one 6.15623.
Fate passes, each format tested with an image to video conversion.

Signed-off-by: Lauri Kasanen 
---

Tested on POWER8 LE. Testing on earlier ppc and/or BE appreciated.

 libswscale/ppc/swscale_altivec.c | 139 ++-
 1 file changed, 137 insertions(+), 2 deletions(-)

diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 1d2b2fa..2ef5257 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -31,7 +31,8 @@
 #include "yuv2rgb_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#if HAVE_ALTIVEC && HAVE_BIGENDIAN
+#if HAVE_ALTIVEC
+#if HAVE_BIGENDIAN
 #define vzero vec_splat_s32(0)
 
 #define  GET_LS(a,b,c,s) {\
@@ -102,7 +103,135 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
-#endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
+#endif /* HAVE_BIGENDIAN */
+
+#define output_pixel(pos, val, bias, signedness) \
+if (big_endian) { \
+AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+} else { \
+AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
+}
+
+static void
+yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
+{
+static const int big_endian = HAVE_BIGENDIAN;
+static const int shift = 3;
+static const float float_mult = 1.0f / 65535.0f;
+int i, val;
+uint16_t val_uint;
+
+for (i = start; i < dstW; ++i){
+val = src[i] + (1 << (shift - 1));
+output_pixel(_uint, val, 0, uint);
+dest[i] = float_mult * (float)val_uint;
+}
+}
+
+static void
+yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int 
start)
+{
+static const int big_endian = HAVE_BIGENDIAN;
+static const int shift = 3;
+static const float float_mult = 1.0f / 65535.0f;
+int i, val;
+uint16_t val_uint;
+
+for (i = start; i < dstW; ++i){
+val = src[i] + (1 << (shift - 1));
+output_pixel(_uint, val, 0, uint);
+dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
+}
+}
+
+static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
+{
+const int dst_u = -(uintptr_t)dest & 3;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const int clip = (1 << 16) - 1;
+const float fmult = 1.0f / 65535.0f;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, 
clip};
+const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
+const vector float vzero = (vector float) {0, 0, 0, 0};
+vector uint32_t v;
+vector float vd;
+int i;
+
+yuv2plane1_float_u(src, dest, dst_u, 0);
+
+for (i = dst_u; i < dstW - 3; i += 4) {
+v = vec_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+v = vec_min(v, vlargest);
+
+vd = vec_ctf(v, 0);
+vd = vec_madd(vd, vmul, vzero);
+
+vec_st(vd, 0, [i]);
+}
+
+yuv2plane1_float_u(src, dest, dstW, i);
+}
+
+static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, 
int dstW)
+{
+const int dst_u = -(uintptr_t)dest & 3;
+const int shift = 3;
+const int add = (1 << (shift - 1));
+const int clip = (1 << 16) - 1;
+const float fmult = 1.0f / 65535.0f;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
+const vector uint32_t vlargest = (vector uint32_t) {clip, clip, clip, 
clip};
+const vector float vmul = (vector float) {fmult, fmult, fmult, fmult};
+const vector float vzero = (vector float) {0, 0, 0, 0};
+const vector uint32_t vswapbig = (vector uint32_t) {16, 16, 16, 16};
+const vector uint16_t vswapsmall = vec_splat_u16(8);
+vector uint32_t v;
+vector float vd;
+int i;
+
+yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
+
+for (i = dst_u; i < dstW - 3; i += 4) {
+v = vec_ld(0, (const uint32_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+v = vec_min(v, vlargest);
+
+vd = vec_ctf(v, 0);
+vd = vec_madd(vd, vmul, 

Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1

2018-12-07 Thread Lauri Kasanen
On Fri, 7 Dec 2018 13:50:12 +0100
Carl Eugen Hoyos  wrote:

> > Carl Eugen Hoyos  wrote:
> >> 2018-11-27 14:26 GMT+01:00, Lauri Kasanen :
> >> > Fate passes, each format tested with an image to video conversion.
> >> >
> >> > Depends on "swscale/ppc: Move VSX-using code to its own file".
> >>
> >> > Only tested on LE.
> >>
> >> This patch breaks output on BE, tested with fate-v410enc and:
> >> $ ffmpeg -i fate-suite/lena.pnm -pix_fmt yuv420p10 -vcodec ffv1 out.nut
> >
> > Just checking, was that with the !BE guards removed?
> 
> Correct, sorry for being unclear.
> 
> > Otherwise I don't see how it could affect BE?
> 
> Yes.

Okay, so it otherwise didn't affect BE. Can it be applied, or is BE a
requirement? This is a simple function, and I can guess how to change
it, but for future more complex functions I rather don't want to
blindly try.

LE is the common case for newer POWER really, many distros don't even
support BE.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-12-03 Thread Lauri Kasanen
On Tue, 4 Dec 2018 03:21:30 +0100
Michael Niedermayer  wrote:

> On Mon, Dec 03, 2018 at 09:24:47AM +0200, Lauri Kasanen wrote:
> > Also ping on "swscale/output: VSX-optimize
> > nbps yuv2plane1".
> 
> This IIUC has not been tested on BE yet
> 
> my ppc emulation setup is a bit broken and my ppc hw ive not tried using
> since years and it was not in good shape last i used it.
> So i cant just quickly test this ...

Raptor offers free POWER9 VMs to open source projects. Since you're the
leader of ffmpeg, if you asked, I'm sure they'd give one or two for
ffmpeg build and fate testing.

Ref
https://mobile.twitter.com/RaptorCompSys/status/1067018060777832449?p=v
https://mobile.twitter.com/RaptorCompSys/status/1067029086273486848?p=v

"We offer free access to cloud VPS for libre software projects in
partnership with @Integricloud, would that help?"

"Contact sa...@integricloud.com and tell them what you want to use a
VPS or two for. They will generally grant access to the resources."

(I'm developing on a POWER8 VM intended for devs, but ordered a
Blackbird from the cyber monday sale ;))

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1

2018-12-06 Thread Lauri Kasanen
On Thu, 6 Dec 2018 22:36:01 +0100
Carl Eugen Hoyos  wrote:

> 2018-11-27 14:26 GMT+01:00, Lauri Kasanen :
> > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt
> > yuv420p9le \
> > -f null -vframes 100 -v error -nostats -
> >
> > Speedups:
> > yuv2plane1_9BE_vsx  11.2042
> > yuv2plane1_9LE_vsx  11.156
> > yuv2plane1_10BE_vsx 9.89428
> > yuv2plane1_10LE_vsx 10.3637
> > yuv2plane1_12BE_vsx 9.71923
> > yuv2plane1_12LE_vsx 11.0404
> > yuv2plane1_14BE_vsx 10.1763
> > yuv2plane1_14LE_vsx 11.2728
> >
> > Fate passes, each format tested with an image to video conversion.
> >
> > Depends on "swscale/ppc: Move VSX-using code to its own file".
> 
> > Only tested on LE.
> 
> This patch breaks output on BE, tested with fate-v410enc and:
> $ ffmpeg -i fate-suite/lena.pnm -pix_fmt yuv420p10 -vcodec ffv1 out.nut

Just checking, was that with the !BE guards removed? Otherwise I don't
see how it could affect BE?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-11-29 Thread Lauri Kasanen
On Mon, 26 Nov 2018 14:24:15 +0200
Lauri Kasanen  wrote:

> Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). 
> Can anyone test BE?

Ping.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-11-30 Thread Lauri Kasanen
On Fri, 30 Nov 2018 12:30:58 +0300
Michael Kostylev  wrote:

> 
> >> Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" 
> >> applied). Can anyone test BE?
> >
> > Ping.
> 
> FATE becomes green as much as possible, I haven't performed any benchmarking 
> though.

Thanks for testing. This patch is not expected to change performance,
it's just moving functions around and putting them under proper VSX
guards.

- Lauri

PS: Your mail did not make it to the list, was it meant for me only?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-23 Thread Lauri Kasanen
On Fri, 23 Nov 2018 03:26:50 +0100
Michael Niedermayer  wrote:

> On Wed, Nov 21, 2018 at 07:19:45PM +0200, Lauri Kasanen wrote:
> > On Wed, 21 Nov 2018 17:22:36 +0100
> > Michael Niedermayer  wrote:
> > > the full fate tests must be run, many of these tests use swscale without
> > > having "scale" in their name
> > > and yes on lower end hardware 20min and longer is possible
> > 
> > I get failures on the baseline, without my patch. What is the procedure
> > here? Is there a var to skip those tests, or?
> 
> procedure ?
> First i try to convince you to attempt to fix some of these failures ;)
> because well, everyone would benefit if they are fixed ...

I mean, if my patch adds no failures, is that enough to apply it?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-24 Thread Lauri Kasanen
On Fri, 23 Nov 2018 23:01:02 +0100
Michael Niedermayer  wrote:

> On Fri, Nov 23, 2018 at 10:38:13AM +0200, Lauri Kasanen wrote:
> > I mean, if my patch adds no failures, is that enough to apply it?
> 
> yes that and the tests failing should still fail the same way with the
> same checksums
> This of course assumes noone finds an issue in the patch

Okay, ran both with -k. No new failures, and fate-rv20-1239 failed with
the same checksums in both cases. That was the only failing test, did
not try with THREADS.

Curiously "make CPUFLAGS=0 fate-rv20-1239" also fails, so it's not
Altivec code that breaks that test, but C (?).

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug

2018-11-25 Thread Lauri Kasanen
Hi,

The lone power8 fate failing test seems like an aliasing issue.
I've isolated it into the attached standalone test case. Compiling it
with
gcc -std=c11 -maltivec -mabi=altivec -mvsx -O3 -fno-tree-vectorize
-o test test.c

reproduces on gcc 8.2.0, dropping the optimization level fixes it. This
was one of the "adding a printf made it work" things too. 

-Wstrict-aliasing=1 complains about the "register int *idataptr =
(int*)dataptr;" cast. If I put "typedef int __attribute__((may_alias))
int_alias;" at the top and change the cast and type to int_alias, the
results become correct.

This code would probably crash on systems where unaligned access is
prohibited, I think the incoming block is just 16-bit aligned. How do
you prefer to fix alignment/aliasing issues?

- Lauri


test.c
Description: Binary data
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug

2018-11-25 Thread Lauri Kasanen
On Sun, 25 Nov 2018 17:17:58 +0200
Lauri Kasanen  wrote:

> This code would probably crash on systems where unaligned access is
> prohibited, I think the incoming block is just 16-bit aligned.

I see the block comes from aligned malloc, so scratch that part, it's at
least 128-bit aligned.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-21 Thread Lauri Kasanen
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt yuv420p 
> \
> -f null -vframes 100 -v error -nostats -
> 
> 1158 UNITS in planar1,   65528 runs,  8 skips
> 
> -cpuflags 0
> 
> 19082 UNITS in planar1,   65533 runs,  3 skips
> 
> 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version
> takes as many cycles as the x86 SSE2 version, yikes it's fast.
> 
> Note that this function uses VSX instructions, but is not marked so.
> This is because several existing functions also make that mistake.
> I'll submit a patch moving them once this is reviewed.
> 
> v2: Remove !BE check
> Signed-off-by: Lauri Kasanen 

Ping. Seems not many ffmpeg devs interested in ppc.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-21 Thread Lauri Kasanen
On Wed, 21 Nov 2018 13:21:58 +0100
Michael Niedermayer  wrote:

> On Wed, Nov 21, 2018 at 10:12:48AM +0200, Lauri Kasanen wrote:
> > > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> > > yuv420p \
> > > -f null -vframes 100 -v error -nostats -
> > > 
> > > 1158 UNITS in planar1,   65528 runs,  8 skips
> > > 
> > > -cpuflags 0
> > > 
> > > 19082 UNITS in planar1,   65533 runs,  3 skips
> > > 
> > > 16.48 speedup ratio. On x86, SSE2 is ~7. Curiously, the Power C version
> > > takes as many cycles as the x86 SSE2 version, yikes it's fast.
> > > 
> > > Note that this function uses VSX instructions, but is not marked so.
> > > This is because several existing functions also make that mistake.
> > > I'll submit a patch moving them once this is reviewed.
> > > 
> > > v2: Remove !BE check
> > > Signed-off-by: Lauri Kasanen 
> > 
> > Ping. Seems not many ffmpeg devs interested in ppc.
> 
> have you tried "make fate" with this patch (note you need to configure with
> fate samples" so all tests are run

I ran those fate tests containing "scale" in the name, I gather the
full suite takes > 20min. Otherwise I tested with a PNG to video
conversion on LE, and Carl Eugen Hoyos tested with Lena on BE.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-21 Thread Lauri Kasanen
On Wed, 21 Nov 2018 17:22:36 +0100
Michael Niedermayer  wrote:

> the full fate tests must be run, many of these tests use swscale without
> having "scale" in their name
> and yes on lower end hardware 20min and longer is possible

I get failures on the baseline, without my patch. What is the procedure
here? Is there a var to skip those tests, or?

First I ran with THREADS=3, baseline blew up in
fate-h264-conformance-frext-hpcafl_bcrm_c

Then I ran without THREADS, it got further, but blew up in
fate-rv20-1239

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-26 Thread Lauri Kasanen
On Mon, 26 Nov 2018 11:03:55 +0300
Michael Kostylev  wrote:

> 
> http://fate.xffm.org/?sort=arch
> /ppc

Yeah, mentioned in the commit message. Follow-up patch coming today.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] fate-rv20-1239 failure on power8, aliasing bug

2018-11-26 Thread Lauri Kasanen
On Mon, 26 Nov 2018 00:45:26 +0100
Carl Eugen Hoyos  wrote:

> 2018-11-25 16:17 GMT+01:00, Lauri Kasanen :
> > Hi,
> >
> > The lone power8 fate failing test seems like an aliasing issue.
> > I've isolated it into the attached standalone test case. Compiling it
> > with
> > gcc -std=c11 -maltivec -mabi=altivec -mvsx -O3 -fno-tree-vectorize
> > -o test test.c
> >
> > reproduces on gcc 8.2.0, dropping the optimization level fixes it. This
> > was one of the "adding a printf made it work" things too.
> >
> > -Wstrict-aliasing=1 complains about the "register int *idataptr =
> > (int*)dataptr;" cast. If I put "typedef int __attribute__((may_alias))
> > int_alias;" at the top and change the cast and type to int_alias, the
> > results become correct.
> 
> Thank you for the analysis!
> 
> Patch attached, Carl Eugen

Tested, fixes the fate test for me.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize nbps yuv2plane1

2018-11-27 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p9le \
-f null -vframes 100 -v error -nostats -

Speedups:
yuv2plane1_9BE_vsx  11.2042
yuv2plane1_9LE_vsx  11.156
yuv2plane1_10BE_vsx 9.89428
yuv2plane1_10LE_vsx 10.3637
yuv2plane1_12BE_vsx 9.71923
yuv2plane1_12LE_vsx 11.0404
yuv2plane1_14BE_vsx 10.1763
yuv2plane1_14LE_vsx 11.2728

Fate passes, each format tested with an image to video conversion.

Depends on "swscale/ppc: Move VSX-using code to its own file". Only tested on 
LE.

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 83 
 1 file changed, 83 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 853b587..6462c11 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -131,6 +131,75 @@ static void yuv2plane1_8_vsx(const int16_t *src, uint8_t 
*dest, int dstW,
 yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
 }
 
+#if !HAVE_BIGENDIAN
+
+#define output_pixel(pos, val) \
+if (big_endian) { \
+AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+} else { \
+AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
+}
+
+static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 15 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = src[i] + (1 << (shift - 1));
+output_pixel([i], val);
+}
+}
+
+static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
+   int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 15 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, 
add, add, add};
+const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 
8 : 0);
+const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift);
+const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, 
clip, clip, clip, clip, clip};
+vector uint16_t v;
+int i;
+
+yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
+
+for (i = dst_u; i < dstW - 7; i += 8) {
+v = vec_vsx_ld(0, (const uint16_t *) [i]);
+v = vec_add(v, vadd);
+v = vec_sr(v, vshift);
+v = vec_min(v, vlargest);
+v = vec_rl(v, vswap);
+vec_st(v, 0, [i]);
+}
+
+yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
+}
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
+ uint8_t *dest, int dstW, \
+ const uint8_t *dither, int offset) \
+{ \
+yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
+ (uint16_t *) dest, dstW, is_be, bits); \
+}
+
+yuv2NBPS( 9, BE, 1, nbps, int16_t)
+yuv2NBPS( 9, LE, 0, nbps, int16_t)
+yuv2NBPS(10, BE, 1, nbps, int16_t)
+yuv2NBPS(10, LE, 0, nbps, int16_t)
+yuv2NBPS(12, BE, 1, nbps, int16_t)
+yuv2NBPS(12, LE, 0, nbps, int16_t)
+yuv2NBPS(14, BE, 1, nbps, int16_t)
+yuv2NBPS(14, LE, 0, nbps, int16_t)
+
+#endif /* !HAVE_BIGENDIAN */
+
 #endif /* HAVE_VSX */
 
 av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
@@ -158,6 +227,20 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 case 8:
 c->yuv2plane1 = yuv2plane1_8_vsx;
 break;
+#if !HAVE_BIGENDIAN
+case 9:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx  : 
yuv2plane1_9LE_vsx;
+break;
+case 10:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx  : 
yuv2plane1_10LE_vsx;
+break;
+case 12:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx  : 
yuv2plane1_12LE_vsx;
+break;
+case 14:
+c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx  : 
yuv2plane1_14LE_vsx;
+break;
+#endif
 }
 }
 #endif /* HAVE_VSX */
-- 
2.6.2
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/output: Altivec-optimize yuv2plane1_8

2018-11-17 Thread Lauri Kasanen
On Sat, 17 Nov 2018 15:20:08 +0100
Carl Eugen Hoyos  wrote:

> 2018-11-17 9:09 GMT+01:00, Lauri Kasanen :
> > Carl Eugen Hoyos  wrote:
> >> (This is less important atm, but I believe all functions currently
> >> in libswscale/ppc compile and run fine on - old - 32bit be hardware
> >> as your new function does.
> >> My completely inexperienced suspicion is that the instruction that
> >> you call "VSX" also exists on Altivec.)
> >
> > Ref
> > http://gcc.gnu.org/onlinedocs/gcc/PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06.html#PowerPC-AltiVec-Built-in-Functions-Available-on-ISA-2_002e06
> >
> > VSX functions such as vec_vsx_ld were added in ISA 2.06, aka POWER7.
> 
> The instruction vec_vsx_ld is currently only used for little-endian ppc
> which I thought did not exist before power7, am I wrong?

Looks like there were LE Powers like the 440 already in 1999:
https://lwn.net/Articles/408051/
datasheets.chipdb.org/IBM/PowerPC/440/PowerPC-440-Core.pdf

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-11-26 Thread Lauri Kasanen
Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" applied). 
Can anyone test BE?

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/Makefile   |   1 +
 libswscale/ppc/swscale_altivec.c  | 291 ++
 libswscale/ppc/swscale_ppc_template.c | 217 +
 libswscale/ppc/swscale_vsx.c  | 164 +++
 libswscale/swscale_internal.h |   1 +
 5 files changed, 393 insertions(+), 281 deletions(-)
 create mode 100644 libswscale/ppc/swscale_ppc_template.c
 create mode 100644 libswscale/ppc/swscale_vsx.c

diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile
index d1b596e..0a31a30 100644
--- a/libswscale/ppc/Makefile
+++ b/libswscale/ppc/Makefile
@@ -1,3 +1,4 @@
 OBJS += ppc/swscale_altivec.o   \
 ppc/yuv2rgb_altivec.o   \
 ppc/yuv2yuv_altivec.o   \
+ppc/swscale_vsx.o
diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index 8c6056d..1d2b2fa 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -31,21 +31,14 @@
 #include "yuv2rgb_altivec.h"
 #include "libavutil/ppc/util_altivec.h"
 
-#if HAVE_ALTIVEC
+#if HAVE_ALTIVEC && HAVE_BIGENDIAN
 #define vzero vec_splat_s32(0)
 
-#if HAVE_BIGENDIAN
 #define  GET_LS(a,b,c,s) {\
 vector signed short l2  = vec_ld(((b) << 1) + 16, s);\
 ls  = vec_perm(a, l2, c);\
 a = l2;\
 }
-#else
-#define  GET_LS(a,b,c,s) {\
-ls  = a;\
-a = vec_vsx_ld(((b) << 1)  + 16, s);\
-}
-#endif
 
 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
 vector signed short ls;\
@@ -59,7 +52,6 @@
 d2 = vec_add(d2, vf2);\
 } while (0)
 
-#if HAVE_BIGENDIAN
 #define LOAD_FILTER(vf,f) {\
 vector unsigned char perm0 = vec_lvsl(joffset, f);\
 vf = vec_ld(joffset, f);\
@@ -69,89 +61,7 @@
 p = vec_lvsl(xoffset, s);\
 ll1   = vec_ld(xoffset, s);\
 }
-#else
-#define LOAD_FILTER(vf,f) {\
-vf = vec_vsx_ld(joffset, f);\
-}
-#define LOAD_L1(ll1,s,p){\
-ll1  = vec_vsx_ld(xoffset, s);\
-}
-#endif
-
-static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
-  const int16_t **src, uint8_t *dest,
-  const uint8_t *dither, int offset, int x)
-{
-register int i, j;
-LOCAL_ALIGNED(16, int, val, [16]);
-vector signed int vo1, vo2, vo3, vo4;
-vector unsigned short vs1, vs2;
-vector unsigned char vf;
-vector unsigned int altivec_vectorShiftInt19 =
-vec_add(vec_splat_u32(10), vec_splat_u32(9));
-
-for (i = 0; i < 16; i++)
-val[i] = dither[(x + i + offset) & 7] << 12;
-
-vo1 = vec_ld(0,  val);
-vo2 = vec_ld(16, val);
-vo3 = vec_ld(32, val);
-vo4 = vec_ld(48, val);
-
-for (j = 0; j < filterSize; j++) {
-unsigned int joffset=j<<1;
-unsigned int xoffset=x<<1;
-vector unsigned char perm;
-vector signed short l1,vLumFilter;
-LOAD_FILTER(vLumFilter,filter);
-vLumFilter = vec_splat(vLumFilter, 0);
-LOAD_L1(l1,src[j],perm);
-yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
-yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
-}
-
-vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
-vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
-vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
-vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
-vs1 = vec_packsu(vo1, vo2);
-vs2 = vec_packsu(vo3, vo4);
-vf  = vec_packsu(vs1, vs2);
-VEC_ST(vf, 0, dest);
-}
-
-
-static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
-const int16_t **src, uint8_t *dest, int dstW,
-const uint8_t *dither, int offset, int x)
-{
-int i, j;
-
-for (i = x; i < dstW; i++) {
-int t = dither[(i + offset) & 7] << 12;
-for (j = 0; j < filterSize; j++)
-t += src[j][i] * filter[j];
-dest[i] = av_clip_uint8(t >> 19);
-}
-}
-
-static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-int dst_u = -(uintptr_t)dest & 15;
-int i;
-
-yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
-
-for (i = dst_u; i < dstW - 15; i += 16)
-yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
-  offset, i);
-
-yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
-}
 
-#if HAVE_BIGENDIAN
 // The 3 above is 2 (filterSize =

Re: [FFmpeg-devel] [PATCH v3] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-10 Thread Lauri Kasanen
On Wed, 9 Jan 2019 22:26:25 +0100
Carl Eugen Hoyos  wrote:

> > +#ifdef __GNUC__
> > +// GCC does not support vmuluwm yet. Bug open.
> > +__asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32l),
> > "v"(vfilter[j]));
> > +vleft = vec_add(vleft, vtmp);
> > +__asm__("vmuluwm %0, %1, %2" : "=v"(vtmp) : "v"(vin32r),
> > "v"(vfilter[j]));
> > +vright = vec_add(vright, vtmp);
> > +#else
> > +// No idea which compilers this works in, untested. Copied from
> > libsimdpp
> > +vtmp = vec_vmuluwm(vin32l, vfilter[j]);
> > +vleft = vec_add(vleft, vtmp);
> > +vtmp = vec_vmuluwm(vin32r, vfilter[j]);
> > +vright = vec_add(vright, vtmp);
> > +#endif
> 
> Is there no xlc installed on your test system?
> I suspect an earlier patch from you already
> broke xlc compilation...

No, I don't really care about proprietary compilers. You reported
previously that xlc created invalid code anyway?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-10 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs, 96 skips
  73752 UNITS in planarX,  131066 runs,  6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs, 47 skips
  73001 UNITS in planarX,  131055 runs, 17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs, 30 skips
  72735 UNITS in planarX,  131062 runs, 10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs, 27 skips
  72734 UNITS in planarX,  131057 runs, 15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs, 14 skips
  73029 UNITS in planarX,  131062 runs, 10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs, 99 skips
  72402 UNITS in planarX,  131069 runs,  3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,  5 skips
  72480 UNITS in planarX,  131069 runs,  3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,124 skips
  73772 UNITS in planarX,  131063 runs,  9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,161 skips
 157923 UNITS in planarX,  131068 runs,  4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,198 skips
 154405 UNITS in planarX,  131061 runs, 11 skips

Signed-off-by: Lauri Kasanen 
---

v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check
v4: #if HAVE_POWER8

 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 195 +-
 2 files changed, 193 insertions(+), 6 deletions(-)

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..12effe2 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, 
clip, clip, clip, clip, clip};
+con

Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-01-10 Thread Lauri Kasanen
On Wed, 9 Jan 2019 21:55:30 +0100
Carl Eugen Hoyos  wrote:

> 2019-01-08 10:08 GMT+01:00, Lauri Kasanen :
> > The existing code was in no released kernel that I can see. The corrected
> > code
> > was added in 3.9.
> >
> > Signed-off-by: Lauri Kasanen 
> > ---
> >  libavutil/ppc/cpu.c | 10 +-
> >  1 file changed, 5 insertions(+), 5 deletions(-)
> >
> > diff --git a/libavutil/ppc/cpu.c b/libavutil/ppc/cpu.c
> > index 7bb7cd8..b022149 100644
> > --- a/libavutil/ppc/cpu.c
> > +++ b/libavutil/ppc/cpu.c
> > @@ -93,13 +93,13 @@ int ff_get_cpu_flags_ppc(void)
> >  if (buf[i + 1] & PPC_FEATURE_HAS_VSX)
> >  ret |= AV_CPU_FLAG_VSX;
> >  #endif
> > -#ifdef PPC_FEATURE_ARCH_2_07
> > -if (buf[i + 1] & PPC_FEATURE_HAS_POWER8)
> > -ret |= AV_CPU_FLAG_POWER8;
> > -#endif
> >  if (ret & AV_CPU_FLAG_VSX)
> >  av_assert0(ret & AV_CPU_FLAG_ALTIVEC);
> 
> > -goto out;
> 
> This seems like an unrelated change.

It's necessary. HWCAP appears before HWCAP2 in the array, so if the
code jumps out in HWCAP, it never gets to checking the CAP2 bits like
power8.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-01-10 Thread Lauri Kasanen
On Thu, 10 Jan 2019 18:09:21 +0100
Carl Eugen Hoyos  wrote:

> >> > -goto out;
> >>
> >> This seems like an unrelated change.
> >
> > It's necessary. HWCAP appears before HWCAP2 in the array, so if the
> > code jumps out in HWCAP, it never gets to checking the CAP2 bits like
> > power8.
> 
> The next line (that I unfortunately cut) is:
> } else if (buf[i] == AT_HWCAP2) {
> indicating afaict that it is only reached if buf[i] is not equal
> to HWCAP.
> What do I miss?

The surrounding context is a loop over all bytes:
for (i = 0; i < count / sizeof(*buf); i += 2) {

While the out: label is after the loop.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Video codec design for very low-end decoder

2019-01-07 Thread Lauri Kasanen
On Mon, 7 Jan 2019 13:44:56 +0100
Michael Niedermayer  wrote:

> > The modern approaches, DCT, FFT, wavelets and such transforms, are all
> > likely too slow to decode.
> 
> you said it can do mpeg1 and xvid, these are DCT based
> have you tried H.264 ? (i imagine that might with asm optimizations
> and avoidance of more complex features like CABAC and the loop filter
> work maybe, maybe not)
> also if h.264 with everything disabled works maybe some features can
> be turned on sometimes like the loop filter for key frames, that 
> might then help compression ...
> 
> and beating an existing codec, while certainly possible might be hard

According to a 2010 comparison
https://keyj.emphy.de/video-encoder-comparison/
x264 constrained baseline (everything off) takes something like 30%
longer to decode vs xvid at the same rate. Probably more because that
site used xvid's full features, while I used it "everything off".

The issue with xvid simple and mpeg1 were that they were slightly too
slow, and looked too bad. The platform does not have any SIMD, so I
doubt asm optimizations will help much.

Cinepak is almost 30 years old, surely it should be possible to match
the decoding & quality, but at a 5x lower bitrate :P

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Video codec design for very low-end decoder

2019-01-07 Thread Lauri Kasanen
On Mon, 7 Jan 2019 17:42:58 +0100
Michael Niedermayer  wrote:

> > According to a 2010 comparison
> > https://keyj.emphy.de/video-encoder-comparison/
> > x264 constrained baseline (everything off) takes something like 30%
> > longer to decode vs xvid at the same rate. Probably more because that
> > site used xvid's full features, while I used it "everything off".
> 
> constrained baseline is not "everything off"

Wikipedia's table shows CBP as "all off", but perhaps it doesn't list
every option. It lists CABAC etc, but not deblocking. Do you think the
unlisted options could account for 30%?

> > The issue with xvid simple and mpeg1 were that they were slightly too
> > slow, and looked too bad. The platform does not have any SIMD, so I
> > doubt asm optimizations will help much.
> 
> I would guess that with rare or odd architectures
> compilers are not so good when it comes to generating efficient code.
> 
> I would not be surprised if someone who knows the target CPUs pipeline
> and timings could beat the compiler by quite some amount.
> This is one part where the amount of man hours needed is significant
> of course. Would that be worth it, well its your project you have
> to know what amount of work you are willing to do for this, 
> i wouldnt do that work ;)
> 
> besides, why this low end chip ?

Just for fun ;)

MIPS does not have any timings, all instructions complete in the same
amount of cycles (except floating point, cache misses, interrupts,
etc). This makes it fairly suitable for a compiler I think, limiting
what could be gotten from hand-writing asm.

On Mon, 7 Jan 2019 12:02:58 -0500
"Ronald S. Bultje"  wrote:

> Have you considered vp8? It may sound weird but this is basically what vp8
> was great at: being really simple to decode.

VP8 has a reputation of being slow, so I didn't consider it. Benchmarks
show it as decoding slower than h264. Perhaps it too has features that
can be disabled?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-06 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs, 96 skips
  73752 UNITS in planarX,  131066 runs,  6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs, 47 skips
  73001 UNITS in planarX,  131055 runs, 17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs, 30 skips
  72735 UNITS in planarX,  131062 runs, 10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs, 27 skips
  72734 UNITS in planarX,  131057 runs, 15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs, 14 skips
  73029 UNITS in planarX,  131062 runs, 10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs, 99 skips
  72402 UNITS in planarX,  131069 runs,  3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,  5 skips
  72480 UNITS in planarX,  131069 runs,  3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,124 skips
  73772 UNITS in planarX,  131063 runs,  9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,161 skips
 157923 UNITS in planarX,  131068 runs,  4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,198 skips
 154405 UNITS in planarX,  131061 runs, 11 skips

Signed-off-by: Lauri Kasanen 
---

v2: Separate macros so that yuv2plane1_16_vsx remains available for power7

 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 190 +-
 2 files changed, 189 insertions(+), 5 deletions(-)

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..1fd392e 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, 
clip, clip, clip, clip, clip};
+const vector int16_t vzero = vec_splat_s16(0);
+const vector uint8_t vperm = (vector uint8_t) {0, 1

[FFmpeg-devel] Video codec design for very low-end decoder

2019-01-07 Thread Lauri Kasanen
Hi,

If you were to design a video codec for a very low-end decoder, what
would it look like?

My target is MIPS 100MHz, and it should decode 320x240x30 in full speed
in software, with headroom for audio too. Seems all the codec research
in last 20 years has been more quality with more overhead, nobody
looking into "improve quality without more overhead".

Currently I'm thinking it would have to be a variant of vector
quantization, like Cinepak. The target bitrates however are ~250 kbps
or lower, where Cinepak targeted 1200 or higher. Are there any tricks
that would improve quality with only encoder-side effort? What is the
current top-of-the-line interframe prediction, that is still fast to
decode?

The platform is fast enough to play back mpeg1, and xvid simple
profile L3 barely. Cinepak should also work, but I'd like the quality
to be higher than these three.

The last relevant VQ paper I found was 
https://arxiv.org/abs/1710.05311 which used a genetic algorithm to seed
the codebook generation, improving PSNR by a few db over previous
approaches. I've implemented that (for a single grayscale frame), but it
looks too bad at reasonable bitrates.

The modern approaches, DCT, FFT, wavelets and such transforms, are all
likely too slow to decode.

Not sure if this would be better off on other MLs, didn't seem to apply
to ffmpeg-user really.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/output: VSX-optimize 9-16 bit yuv2planeX

2019-01-04 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs, 96 skips
  73752 UNITS in planarX,  131066 runs,  6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs, 47 skips
  73001 UNITS in planarX,  131055 runs, 17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs, 30 skips
  72735 UNITS in planarX,  131062 runs, 10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs, 27 skips
  72734 UNITS in planarX,  131057 runs, 15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs, 14 skips
  73029 UNITS in planarX,  131062 runs, 10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs, 99 skips
  72402 UNITS in planarX,  131069 runs,  3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,  5 skips
  72480 UNITS in planarX,  131069 runs,  3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,124 skips
  73772 UNITS in planarX,  131063 runs,  9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,161 skips
 157923 UNITS in planarX,  131068 runs,  4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,198 skips
 154405 UNITS in planarX,  131061 runs, 11 skips

Signed-off-by: Lauri Kasanen 
---

The existing VSX yuv2plane1 is also ifdefed out for POWER7, even though it 
works there.
This is for cleanliness mainly, separating the macros would be a bit uglier. If 
we
have POWER7 users who need that one, please speak up.

 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 177 +-
 2 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..baca36c 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {clip, 

Re: [FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-11 Thread Lauri Kasanen
On Fri, 11 Jan 2019 09:56:15 +0100
Michael Niedermayer  wrote:

> > +#ifdef __GNUC__
> > +// GCC does not support vmuluwm yet. Bug open.
> 
> this should probably be tested by configure similar to how other
> compiler limitations are tested

We can't really test for it, because there is no standard name for it. I
don't know what name the gcc devs will pick for it, it could be vec_mul,
vec_vmuluwm or something different.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v5] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-12 Thread Lauri Kasanen
On Sat, 12 Jan 2019 14:52:07 +0100
Michael Niedermayer  wrote:

> On Sat, Jan 12, 2019 at 10:47:50AM +0200, Lauri Kasanen wrote:
> > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> > yuv420p16be \
> > -s 1920x1728 -f null -vframes 100 -v error -nostats -
> > 
> > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> > Fate passes, each format tested with an image to video conversion.
> > 
> > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> > of the 16-bit function. This includes the vec_mulo/mule functions too,
> > not just vmuluwm.
> > 
> > yuv420p9le
> >   12341 UNITS in planarX,  130976 runs, 96 skips
> >   73752 UNITS in planarX,  131066 runs,  6 skips
> > yuv420p9be
> >   12364 UNITS in planarX,  131025 runs, 47 skips
> >   73001 UNITS in planarX,  131055 runs, 17 skips
> > yuv420p10le
> >   12386 UNITS in planarX,  131042 runs, 30 skips
> >   72735 UNITS in planarX,  131062 runs, 10 skips
> > yuv420p10be
> >   12337 UNITS in planarX,  131045 runs, 27 skips
> >   72734 UNITS in planarX,  131057 runs, 15 skips
> > yuv420p12le
> >   12236 UNITS in planarX,  131058 runs, 14 skips
> >   73029 UNITS in planarX,  131062 runs, 10 skips
> > yuv420p12be
> >   12218 UNITS in planarX,  130973 runs, 99 skips
> >   72402 UNITS in planarX,  131069 runs,  3 skips
> > yuv420p14le
> >   12168 UNITS in planarX,  131067 runs,  5 skips
> >   72480 UNITS in planarX,  131069 runs,  3 skips
> > yuv420p14be
> >   12358 UNITS in planarX,  130948 runs,124 skips
> >   73772 UNITS in planarX,  131063 runs,  9 skips
> > yuv420p16le
> >   10439 UNITS in planarX,  130911 runs,161 skips
> >  157923 UNITS in planarX,  131068 runs,  4 skips
> > yuv420p16be
> >   10463 UNITS in planarX,  130874 runs,198 skips
> >  154405 UNITS in planarX,  131061 runs, 11 skips
> 
> The number of skips in the benchmark is much larger on one
> side. That way the numbers become hard to compare as
> more cases aer skipped on one side
> 
> please adjust the parameters so the skip counts are compareable
> or redo the tests until the numbers are more similar
> thanks

How do I do that? It's a VM, so there are going to be pauses no matter
what, when other VMs run. Or should I take the largest run count with
about the same skips?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v5] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-12 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs, 96 skips
  73752 UNITS in planarX,  131066 runs,  6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs, 47 skips
  73001 UNITS in planarX,  131055 runs, 17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs, 30 skips
  72735 UNITS in planarX,  131062 runs, 10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs, 27 skips
  72734 UNITS in planarX,  131057 runs, 15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs, 14 skips
  73029 UNITS in planarX,  131062 runs, 10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs, 99 skips
  72402 UNITS in planarX,  131069 runs,  3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,  5 skips
  72480 UNITS in planarX,  131069 runs,  3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,124 skips
  73772 UNITS in planarX,  131063 runs,  9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,161 skips
 157923 UNITS in planarX,  131068 runs,  4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,198 skips
 154405 UNITS in planarX,  131061 runs, 11 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 186 +-
 2 files changed, 184 insertions(+), 6 deletions(-)

v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check
v4: #if HAVE_POWER8
v5: Get rid of the mul #if, turns out gcc vec_mul works

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..f6c7f1d 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {

[FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-13 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

With TIMER_REPORT skips disabled:
yuv420p9le
  12412 UNITS in planarX,  131072 runs,  0 skips
  73136 UNITS in planarX,  131072 runs,  0 skips
yuv420p9be
  12481 UNITS in planarX,  131072 runs,  0 skips
  73410 UNITS in planarX,  131072 runs,  0 skips
yuv420p10le
  12322 UNITS in planarX,  131072 runs,  0 skips
  72546 UNITS in planarX,  131072 runs,  0 skips
yuv420p10be
  12291 UNITS in planarX,  131072 runs,  0 skips
  72935 UNITS in planarX,  131072 runs,  0 skips
yuv420p12le
  12316 UNITS in planarX,  131072 runs,  0 skips
  72708 UNITS in planarX,  131072 runs,  0 skips
yuv420p12be
  12319 UNITS in planarX,  131072 runs,  0 skips
  72577 UNITS in planarX,  131072 runs,  0 skips
yuv420p14le
  12259 UNITS in planarX,  131072 runs,  0 skips
  72516 UNITS in planarX,  131072 runs,  0 skips
yuv420p14be
  12440 UNITS in planarX,  131072 runs,  0 skips
  72962 UNITS in planarX,  131072 runs,  0 skips
yuv420p16le
  10548 UNITS in planarX,  131072 runs,  0 skips
  73429 UNITS in planarX,  131072 runs,  0 skips
yuv420p16be
  10634 UNITS in planarX,  131072 runs,  0 skips
 150959 UNITS in planarX,  131072 runs,  0 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 186 +-
 2 files changed, 184 insertions(+), 6 deletions(-)

v6: No patch changes, updated bench numbers without skips.

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..f6c7f1d 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian ? 8 : 0;
+const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
+const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
shift};
+const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
swap, swap, swap, swap};
+const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, 
clip, clip, clip, clip, clip};
+const vector int16_t vzero = vec_splat_s16(0);
+const vector uint8_t vperm = (vector 

Re: [FFmpeg-devel] Video codec design for very low-end decoder

2019-01-13 Thread Lauri Kasanen
On Mon, 7 Jan 2019 12:37:01 -0500
"Ronald S. Bultje"  wrote:

> On Mon, Jan 7, 2019 at 12:22 PM Lauri Kasanen  wrote:
> > "Ronald S. Bultje"  wrote:
> >
> > > Have you considered vp8? It may sound weird but this is basically what
> > > vp8 was great at: being really simple to decode.
> >
> > VP8 has a reputation of being slow, so I didn't consider it. Benchmarks
> > show it as decoding slower than h264.
> 
> It is faster than h264 when comparing ffh264 vs. ffvp8

I tried VP8 on the target platform (libvpx 1.7.0). It took 32% longer
to decode the test vid than xvid, and given xvid was already a bit
under realtime, VP8 is out.

Curiously, VP8 also added very objectionable artifacts. Some blocks
*moved* around in frames. That looked very bad, neither xvid nor h264
caused that, they were just blocky or blurry. VP8 also looked worst of
the three, by eye.

x264 "everything disabled AFAICT" actually looks very good for the
bitrate. Too bad I can't use H.264 due to the patent situation, so not
going to spend time benching it either.

Settings used:

vpxenc -p 2 --profile=3 --target-bitrate=250 --best --end-usage=vbr
--codec=vp8 --min-q=0 --max-q=60 --ivf

mencoder -ovc x264 -x264encopts
preset=veryslow:pass=2:bitrate=250:tune=fastdecode:profile=baseline

(tune=fastdecode disables deblocking, the result file confirms all
heavy options are off)

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v4] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-12 Thread Lauri Kasanen
On Sat, 12 Jan 2019 01:03:09 +0100
Michael Niedermayer  wrote:

> On Fri, Jan 11, 2019 at 11:16:20AM +0200, Lauri Kasanen wrote:
> > On Fri, 11 Jan 2019 09:56:15 +0100
> > Michael Niedermayer  wrote:
> > 
> > > > +#ifdef __GNUC__
> > > > +// GCC does not support vmuluwm yet. Bug open.
> > > 
> > > this should probably be tested by configure similar to how other
> > > compiler limitations are tested
> > 
> > We can't really test for it, because there is no standard name for it. I
> > don't know what name the gcc devs will pick for it, it could be vec_mul,
> > vec_vmuluwm or something different.
> 
> the code contains a #if and a #else case
> so i thought there was something else than the __GNUC__ case and gcc
> would follow that

It's second-hand info from libsimdpp. I don't know where they got it.

However, I found out yesterday that gcc docs are wrong, and vec_mul for
gcc does use the correct instruction on power8. Respinning.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] Armada 370 problem causes ffmpeg segmentation fault

2019-01-09 Thread Lauri Kasanen
On Tue, 08 Jan 2019 21:32:30 +
Simon Nash  wrote:

> I have encountered a problem with ffmpeg (a segmentation fault) that
> occurs only when running ffmpeg on the Marvell Armada 370 processor.
...
> When the 32-bit floating-point multiply instruction
> 0x0018a8f2 :   vmla.f32s12, s15, s15
> at activate+1690 is executed, there is a segmentation fault.

You don't want to go whack-a-mole on this, since there could be 1500
other places in just ffmpeg that could hit this. You want to fix this
in your compiler, it already has similar errata workarounds for almost
every processor. Then every such case will work automatically.

So,
1) Find the errata from the processor manufacturer
2) Report bug with that to gcc/clang/whatever compiler you use

If there is no known errata for this, and you managed to find a new
one, contact the processor manufacturer.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-01-08 Thread Lauri Kasanen
The existing code was in no released kernel that I can see. The corrected code
was added in 3.9.

Signed-off-by: Lauri Kasanen 
---
 libavutil/ppc/cpu.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libavutil/ppc/cpu.c b/libavutil/ppc/cpu.c
index 7bb7cd8..b022149 100644
--- a/libavutil/ppc/cpu.c
+++ b/libavutil/ppc/cpu.c
@@ -93,13 +93,13 @@ int ff_get_cpu_flags_ppc(void)
 if (buf[i + 1] & PPC_FEATURE_HAS_VSX)
 ret |= AV_CPU_FLAG_VSX;
 #endif
-#ifdef PPC_FEATURE_ARCH_2_07
-if (buf[i + 1] & PPC_FEATURE_HAS_POWER8)
-ret |= AV_CPU_FLAG_POWER8;
-#endif
 if (ret & AV_CPU_FLAG_VSX)
 av_assert0(ret & AV_CPU_FLAG_ALTIVEC);
-goto out;
+} else if (buf[i] == AT_HWCAP2) {
+#ifdef PPC_FEATURE2_ARCH_2_07
+if (buf[i + 1] & PPC_FEATURE2_ARCH_2_07)
+ret |= AV_CPU_FLAG_POWER8;
+#endif
 }
 }
 }
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v3] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-08 Thread Lauri Kasanen
./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
yuv420p16be \
-s 1920x1728 -f null -vframes 100 -v error -nostats -

9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
Fate passes, each format tested with an image to video conversion.

Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
of the 16-bit function. This includes the vec_mulo/mule functions too,
not just vmuluwm.

yuv420p9le
  12341 UNITS in planarX,  130976 runs, 96 skips
  73752 UNITS in planarX,  131066 runs,  6 skips
yuv420p9be
  12364 UNITS in planarX,  131025 runs, 47 skips
  73001 UNITS in planarX,  131055 runs, 17 skips
yuv420p10le
  12386 UNITS in planarX,  131042 runs, 30 skips
  72735 UNITS in planarX,  131062 runs, 10 skips
yuv420p10be
  12337 UNITS in planarX,  131045 runs, 27 skips
  72734 UNITS in planarX,  131057 runs, 15 skips
yuv420p12le
  12236 UNITS in planarX,  131058 runs, 14 skips
  73029 UNITS in planarX,  131062 runs, 10 skips
yuv420p12be
  12218 UNITS in planarX,  130973 runs, 99 skips
  72402 UNITS in planarX,  131069 runs,  3 skips
yuv420p14le
  12168 UNITS in planarX,  131067 runs,  5 skips
  72480 UNITS in planarX,  131069 runs,  3 skips
yuv420p14be
  12358 UNITS in planarX,  130948 runs,124 skips
  73772 UNITS in planarX,  131063 runs,  9 skips
yuv420p16le
  10439 UNITS in planarX,  130911 runs,161 skips
 157923 UNITS in planarX,  131068 runs,  4 skips
yuv420p16be
  10463 UNITS in planarX,  130874 runs,198 skips
 154405 UNITS in planarX,  131061 runs, 11 skips

Signed-off-by: Lauri Kasanen 
---

v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime check

As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at least
power8, meaning with the current setup such a binary wouldn't run on POWER7.
However using the configure define lets it be disabled in configure like Michael
pointed out, and having the runtime check doesn't hurt any (it allows for future
splits like on x86, where one binary can run on low cpu but use higher ISA if
available).

 libswscale/ppc/swscale_ppc_template.c |   4 +-
 libswscale/ppc/swscale_vsx.c  | 195 +-
 2 files changed, 193 insertions(+), 6 deletions(-)

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 00e4b99..11decab 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
+static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
   const int16_t **src, uint8_t *dest,
   const uint8_t *dither, int offset, int x)
 {
@@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
filterSize,
 yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
 for (i = dst_u; i < dstW - 15; i += 16)
-FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
+FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
   offset, i);
 
 yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 70da6ae..77680f8 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -83,6 +83,8 @@
 #include "swscale_ppc_template.c"
 #undef FUNC
 
+#undef vzero
+
 #endif /* !HAVE_BIGENDIAN */
 
 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
@@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
uint16_t *dest, int dstW,
 yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
 }
 
+static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
+  const int16_t **src, uint16_t *dest, int dstW,
+  int big_endian, int output_bits, int start)
+{
+int i;
+int shift = 11 + 16 - output_bits;
+
+for (i = start; i < dstW; i++) {
+int val = 1 << (shift - 1);
+int j;
+
+for (j = 0; j < filterSize; j++)
+val += src[j][i] * filter[j];
+
+output_pixel([i], val);
+}
+}
+
+static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
+const int16_t **src, uint16_t *dest, int dstW,
+int big_endian, int output_bits)
+{
+const int dst_u = -(uintptr_t)dest & 7;
+const int shift = 11 + 16 - output_bits;
+const int add = (1 << (shift - 1));
+const int clip = (1 << output_bits) - 1;
+const uint16_t swap = big_endian

Re: [FFmpeg-devel] [PATCH] swscale/ppc: Move VSX-using code to its own file

2018-12-02 Thread Lauri Kasanen
On Fri, 30 Nov 2018 14:05:26 +0200
Lauri Kasanen  wrote:

> On Fri, 30 Nov 2018 12:30:58 +0300
> Michael Kostylev  wrote:
> > 
> > >> Passes fate on LE (with "lavc/jrevdct: Avoid an aliasing violation" 
> > >> applied). Can anyone test BE?
> > >
> > > Ping.
> > 
> > FATE becomes green as much as possible, I haven't performed any 
> > benchmarking though.
> 
> Thanks for testing. This patch is not expected to change performance,
> it's just moving functions around and putting them under proper VSX
> guards.

Could this patch be applied? Also ping on "swscale/output: VSX-optimize
nbps yuv2plane1".

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-01-27 Thread Lauri Kasanen
On Mon, 14 Jan 2019 16:13:52 +0100
Michael Niedermayer  wrote:

> On Sun, Jan 13, 2019 at 10:26:20AM +0200, Lauri Kasanen wrote:
> > ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> > yuv420p16be \
> > -s 1920x1728 -f null -vframes 100 -v error -nostats -
> > 
> > 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> > Fate passes, each format tested with an image to video conversion.
> > 
> > Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> > of the 16-bit function. This includes the vec_mulo/mule functions too,
> > not just vmuluwm.
...
> > v6: No patch changes, updated bench numbers without skips.
> 
> fate does not get worse from this patch on qemu ppc32be and ppc64le 

Ping

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-01-27 Thread Lauri Kasanen
On Thu, 17 Jan 2019 09:40:09 +0200
Lauri Kasanen  wrote:

> On Tue, 8 Jan 2019 11:08:04 +0200
> Lauri Kasanen  wrote:
> 
> > The existing code was in no released kernel that I can see. The corrected 
> > code
> > was added in 3.9.
> > 
> > Signed-off-by: Lauri Kasanen 
> > ---
> >  libavutil/ppc/cpu.c | 10 +-
> >  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> Ping.

Ping. Carl Eugen, you were the only one who looked at it - could you
apply it?

Given the low interest in power patches, should I be applying for
commit rights?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] MAINTAINERS: add myself to the PPC section

2019-01-27 Thread Lauri Kasanen
Signed-off-by: Lauri Kasanen 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

Ref http://ffmpeg.org/pipermail/ffmpeg-devel/2019-January/239357.html
Requesting commit access so I don't have to constantly bug Michael.

diff --git a/MAINTAINERS b/MAINTAINERS
index bc2ae13..e3a80e9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -526,6 +526,7 @@ Alpha   Falk Hueffner
 MIPSManojkumar Bhosale, Shiyou Yin
 Mac OS X / PowerPC  Romain Dolbeau, Guillaume Poirier
 Amiga / PowerPC Colin Ward
+Linux / PowerPC Lauri Kasanen
 Windows MinGW   Alex Beregszaszi, Ramiro Polla
 Windows Cygwin  Victor Paesa
 Windows MSVCMatthew Oliver, Hendrik Leppkes
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] This patch addresses Trac ticket #5570. The optimized functions are in file libswscale/ppc/input_vsx.c. Each optimized function name is a concatenation of the corresponding

2019-03-29 Thread Lauri Kasanen
On Fri, 29 Mar 2019 17:00:38 +0300
Вячеслав  wrote:

> ---
>  libswscale/ppc/Makefile   |3 +-
>  libswscale/ppc/input_vsx.c| 3801 
> +
>  libswscale/swscale.c  |3 +
>  libswscale/swscale_internal.h |1 +
>  4 files changed, 3807 insertions(+), 1 deletion(-)
>  create mode 100644 libswscale/ppc/input_vsx.c

Please include performance benchmarks for each function. The
description should go in the patch main part, not in the title.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 resend] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-21 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

This uses 32-bit mul, so POWER8 only.

The following output formats get about 4.5x speedup:

rgb24
  39980 UNITS in yuv2packed1,   32768 runs,  0 skips
   8774 UNITS in yuv2packed1,   32768 runs,  0 skips
bgr24
  40069 UNITS in yuv2packed1,   32768 runs,  0 skips
   8772 UNITS in yuv2packed1,   32766 runs,  2 skips
rgba
  39759 UNITS in yuv2packed1,   32768 runs,  0 skips
   8681 UNITS in yuv2packed1,   32767 runs,  1 skips
bgra
  39729 UNITS in yuv2packed1,   32768 runs,  0 skips
   8696 UNITS in yuv2packed1,   32766 runs,  2 skips
argb
  39766 UNITS in yuv2packed1,   32768 runs,  0 skips
   8672 UNITS in yuv2packed1,   32766 runs,  2 skips
bgra
  39784 UNITS in yuv2packed1,   32768 runs,  0 skips
   8659 UNITS in yuv2packed1,   32767 runs,  1 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 291 +++
 1 file changed, 291 insertions(+)

v2: HAVE_POWER8 from ifdef to if
Resending due to mail client troubles

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 01eb46c..062ab0d 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -422,6 +422,248 @@ yuv2NBPSX(16, BE, 1, 16, int32_t)
 yuv2NBPSX(16, LE, 0, 16, int32_t)
 #endif

+static av_always_inline void
+yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, 
tmp32_2;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift2 = vec_splat_u32(2);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
+const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
+const vector int16_t mul4 = vec_splat_s16(4);
+const vector int16_t mul8 = vec_splat_s16(8);
+const vector int16_t add64 = vec_splat_s16(64);
+const vector uint16_t shift7 = vec_splat_u16(7);
+const vector int16_t max255 = vec_splat_s16(255);
+int i;
+
+// Various permutations
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0 };
+const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
+  0x3, 0x4, 0x11,
+  0x6, 0x7, 0x12,
+  0x9, 0xa, 0x13,
+  0xc, 0xd, 0x14,
+  0xf };
+const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
+  0x2, 0x3, 0x16,
+  0x5, 0x6, 0x17 };
+
+for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding 
bytes.
+vy = vec_ld(0, [i]);
+vy32_l = vec_unpackh(vy);
+vy32_r = vec_unpackl(vy);
+vy32_l = vec_sl(vy32_l, shift2);
+vy32_r = vec_sl(vy32_r, shift2);
+
+vu

Re: [FFmpeg-devel] [PATCH]lavf: Constify the probe function argument

2019-03-21 Thread Lauri Kasanen
On Thu, 21 Mar 2019 01:20:21 +0100
Carl Eugen Hoyos  wrote:

> Hi!
>
> Attached patch makes the only argument to the common probe() function const.
>
> Please comment, Carl Eugen

LGTM

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale: Remove duplicated code

2019-03-27 Thread Lauri Kasanen
On Tue, 26 Mar 2019 22:00:54 +0100
Michael Niedermayer  wrote:

> On Tue, Mar 26, 2019 at 08:58:34AM +0200, Lauri Kasanen wrote:
> > In this function, the exact same clamping happens both in the if and 
> > unconditionally.
> >
> > Signed-off-by: Lauri Kasanen 
> > ---
> >  libswscale/output.c | 10 --
> >  1 file changed, 10 deletions(-)
> >
> > v2: Remove the unconditional instead of the if'd clipping.
> > I'll leave changing the bit pattern to others, there's so many funcs using 
> > 0x100.
> >
> > diff --git a/libswscale/output.c b/libswscale/output.c
> > index d7c53e6..d3401f0 100644
> > --- a/libswscale/output.c
> > +++ b/libswscale/output.c
>
> should be ok
>
> thanks

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 resend] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-27 Thread Lauri Kasanen
On Thu, 21 Mar 2019 09:54:17 +0200
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
> -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> This uses 32-bit mul, so POWER8 only.
>
> The following output formats get about 4.5x speedup:
>
> rgb24
>   39980 UNITS in yuv2packed1,   32768 runs,  0 skips
>8774 UNITS in yuv2packed1,   32768 runs,  0 skips
> bgr24
>   40069 UNITS in yuv2packed1,   32768 runs,  0 skips
>8772 UNITS in yuv2packed1,   32766 runs,  2 skips
> rgba
>   39759 UNITS in yuv2packed1,   32768 runs,  0 skips
>8681 UNITS in yuv2packed1,   32767 runs,  1 skips
> bgra
>   39729 UNITS in yuv2packed1,   32768 runs,  0 skips
>8696 UNITS in yuv2packed1,   32766 runs,  2 skips
> argb
>   39766 UNITS in yuv2packed1,   32768 runs,  0 skips
>8672 UNITS in yuv2packed1,   32766 runs,  2 skips
> bgra
>   39784 UNITS in yuv2packed1,   32768 runs,  0 skips
>8659 UNITS in yuv2packed1,   32767 runs,  1 skips
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 291 
> +++
>  1 file changed, 291 insertions(+)
>
> v2: HAVE_POWER8 from ifdef to if
> Resending due to mail client troubles

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2] swscale: Remove duplicated code

2019-03-26 Thread Lauri Kasanen
In this function, the exact same clamping happens both in the if and 
unconditionally.

Signed-off-by: Lauri Kasanen 
---
 libswscale/output.c | 10 --
 1 file changed, 10 deletions(-)

v2: Remove the unconditional instead of the if'd clipping.
I'll leave changing the bit pattern to others, there's so many funcs using 
0x100.

diff --git a/libswscale/output.c b/libswscale/output.c
index d7c53e6..d3401f0 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -853,11 +853,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 V  = av_clip_uint8(V);
 }

-Y1 = av_clip_uint8(Y1);
-Y2 = av_clip_uint8(Y2);
-U  = av_clip_uint8(U);
-V  = av_clip_uint8(V);
-
 output_pixels(i * 4, Y1, U, Y2, V);
 }
 } else {
@@ -875,11 +870,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 V  = av_clip_uint8(V);
 }

-Y1 = av_clip_uint8(Y1);
-Y2 = av_clip_uint8(Y2);
-U  = av_clip_uint8(U);
-V  = av_clip_uint8(V);
-
 output_pixels(i * 4, Y1, U, Y2, V);
 }
 }
--
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] swscale: Remove duplicated code

2019-03-24 Thread Lauri Kasanen
In this function, the exact same clamping happens both in the if and 
unconditionally.

Signed-off-by: Lauri Kasanen 
---
 libswscale/output.c | 14 --
 1 file changed, 14 deletions(-)

diff --git a/libswscale/output.c b/libswscale/output.c
index d7c53e6..8441ddd 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -846,13 +846,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 int U  = (ubuf0[i]   +64) >> 7;
 int V  = (vbuf0[i]   +64) >> 7;

-if ((Y1 | Y2 | U | V) & 0x100) {
-Y1 = av_clip_uint8(Y1);
-Y2 = av_clip_uint8(Y2);
-U  = av_clip_uint8(U);
-V  = av_clip_uint8(V);
-}
-
 Y1 = av_clip_uint8(Y1);
 Y2 = av_clip_uint8(Y2);
 U  = av_clip_uint8(U);
@@ -868,13 +861,6 @@ yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 int U  = (ubuf0[i] + ubuf1[i]+128) >> 8;
 int V  = (vbuf0[i] + vbuf1[i]+128) >> 8;

-if ((Y1 | Y2 | U | V) & 0x100) {
-Y1 = av_clip_uint8(Y1);
-Y2 = av_clip_uint8(Y2);
-U  = av_clip_uint8(U);
-V  = av_clip_uint8(V);
-}
-
 Y1 = av_clip_uint8(Y1);
 Y2 = av_clip_uint8(Y2);
 U  = av_clip_uint8(U);
--
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] swscale/ppc: VSX-optimize yuv2422_X

2019-03-24 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

7.2x speedup:

yuyv422
 126354 UNITS in yuv2packedX,   16384 runs,  0 skips
  16383 UNITS in yuv2packedX,   16382 runs,  2 skips
yvyu422
 117669 UNITS in yuv2packedX,   16384 runs,  0 skips
  16271 UNITS in yuv2packedX,   16379 runs,  5 skips
uyvy422
 117310 UNITS in yuv2packedX,   16384 runs,  0 skips
  16226 UNITS in yuv2packedX,   16382 runs,  2 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 104 +++
 1 file changed, 104 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 1c4051b..36b4c33 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -726,6 +726,93 @@ write422(const vector int16_t vy1, const vector int16_t 
vy2,
 }
 }

+static av_always_inline void
+yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target)
+{
+int i, j;
+vector int16_t vy1, vy2, vu, vv;
+vector int32_t vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
+vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
+const vector int32_t start = vec_splats(1 << 18);
+const vector uint32_t shift19 = vec_splats(19U);
+
+for (i = 0; i < lumFilterSize; i++)
+vlumFilter[i] = vec_splats(lumFilter[i]);
+for (i = 0; i < chrFilterSize; i++)
+vchrFilter[i] = vec_splats(chrFilter[i]);
+
+for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+vy32[0] =
+vy32[1] =
+vy32[2] =
+vy32[3] =
+vu32[0] =
+vu32[1] =
+vv32[0] =
+vv32[1] = start;
+
+for (j = 0; j < lumFilterSize; j++) {
+vv = vec_ld(0, [j][i * 2]);
+tmp = vec_mule(vv, vlumFilter[j]);
+tmp2 = vec_mulo(vv, vlumFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vy32[0] = vec_adds(vy32[0], tmp3);
+vy32[1] = vec_adds(vy32[1], tmp4);
+
+vv = vec_ld(0, [j][(i + 4) * 2]);
+tmp = vec_mule(vv, vlumFilter[j]);
+tmp2 = vec_mulo(vv, vlumFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vy32[2] = vec_adds(vy32[2], tmp3);
+vy32[3] = vec_adds(vy32[3], tmp4);
+}
+
+for (j = 0; j < chrFilterSize; j++) {
+vv = vec_ld(0, [j][i]);
+tmp = vec_mule(vv, vchrFilter[j]);
+tmp2 = vec_mulo(vv, vchrFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vu32[0] = vec_adds(vu32[0], tmp3);
+vu32[1] = vec_adds(vu32[1], tmp4);
+
+vv = vec_ld(0, [j][i]);
+tmp = vec_mule(vv, vchrFilter[j]);
+tmp2 = vec_mulo(vv, vchrFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vv32[0] = vec_adds(vv32[0], tmp3);
+vv32[1] = vec_adds(vv32[1], tmp4);
+}
+
+   for (j = 0; j < 4; j++) {
+   vy32[j] = vec_sra(vy32[j], shift19);
+   }
+   for (j = 0; j < 2; j++) {
+   vu32[j] = vec_sra(vu32[j], shift19);
+   vv32[j] = vec_sra(vv32[j], shift19);
+   }
+
+vy1 = vec_packs(vy32[0], vy32[1]);
+vy2 = vec_packs(vy32[2], vy32[3]);
+vu = vec_packs(vu32[0], vu32[1]);
+vv = vec_packs(vv32[0], vv32[1]);
+
+write422(vy1, vy2, vu, vv, [i * 4], target);
+}
+}
+
 #define SETUP(x, buf0, buf1, alpha) { \
 x = vec_ld(0, buf0); \
 tmp = vec_mule(x, alpha); \
@@ -841,7 +928,21 @@ yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
 }
 }

+#define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
+static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
+const int16_t **lumSrc, int lumFilterSize, \
+const int16_t *chrFilter, const int16_t 
**chrUSrc, \
+const int16_t **chrVSrc, int chrFilterSize, \
+const int16_t **alpSrc, uint8_t *dest, int 
dstW, \
+int y) \
+{ \
+name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
+  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+  alpSrc, dest, dstW, y, fmt); \
+}
+
 #define YUV2PA

[FFmpeg-devel] [PATCH 1/3] swscale/ppc: VSX-optimize yuv2422_1

2019-03-24 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

15.3x speedup:

yuyv422
  14513 UNITS in yuv2packed1,   32768 runs,  0 skips
949 UNITS in yuv2packed1,   32767 runs,  1 skips
yvyu422
  14516 UNITS in yuv2packed1,   32767 runs,  1 skips
943 UNITS in yuv2packed1,   32767 runs,  1 skips
uyvy422
  14530 UNITS in yuv2packed1,   32767 runs,  1 skips
941 UNITS in yuv2packed1,   32766 runs,  2 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 149 +++
 1 file changed, 149 insertions(+)

Series on top of "swscale/ppc: VSX-optimize yuv2rgb_full".

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 062ab0d..0bb82ac 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -664,6 +664,143 @@ YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, 
AV_PIX_FMT_ABGR,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full,  AV_PIX_FMT_RGB24, 0)
 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full,  AV_PIX_FMT_BGR24, 0)

+static av_always_inline void
+write422(const vector int16_t vy1, const vector int16_t vy2,
+ const vector int16_t vu, const vector int16_t vv,
+ uint8_t *dest, const enum AVPixelFormat target)
+{
+vector uint8_t vd1, vd2, tmp;
+const vector uint8_t yuyv1 = (vector uint8_t) {
+ 0x0, 0x10, 0x1, 0x18,
+ 0x2, 0x11, 0x3, 0x19,
+ 0x4, 0x12, 0x5, 0x1a,
+ 0x6, 0x13, 0x7, 0x1b };
+const vector uint8_t yuyv2 = (vector uint8_t) {
+ 0x8, 0x14, 0x9, 0x1c,
+ 0xa, 0x15, 0xb, 0x1d,
+ 0xc, 0x16, 0xd, 0x1e,
+ 0xe, 0x17, 0xf, 0x1f };
+const vector uint8_t yvyu1 = (vector uint8_t) {
+ 0x0, 0x18, 0x1, 0x10,
+ 0x2, 0x19, 0x3, 0x11,
+ 0x4, 0x1a, 0x5, 0x12,
+ 0x6, 0x1b, 0x7, 0x13 };
+const vector uint8_t yvyu2 = (vector uint8_t) {
+ 0x8, 0x1c, 0x9, 0x14,
+ 0xa, 0x1d, 0xb, 0x15,
+ 0xc, 0x1e, 0xd, 0x16,
+ 0xe, 0x1f, 0xf, 0x17 };
+const vector uint8_t uyvy1 = (vector uint8_t) {
+ 0x10, 0x0, 0x18, 0x1,
+ 0x11, 0x2, 0x19, 0x3,
+ 0x12, 0x4, 0x1a, 0x5,
+ 0x13, 0x6, 0x1b, 0x7 };
+const vector uint8_t uyvy2 = (vector uint8_t) {
+ 0x14, 0x8, 0x1c, 0x9,
+ 0x15, 0xa, 0x1d, 0xb,
+ 0x16, 0xc, 0x1e, 0xd,
+ 0x17, 0xe, 0x1f, 0xf };
+
+vd1 = vec_packsu(vy1, vy2);
+vd2 = vec_packsu(vu, vv);
+
+switch (target) {
+case AV_PIX_FMT_YUYV422:
+tmp = vec_perm(vd1, vd2, yuyv1);
+vec_st(tmp, 0, dest);
+tmp = vec_perm(vd1, vd2, yuyv2);
+vec_st(tmp, 16, dest);
+break;
+case AV_PIX_FMT_YVYU422:
+tmp = vec_perm(vd1, vd2, yvyu1);
+vec_st(tmp, 0, dest);
+tmp = vec_perm(vd1, vd2, yvyu2);
+vec_st(tmp, 16, dest);
+break;
+case AV_PIX_FMT_UYVY422:
+tmp = vec_perm(vd1, vd2, uyvy1);
+vec_st(tmp, 0, dest);
+tmp = vec_perm(vd1, vd2, uyvy2);
+vec_st(tmp, 16, dest);
+break;
+}
+}
+
+static av_always_inline void
+yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target)
+{
+const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+vector int16_t vy1, vy2, vu, vv, tmp;
+const vector int16_t add64 = vec_splats((int16_t) 64);
+const vector int16_t add128 = vec_splats((int16_t) 128);
+const vector uint16_t shift7 = vec_splat_u16(7);
+const vector uint16_t shift8 = vec_splat_u16(8);
+int i;
+
+if (uvalpha < 2048) {
+for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+vy1 = vec_ld(0, [i * 2]);
+vy2 = vec_ld(0, [(i + 4) * 2]);
+vu = vec_ld(0, [i]);
+vv = vec_ld(0, [i]);
+
+vy1 = vec_add(vy1, add64);
+vy2 = vec_add(vy2, add64);
+vu = vec_add(vu, add64);
+vv = vec_add(vv, add64);
+
+vy1 = vec_sra(vy1, shift7);
+vy2 = vec_sra(vy2, shift7);
+vu = vec_sra(vu, shift7);
+vv = vec_sra(vv, shift7);
+
+write422(vy1, vy2, v

[FFmpeg-devel] [PATCH 2/3] swscale/ppc: VSX-optimize yuv2422_2

2019-03-24 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags area \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

5.1x speedup:

yuyv422
  19339 UNITS in yuv2packed2,   16384 runs,  0 skips
   3718 UNITS in yuv2packed2,   16383 runs,  1 skips
yvyu422
  19438 UNITS in yuv2packed2,   16384 runs,  0 skips
   3800 UNITS in yuv2packed2,   16380 runs,  4 skips
uyvy422
  19128 UNITS in yuv2packed2,   16384 runs,  0 skips
   3721 UNITS in yuv2packed2,   16380 runs,  4 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 69 
 1 file changed, 69 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 0bb82ac..1c4051b 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -726,6 +726,61 @@ write422(const vector int16_t vy1, const vector int16_t 
vy2,
 }
 }

+#define SETUP(x, buf0, buf1, alpha) { \
+x = vec_ld(0, buf0); \
+tmp = vec_mule(x, alpha); \
+tmp2 = vec_mulo(x, alpha); \
+tmp3 = vec_mergeh(tmp, tmp2); \
+tmp4 = vec_mergel(tmp, tmp2); \
+\
+x = vec_ld(0, buf1); \
+tmp = vec_mule(x, alpha); \
+tmp2 = vec_mulo(x, alpha); \
+tmp5 = vec_mergeh(tmp, tmp2); \
+tmp6 = vec_mergel(tmp, tmp2); \
+\
+tmp3 = vec_add(tmp3, tmp5); \
+tmp4 = vec_add(tmp4, tmp6); \
+\
+tmp3 = vec_sra(tmp3, shift19); \
+tmp4 = vec_sra(tmp4, shift19); \
+x = vec_packs(tmp3, tmp4); \
+}
+
+static av_always_inline void
+yuv2422_2_vsx_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target)
+{
+const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
+const int16_t  yalpha1 = 4096 - yalpha;
+const int16_t uvalpha1 = 4096 - uvalpha;
+vector int16_t vy1, vy2, vu, vv;
+vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
+const vector int16_t vyalpha1 = vec_splats(yalpha1);
+const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
+const vector uint32_t shift19 = vec_splats(19U);
+int i;
+av_assert2(yalpha  <= 4096U);
+av_assert2(uvalpha <= 4096U);
+
+for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+
+SETUP(vy1, [i * 2], [i * 2], vyalpha1)
+SETUP(vy2, [(i + 4) * 2], [(i + 4) * 2], vyalpha1)
+SETUP(vu, [i], [i], vuvalpha1)
+SETUP(vv, [i], [i], vuvalpha1)
+
+write422(vy1, vy2, vu, vv, [i * 4], target);
+}
+}
+
+#undef SETUP
+
 static av_always_inline void
 yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
  const int16_t *ubuf[2], const int16_t *vbuf[2],
@@ -786,7 +841,18 @@ yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
 }
 }

+#define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
+static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
+const int16_t *ubuf[2], const int16_t 
*vbuf[2], \
+const int16_t *abuf[2], uint8_t *dest, int 
dstW, \
+int yalpha, int uvalpha, int y) \
+{ \
+name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
+  dest, dstW, yalpha, uvalpha, y, fmt); \
+}
+
 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
+YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
 const int16_t *ubuf[2], const int16_t 
*vbuf[2], \
 const int16_t *abuf0, uint8_t *dest, int dstW, 
\
@@ -909,12 +975,15 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 switch (dstFormat) {
 case AV_PIX_FMT_YUYV422:
 c->yuv2packed1 = yuv2yuyv422_1_vsx;
+c->yuv2packed2 = yuv2yuyv422_2_vsx;
 break;
 case AV_PIX_FMT_YVYU422:
 c->yuv2packed1 = yuv2yvyu422_1_vsx;
+c->yuv2packed2 = yuv2yvyu422_2_vsx;
 break;
 case AV_PIX_FMT_UYVY422:
 c->yuv2packed1 = yuv2uyvy422_1_vsx;
+c->yuv2packed2 = yuv2uyvy422_2_vsx;
 break;
 }
 }
--
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3 v2] swscale/ppc: VSX-optimize yuv2422_X

2019-03-25 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
  -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
  -cpuflags 0 -v error -

7.2x speedup:

yuyv422
 126354 UNITS in yuv2packedX,   16384 runs,  0 skips
  16383 UNITS in yuv2packedX,   16382 runs,  2 skips
yvyu422
 117669 UNITS in yuv2packedX,   16384 runs,  0 skips
  16271 UNITS in yuv2packedX,   16379 runs,  5 skips
uyvy422
 117310 UNITS in yuv2packedX,   16384 runs,  0 skips
  16226 UNITS in yuv2packedX,   16382 runs,  2 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 104 +++
 1 file changed, 104 insertions(+)

v2: Fix accidental tabs. No code changes

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 1c4051b..69ec63d 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -726,6 +726,93 @@ write422(const vector int16_t vy1, const vector int16_t 
vy2,
 }
 }

+static av_always_inline void
+yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
+ const int16_t **lumSrc, int lumFilterSize,
+ const int16_t *chrFilter, const int16_t **chrUSrc,
+ const int16_t **chrVSrc, int chrFilterSize,
+ const int16_t **alpSrc, uint8_t *dest, int dstW,
+ int y, enum AVPixelFormat target)
+{
+int i, j;
+vector int16_t vy1, vy2, vu, vv;
+vector int32_t vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
+vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
+const vector int32_t start = vec_splats(1 << 18);
+const vector uint32_t shift19 = vec_splats(19U);
+
+for (i = 0; i < lumFilterSize; i++)
+vlumFilter[i] = vec_splats(lumFilter[i]);
+for (i = 0; i < chrFilterSize; i++)
+vchrFilter[i] = vec_splats(chrFilter[i]);
+
+for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
+vy32[0] =
+vy32[1] =
+vy32[2] =
+vy32[3] =
+vu32[0] =
+vu32[1] =
+vv32[0] =
+vv32[1] = start;
+
+for (j = 0; j < lumFilterSize; j++) {
+vv = vec_ld(0, [j][i * 2]);
+tmp = vec_mule(vv, vlumFilter[j]);
+tmp2 = vec_mulo(vv, vlumFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vy32[0] = vec_adds(vy32[0], tmp3);
+vy32[1] = vec_adds(vy32[1], tmp4);
+
+vv = vec_ld(0, [j][(i + 4) * 2]);
+tmp = vec_mule(vv, vlumFilter[j]);
+tmp2 = vec_mulo(vv, vlumFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vy32[2] = vec_adds(vy32[2], tmp3);
+vy32[3] = vec_adds(vy32[3], tmp4);
+}
+
+for (j = 0; j < chrFilterSize; j++) {
+vv = vec_ld(0, [j][i]);
+tmp = vec_mule(vv, vchrFilter[j]);
+tmp2 = vec_mulo(vv, vchrFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vu32[0] = vec_adds(vu32[0], tmp3);
+vu32[1] = vec_adds(vu32[1], tmp4);
+
+vv = vec_ld(0, [j][i]);
+tmp = vec_mule(vv, vchrFilter[j]);
+tmp2 = vec_mulo(vv, vchrFilter[j]);
+tmp3 = vec_mergeh(tmp, tmp2);
+tmp4 = vec_mergel(tmp, tmp2);
+
+vv32[0] = vec_adds(vv32[0], tmp3);
+vv32[1] = vec_adds(vv32[1], tmp4);
+}
+
+for (j = 0; j < 4; j++) {
+vy32[j] = vec_sra(vy32[j], shift19);
+}
+for (j = 0; j < 2; j++) {
+vu32[j] = vec_sra(vu32[j], shift19);
+vv32[j] = vec_sra(vv32[j], shift19);
+}
+
+vy1 = vec_packs(vy32[0], vy32[1]);
+vy2 = vec_packs(vy32[2], vy32[3]);
+vu = vec_packs(vu32[0], vu32[1]);
+vv = vec_packs(vv32[0], vv32[1]);
+
+write422(vy1, vy2, vu, vv, [i * 4], target);
+}
+}
+
 #define SETUP(x, buf0, buf1, alpha) { \
 x = vec_ld(0, buf0); \
 tmp = vec_mule(x, alpha); \
@@ -841,7 +928,21 @@ yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
 }
 }

+#define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
+static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
+const int16_t **lumSrc, int lumFilterSize, \
+const int16_t *chrFilter, const int16_t 
**chrUSrc, \
+const int16_t **chrVSrc, int chrFilterSize, \
+const int16_t **alpSrc, uint8_t *dest, int 
dstW, \
+int y) \
+{ \
+name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
+  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
+  alpSrc, dest, dstW, y, fmt); \
+}
+

Re: [FFmpeg-devel] [PATCH] swscale: Remove duplicated code

2019-03-25 Thread Lauri Kasanen
On Mon, 25 Mar 2019 11:17:38 +0100
Michael Niedermayer  wrote:

> On Sun, Mar 24, 2019 at 01:04:51PM +0200, Lauri Kasanen wrote:
> > In this function, the exact same clamping happens both in the if and 
> > unconditionally.
> >
> > Signed-off-by: Lauri Kasanen 
> > ---
> >  libswscale/output.c | 14 --
> >  1 file changed, 14 deletions(-)
>
> The removed code is the one that should stay, the other should be
> removed.
> one check for a rarely true condition should be faster than 4 checks

Yes, I thought so too, but the commit that added the unconditional code
says it fixes a bug...

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2

2019-04-05 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
fast_bilinear \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

32-bit mul, power8 only.

~2x speedup:

rgb24
  24431 UNITS in yuv2packed2,   16384 runs,  0 skips
  13783 UNITS in yuv2packed2,   16383 runs,  1 skips
bgr24
  24396 UNITS in yuv2packed2,   16384 runs,  0 skips
  14059 UNITS in yuv2packed2,   16384 runs,  0 skips
rgba
  26815 UNITS in yuv2packed2,   16383 runs,  1 skips
  12797 UNITS in yuv2packed2,   16383 runs,  1 skips
bgra
  27060 UNITS in yuv2packed2,   16384 runs,  0 skips
  13138 UNITS in yuv2packed2,   16384 runs,  0 skips
argb
  26998 UNITS in yuv2packed2,   16384 runs,  0 skips
  12728 UNITS in yuv2packed2,   16381 runs,  3 skips
bgra
  26651 UNITS in yuv2packed2,   16384 runs,  0 skips
  13124 UNITS in yuv2packed2,   16384 runs,  0 skips

This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
version
is also heavily inaccurate, while the vsx version has high accuracy.

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 188 +++
 1 file changed, 188 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index e05f9ec..ba00791 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -793,6 +793,180 @@ yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t 
*buf[2],
 }
 }

+static av_always_inline void
+yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+  *abuf0 = hasAlpha ? abuf[0] : NULL,
+  *abuf1 = hasAlpha ? abuf[1] : NULL;
+const int16_t  yalpha1 = 4096 - yalpha;
+const int16_t uvalpha1 = 4096 - uvalpha;
+vector int16_t vy, vu, vv, A = vec_splat_s16(0);
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, 
vvd32_r;
+vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector int16_t vyalpha1 = vec_splats(yalpha1);
+const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
+const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
+const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift19 = vec_splats(19U);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint32_t shift10 = vec_splat_u32(10);
+const vector int32_t dec128 = vec_splats(128 << 19);
+const vector int32_t add18 = vec_splats(1 << 18);
+int i;
+
+// Various permutations
+const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
+0, 1, 2, 3,
+4, 5, 6, 7,
+4, 5, 6, 7 };
+const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
+8, 9, 10, 11,
+12, 13, 14, 15,
+12, 13, 14, 15 };
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+

[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_1

2019-03-31 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
fast_bilinear \
-s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

32-bit mul, power8 only.

1.8-2.3x speedup:

rgb24
  18192 UNITS in yuv2packed1,   32767 runs,  1 skips
   9983 UNITS in yuv2packed1,   32760 runs,  8 skips
bgr24
  18665 UNITS in yuv2packed1,   32766 runs,  2 skips
   9925 UNITS in yuv2packed1,   32763 runs,  5 skips
rgba
  20239 UNITS in yuv2packed1,   32767 runs,  1 skips
   8794 UNITS in yuv2packed1,   32759 runs,  9 skips
bgra
  20354 UNITS in yuv2packed1,   32768 runs,  0 skips
   8770 UNITS in yuv2packed1,   32761 runs,  7 skips
argb
  20185 UNITS in yuv2packed1,   32768 runs,  0 skips
   8761 UNITS in yuv2packed1,   32761 runs,  7 skips
bgra
  20360 UNITS in yuv2packed1,   32766 runs,  2 skips
   8759 UNITS in yuv2packed1,   32764 runs,  4 skips

This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
version
is also heavily inaccurate, while the vsx version has high accuracy.

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 425 +--
 1 file changed, 330 insertions(+), 95 deletions(-)

Okay, so I'm a bit unsure what to do here. I'm sure it could be faster if made 
as
inaccurate as the mmx version, but that differs a lot from the C version, which 
itself
is inaccurate vs the _full C and vsx versions. There are no other versions than 
mmx and
C to compare against.

I took the approach of using the accurate _full YUV logic, just writing two 
pixels for
each UV value. The C version uses a LUT, resulting in ~1/255 rounding errors in 
most
pixels compared to the accurate C/VSX _full logic. The MMX version does low 
accuracy
logic, differing from the C LUT as much as 10/255 per pixel.

Speed or accuracy? IMHO the mmx errors are far too large.

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 69ec63d..0ac8cac 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -422,6 +422,104 @@ yuv2NBPSX(16, BE, 1, 16, int32_t)
 yuv2NBPSX(16, LE, 0, 16, int32_t)
 #endif

+#define WRITERGB \
+R_l = vec_max(R_l, zero32); \
+R_r = vec_max(R_r, zero32); \
+G_l = vec_max(G_l, zero32); \
+G_r = vec_max(G_r, zero32); \
+B_l = vec_max(B_l, zero32); \
+B_r = vec_max(B_r, zero32); \
+\
+R_l = vec_min(R_l, rgbclip); \
+R_r = vec_min(R_r, rgbclip); \
+G_l = vec_min(G_l, rgbclip); \
+G_r = vec_min(G_r, rgbclip); \
+B_l = vec_min(B_l, rgbclip); \
+B_r = vec_min(B_r, rgbclip); \
+\
+R_l = vec_sr(R_l, shift22); \
+R_r = vec_sr(R_r, shift22); \
+G_l = vec_sr(G_l, shift22); \
+G_r = vec_sr(G_r, shift22); \
+B_l = vec_sr(B_l, shift22); \
+B_r = vec_sr(B_r, shift22); \
+\
+rd16 = vec_packsu(R_l, R_r); \
+gd16 = vec_packsu(G_l, G_r); \
+bd16 = vec_packsu(B_l, B_r); \
+rd = vec_packsu(rd16, zero16); \
+gd = vec_packsu(gd16, zero16); \
+bd = vec_packsu(bd16, zero16); \
+\
+switch(target) { \
+case AV_PIX_FMT_RGB24: \
+out0 = vec_perm(rd, gd, perm3rg0); \
+out0 = vec_perm(out0, bd, perm3tb0); \
+out1 = vec_perm(rd, gd, perm3rg1); \
+out1 = vec_perm(out1, bd, perm3tb1); \
+\
+vec_vsx_st(out0, 0, dest); \
+vec_vsx_st(out1, 16, dest); \
+\
+dest += 24; \
+break; \
+case AV_PIX_FMT_BGR24: \
+out0 = vec_perm(bd, gd, perm3rg0); \
+out0 = vec_perm(out0, rd, perm3tb0); \
+out1 = vec_perm(bd, gd, perm3rg1); \
+out1 = vec_perm(out1, rd, perm3tb1); \
+\
+vec_vsx_st(out0, 0, dest); \
+vec_vsx_st(out1, 16, dest); \
+\
+dest += 24; \
+break; \
+case AV_PIX_FMT_BGRA: \
+out0 = vec_mergeh(bd, gd); \
+out1 = vec_mergeh(rd, ad); \
+\
+tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector 
uint16_t) out1); \
+vec_vsx_st(tmp8, 0, dest); \
+tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector 
uint16_t) out1); \
+vec_vsx_st(tmp8, 16, dest); \
+\
+dest += 32; \
+break; \
+case AV_PIX_FMT_RGBA: \
+out0 = vec_mergeh(rd, gd); \
+out1 = vec_mergeh(bd, ad); \
+\
+tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector 
uint16_t) out1); \
+vec_vsx_st(tmp8, 0, dest); \
+tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector 
uint16_t) out1); \
+vec_vsx_st(tmp8, 16, dest); \
+\
+dest += 32; \
+break; \
+case AV_PIX_FMT_ARGB: \
+out0 = vec_mergeh(ad, rd); \
+out1 = vec_mergeh(gd, bd

Re: [FFmpeg-devel] [PATCH 1/3] swscale/ppc: VSX-optimize yuv2422_1

2019-03-31 Thread Lauri Kasanen
On Sun, 24 Mar 2019 15:10:35 +0200
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
> -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 15.3x speedup:
>
> yuyv422
>   14513 UNITS in yuv2packed1,   32768 runs,  0 skips
> 949 UNITS in yuv2packed1,   32767 runs,  1 skips
> yvyu422
>   14516 UNITS in yuv2packed1,   32767 runs,  1 skips
> 943 UNITS in yuv2packed1,   32767 runs,  1 skips
> uyvy422
>   14530 UNITS in yuv2packed1,   32767 runs,  1 skips
> 941 UNITS in yuv2packed1,   32766 runs,  2 skips
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 149 
> +++
>  1 file changed, 149 insertions(+)

Applying these.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] This patch addresses Trac ticket #5570. The optimized functions are in file libswscale/ppc/input_vsx.c. Each optimized function name is a concatenation of the corresponding

2019-04-01 Thread Lauri Kasanen
On Mon, 1 Apr 2019 09:07:48 +0300
slava  wrote:

> Sorry for title. It is my first experience in git send-email. Can I make
> a benchmark with handwritten tests or have some standard tool in ffmeg?
> And will the benchmark on x86-64 be informative?

We have standard bench macros, START_TIMER and STOP_TIMER. Put those
around the function's callsite, then do some ffmpeg run that calls that
specific function. Then add "-cpuflags 0" to the call to get the C
results, and from the numbers you can calculate the speedup.

Both the C and VSX runs should be done on the POWER machine. A Qemu VM,
emulating POWER instructions on x86-64, would probably be useless for
benchmark purposes. There are free POWER VMs available for testing from
a few places.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_2

2019-04-01 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags area \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

32-bit mul, power8 only.

~4x speedup:

rgb24
  52763 UNITS in yuv2packed2,   16384 runs,  0 skips
  13453 UNITS in yuv2packed2,   16384 runs,  0 skips
bgr24
  53144 UNITS in yuv2packed2,   16384 runs,  0 skips
  13616 UNITS in yuv2packed2,   16384 runs,  0 skips
rgba
  52796 UNITS in yuv2packed2,   16384 runs,  0 skips
  12904 UNITS in yuv2packed2,   16384 runs,  0 skips
bgra
  52732 UNITS in yuv2packed2,   16384 runs,  0 skips
  13262 UNITS in yuv2packed2,   16384 runs,  0 skips
argb
  52661 UNITS in yuv2packed2,   16384 runs,  0 skips
  12879 UNITS in yuv2packed2,   16384 runs,  0 skips
bgra
  52662 UNITS in yuv2packed2,   16384 runs,  0 skips
  12932 UNITS in yuv2packed2,   16384 runs,  0 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 166 +++
 1 file changed, 166 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 0ac8cac..6ff8b62 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -520,6 +520,148 @@ yuv2NBPSX(16, LE, 0, 16, int32_t)
 break; \
 }

+#define SETUP(x, buf0, alpha1, buf1, alpha) { \
+x = vec_ld(0, buf0); \
+tmp = vec_mule(x, alpha1); \
+tmp2 = vec_mulo(x, alpha1); \
+tmp3 = vec_mergeh(tmp, tmp2); \
+tmp4 = vec_mergel(tmp, tmp2); \
+\
+x = vec_ld(0, buf1); \
+tmp = vec_mule(x, alpha); \
+tmp2 = vec_mulo(x, alpha); \
+tmp5 = vec_mergeh(tmp, tmp2); \
+tmp6 = vec_mergel(tmp, tmp2); \
+\
+tmp3 = vec_add(tmp3, tmp5); \
+tmp4 = vec_add(tmp4, tmp6); \
+}
+
+
+static av_always_inline void
+yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2],
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf[2], uint8_t *dest, int dstW,
+ int yalpha, int uvalpha, int y,
+ enum AVPixelFormat target, int hasAlpha)
+{
+const int16_t *buf0  = buf[0],  *buf1  = buf[1],
+  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
+  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
+  *abuf0 = hasAlpha ? abuf[0] : NULL,
+  *abuf1 = hasAlpha ? abuf[1] : NULL;
+const int16_t  yalpha1 = 4096 - yalpha;
+const int16_t uvalpha1 = 4096 - uvalpha;
+vector int16_t vy, vu, vv, A = vec_splat_s16(0);
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
+vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector int16_t vyalpha1 = vec_splats(yalpha1);
+const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
+const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
+const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift19 = vec_splats(19U);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint32_t shift10 = vec_splat_u32(10);
+const vector int32_t dec128 = vec_splats(128 << 19);
+const vector int32_t add18 = vec_splats(1 << 18);
+int i;
+
+// Various permutations
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0 };
+const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
+  0x3, 0x4, 0x11,
+  0x6, 0x7, 0x12,
+  

[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_X

2019-04-01 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

32-bit mul, power8 only.

~6.4x speedup:

rgb24
 214278 UNITS in yuv2packedX,   16384 runs,  0 skips
  33249 UNITS in yuv2packedX,   16384 runs,  0 skips
bgr24
 214616 UNITS in yuv2packedX,   16384 runs,  0 skips
  33233 UNITS in yuv2packedX,   16384 runs,  0 skips
rgba
 214517 UNITS in yuv2packedX,   16384 runs,  0 skips
  33271 UNITS in yuv2packedX,   16384 runs,  0 skips
bgra
 214973 UNITS in yuv2packedX,   16384 runs,  0 skips
  33397 UNITS in yuv2packedX,   16384 runs,  0 skips
argb
 214613 UNITS in yuv2packedX,   16384 runs,  0 skips
  33310 UNITS in yuv2packedX,   16384 runs,  0 skips
bgra
 214637 UNITS in yuv2packedX,   16384 runs,  0 skips
  0 UNITS in yuv2packedX,   16384 runs,  0 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 160 +++
 1 file changed, 160 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 6ff8b62..e05f9ec 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -520,6 +520,139 @@ yuv2NBPSX(16, LE, 0, 16, int32_t)
 break; \
 }

+static av_always_inline void
+yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
+  const int16_t **lumSrc, int lumFilterSize,
+  const int16_t *chrFilter, const int16_t **chrUSrc,
+  const int16_t **chrVSrc, int chrFilterSize,
+  const int16_t **alpSrc, uint8_t *dest,
+  int dstW, int y, enum AVPixelFormat target, int 
hasAlpha)
+{
+vector int16_t vv;
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
+vector int32_t tmp, tmp2, tmp3, tmp4;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
+const vector int32_t ystart = vec_splats(1 << 9);
+const vector int32_t uvstart = vec_splats((1 << 9) - (128 << 19));
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint32_t shift10 = vec_splat_u32(10);
+int i, j;
+
+// Various permutations
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0 };
+const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
+  0x3, 0x4, 0x11,
+  0x6, 0x7, 0x12,
+  0x9, 0xa, 0x13,
+  0xc, 0xd, 0x14,
+  0xf };
+const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
+  0x2, 0x3, 0x16,
+  0x5, 0x6, 0x17 };
+
+ad = vec_splats((uint8_t) 255);
+
+for (i = 0; i < lumFilterSize; i++)
+vlumFilter[i] = vec_splats(lumFilter[i]);
+for (i = 0; i < chrFilterSize; i++)
+vchrFilter[i] = vec_splats(chrFilter[i]);
+
+for (i = 0; i < dstW; i += 8) {
+vy32_l =
+vy32_r = ystart;
+vu32_l =
+vu32_r =
+vv32_l =
+vv32_r = uvstart;
+
+for (j = 0; j < lumFilterSize; j++) {
+vv = vec_ld(0, [j][i]);
+tmp = vec_mule(vv, vlumFilter[j]);
+tmp2 = vec_mulo(vv, vlumFilter[j]);
+

Re: [FFmpeg-devel] [PATCH v2] Added XV Support

2019-04-08 Thread Lauri Kasanen
On Mon, 8 Apr 2019 06:39:27 +0800
Steven Liu  wrote:

> >+.long_name  = NULL_IF_CONFIG_SMALL("Xunlie Video File"),

XV is a video output format, so please make the title something like
"flv: Add XV (Xunlie Video) support".

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] avcodec/pnm: Avoid structure pointer dereferences in inner loop in pnm_get()

2019-02-21 Thread Lauri Kasanen
On Thu, 21 Feb 2019 20:34:29 +0100
Michael Niedermayer  wrote:

> Improves speed from 5.4 to 4.2 seconds
> Fixes: 
> 13149/clusterfuzz-testcase-minimized-ffmpeg_AV_CODEC_ID_PGM_fuzzer-5760833622114304

LGTM

Though, I really would expect the compiler to detect and optimize that.
I wonder if "PNMContext * const sc" would help it any.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/2] swscale/ppc: Clean up some mixed decl warnings

2019-03-18 Thread Lauri Kasanen
On Mon, 18 Mar 2019 14:06:15 +0100
Carl Eugen Hoyos  wrote:
> 
> This looks good to me if you tested it and it reduces the number of warnings.

Tested on power8. With these two patches, swscale/ppc has no warnings.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 2/2] swscale/ppc: Add av_unused to template vars only used in one includer

2019-03-20 Thread Lauri Kasanen
On Mon, 18 Mar 2019 13:56:52 +0200
Lauri Kasanen  wrote:

> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_ppc_template.c | 21 +++--
>  1 file changed, 11 insertions(+), 10 deletions(-)

Applying these two.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v2] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-20 Thread Lauri Kasanen
On Wed, 20 Mar 2019 16:31:57 +0100
Carl Eugen Hoyos  wrote:

> 2019-03-20 16:06 GMT+01:00, Lauri Kasanen :
> > On Wed, 20 Mar 2019 15:51:20 +0100
> > Carl Eugen Hoyos  wrote:
> >
> >> 2019-03-20 15:06 GMT+01:00, Lauri Kasanen :
> >>
> >> > +case AV_PIX_FMT_BGRA:
> >> > +if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8) {
> >> > +if (!c->needAlpha) {
> >> > +c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
> >>
> >> If only non-alpha is supported, I would have expected the
> >> exact same function to also work for AV_PIX_FMT_BGR0.
> >
> > I'll check that, and RGB0 as well.

No need for changes it seems. swcale internals never see those zero
pixfmts:

libswscale/utils.c:case AV_PIX_FMT_RGB0: *format =
AV_PIX_FMT_RGBA   ; return 4;

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH v2] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-20 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

This uses 32-bit mul, so POWER8 only.

The following output formats get about 4.5x speedup:

rgb24
  39980 UNITS in yuv2packed1,   32768 runs,  0 skips
   8774 UNITS in yuv2packed1,   32768 runs,  0 skips
bgr24
  40069 UNITS in yuv2packed1,   32768 runs,  0 skips
   8772 UNITS in yuv2packed1,   32766 runs,  2 skips
rgba
  39759 UNITS in yuv2packed1,   32768 runs,  0 skips
   8681 UNITS in yuv2packed1,   32767 runs,  1 skips
bgra
  39729 UNITS in yuv2packed1,   32768 runs,  0 skips
   8696 UNITS in yuv2packed1,   32766 runs,  2 skips
argb
  39766 UNITS in yuv2packed1,   32768 runs,  0 skips
   8672 UNITS in yuv2packed1,   32766 runs,  2 skips
bgra
  39784 UNITS in yuv2packed1,   32768 runs,  0 skips
   8659 UNITS in yuv2packed1,   32767 runs,  1 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 291 
+++ 1 file changed, 291 insertions(+)

v2: HAVE_POWER8 from ifdef to if

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 01eb46c..062ab0d 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -422,6 +422,248 @@ yuv2NBPSX(16, BE, 1, 16, int32_t)
 yuv2NBPSX(16, LE, 0, 16, int32_t)
 #endif

+static av_always_inline void
+yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r,
tmp32, tmp32_2;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift2 = vec_splat_u32(2);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
+const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
+const vector int16_t mul4 = vec_splat_s16(4);
+const vector int16_t mul8 = vec_splat_s16(8);
+const vector int16_t add64 = vec_splat_s16(64);
+const vector uint16_t shift7 = vec_splat_u16(7);
+const vector int16_t max255 = vec_splat_s16(255);
+int i;
+
+// Various permutations
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0 };
+const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
+  0x3, 0x4, 0x11,
+  0x6, 0x7, 0x12,
+  0x9, 0xa, 0x13,
+  0xc, 0xd, 0x14,
+  0xf };
+const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
+  0x2, 0x3, 0x16,
+  0x5, 0x6, 0x17 };
+
+for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites
padding bytes.
+vy = vec_ld(0, [i]);
+vy32_l = vec_unpackh(vy);
+vy32_r = vec_unpackl(vy);
+vy32_l = vec_sl(vy32_l, shift2);
+vy32_r = vec_sl(vy32_r, shift2);
+
+vu = vec_ld(0, [i]);
+vv = vec_ld(0, [i]);
+if

Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-20 Thread Lauri Kasanen
On Wed, 20 Mar 2019 14:41:27 +0100
Carl Eugen Hoyos  wrote:

> 2019-03-20 13:37 GMT+01:00, Lauri Kasanen :
>
> > @@ -480,5 +722,66 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
>
> Are there followup patches?
> Or why is the following hunk so convoluted?

I plan to add the _2 and _X variants later. I don't know yet if they
need power8; if one doesn't, then there'd be plenty of ifdef sprinkling.

> > +if (c->flags & SWS_BITEXACT)
> > +return;
>
> > +#if !HAVE_BIGENDIAN
>
> Are you planning to add big-endian support?

No, I can only test LE.

> > +if (c->flags & SWS_FULL_CHR_H_INT) {
>
> Iiuc, the first if above and this one can be merged.

I plan to add other formats that are used without that flag.

> > +switch (dstFormat) {
> > +case AV_PIX_FMT_RGB24:
>
> > +#if HAVE_POWER8
> > +if (cpu_flags & AV_CPU_FLAG_POWER8) {
>
> if (HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8)

Will do.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full

2019-03-20 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
-cpuflags 0 -v error -

This uses 32-bit mul, so POWER8 only.

The following output formats get about 4.5x speedup:

rgb24
  39980 UNITS in yuv2packed1,   32768 runs,  0 skips
   8774 UNITS in yuv2packed1,   32768 runs,  0 skips
bgr24
  40069 UNITS in yuv2packed1,   32768 runs,  0 skips
   8772 UNITS in yuv2packed1,   32766 runs,  2 skips
rgba
  39759 UNITS in yuv2packed1,   32768 runs,  0 skips
   8681 UNITS in yuv2packed1,   32767 runs,  1 skips
bgra
  39729 UNITS in yuv2packed1,   32768 runs,  0 skips
   8696 UNITS in yuv2packed1,   32766 runs,  2 skips
argb
  39766 UNITS in yuv2packed1,   32768 runs,  0 skips
   8672 UNITS in yuv2packed1,   32766 runs,  2 skips
bgra
  39784 UNITS in yuv2packed1,   32768 runs,  0 skips
   8659 UNITS in yuv2packed1,   32767 runs,  1 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 303 +++
 1 file changed, 303 insertions(+)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 01eb46c..f20c11e 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -422,6 +422,248 @@ yuv2NBPSX(16, BE, 1, 16, int32_t)
 yuv2NBPSX(16, LE, 0, 16, int32_t)
 #endif

+static av_always_inline void
+yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
+ const int16_t *ubuf[2], const int16_t *vbuf[2],
+ const int16_t *abuf0, uint8_t *dest, int dstW,
+ int uvalpha, int y, enum AVPixelFormat target,
+ int hasAlpha)
+{
+const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
+const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
+vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
+vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, 
tmp32_2;
+vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
+vector uint16_t rd16, gd16, bd16;
+vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
+const vector uint16_t zero16 = vec_splat_u16(0);
+const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
+const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
+const vector int32_t y_add = vec_splats(1 << 21);
+const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
+const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
+const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
+const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
+const vector int32_t rgbclip = vec_splats(1 << 30);
+const vector int32_t zero32 = vec_splat_s32(0);
+const vector uint32_t shift2 = vec_splat_u32(2);
+const vector uint32_t shift22 = vec_splats(22U);
+const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
+const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
+const vector int16_t mul4 = vec_splat_s16(4);
+const vector int16_t mul8 = vec_splat_s16(8);
+const vector int16_t add64 = vec_splat_s16(64);
+const vector uint16_t shift7 = vec_splat_u16(7);
+const vector int16_t max255 = vec_splat_s16(255);
+int i;
+
+// Various permutations
+const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
+  0x1, 0x11, 0,
+  0x2, 0x12, 0,
+  0x3, 0x13, 0,
+  0x4, 0x14, 0,
+  0x5 };
+const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
+  0x6, 0x16, 0,
+  0x7, 0x17, 0 };
+const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
+  0x3, 0x4, 0x11,
+  0x6, 0x7, 0x12,
+  0x9, 0xa, 0x13,
+  0xc, 0xd, 0x14,
+  0xf };
+const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
+  0x2, 0x3, 0x16,
+  0x5, 0x6, 0x17 };
+
+for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding 
bytes.
+vy = vec_ld(0, [i]);
+vy32_l = vec_unpackh(vy);
+vy32_r = vec_unpackl(vy);
+vy32_l = vec_sl(vy32_l, shift2);
+vy32_r = vec_sl(vy32_r, shift2);
+
+vu = vec_ld(0, [i]);
+vv = vec_ld(0, [i]);
+if (uvalpha < 2048) {
+ 

Re: [FFmpeg-devel] [PATCH] avcodec/tiff: Add support for recognizing DNG files

2019-03-18 Thread Lauri Kasanen
On Mon, 18 Mar 2019 09:13:01 +0100
Moritz Barsnick  wrote:

> On Sun, Mar 17, 2019 at 23:05:01 +0100, Paul B Mahol wrote:
> > Still wrong, You can decode images you linked just fine (albeit with
> > incorrect colors) with command:
> > 
> > ffmpeg -subimage 1 -i IMAGE.dng rest of command.
> 
> Shouldn't, ideally, these image files be demuxed as two image streams?
> Perhaps with the "main" image as the first stream.

The DNG spec is pretty massive, and there's a huge amount of
variations. There can easily be far more than two streams, there could
be several "main" images and several previews in different sizes. Their
order can vary too, it's not always the thumbnail first; thumbnails can
also be omitted entirely. There's also several different
encodings/compression types for the "main" images.

I've used their libdng for a project. It's a big LGPL library
implementing pretty much everything, but no distro really ships it, so
it'd have to be embedded or built manually by the user.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/2] swscale/ppc: Add av_unused to template vars only used in one includer

2019-03-18 Thread Lauri Kasanen
Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_ppc_template.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 3964a7a..aff2dd7 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -44,7 +44,7 @@ static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int 
filterSize,
 for (j = 0; j < filterSize; j++) {
 unsigned int joffset=j<<1;
 unsigned int xoffset=x<<1;
-vector unsigned char perm;
+vector unsigned char av_unused perm;
 vector signed short l1,vLumFilter;
 LOAD_FILTER(vLumFilter,filter);
 vLumFilter = vec_splat(vLumFilter, 0);
@@ -133,8 +133,8 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, 
int dstW,
 case 8:
 for (i = 0; i < dstW; i++) {
 register int srcPos = filterPos[i];
-vector unsigned char src_vF, src_v0, src_v1;
-vector unsigned char permS;
+vector unsigned char src_vF, av_unused src_v0, av_unused 
src_v1;
+vector unsigned char av_unused permS;
 vector signed short src_v, filter_v;
 vector signed int val_v, val_s;
 FIRST_LOAD(src_v0, srcPos, src, permS);
@@ -173,18 +173,19 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t 
*dst, int dstW,
 
 default:
 for (i = 0; i < dstW; i++) {
-register int j, offset = i * 2 * filterSize;
+register int j, av_unused offset = i * 2 * filterSize;
 register int srcPos = filterPos[i];
 
 vector signed int val_s, val_v = (vector signed int)vzero;
-vector signed short filter_v0R;
-vector unsigned char permF, src_v0, permS;
+vector signed short av_unused filter_v0R;
+vector unsigned char av_unused permF, av_unused src_v0, 
av_unused permS;
 FIRST_LOAD(filter_v0R, offset, filter, permF);
 FIRST_LOAD(src_v0, srcPos, src, permS);
 
 for (j = 0; j < filterSize - 15; j += 16) {
-vector unsigned char src_v1, src_vF;
-vector signed short filter_v1R, filter_v2R, filter_v0, 
filter_v1, src_vA, src_vB;
+vector unsigned char av_unused src_v1, src_vF;
+vector signed short av_unused filter_v1R, av_unused 
filter_v2R,
+filter_v0, filter_v1, src_vA, src_vB;
 vector signed int val_acc;
 LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
 src_vA = // vec_unpackh sign-extends...
@@ -201,8 +202,8 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t *dst, 
int dstW,
 
 if (j < filterSize - 7) {
 // loading src_v0 is useless, it's already done above
-vector unsigned char src_v1, src_vF;
-vector signed short src_v, filter_v1R, filter_v;
+vector unsigned char av_unused src_v1, src_vF;
+vector signed short src_v, av_unused filter_v1R, filter_v;
 LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
 src_v = // vec_unpackh sign-extends...
 (vector signed short)(VEC_MERGEH((vector unsigned 
char)vzero, src_vF));
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/2] swscale/ppc: Clean up some mixed decl warnings

2019-03-18 Thread Lauri Kasanen
Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_altivec.c  | 6 +++---
 libswscale/ppc/swscale_ppc_template.c | 9 +
 libswscale/ppc/swscale_vsx.c  | 6 +++---
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/libswscale/ppc/swscale_altivec.c b/libswscale/ppc/swscale_altivec.c
index d72ed1e..3cd9782 100644
--- a/libswscale/ppc/swscale_altivec.c
+++ b/libswscale/ppc/swscale_altivec.c
@@ -43,10 +43,10 @@
 
 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
 vector signed short ls;\
+vector signed int   vf1, vf2, i1, i2;\
 GET_LS(l1, x, perm, src);\
-vector signed int   i1  = vec_mule(filter, ls);\
-vector signed int   i2  = vec_mulo(filter, ls);\
-vector signed int   vf1, vf2;\
+i1  = vec_mule(filter, ls);\
+i2  = vec_mulo(filter, ls);\
 vf1 = vec_mergeh(i1, i2);\
 vf2 = vec_mergel(i1, i2);\
 d1 = vec_add(d1, vf1);\
diff --git a/libswscale/ppc/swscale_ppc_template.c 
b/libswscale/ppc/swscale_ppc_template.c
index 11decab..3964a7a 100644
--- a/libswscale/ppc/swscale_ppc_template.c
+++ b/libswscale/ppc/swscale_ppc_template.c
@@ -184,16 +184,17 @@ static void FUNC(hScale_real)(SwsContext *c, int16_t 
*dst, int dstW,
 
 for (j = 0; j < filterSize - 15; j += 16) {
 vector unsigned char src_v1, src_vF;
-vector signed short filter_v1R, filter_v2R, filter_v0, 
filter_v1;
+vector signed short filter_v1R, filter_v2R, filter_v0, 
filter_v1, src_vA, src_vB;
+vector signed int val_acc;
 LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
-vector signed short src_vA = // vec_unpackh sign-extends...
+src_vA = // vec_unpackh sign-extends...
  (vector signed 
short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
-vector signed short src_vB = // vec_unpackh sign-extends...
+src_vB = // vec_unpackh sign-extends...
  (vector signed 
short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
 GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, 
filter_v0, 0);
 GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, 
filter_v1, 16);
 
-vector signed int val_acc = vec_msums(src_vA, filter_v0, 
val_v);
+val_acc = vec_msums(src_vA, filter_v0, val_v);
 val_v = vec_msums(src_vB, filter_v1, val_acc);
 UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
 }
diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index f6c7f1d..01eb46c 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -42,10 +42,10 @@
 
 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
 vector signed short ls;\
+vector signed int   vf1, vf2, i1, i2;\
 GET_LS(l1, x, perm, src);\
-vector signed int   i1  = vec_mule(filter, ls);\
-vector signed int   i2  = vec_mulo(filter, ls);\
-vector signed int   vf1, vf2;\
+i1  = vec_mule(filter, ls);\
+i2  = vec_mulo(filter, ls);\
 vf1 = vec_mergeh(i1, i2);\
 vf2 = vec_mergel(i1, i2);\
 d1 = vec_add(d1, vf1);\
-- 
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH v6] libswscale/ppc: VSX-optimize 9-16 bit yuv2planeX

2019-02-04 Thread Lauri Kasanen
On Sun, 13 Jan 2019 10:26:20 +0200
Lauri Kasanen  wrote:

> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> yuv420p16be \
> -s 1920x1728 -f null -vframes 100 -v error -nostats -
> 
> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> Fate passes, each format tested with an image to video conversion.
> 
> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> of the 16-bit function. This includes the vec_mulo/mule functions too,
> not just vmuluwm.

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] avutil/ppc/cpu: Fix power8 linux detection

2019-02-04 Thread Lauri Kasanen
On Tue, 8 Jan 2019 11:08:04 +0200
Lauri Kasanen  wrote:

> The existing code was in no released kernel that I can see. The corrected code
> was added in 3.9.
> 
> Signed-off-by: Lauri Kasanen 
> ---
>  libavutil/ppc/cpu.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_2

2019-04-11 Thread Lauri Kasanen
On Fri, 5 Apr 2019 11:41:19 +0300
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
> fast_bilinear \
> -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 32-bit mul, power8 only.
>
> ~2x speedup:
>
> rgb24
>   24431 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13783 UNITS in yuv2packed2,   16383 runs,  1 skips
> bgr24
>   24396 UNITS in yuv2packed2,   16384 runs,  0 skips
>   14059 UNITS in yuv2packed2,   16384 runs,  0 skips
> rgba
>   26815 UNITS in yuv2packed2,   16383 runs,  1 skips
>   12797 UNITS in yuv2packed2,   16383 runs,  1 skips
> bgra
>   27060 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13138 UNITS in yuv2packed2,   16384 runs,  0 skips
> argb
>   26998 UNITS in yuv2packed2,   16384 runs,  0 skips
>   12728 UNITS in yuv2packed2,   16381 runs,  3 skips
> bgra
>   26651 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13124 UNITS in yuv2packed2,   16384 runs,  0 skips
>
> This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
> version
> is also heavily inaccurate, while the vsx version has high accuracy.
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 188 
> +++
>  1 file changed, 188 insertions(+)

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_X

2019-04-07 Thread Lauri Kasanen
On Mon, 1 Apr 2019 13:37:32 +0300
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
> -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 32-bit mul, power8 only.
>
> ~6.4x speedup:
>
> rgb24
>  214278 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33249 UNITS in yuv2packedX,   16384 runs,  0 skips
> bgr24
>  214616 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33233 UNITS in yuv2packedX,   16384 runs,  0 skips
> rgba
>  214517 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33271 UNITS in yuv2packedX,   16384 runs,  0 skips
> bgra
>  214973 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33397 UNITS in yuv2packedX,   16384 runs,  0 skips
> argb
>  214613 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33310 UNITS in yuv2packedX,   16384 runs,  0 skips
> bgra
>  214637 UNITS in yuv2packedX,   16384 runs,  0 skips
>   33330 UNITS in yuv2packedX,   16384 runs,  0 skips
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 160 
> +++
>  1 file changed, 160 insertions(+)

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize yuv2rgb_full_2

2019-04-07 Thread Lauri Kasanen
On Mon, 1 Apr 2019 13:13:59 +0300
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags area \
> -s 1200x720 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 32-bit mul, power8 only.
>
> ~4x speedup:
>
> rgb24
>   52763 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13453 UNITS in yuv2packed2,   16384 runs,  0 skips
> bgr24
>   53144 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13616 UNITS in yuv2packed2,   16384 runs,  0 skips
> rgba
>   52796 UNITS in yuv2packed2,   16384 runs,  0 skips
>   12904 UNITS in yuv2packed2,   16384 runs,  0 skips
> bgra
>   52732 UNITS in yuv2packed2,   16384 runs,  0 skips
>   13262 UNITS in yuv2packed2,   16384 runs,  0 skips
> argb
>   52661 UNITS in yuv2packed2,   16384 runs,  0 skips
>   12879 UNITS in yuv2packed2,   16384 runs,  0 skips
> bgra
>   52662 UNITS in yuv2packed2,   16384 runs,  0 skips
>   12932 UNITS in yuv2packed2,   16384 runs,  0 skips
>
> Signed-off-by: Lauri Kasanen 
> ---
>  libswscale/ppc/swscale_vsx.c | 166 
> +++
>  1 file changed, 166 insertions(+)

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale/ppc: VSX-optimize non-full-chroma yuv2rgb_1

2019-04-07 Thread Lauri Kasanen
On Sun, 31 Mar 2019 17:18:47 +0300
Lauri Kasanen  wrote:

> ./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 -sws_flags 
> fast_bilinear \
> -s 1200x1440 -f null -vframes 100 -pix_fmt $i -nostats \
> -cpuflags 0 -v error -
>
> 32-bit mul, power8 only.
>
> 1.8-2.3x speedup:
>
> rgb24
>   18192 UNITS in yuv2packed1,   32767 runs,  1 skips
>9983 UNITS in yuv2packed1,   32760 runs,  8 skips
> bgr24
>   18665 UNITS in yuv2packed1,   32766 runs,  2 skips
>9925 UNITS in yuv2packed1,   32763 runs,  5 skips
> rgba
>   20239 UNITS in yuv2packed1,   32767 runs,  1 skips
>8794 UNITS in yuv2packed1,   32759 runs,  9 skips
> bgra
>   20354 UNITS in yuv2packed1,   32768 runs,  0 skips
>8770 UNITS in yuv2packed1,   32761 runs,  7 skips
> argb
>   20185 UNITS in yuv2packed1,   32768 runs,  0 skips
>8761 UNITS in yuv2packed1,   32761 runs,  7 skips
> bgra
>   20360 UNITS in yuv2packed1,   32766 runs,  2 skips
>8759 UNITS in yuv2packed1,   32764 runs,  4 skips
>
> This is a low speedup, but the x86 mmx version also gets only ~2x. The mmx 
> version
> is also heavily inaccurate, while the vsx version has high accuracy.

Applying.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH]lavc/alac: Make a variable unsigned

2019-04-18 Thread Lauri Kasanen
On Thu, 18 Apr 2019 13:53:37 +0200
Carl Eugen Hoyos  wrote:

> Hi!
>
> Attached patch silences a warning that is shown with some gcc versions.

It pokes my style sense to have different things in the sizeof() and
the var. How about uint32_t in both?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH]lavc/alac: Make a variable unsigned

2019-04-18 Thread Lauri Kasanen
On Thu, 18 Apr 2019 15:07:03 +0200
Hendrik Leppkes  wrote:

> On Thu, Apr 18, 2019 at 2:54 PM Lauri Kasanen  wrote:
> >
> > On Thu, 18 Apr 2019 13:53:37 +0200
> > Carl Eugen Hoyos  wrote:
> >
> > > Hi!
> > >
> > > Attached patch silences a warning that is shown with some gcc versions.
> >
> > It pokes my style sense to have different things in the sizeof() and
> > the var. How about uint32_t in both?
> >
>
> Those two things are entirely unrelated types, though.

Indeed, my bad. Please ignore.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time

2019-06-08 Thread Lauri Kasanen
On Fri, 7 Jun 2019 08:38:35 -0700
Adrian Tong  wrote:

> Hi
>
> I have a workload which spends a significant amount of time (~10%) in
> the yuv420_bgr24_mmxext function in FFMEPG.
>
> I looked at the assembly and profile and see MMX (64 bit) registers are
> used. I wonder whether we can have a SSE2 version which has a register bit
> width of 128.
>
> I am very interested in implementing such support if it is possible.

I'm not well versed in x86 vectors, so I can't say if SSE2 is enough or
some other SSE version would be needed, but certainly YUV to RGB
conversion can be done faster than with MMX. Please do send a patch.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time

2019-06-08 Thread Lauri Kasanen
On Sat, 8 Jun 2019 06:51:51 -0700
Adrian Tong  wrote:

> Hi Lauri.
>
> Thanks for the reply, any reason why this has not been implemented before ?
> it seems to me that this would be a pretty important/hot function.

Just the usual, nobody has had the interest. There are other places too
where the only x86 accel is mmx.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] yuv420_bgr24_mmxext conversion taking significant time

2019-06-11 Thread Lauri Kasanen
On Mon, 10 Jun 2019 17:42:00 -0700
Adrian Tong  wrote:

> I have been trying to implement yuv420_to_bgr24 using SSE2 instruction. I
> ran into the case where the output of C implemented yuv420_to_bgr24 has
> slightly different resulting bgr24 image from MMX implemented
> yuv420_to_bgr24. Is this expected behavior ?

Yes, some of the MMX implementations choose speed over accuracy, I ran
to that myself when doing PPC versions. For a SSE version, if an
accurate version is fast enough, please try to match the C version.
Otherwise try to match MMX.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42

2019-05-10 Thread Lauri Kasanen
On Thu,  9 May 2019 22:59:12 -0700
Philip Langdale  wrote:

> I don't think this is terribly useful, as the only thing out there that
> can even handle NV24 content is VDPAU and the only time you have to
> deal with it is when doing VDPAU OpenGL interop where swscale is
> irrelevant. In the other cases you can use YV24 (YUV444P).
>
> But anyway, I was asked to do this for the sake of completeness.
>
> The implementation is pretty straight-forward. Most of the existing
> NV12 codepaths work regardless of subsampling and are re-used as is.
> Where necessary I wrote the slightly different NV24 versions.
>
> Finally, the one thing that confused me for a long time was the
> asm specific x86 path that did an explicit exclusion check for NV12.
> I replaced that with a semi-planar check and also updated the
> equivalent PPC code, but which I cannot test.

I'm having trouble making out what formats exactly isSemiPlanarYUV()
matches. Are you sure it's an equivalent check?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42

2019-05-10 Thread Lauri Kasanen
On Fri, 10 May 2019 08:07:45 -0700
Philip Langdale  wrote:

> On Fri, 10 May 2019 09:35:40 +0300
> Lauri Kasanen  wrote:
>
> >
> > I'm having trouble making out what formats exactly isSemiPlanarYUV()
> > matches. Are you sure it's an equivalent check?
> >
>
> Well, the check's been in there for quite a while; that's not new.
>
> (isPlanarYUV(pix_fmt) && desc->comp[1].plane == desc->comp[2].plane);
>
> So, any planar yuv format where component 1 and component 2 are on the
> same plane. Except for semi planar formats, you expect either all
> components on the same plane (packed, so not planar) or every component
> on a separate plain (normal planar).

Yes, I understand that. I mean: can you list all formats that function
matches?

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] swscale: Add support for NV24 and NV42

2019-05-10 Thread Lauri Kasanen
On Fri, 10 May 2019 10:08:57 -0700
Philip Langdale  wrote:

> On 2019-05-10 08:12, Lauri Kasanen wrote:
> > On Fri, 10 May 2019 08:07:45 -0700
> > Philip Langdale  wrote:
> >
> >> On Fri, 10 May 2019 09:35:40 +0300
> >> Lauri Kasanen  wrote:
> >>
> >> >
> >> > I'm having trouble making out what formats exactly isSemiPlanarYUV()
> >> > matches. Are you sure it's an equivalent check?
> >> >
> >>
> >> Well, the check's been in there for quite a while; that's not new.
> >>
> >> (isPlanarYUV(pix_fmt) && desc->comp[1].plane == desc->comp[2].plane);
> >>
> >> So, any planar yuv format where component 1 and component 2 are on the
> >> same plane. Except for semi planar formats, you expect either all
> >> components on the same plane (packed, so not planar) or every
> >> component
> >> on a separate plain (normal planar).
> >
> > Yes, I understand that. I mean: can you list all formats that function
> > matches?
>
> For formats that swscale understands:
>
> NV12, NV21
> P010(BE|LE)
> P016(BE|LE)
>
> and now NV24, NV42.
>
> There are also NV16 and NV20(BE|LE) formats which are not supported by
> swscale.

Thanks. Then the ppc part looks ok to me. Please include that list in
the commit message too.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] swscale/ppc: VSX-optimize hScale8To19_vsx

2019-05-01 Thread Lauri Kasanen
Copy-paste thinko in the title I see. Will remove the _vsx suffix from
the title.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH V5 1/2] configure: sort decoder/encoder/filter/... names in alphabet order

2019-05-01 Thread Lauri Kasanen
On Wed, 1 May 2019 22:57:47 +0200
Carl Eugen Hoyos  wrote:

> 2019-04-28 3:18 GMT+02:00, Alexander Strasser :
>
> > What do you think about using awk instead of shell?
>
> Do we only use awk for --enable-random and the dependency
> files so far? Does configure also work without awk now and
> would this change?

It seems awk is unconditionally required already. However I wanted to
say that it's a very nice dep to have: easy to build, present almost
everywhere, even in busybox. Nothing like perl/tcl or worse,
python/java/rust/go.

- Lauri
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/4] swscale/ppc: VSX-optimize hScale8To19_vsx

2019-04-30 Thread Lauri Kasanen
./ffmpeg -f lavfi -i yuvtestsrc=duration=1:size=1200x1440 \
-s 2400x720 -f rawvideo -y -vframes 5 -pix_fmt yuv420p16le -nostats test.raw

2.26 speedup (x86 SSE2 is 2.32):
  23772 UNITS in hscale,4096 runs,  0 skips
  53862 UNITS in hscale,4096 runs,  0 skips

Signed-off-by: Lauri Kasanen 
---
 libswscale/ppc/swscale_vsx.c | 64 +++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
index 2e20ab3..a82cf95 100644
--- a/libswscale/ppc/swscale_vsx.c
+++ b/libswscale/ppc/swscale_vsx.c
@@ -1853,6 +1853,64 @@ static void hcscale_fast_vsx(SwsContext *c, int16_t 
*dst1, int16_t *dst2,

 #undef HCSCALE

+static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
+const uint8_t *src, const int16_t *filter,
+const int32_t *filterPos, int filterSize)
+{
+int i, j;
+int32_t *dst = (int32_t *) _dst;
+vector int16_t vfilter, vin;
+vector uint8_t vin8;
+vector int32_t vout;
+const vector uint8_t vzero = vec_splat_u8(0);
+const vector uint8_t vunusedtab[8] = {
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
+(vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
+  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
+  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
+(vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
+};
+const vector uint8_t vunused = vunusedtab[filterSize % 8];
+
+if (filterSize == 1) {
+for (i = 0; i < dstW; i++) {
+int srcPos = filterPos[i];
+int val= 0;
+for (j = 0; j < filterSize; j++) {
+val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
+}
+dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation 
does overflow ...
+}
+} else {
+for (i = 0; i < dstW; i++) {
+const int srcPos = filterPos[i];
+vout = vec_splat_s32(0);
+for (j = 0; j < filterSize; j += 8) {
+vin8 = vec_vsx_ld(0, [srcPos + j]);
+vin = (vector int16_t) vec_mergeh(vin8, vzero);
+if (j + 8 > filterSize) // Remove the unused elements on the 
last round
+vin = vec_perm(vin, (vector int16_t) vzero, vunused);
+
+vfilter = vec_vsx_ld(0, [filterSize * i + j]);
+vout = vec_msums(vin, vfilter, vout);
+}
+vout = vec_sums(vout, (vector int32_t) vzero);
+dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1);
+}
+}
+}
+
 #endif /* !HAVE_BIGENDIAN */

 #endif /* HAVE_VSX */
@@ -1867,12 +1925,16 @@ av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
 return;

 #if !HAVE_BIGENDIAN
-if (c->srcBpc == 8 && c->dstBpc <= 14) {
+if (c->srcBpc == 8) {
+if (c->dstBpc <= 14) {
 c->hyScale = c->hcScale = hScale_real_vsx;
 if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW 
>= c->chrSrcW) {
 c->hyscale_fast = hyscale_fast_vsx;
 c->hcscale_fast = hcscale_fast_vsx;
 }
+} else {
+c->hyScale = c->hcScale = hScale8To19_vsx;
+}
 }
 if (!is16BPS(dstFormat) && !isNBPS(dstFormat) &&
 dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
--
2.6.2

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

  1   2   >