On Tue, Jan 08, 2019 at 11:11:56AM +0200, Lauri Kasanen wrote:
> ./ffmpeg_g -f rawvideo -pix_fmt rgb24 -s hd1080 -i /dev/zero -pix_fmt 
> yuv420p16be \
> -s 1920x1728 -f null -vframes 100 -v error -nostats -
> 
> 9-14 bit funcs get about 6x speedup, 16-bit gets about 15x.
> Fate passes, each format tested with an image to video conversion.
> 
> Only POWER8 includes 32-bit vector multiplies, so POWER7 is locked out
> of the 16-bit function. This includes the vec_mulo/mule functions too,
> not just vmuluwm.
> 
> yuv420p9le
>   12341 UNITS in planarX,  130976 runs,     96 skips
>   73752 UNITS in planarX,  131066 runs,      6 skips
> yuv420p9be
>   12364 UNITS in planarX,  131025 runs,     47 skips
>   73001 UNITS in planarX,  131055 runs,     17 skips
> yuv420p10le
>   12386 UNITS in planarX,  131042 runs,     30 skips
>   72735 UNITS in planarX,  131062 runs,     10 skips
> yuv420p10be
>   12337 UNITS in planarX,  131045 runs,     27 skips
>   72734 UNITS in planarX,  131057 runs,     15 skips
> yuv420p12le
>   12236 UNITS in planarX,  131058 runs,     14 skips
>   73029 UNITS in planarX,  131062 runs,     10 skips
> yuv420p12be
>   12218 UNITS in planarX,  130973 runs,     99 skips
>   72402 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14le
>   12168 UNITS in planarX,  131067 runs,      5 skips
>   72480 UNITS in planarX,  131069 runs,      3 skips
> yuv420p14be
>   12358 UNITS in planarX,  130948 runs,    124 skips
>   73772 UNITS in planarX,  131063 runs,      9 skips
> yuv420p16le
>   10439 UNITS in planarX,  130911 runs,    161 skips
>  157923 UNITS in planarX,  131068 runs,      4 skips
> yuv420p16be
>   10463 UNITS in planarX,  130874 runs,    198 skips
>  154405 UNITS in planarX,  131061 runs,     11 skips
> 
> Signed-off-by: Lauri Kasanen <c...@gmx.com>
> ---
> 
> v2: Separate macros so that yuv2plane1_16_vsx remains available for power7
> v3: Remove accidental tabs, switch to HAVE_POWER8 from configure + runtime 
> check
> 
> As far as I can tell, for HAVE_POWER8 to be defined, -march has to be at least
> power8, meaning with the current setup such a binary wouldn't run on POWER7.
> However using the configure define lets it be disabled in configure like 
> Michael
> pointed out, and having the runtime check doesn't hurt any (it allows for 
> future
> splits like on x86, where one binary can run on low cpu but use higher ISA if
> available).
> 
>  libswscale/ppc/swscale_ppc_template.c |   4 +-
>  libswscale/ppc/swscale_vsx.c          | 195 
> +++++++++++++++++++++++++++++++++-
>  2 files changed, 193 insertions(+), 6 deletions(-)
> 
> diff --git a/libswscale/ppc/swscale_ppc_template.c 
> b/libswscale/ppc/swscale_ppc_template.c
> index 00e4b99..11decab 100644
> --- a/libswscale/ppc/swscale_ppc_template.c
> +++ b/libswscale/ppc/swscale_ppc_template.c
> @@ -21,7 +21,7 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
>   */
>  
> -static void FUNC(yuv2planeX_16)(const int16_t *filter, int filterSize,
> +static void FUNC(yuv2planeX_8_16)(const int16_t *filter, int filterSize,
>                                    const int16_t **src, uint8_t *dest,
>                                    const uint8_t *dither, int offset, int x)
>  {
> @@ -88,7 +88,7 @@ static void FUNC(yuv2planeX)(const int16_t *filter, int 
> filterSize,
>      yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
>  
>      for (i = dst_u; i < dstW - 15; i += 16)
> -        FUNC(yuv2planeX_16)(filter, filterSize, src, dest + i, dither,
> +        FUNC(yuv2planeX_8_16)(filter, filterSize, src, dest + i, dither,
>                                offset, i);
>  
>      yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
> diff --git a/libswscale/ppc/swscale_vsx.c b/libswscale/ppc/swscale_vsx.c
> index 70da6ae..77680f8 100644
> --- a/libswscale/ppc/swscale_vsx.c
> +++ b/libswscale/ppc/swscale_vsx.c
> @@ -83,6 +83,8 @@
>  #include "swscale_ppc_template.c"
>  #undef FUNC
>  
> +#undef vzero
> +
>  #endif /* !HAVE_BIGENDIAN */
>  
>  static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
> @@ -180,6 +182,76 @@ static void yuv2plane1_nbps_vsx(const int16_t *src, 
> uint16_t *dest, int dstW,
>      yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  
> +static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
> +                              const int16_t **src, uint16_t *dest, int dstW,
> +                              int big_endian, int output_bits, int start)
> +{
> +    int i;
> +    int shift = 11 + 16 - output_bits;
> +
> +    for (i = start; i < dstW; i++) {
> +        int val = 1 << (shift - 1);
> +        int j;
> +
> +        for (j = 0; j < filterSize; j++)
> +            val += src[j][i] * filter[j];
> +
> +        output_pixel(&dest[i], val);
> +    }
> +}
> +
> +static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
> +                                const int16_t **src, uint16_t *dest, int 
> dstW,
> +                                int big_endian, int output_bits)
> +{
> +    const int dst_u = -(uintptr_t)dest & 7;
> +    const int shift = 11 + 16 - output_bits;
> +    const int add = (1 << (shift - 1));
> +    const int clip = (1 << output_bits) - 1;
> +    const uint16_t swap = big_endian ? 8 : 0;
> +    const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
> +    const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, 
> shift};
> +    const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, 
> swap, swap, swap, swap};
> +    const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, 
> clip, clip, clip, clip, clip};
> +    const vector int16_t vzero = vec_splat_s16(0);
> +    const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 
> 4, 5, 12, 13, 6, 7, 14, 15};
> +    vector int16_t vfilter[MAX_FILTER_SIZE], vin;
> +    vector uint16_t v;
> +    vector uint32_t vleft, vright, vtmp;
> +    int i, j;
> +
> +    for (i = 0; i < filterSize; i++) {
> +        vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], 
> filter[i],
> +                                       filter[i], filter[i], filter[i], 
> filter[i]};
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, 
> output_bits, 0);
> +
> +    for (i = dst_u; i < dstW - 7; i += 8) {
> +        vleft = vright = vadd;
> +
> +        for (j = 0; j < filterSize; j++) {
> +            vin = vec_vsx_ld(0, &src[j][i]);
> +            vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
> +            vleft = vec_add(vleft, vtmp);
> +            vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
> +            vright = vec_add(vright, vtmp);
> +        }
> +
> +        vleft = vec_sra(vleft, vshift);
> +        vright = vec_sra(vright, vshift);
> +        v = vec_packsu(vleft, vright);
> +        v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
> +        v = vec_min(v, vlargest);
> +        v = vec_rl(v, vswap);
> +        v = vec_perm(v, v, vperm);
> +        vec_st(v, 0, &dest[i]);
> +    }
> +
> +    yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, 
> output_bits, i);
> +}
> +
> +
>  #undef output_pixel
>  
>  #define output_pixel(pos, val, bias, signedness) \
> @@ -234,7 +306,97 @@ static void yuv2plane1_16_vsx(const int32_t *src, 
> uint16_t *dest, int dstW,
>      yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
>  }
>  
> +#ifdef HAVE_POWER8

this probably should be #if, similar for others

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA

Attachment: signature.asc
Description: PGP signature

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to