On Tue, Oct 04, 2011 at 10:32:16PM +0200, Janne Grunau wrote:
> 2.5-3 times faster, 5% overall speedup on bourne.rvmb
> ---
>  libavcodec/arm/Makefile            |    4 +
>  libavcodec/arm/rv30dsp_init_neon.c |   36 +++++++++++
>  libavcodec/arm/rv34dsp_neon.S      |  121 
> ++++++++++++++++++++++++++++++++++++
>  libavcodec/arm/rv40dsp_init_neon.c |    6 ++
>  libavcodec/rv30dsp.c               |    3 +
>  libavcodec/rv34dsp.h               |    2 +
>  6 files changed, 172 insertions(+), 0 deletions(-)
>  create mode 100644 libavcodec/arm/rv30dsp_init_neon.c
>  create mode 100644 libavcodec/arm/rv34dsp_neon.S
> 
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index d6a6961..d680f6e 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -62,7 +62,10 @@ NEON-OBJS-$(CONFIG_AC3DSP)             += arm/ac3dsp_neon.o
>  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
>                                            arm/synth_filter_neon.o       \
>  
> +NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o            \
> +
>  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv40dsp_neon.o            \
> +                                          arm/rv34dsp_neon.o            \
>  
>  NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
>  
> @@ -79,6 +82,7 @@ OBJS-$(HAVE_NEON)                      += 
> arm/dsputil_init_neon.o       \
>                                            arm/fmtconvert_neon.o         \
>                                            arm/int_neon.o                \
>                                            arm/mpegvideo_neon.o          \
> +                                          arm/rv30dsp_init_neon.o       \
>                                            arm/rv40dsp_init_neon.o       \
>                                            arm/simple_idct_neon.o        \
>                                            $(NEON-OBJS-yes)
> diff --git a/libavcodec/arm/rv30dsp_init_neon.c 
> b/libavcodec/arm/rv30dsp_init_neon.c
> new file mode 100644
> index 0000000..0943db4
> --- /dev/null
> +++ b/libavcodec/arm/rv30dsp_init_neon.c
> @@ -0,0 +1,36 @@
> +/*
> + * ARM NEON optimised DSP functions
> + * Copyright (c) 2011 Janne Grunau <[email protected]>
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#include <stdint.h>
> +
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/rv34dsp.h"
> +
> +void ff_rv34_inv_transform_neon(DCTELEM *block);
> +void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
> +
> +void ff_rv30dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
> +{
> +    c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
> +    c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
> +
> +    return;

this return is silly

> +}
> diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
> new file mode 100644
> index 0000000..9414db2
> --- /dev/null
> +++ b/libavcodec/arm/rv34dsp_neon.S
> @@ -0,0 +1,121 @@
> +/*
> + * Copyright (c) 2011 Janne Grunau <[email protected]>
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#include "libavcodec/arm/asm.S"
> +
> +.macro rv34_row_transform
> +        mov        r1,     #16
> +        mov        r3,     #13
> +        vmov.s32   d0[0],  r3
> +        vld1.16    {d28},  [r0], r1        @ load block[i+8*0]
> +        vld1.16    {d29},  [r0], r1        @ load block[i+8*1]
> +        mov        r3,     #7
> +        vmov.s16   d0[2],  r3
> +        vmov.s32   d1[0],  r3
> +        mov        r3,     #17
> +        vmov.s16   d0[3],  r3
> +        vmov.s32   d1[1],  r3
> +        vld1.16    {d30},  [r0], r1        @ load block[i+8*2]
> +        vmull.s16  q12,    d29,  d0[2]     @ z2 = block[i+8*1]*7
> +        vld1.16    {d31},  [r0], r1        @ load block[i+8*3]
> +        vmull.s16  q13,    d29,  d0[3]     @ z3 = block[i+8*1]*17
> +        vaddl.s16  q10,    d28,  d30       @ block[i+8*0] + block[i+8*2]
> +        vmul.i32   q10,    q10,  d0[0]     @ z0 = 13*(block[i+8*0] + 
> block[i+8*2])
> +        vsubl.s16  q11,    d28,  d30       @ block[i+8*0] - block[i+8*2]
> +        vmul.i32   q11,    q11,  d0[0]     @ z1 = 13*(block[i+8*0] - 
> block[i+8*2])
> +        vmlsl.s16  q12,    d31,  d0[3]     @ z2 = block[i+8*1]*7  - 
> block[i+8*3]*17
> +        vadd.s32   q2,     q11,  q12       @  z1 + z2
> +        vsub.s32   q3,     q11,  q12       @  z1 - z2
> +        vmlal.s16  q13,    d31,  d0[2]     @ z3 = block[i+8*1]*17 + 
> block[i+8*3]*7
> +        vadd.s32   q1,     q10,  q13       @  z0 + z3
> +        vsub.s32   q8,     q10,  q13       @  z0 - z3
> +.endm

Also wouldn't it be faster to use the oldest trick in the book for some
multiplications (e.g. X*7 = (X << 3) - X, etc.)?
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to