On Tue, Oct 04, 2011 at 10:32:16PM +0200, Janne Grunau wrote: > 2.5-3 times faster, 5% overall speedup on bourne.rvmb > --- > libavcodec/arm/Makefile | 4 + > libavcodec/arm/rv30dsp_init_neon.c | 36 +++++++++++ > libavcodec/arm/rv34dsp_neon.S | 121 > ++++++++++++++++++++++++++++++++++++ > libavcodec/arm/rv40dsp_init_neon.c | 6 ++ > libavcodec/rv30dsp.c | 3 + > libavcodec/rv34dsp.h | 2 + > 6 files changed, 172 insertions(+), 0 deletions(-) > create mode 100644 libavcodec/arm/rv30dsp_init_neon.c > create mode 100644 libavcodec/arm/rv34dsp_neon.S > > diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile > index d6a6961..d680f6e 100644 > --- a/libavcodec/arm/Makefile > +++ b/libavcodec/arm/Makefile > @@ -62,7 +62,10 @@ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o > NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ > arm/synth_filter_neon.o \ > > +NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o \ > + > NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_neon.o \ > + arm/rv34dsp_neon.o \ > > NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o > > @@ -79,6 +82,7 @@ OBJS-$(HAVE_NEON) += > arm/dsputil_init_neon.o \ > arm/fmtconvert_neon.o \ > arm/int_neon.o \ > arm/mpegvideo_neon.o \ > + arm/rv30dsp_init_neon.o \ > arm/rv40dsp_init_neon.o \ > arm/simple_idct_neon.o \ > $(NEON-OBJS-yes) > diff --git a/libavcodec/arm/rv30dsp_init_neon.c > b/libavcodec/arm/rv30dsp_init_neon.c > new file mode 100644 > index 0000000..0943db4 > --- /dev/null > +++ b/libavcodec/arm/rv30dsp_init_neon.c > @@ -0,0 +1,36 @@ > +/* > + * ARM NEON optimised DSP functions > + * Copyright (c) 2011 Janne Grunau <[email protected]> > + * > + * This file is part of Libav. > + * > + * Libav is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * Libav is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with Libav; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include <stdint.h> > + > +#include "libavcodec/avcodec.h" > +#include "libavcodec/rv34dsp.h" > + > +void ff_rv34_inv_transform_neon(DCTELEM *block); > +void ff_rv34_inv_transform_noround_neon(DCTELEM *block); > + > +void ff_rv30dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) > +{ > + c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon; > + c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon; > + > + return;
this return is silly > +} > diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S > new file mode 100644 > index 0000000..9414db2 > --- /dev/null > +++ b/libavcodec/arm/rv34dsp_neon.S > @@ -0,0 +1,121 @@ > +/* > + * Copyright (c) 2011 Janne Grunau <[email protected]> > + * > + * This file is part of Libav. > + * > + * Libav is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * Libav is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with Libav; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavcodec/arm/asm.S" > + > +.macro rv34_row_transform > + mov r1, #16 > + mov r3, #13 > + vmov.s32 d0[0], r3 > + vld1.16 {d28}, [r0], r1 @ load block[i+8*0] > + vld1.16 {d29}, [r0], r1 @ load block[i+8*1] > + mov r3, #7 > + vmov.s16 d0[2], r3 > + vmov.s32 d1[0], r3 > + mov r3, #17 > + vmov.s16 d0[3], r3 > + vmov.s32 d1[1], r3 > + vld1.16 {d30}, [r0], r1 @ load block[i+8*2] > + vmull.s16 q12, d29, d0[2] @ z2 = block[i+8*1]*7 > + vld1.16 {d31}, [r0], r1 @ load block[i+8*3] > + vmull.s16 q13, d29, d0[3] @ z3 = block[i+8*1]*17 > + vaddl.s16 q10, d28, d30 @ block[i+8*0] + block[i+8*2] > + vmul.i32 q10, q10, d0[0] @ z0 = 13*(block[i+8*0] + > block[i+8*2]) > + vsubl.s16 q11, d28, d30 @ block[i+8*0] - block[i+8*2] > + vmul.i32 q11, q11, d0[0] @ z1 = 13*(block[i+8*0] - > block[i+8*2]) > + vmlsl.s16 q12, d31, d0[3] @ z2 = block[i+8*1]*7 - > block[i+8*3]*17 > + vadd.s32 q2, q11, q12 @ z1 + z2 > + vsub.s32 q3, q11, q12 @ z1 - z2 > + vmlal.s16 q13, d31, d0[2] @ z3 = block[i+8*1]*17 + > block[i+8*3]*7 > + vadd.s32 q1, q10, q13 @ z0 + z3 > + vsub.s32 q8, q10, q13 @ z0 - z3 > +.endm Also wouldn't it be faster to use the oldest trick in the book for some multiplications (e.g. X*7 = (X << 3) - X, etc.)? _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
