Janne Grunau <[email protected]> writes:
> Overall 3% faster, idct_add down from 340 to 101 cycles, idct_dc_add
> down from 96 to 54 cycles.
> ---
> libavcodec/arm/rv34dsp_init_neon.c | 6 +++
> libavcodec/arm/rv34dsp_neon.S | 65
> ++++++++++++++++++++++++++++++++++++
> 2 files changed, 71 insertions(+), 0 deletions(-)
>
> diff --git a/libavcodec/arm/rv34dsp_init_neon.c
> b/libavcodec/arm/rv34dsp_init_neon.c
> index 3984d43..744818c 100644
> --- a/libavcodec/arm/rv34dsp_init_neon.c
> +++ b/libavcodec/arm/rv34dsp_init_neon.c
> @@ -27,8 +27,14 @@ void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
>
> void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
>
> +void ff_rv34_idct_add_neon(uint8_t *dst, int stride, DCTELEM *block);
> +void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc);
> +
> void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
> {
> c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
> c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
> +
> + c->rv34_idct_add = ff_rv34_idct_add_neon;
> + c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
> }
> diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
> index a156412..f3ae0a6 100644
> --- a/libavcodec/arm/rv34dsp_neon.S
> +++ b/libavcodec/arm/rv34dsp_neon.S
> @@ -66,6 +66,42 @@
> vsub.s32 q15, q14, q9 @ z0 - z3
> .endm
>
> +/* void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block) */
> +function ff_rv34_idct_add_neon, export=1
> + mov r3, r0
> + mov r0, r2
> + rv34_inv_transform
> + mov r0, r3
Change the macro to take the base register as an argument and avoid
shuffling them around here.
> + vrshrn.s32 d22, q15, #10 @ (z0 - z3) >> 10
> + veor.s16 q12, q12
> + veor.s16 q13, q13
vmov.i16 dst, #0
> + vld4.8 {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
> + vld4.8 {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
> + vld4.8 {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
> + vld4.8 {d28[3], d29[3], d30[3], d31[3]}, [r0], r1
Transpose the idct output instead (after shift/narrow). Then load the
pixel values packing two lines into each d register for more efficient
processing (see below).
> + vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
> + vrshrn.s32 d18, q2, #10 @ (z1 + z2) >> 10
> + vrshrn.s32 d20, q3, #10 @ (z1 - z2) >> 10
Why the strange order for these? Since q15 is produced last in the
macro, it makes no sense at all shifting it before the others.
Furthermore, keeping them all together makes the code easier to read
without harming performance.
> + vst1.16 {q12-q13}, [r2,:128] @ memset(block, 0, 16*2)
This is no faster than two 16-byte stores, so you could do that instead
if you're short on registers.
> + vmovl.u8 q0, d28
> + vmovl.u8 q1, d29
> + vmovl.u8 q2, d30
> + vmovl.u8 q3, d31
> + vadd.s16 d24, d0, d16 @ dst[i*4 + 0] + ((z0 + z3) >> 10)
> + vadd.s16 d26, d2, d18 @ dst[i*4 + 1] + ((z1 + z2) >> 10)
> + vadd.s16 d28, d4, d20 @ dst[i*4 + 2] + ((z1 - z2) >> 10)
> + vadd.s16 d30, d6, d22 @ dst[i*4 + 3] + ((z0 - z3) >> 10)
Drop the vmovl and use vaddw.u8. It doesn't matter that you're adding
to signed values. If you arrange the idct output properly, you can add
two lines at once.
> + vqmovun.s16 d0, q12
> + vqmovun.s16 d1, q13
> + vqmovun.s16 d2, q14
> + vqmovun.s16 d3, q15
> + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r3], r1
> + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r3], r1
> + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r3], r1
> + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r3], r1
Use aligned stores and adapt as necessary to the changes suggested above.
> + bx lr
> +endfunc
> +
> /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
> function ff_rv34_inv_transform_noround_neon, export=1
> rv34_inv_transform
> @@ -88,6 +124,35 @@ function ff_rv34_inv_transform_noround_neon, export=1
> bx lr
> endfunc
>
> +/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
> +function ff_rv34_idct_dc_add_neon, export=1
> + mov r3, r0
> + vld4.8 {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
> + vld4.8 {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
> + vld4.8 {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
> + vld4.8 {d28[3], d29[3], d30[3], d31[3]}, [r0], r1
Use aligned vld1.32 instructions packing two rows per d register.
> + vdup.16 d0, r2
> + vmov.s16 d1, #169
> + vmovl.u8 q8, d28
> + vmull.s16 q1, d0, d1 @ dc * 13 * 13
> + vmovl.u8 q9, d29
> + vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
> + vmovl.u8 q10, d30
> + vmovl.u8 q11, d31
> + vadd.s16 d4, d0, d16
> + vadd.s16 d6, d0, d18
> + vadd.s16 d24, d0, d20
> + vadd.s16 d26, d0, d22
Drop the vmovl and use two vaddw.u8 instructions processing two rows
each.
> + vqmovun.s16 d28, q2
> + vqmovun.s16 d29, q3
> + vqmovun.s16 d30, q12
> + vqmovun.s16 d31, q13
> + vst4.8 {d28[0], d29[0], d30[0], d31[0]}, [r3], r1
> + vst4.8 {d28[1], d29[1], d30[1], d31[1]}, [r3], r1
> + vst4.8 {d28[2], d29[2], d30[2], d31[2]}, [r3], r1
> + vst4.8 {d28[3], d29[3], d30[3], d31[3]}, [r3], r1
Use aligned stores and adapt to the changes requested.
> + bx lr
> +endfunc
>
> /* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
> function ff_rv34_inv_transform_noround_dc_neon, export=1
> --
> 1.7.8.3
>
--
Måns Rullgård
[email protected]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel