Janne Grunau <[email protected]> writes:

> Overall 3% faster, idct_add down from 340 to 101 cycles, idct_dc_add
> down from 96 to 54 cycles.
> ---
>  libavcodec/arm/rv34dsp_init_neon.c |    6 +++
>  libavcodec/arm/rv34dsp_neon.S      |   65 
> ++++++++++++++++++++++++++++++++++++
>  2 files changed, 71 insertions(+), 0 deletions(-)
>
> diff --git a/libavcodec/arm/rv34dsp_init_neon.c 
> b/libavcodec/arm/rv34dsp_init_neon.c
> index 3984d43..744818c 100644
> --- a/libavcodec/arm/rv34dsp_init_neon.c
> +++ b/libavcodec/arm/rv34dsp_init_neon.c
> @@ -27,8 +27,14 @@ void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
>
>  void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
>
> +void ff_rv34_idct_add_neon(uint8_t *dst, int stride, DCTELEM *block);
> +void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc);
> +
>  void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
>  {
>      c->rv34_inv_transform    = ff_rv34_inv_transform_noround_neon;
>      c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
> +
> +    c->rv34_idct_add    = ff_rv34_idct_add_neon;
> +    c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
>  }
> diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
> index a156412..f3ae0a6 100644
> --- a/libavcodec/arm/rv34dsp_neon.S
> +++ b/libavcodec/arm/rv34dsp_neon.S
> @@ -66,6 +66,42 @@
>          vsub.s32        q15, q14, q9    @ z0 - z3
>  .endm
>
> +/* void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block) */
> +function ff_rv34_idct_add_neon, export=1
> +        mov             r3,  r0
> +        mov             r0,  r2
> +        rv34_inv_transform
> +        mov             r0,  r3

Change the macro to take the base register as an argument and avoid
shuffling them around here.

> +        vrshrn.s32      d22, q15, #10   @ (z0 - z3) >> 10
> +        veor.s16        q12, q12
> +        veor.s16        q13, q13

vmov.i16 dst, #0

> +        vld4.8          {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
> +        vld4.8          {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
> +        vld4.8          {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
> +        vld4.8          {d28[3], d29[3], d30[3], d31[3]}, [r0], r1

Transpose the idct output instead (after shift/narrow).  Then load the
pixel values packing two lines into each d register for more efficient
processing (see below).

> +        vrshrn.s32      d16, q1,  #10   @ (z0 + z3) >> 10
> +        vrshrn.s32      d18, q2,  #10   @ (z1 + z2) >> 10
> +        vrshrn.s32      d20, q3,  #10   @ (z1 - z2) >> 10

Why the strange order for these?  Since q15 is produced last in the
macro, it makes no sense at all shifting it before the others.
Furthermore, keeping them all together makes the code easier to read
without harming performance.

> +        vst1.16         {q12-q13}, [r2,:128]    @ memset(block, 0, 16*2)

This is no faster than two 16-byte stores, so you could do that instead
if you're short on registers.

> +        vmovl.u8        q0,  d28
> +        vmovl.u8        q1,  d29
> +        vmovl.u8        q2,  d30
> +        vmovl.u8        q3,  d31
> +        vadd.s16        d24, d0,  d16   @ dst[i*4 + 0] + ((z0 + z3) >> 10)
> +        vadd.s16        d26, d2,  d18   @ dst[i*4 + 1] + ((z1 + z2) >> 10)
> +        vadd.s16        d28, d4,  d20   @ dst[i*4 + 2] + ((z1 - z2) >> 10)
> +        vadd.s16        d30, d6,  d22   @ dst[i*4 + 3] + ((z0 - z3) >> 10)

Drop the vmovl and use vaddw.u8.  It doesn't matter that you're adding
to signed values.  If you arrange the idct output properly, you can add
two lines at once.

> +        vqmovun.s16     d0,  q12
> +        vqmovun.s16     d1,  q13
> +        vqmovun.s16     d2,  q14
> +        vqmovun.s16     d3,  q15
> +        vst4.8          {d0[0], d1[0], d2[0], d3[0]}, [r3], r1
> +        vst4.8          {d0[1], d1[1], d2[1], d3[1]}, [r3], r1
> +        vst4.8          {d0[2], d1[2], d2[2], d3[2]}, [r3], r1
> +        vst4.8          {d0[3], d1[3], d2[3], d3[3]}, [r3], r1

Use aligned stores and adapt as necessary to the changes suggested above.

> +        bx              lr
> +endfunc
> +
>  /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
>  function ff_rv34_inv_transform_noround_neon, export=1
>          rv34_inv_transform
> @@ -88,6 +124,35 @@ function ff_rv34_inv_transform_noround_neon, export=1
>          bx              lr
>  endfunc
>
> +/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
> +function ff_rv34_idct_dc_add_neon, export=1
> +        mov             r3,  r0
> +        vld4.8          {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
> +        vld4.8          {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
> +        vld4.8          {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
> +        vld4.8          {d28[3], d29[3], d30[3], d31[3]}, [r0], r1

Use aligned vld1.32 instructions packing two rows per d register.

> +        vdup.16         d0,  r2
> +        vmov.s16        d1,  #169
> +        vmovl.u8        q8,  d28
> +        vmull.s16       q1,  d0,  d1    @ dc * 13 * 13
> +        vmovl.u8        q9,  d29
> +        vrshrn.s32      d0,  q1,  #10   @ (dc * 13 * 13 + 0x200) >> 10
> +        vmovl.u8        q10, d30
> +        vmovl.u8        q11, d31
> +        vadd.s16        d4,  d0,  d16
> +        vadd.s16        d6,  d0,  d18
> +        vadd.s16        d24, d0,  d20
> +        vadd.s16        d26, d0,  d22

Drop the vmovl and use two vaddw.u8 instructions processing two rows
each.

> +        vqmovun.s16     d28, q2
> +        vqmovun.s16     d29, q3
> +        vqmovun.s16     d30, q12
> +        vqmovun.s16     d31, q13
> +        vst4.8          {d28[0], d29[0], d30[0], d31[0]}, [r3], r1
> +        vst4.8          {d28[1], d29[1], d30[1], d31[1]}, [r3], r1
> +        vst4.8          {d28[2], d29[2], d30[2], d31[2]}, [r3], r1
> +        vst4.8          {d28[3], d29[3], d30[3], d31[3]}, [r3], r1

Use aligned stores and adapt to the changes requested.

> +        bx              lr
> +endfunc
>
>  /* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
>  function ff_rv34_inv_transform_noround_dc_neon, export=1
> -- 
> 1.7.8.3
>

-- 
Måns Rullgård
[email protected]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to