On 2016-11-23 15:00:51 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> Previously all subpartitions except the eob=1 (DC) case ran with
> the same runtime:
> 
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3189.0   2486.8   2509.9   1964.1
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18448.1  16682.0  14235.4  11993.4
> 
> By skipping individual 4x16 or 4x32 pixel slices in the first pass,
> we reduce the runtime of these functions like this:
> 
> vp9_inv_dct_dct_16x16_sub1_add_neon:     271.5    188.7    211.6    235.1
> vp9_inv_dct_dct_16x16_sub4_add_neon:    2079.7   1606.3   1772.1   1264.8
> vp9_inv_dct_dct_16x16_sub8_add_neon:    2449.2   1834.3   2046.5   1499.7
> vp9_inv_dct_dct_16x16_sub12_add_neon:   2826.2   2109.2   2295.9   1758.2
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
> vp9_inv_dct_dct_32x32_sub1_add_neon:     752.5    457.5    863.7    554.7
> vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
> vp9_inv_dct_dct_32x32_sub8_add_neon:   12217.8   9068.1   9420.4   7518.3
> vp9_inv_dct_dct_32x32_sub12_add_neon:  12967.3  10455.5  10223.9   8275.7
> vp9_inv_dct_dct_32x32_sub16_add_neon:  14084.1  11933.7  10998.9   9012.5
> vp9_inv_dct_dct_32x32_sub20_add_neon:  15171.4  13335.0  11820.6   9757.2
> vp9_inv_dct_dct_32x32_sub24_add_neon:  16229.6  15185.7  12614.4  10504.9
> vp9_inv_dct_dct_32x32_sub28_add_neon:  17338.1  15955.3  13445.0  11248.4
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1
> 
> I.e. in general a very minor overhead for the full subpartition case due
> to the additional cmps, but a significant speedup for the cases when we
> only need to process a small part of the actual input data.
> 
> In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
> 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
> 8x8 or 16x16 subpartitions respectively.
> ---
> This goes on top of the checkasm vp9dsp patch that adds benchmarking
> of generic subpartitions in the itxfm.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 70 
> ++++++++++++++++++++++++++++++++++++------
>  tests/checkasm/vp9dsp.c        |  6 ++--
>  2 files changed, 64 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 01944bd..769579a 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -659,10 +659,17 @@ endfunc
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
>  @ transpose into a horizontal 16x4 slice and store.
>  @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = slice offset
>  @ r2 = src
> -@ r3 = slice offset
> +@ r3 = eob
> +@ r9 = min eob
>  function \txfm\()16_1d_4x16_pass1_neon
> +.ifc \txfm,idct
> +        @ Check if this whole input slice is zero
> +        cmp             r3,  r9
> +        ble             2f

once this check is true it is true for all remaining slices so we should 
move it out to the main function.

> +.endif
> +
>          mov             r12, #32
>          vmov.s16        q2, #0
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
>          transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, 
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
>  
>          @ Store the transposed 4x4 blocks horizontally.
> -        cmp             r3,  #12
> +        cmp             r1,  #12
>          beq             1f
>  .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
>          vst1.16         {d\i}, [r0,:64]!
>  .endr
>          bx              lr
>  1:
> -        @ Special case: For the last input column (r3 == 12),
> +        @ Special case: For the last input column (r1 == 12),
>          @ which would be stored as the last row in the temp buffer,
>          @ don't store the first 4x4 block, but keep it in registers
>          @ for the first slice of the second pass (where it is the
> @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
>          vmov            d30, d18
>          vmov            d31, d19
>          bx              lr
> +
> +.ifc \txfm,idct
> +2:
> +        @ Set d28-d31 to zero, for the in-register passthrough of 
> coefficients to pass 2
> +        vmov.i16        q14, #0
> +        vmov.i16        q15, #0
> +        @ Write zeros to the temp buffer for pass 2
> +.rept 4
> +        vst1.16         {q14-q15}, [r0,:128]!
> +.endr
> +        bx              lr
> +.endif
>  endfunc
>  
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @@ -781,15 +800,23 @@ endfunc
>  itxfm16_1d_funcs idct
>  itxfm16_1d_funcs iadst
>  
> +@ This is the minimum eob value for each subpartition, in increments of 4
> +const min_eob_idct_idct_16, align=4
> +        .short  0, 10, 38, 89
> +endconst
> +
>  .macro itxfm_func16x16 txfm1, txfm2
>  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>  .ifc \txfm1\()_\txfm2,idct_idct
>          cmp             r3,  #1
>          beq             idct16x16_dc_add_neon
>  .endif
> -        push            {r4-r7,lr}
> +        push            {r4-r9,lr}
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpush           {q4-q7}
> +        mov             r9,  #0
> +.else
> +        movrel          r8,  min_eob_idct_idct_16
>  .endif
>  
>          @ Align the stack, allocate a temp buffer
> @@ -810,8 +837,11 @@ A       and             r7,  sp,  #15
>  
>  .irp i, 0, 4, 8, 12
>          add             r0,  sp,  #(\i*32)
> +        mov             r1,  #\i
>          add             r2,  r6,  #(\i*2)
> -        mov             r3,  #\i
> +.ifc \txfm1\()_\txfm2,idct_idct
> +        ldrh            r9,  [r8, #(\i/2)]

using the writeback variant would look imo clearer although it increases 
the code size for thumb (if we care about that)

> +.endif

move this to the beginning and load to r1, cmp with eob, conditionally 
store how much stack space needs to be cleared and jump out of '.irp', 
saves r9. and several jumps and comparisons if eob is small. 

>          bl              \txfm1\()16_1d_4x16_pass1_neon
>  .endr
>  .ifc \txfm2,idct
> @@ -830,7 +860,7 @@ A       and             r7,  sp,  #15
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpop            {q4-q7}
>  .endif
> -        pop             {r4-r7,pc}
> +        pop             {r4-r9,pc}
>  endfunc
>  .endm
>  
> @@ -944,9 +974,14 @@ endfunc
>  @ each output written twice), followed by a separate 16-point IDCT
>  @ of the odd inputs, added/subtracted onto the outputs of the first idct16.
>  @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = min eob
>  @ r2 = src
> +@ r3 = eob
>  function idct32_1d_4x32_pass1_neon
> +        @ Check if this whole input slice is zero
> +        cmp             r3,  r1
> +        ble             1f

the same applies as for the 16x16 idct

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to