On 2016-11-23 15:00:51 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
>
> Previously all subpartitions except the eob=1 (DC) case ran with
> the same runtime:
>
> vp9_inv_dct_dct_16x16_sub16_add_neon: 3189.0 2486.8 2509.9 1964.1
> vp9_inv_dct_dct_32x32_sub32_add_neon: 18448.1 16682.0 14235.4 11993.4
>
> By skipping individual 4x16 or 4x32 pixel slices in the first pass,
> we reduce the runtime of these functions like this:
>
> vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5 188.7 211.6 235.1
> vp9_inv_dct_dct_16x16_sub4_add_neon: 2079.7 1606.3 1772.1 1264.8
> vp9_inv_dct_dct_16x16_sub8_add_neon: 2449.2 1834.3 2046.5 1499.7
> vp9_inv_dct_dct_16x16_sub12_add_neon: 2826.2 2109.2 2295.9 1758.2
> vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7
> vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5 457.5 863.7 554.7
> vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9
> vp9_inv_dct_dct_32x32_sub8_add_neon: 12217.8 9068.1 9420.4 7518.3
> vp9_inv_dct_dct_32x32_sub12_add_neon: 12967.3 10455.5 10223.9 8275.7
> vp9_inv_dct_dct_32x32_sub16_add_neon: 14084.1 11933.7 10998.9 9012.5
> vp9_inv_dct_dct_32x32_sub20_add_neon: 15171.4 13335.0 11820.6 9757.2
> vp9_inv_dct_dct_32x32_sub24_add_neon: 16229.6 15185.7 12614.4 10504.9
> vp9_inv_dct_dct_32x32_sub28_add_neon: 17338.1 15955.3 13445.0 11248.4
> vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1
>
> I.e. in general a very minor overhead for the full subpartition case due
> to the additional cmps, but a significant speedup for the cases when we
> only need to process a small part of the actual input data.
>
> In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
> 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
> 8x8 or 16x16 subpartitions respectively.
> ---
> This goes on top of the checkasm vp9dsp patch that adds benchmarking
> of generic subpartitions in the itxfm.
> ---
> libavcodec/arm/vp9itxfm_neon.S | 70
> ++++++++++++++++++++++++++++++++++++------
> tests/checkasm/vp9dsp.c | 6 ++--
> 2 files changed, 64 insertions(+), 12 deletions(-)
>
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 01944bd..769579a 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -659,10 +659,17 @@ endfunc
> @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @ transpose into a horizontal 16x4 slice and store.
> @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = slice offset
> @ r2 = src
> -@ r3 = slice offset
> +@ r3 = eob
> +@ r9 = min eob
> function \txfm\()16_1d_4x16_pass1_neon
> +.ifc \txfm,idct
> + @ Check if this whole input slice is zero
> + cmp r3, r9
> + ble 2f
once this check is true it is true for all remaining slices so we should
move it out to the main function.
> +.endif
> +
> mov r12, #32
> vmov.s16 q2, #0
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
> transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16,
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
>
> @ Store the transposed 4x4 blocks horizontally.
> - cmp r3, #12
> + cmp r1, #12
> beq 1f
> .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
> vst1.16 {d\i}, [r0,:64]!
> .endr
> bx lr
> 1:
> - @ Special case: For the last input column (r3 == 12),
> + @ Special case: For the last input column (r1 == 12),
> @ which would be stored as the last row in the temp buffer,
> @ don't store the first 4x4 block, but keep it in registers
> @ for the first slice of the second pass (where it is the
> @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
> vmov d30, d18
> vmov d31, d19
> bx lr
> +
> +.ifc \txfm,idct
> +2:
> + @ Set d28-d31 to zero, for the in-register passthrough of
> coefficients to pass 2
> + vmov.i16 q14, #0
> + vmov.i16 q15, #0
> + @ Write zeros to the temp buffer for pass 2
> +.rept 4
> + vst1.16 {q14-q15}, [r0,:128]!
> +.endr
> + bx lr
> +.endif
> endfunc
>
> @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @@ -781,15 +800,23 @@ endfunc
> itxfm16_1d_funcs idct
> itxfm16_1d_funcs iadst
>
> +@ This is the minimum eob value for each subpartition, in increments of 4
> +const min_eob_idct_idct_16, align=4
> + .short 0, 10, 38, 89
> +endconst
> +
> .macro itxfm_func16x16 txfm1, txfm2
> function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
> .ifc \txfm1\()_\txfm2,idct_idct
> cmp r3, #1
> beq idct16x16_dc_add_neon
> .endif
> - push {r4-r7,lr}
> + push {r4-r9,lr}
> .ifnc \txfm1\()_\txfm2,idct_idct
> vpush {q4-q7}
> + mov r9, #0
> +.else
> + movrel r8, min_eob_idct_idct_16
> .endif
>
> @ Align the stack, allocate a temp buffer
> @@ -810,8 +837,11 @@ A and r7, sp, #15
>
> .irp i, 0, 4, 8, 12
> add r0, sp, #(\i*32)
> + mov r1, #\i
> add r2, r6, #(\i*2)
> - mov r3, #\i
> +.ifc \txfm1\()_\txfm2,idct_idct
> + ldrh r9, [r8, #(\i/2)]
using the writeback variant would look imo clearer although it increases
the code size for thumb (if we care about that)
> +.endif
move this to the beginning and load to r1, cmp with eob, conditionally
store how much stack space needs to be cleared and jump out of '.irp',
saves r9. and several jumps and comparisons if eob is small.
> bl \txfm1\()16_1d_4x16_pass1_neon
> .endr
> .ifc \txfm2,idct
> @@ -830,7 +860,7 @@ A and r7, sp, #15
> .ifnc \txfm1\()_\txfm2,idct_idct
> vpop {q4-q7}
> .endif
> - pop {r4-r7,pc}
> + pop {r4-r9,pc}
> endfunc
> .endm
>
> @@ -944,9 +974,14 @@ endfunc
> @ each output written twice), followed by a separate 16-point IDCT
> @ of the odd inputs, added/subtracted onto the outputs of the first idct16.
> @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = min eob
> @ r2 = src
> +@ r3 = eob
> function idct32_1d_4x32_pass1_neon
> + @ Check if this whole input slice is zero
> + cmp r3, r1
> + ble 1f
the same applies as for the 16x16 idct
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel