On 2016-11-28 11:26:02 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
>
> Previously all subpartitions except the eob=1 (DC) case ran with
> the same runtime:
>
> vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2
> vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0
>
> By skipping individual 8x16 or 8x32 pixel slices in the first pass,
> we reduce the runtime of these functions like this:
>
> vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
> vp9_inv_dct_dct_16x16_sub2_add_neon: 1043.7
> vp9_inv_dct_dct_16x16_sub4_add_neon: 1045.3
> vp9_inv_dct_dct_16x16_sub8_add_neon: 1043.7
> vp9_inv_dct_dct_16x16_sub12_add_neon: 1374.0
> vp9_inv_dct_dct_16x16_sub16_add_neon: 1368.7
> vp9_inv_dct_dct_32x32_sub1_add_neon: 555.6
> vp9_inv_dct_dct_32x32_sub2_add_neon: 5180.0
> vp9_inv_dct_dct_32x32_sub4_add_neon: 5175.1
> vp9_inv_dct_dct_32x32_sub8_add_neon: 5186.6
> vp9_inv_dct_dct_32x32_sub12_add_neon: 6159.5
> vp9_inv_dct_dct_32x32_sub16_add_neon: 6162.7
> vp9_inv_dct_dct_32x32_sub20_add_neon: 7129.0
> vp9_inv_dct_dct_32x32_sub24_add_neon: 7133.1
> vp9_inv_dct_dct_32x32_sub28_add_neon: 8107.1
> vp9_inv_dct_dct_32x32_sub32_add_neon: 8105.6
>
> I.e. in general a very minor overhead for the full subpartition case due
> to the additional cmps, but a significant speedup for the cases when we
> only need to process a small part of the actual input data.
> ---
> Updated based on Janne's review of the arm version.
> ---
> libavcodec/aarch64/vp9itxfm_neon.S | 60
> ++++++++++++++++++++++++++++++++++----
> 1 file changed, 55 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/aarch64/vp9itxfm_neon.S
> b/libavcodec/aarch64/vp9itxfm_neon.S
> index f4194a6..9d2ba11 100644
> --- a/libavcodec/aarch64/vp9itxfm_neon.S
> +++ b/libavcodec/aarch64/vp9itxfm_neon.S
> @@ -588,6 +588,9 @@ endfunc
> .macro store i, dst, inc
> st1 {v\i\().8h}, [\dst], \inc
> .endm
> +.macro movi_v i, size, imm
> + movi v\i\()\size, \imm
> +.endm
> .macro load_clear i, src, inc
> ld1 {v\i\().8h}, [\src]
> st1 {v2.8h}, [\src], \inc
> @@ -596,9 +599,8 @@ endfunc
> // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
> // transpose into a horizontal 16x8 slice and store.
> // x0 = dst (temp buffer)
> -// x1 = unused
> +// x1 = slice offset
> // x2 = src
> -// x3 = slice offset
> // x9 = input stride
> .macro itxfm16_1d_funcs txfm
> function \txfm\()16_1d_8x16_pass1_neon
> @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
> transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
>
> // Store the transposed 8x8 blocks horizontally.
> - cmp x3, #8
> + cmp x1, #8
> b.eq 1f
> .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
> store \i, x0, #16
> .endr
> ret
> 1:
> - // Special case: For the last input column (x3 == 8),
> + // Special case: For the last input column (x1 == 8),
> // which would be stored as the last row in the temp buffer,
> // don't store the first 8x8 block, but keep it in registers
> // for the first slice of the second pass (where it is the
> @@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon,
> export=1
>
> .irp i, 0, 8
> add x0, sp, #(\i*32)
> +.ifc \txfm1\()_\txfm2,idct_idct
> +.if \i == 8
> + cmp w3, #38
> + b.le 1f
> +.endif
> +.endif
> + mov x1, #\i
> add x2, x6, #(\i*2)
> - mov x3, #\i
> bl \txfm1\()16_1d_8x16_pass1_neon
> .endr
> .ifc \txfm1\()_\txfm2,iadst_idct
> ld1 {v0.8h,v1.8h}, [x10]
> .endif
> +
> +.ifc \txfm1\()_\txfm2,idct_idct
> + b 3f
> +1:
> + // Set v24-v31 to zero, for the in-register passthrough of
> + // coefficients to pass 2. Since we only do two slices, this can
> + // only ever happen for the second slice. So we only need to store
> + // zeros to the temp buffer for the second half of the buffer.
> +.irp i, 24, 25, 26, 27, 28, 29, 30, 31
> + add x0, x0, #16
> + movi_v \i, .16b, #0
> + store 24, x0, #16
> +.endr
not really pretty, unfortunately I don't see much room for improvement.
iirc we should have a gpr which holds #32. move the add out of the .irp
and use w\that register as writeback
> +3:
> +.endif
> +
> .irp i, 0, 8
> add x0, x4, #(\i)
> mov x1, x5
> @@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon
> ret
> endfunc
>
> +const min_eob_idct_idct_32, align=4
> + .short 0, 34, 135, 336
> +endconst
> +
> function ff_vp9_idct_idct_32x32_add_neon, export=1
> cmp w3, #1
> b.eq idct32x32_dc_add_neon
>
> movrel x10, idct_coeffs
> add x11, x10, #32
> + movrel x12, min_eob_idct_idct_32 + 2
>
> mov x15, x30
>
> @@ -1099,9 +1128,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>
> .irp i, 0, 8, 16, 24
> add x0, sp, #(\i*64)
> +.if \i > 0
> + ldrh w1, [x12], #2
> + cmp w3, w1
> + mov x1, #(32 - \i)/4
> + b.le 1f
> +.endif
> add x2, x6, #(\i*2)
> bl idct32_1d_8x32_pass1_neon
> .endr
> + b 3f
> +
> +1:
> + // Write zeros to the temp buffer for pass 2
> + movi v16.8h, #0
> + movi v17.8h, #0
> + movi v18.8h, #0
> + movi v19.8h, #0
> +2:
> + subs x1, x1, #1
> +.rept 4
> + st1 {v16.8h-v19.8h}, [x0], #64
> +.endr
> + b.ne 2b
> +3:
> .irp i, 0, 8, 16, 24
> add x0, x4, #(\i)
> mov x1, x5
otherwise ok
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel