On 2016-11-28 11:26:02 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Previously all subpartitions except the eob=1 (DC) case ran with > the same runtime: > > vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0 > > By skipping individual 8x16 or 8x32 pixel slices in the first pass, > we reduce the runtime of these functions like this: > > vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 > vp9_inv_dct_dct_16x16_sub2_add_neon: 1043.7 > vp9_inv_dct_dct_16x16_sub4_add_neon: 1045.3 > vp9_inv_dct_dct_16x16_sub8_add_neon: 1043.7 > vp9_inv_dct_dct_16x16_sub12_add_neon: 1374.0 > vp9_inv_dct_dct_16x16_sub16_add_neon: 1368.7 > vp9_inv_dct_dct_32x32_sub1_add_neon: 555.6 > vp9_inv_dct_dct_32x32_sub2_add_neon: 5180.0 > vp9_inv_dct_dct_32x32_sub4_add_neon: 5175.1 > vp9_inv_dct_dct_32x32_sub8_add_neon: 5186.6 > vp9_inv_dct_dct_32x32_sub12_add_neon: 6159.5 > vp9_inv_dct_dct_32x32_sub16_add_neon: 6162.7 > vp9_inv_dct_dct_32x32_sub20_add_neon: 7129.0 > vp9_inv_dct_dct_32x32_sub24_add_neon: 7133.1 > vp9_inv_dct_dct_32x32_sub28_add_neon: 8107.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 8105.6 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional cmps, but a significant speedup for the cases when we > only need to process a small part of the actual input data. > --- > Updated based on Janne's review of the arm version. > --- > libavcodec/aarch64/vp9itxfm_neon.S | 60 > ++++++++++++++++++++++++++++++++++---- > 1 file changed, 55 insertions(+), 5 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index f4194a6..9d2ba11 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -588,6 +588,9 @@ endfunc > .macro store i, dst, inc > st1 {v\i\().8h}, [\dst], \inc > .endm > +.macro movi_v i, size, imm > + movi v\i\()\size, \imm > +.endm > .macro load_clear i, src, inc > ld1 {v\i\().8h}, [\src] > st1 {v2.8h}, [\src], \inc > @@ -596,9 +599,8 @@ endfunc > // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, > // transpose into a horizontal 16x8 slice and store. > // x0 = dst (temp buffer) > -// x1 = unused > +// x1 = slice offset > // x2 = src > -// x3 = slice offset > // x9 = input stride > .macro itxfm16_1d_funcs txfm > function \txfm\()16_1d_8x16_pass1_neon > @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon > transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 > > // Store the transposed 8x8 blocks horizontally. > - cmp x3, #8 > + cmp x1, #8 > b.eq 1f > .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 > store \i, x0, #16 > .endr > ret > 1: > - // Special case: For the last input column (x3 == 8), > + // Special case: For the last input column (x1 == 8), > // which would be stored as the last row in the temp buffer, > // don't store the first 8x8 block, but keep it in registers > // for the first slice of the second pass (where it is the > @@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, > export=1 > > .irp i, 0, 8 > add x0, sp, #(\i*32) > +.ifc \txfm1\()_\txfm2,idct_idct > +.if \i == 8 > + cmp w3, #38 > + b.le 1f > +.endif > +.endif > + mov x1, #\i > add x2, x6, #(\i*2) > - mov x3, #\i > bl \txfm1\()16_1d_8x16_pass1_neon > .endr > .ifc \txfm1\()_\txfm2,iadst_idct > ld1 {v0.8h,v1.8h}, [x10] > .endif > + > +.ifc \txfm1\()_\txfm2,idct_idct > + b 3f > +1: > + // Set v24-v31 to zero, for the in-register passthrough of > + // coefficients to pass 2. Since we only do two slices, this can > + // only ever happen for the second slice. So we only need to store > + // zeros to the temp buffer for the second half of the buffer. > +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 > + add x0, x0, #16 > + movi_v \i, .16b, #0 > + store 24, x0, #16 > +.endr
not really pretty, unfortunately I don't see much room for improvement. iirc we should have a gpr which holds #32. move the add out of the .irp and use w\that register as writeback > +3: > +.endif > + > .irp i, 0, 8 > add x0, x4, #(\i) > mov x1, x5 > @@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon > ret > endfunc > > +const min_eob_idct_idct_32, align=4 > + .short 0, 34, 135, 336 > +endconst > + > function ff_vp9_idct_idct_32x32_add_neon, export=1 > cmp w3, #1 > b.eq idct32x32_dc_add_neon > > movrel x10, idct_coeffs > add x11, x10, #32 > + movrel x12, min_eob_idct_idct_32 + 2 > > mov x15, x30 > > @@ -1099,9 +1128,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 > > .irp i, 0, 8, 16, 24 > add x0, sp, #(\i*64) > +.if \i > 0 > + ldrh w1, [x12], #2 > + cmp w3, w1 > + mov x1, #(32 - \i)/4 > + b.le 1f > +.endif > add x2, x6, #(\i*2) > bl idct32_1d_8x32_pass1_neon > .endr > + b 3f > + > +1: > + // Write zeros to the temp buffer for pass 2 > + movi v16.8h, #0 > + movi v17.8h, #0 > + movi v18.8h, #0 > + movi v19.8h, #0 > +2: > + subs x1, x1, #1 > +.rept 4 > + st1 {v16.8h-v19.8h}, [x0], #64 > +.endr > + b.ne 2b > +3: > .irp i, 0, 8, 16, 24 > add x0, x4, #(\i) > mov x1, x5 otherwise ok Janne _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel