On 2016-12-01 11:26:58 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
>
> This makes it easier to avoid filling the temp buffer with zeros for the
> skipped slices, and leads to slightly more straightforward code for these
> cases (for the 16x16 case, where the special case pass functions are written
> out instead of templated from the same macro), instead of riddling the common
> code with special case branches or macro .ifs.
>
> The code size increases from 12388 bytes to 19932 bytes.
>
> Before: Cortex A7 A8 A9 A53
> vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.7 211.9 235.8
> vp9_inv_dct_dct_16x16_sub2_add_neon: 2056.7 1521.2 1734.8 1262.0
> vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0
> vp9_inv_dct_dct_16x16_sub8_add_neon: 2444.9 1801.6 2007.8 1508.5
> vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7
> vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5
> vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 456.7 866.0 553.9
> vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8
> vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1
> vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4
> vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9
> vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8
> vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7
> vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8
> vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6
> vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9
>
> After:
> vp9_inv_dct_dct_16x16_sub1_add_neon: 274.4 189.5 211.7 235.8
> vp9_inv_dct_dct_16x16_sub2_add_neon: 1214.2 962.0 1034.4 764.0
> vp9_inv_dct_dct_16x16_sub4_add_neon: 1214.5 911.0 1034.7 763.9
> vp9_inv_dct_dct_16x16_sub8_add_neon: 2000.6 1601.9 1729.0 1286.4
> vp9_inv_dct_dct_16x16_sub12_add_neon: 2854.3 2122.2 2292.9 1757.6
> vp9_inv_dct_dct_16x16_sub16_add_neon: 3231.1 2477.9 2544.6 2005.7
> vp9_inv_dct_dct_32x32_sub1_add_neon: 756.1 460.3 865.3 553.9
> vp9_inv_dct_dct_32x32_sub2_add_neon: 7603.7 5469.8 6046.2 4922.6
> vp9_inv_dct_dct_32x32_sub4_add_neon: 7586.9 5740.2 6061.5 4921.5
> vp9_inv_dct_dct_32x32_sub8_add_neon: 8380.7 6554.4 6600.4 5476.3
> vp9_inv_dct_dct_32x32_sub12_add_neon: 11005.8 9856.2 9242.4 7462.3
> vp9_inv_dct_dct_32x32_sub16_add_neon: 11959.7 10698.5 9998.0 8134.5
> vp9_inv_dct_dct_32x32_sub20_add_neon: 15250.8 13175.6 11854.4 9825.7
> vp9_inv_dct_dct_32x32_sub24_add_neon: 16382.3 14501.4 12671.7 10579.5
> vp9_inv_dct_dct_32x32_sub28_add_neon: 17521.2 16403.8 13486.3 11331.2
> vp9_inv_dct_dct_32x32_sub32_add_neon: 18630.8 17398.7 14383.2 12089.4
>
> ---
> If we wouldn't have made the core transforms standalone functions,
> the code size would end up at around 26 KB.
>
> The binary output is 4 KB larger than in the other alternative,
> but is more straightforward and gives better opportunities to
> special case them, and is a couple hundred cycles faster for the
> small subpartitions.
> ---
> libavcodec/arm/vp9itxfm_neon.S | 645
> ++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 601 insertions(+), 44 deletions(-)
>
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 22e63e5..4bba4b9 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -74,6 +74,14 @@ endconst
> vrshrn.s32 \out2, \tmpq4, #14
> .endm
>
> +@ Same as mbutterfly0 above, but treating the input in in2 as zero,
> +@ writing the same output into both out1 and out2.
> +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
> + vmull.s16 \tmpq3, \in1, d0[0]
> + vrshrn.s32 \out1, \tmpq3, #14
> + vmov \out2, \out1
> +.endm
Same as the other variant
> +
> @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
> @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
> @ Same as mbutterfly0, but with input being 2 q registers, output
> @@ -137,6 +145,23 @@ endconst
> vrshrn.s32 \inout2, \tmp2, #14
> .endm
>
> +@ Same as mbutterfly above, but treating the input in inout2 as zero
> +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
> + vmull.s16 \tmp1, \inout1, \coef1
> + vmull.s16 \tmp2, \inout1, \coef2
> + vrshrn.s32 \inout1, \tmp1, #14
> + vrshrn.s32 \inout2, \tmp2, #14
> +.endm
> +
> +@ Same as mbutterfly above, but treating the input in inout1 as zero
> +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
> + vmull.s16 \tmp1, \inout2, \coef2
> + vmull.s16 \tmp2, \inout2, \coef1
> + vneg.s32 \tmp1, \tmp1
> + vrshrn.s32 \inout2, \tmp2, #14
> + vrshrn.s32 \inout1, \tmp1, #14
> +.endm
> +
> @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 <<
> 13)) >> 14
> @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 <<
> 13)) >> 14
> @ inout are 4 d registers, tmp are 4 q registers
> @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon
> endfunc
> .ltorg
>
> -function idct16
> +.macro idct16_full
> mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a,
> d24 = t1a
> mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 =
> t3a
> mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 =
> t7a
> @@ -556,7 +581,10 @@ function idct16
> mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a,
> d26 = t5a
> mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a,
> d25 = t14a
> mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 =
> t13a, d21 = t10a
> + idct16_end
> +.endm
>
> +.macro idct16_end
> butterfly d18, d7, d4, d7 @ d18 = t0a, d7 =
> t7a
> butterfly d19, d22, d5, d22 @ d19 = t1a, d22 =
> t6
> butterfly d4, d26, d20, d26 @ d4 = t2a, d26 =
> t5
> @@ -581,6 +609,66 @@ function idct16
> butterfly d20, d27, d6, d27 @ d20 = out[4], d27
> = out[11]
> butterfly d21, d26, d26, d4 @ d21 = out[5], d26
> = out[10]
> bx lr
> +.endm
> +
> +function idct16
> + idct16_full
> +endfunc
> +
> +function idct16_half
> + mbutterfly0_h d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a,
> d24 = t1a
> + mbutterfly_h1 d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 =
> t3a
> + mbutterfly_h1 d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 =
> t7a
> + mbutterfly_h2 d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 =
> t6a
> + mbutterfly_h1 d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 =
> t15a
> + mbutterfly_h2 d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 =
> t14a
> + mbutterfly_h1 d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 =
> t13a
> + mbutterfly_h2 d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 =
> t12a
> +
> + butterfly d4, d28, d16, d28 @ d4 = t0, d28 =
> t3
> + butterfly d5, d20, d24, d20 @ d5 = t1, d20 =
> t2
> + butterfly d6, d26, d18, d26 @ d6 = t4, d26 =
> t5
> + butterfly d7, d22, d30, d22 @ d7 = t7, d22 =
> t6
> + butterfly d16, d25, d17, d25 @ d16 = t8, d25 =
> t9
> + butterfly d24, d21, d29, d21 @ d24 = t11, d21 =
> t10
> + butterfly d17, d27, d19, d27 @ d17 = t12, d27 =
> t13
> + butterfly d29, d23, d31, d23 @ d29 = t15, d23 =
> t14
> +
> + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a,
> d26 = t5a
> + mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a,
> d25 = t14a
> + mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 =
> t13a, d21 = t10a
> + idct16_end
> +endfunc
> +
> +function idct16_quarter
> + vmull.s16 q12, d19, d3[2]
> + vmull.s16 q2, d17, d1[3]
> + vmull.s16 q3, d18, d1[0]
> + vmull.s16 q15, d18, d0[3]
> + vneg.s32 q12, q12
> + vmull.s16 q14, d17, d2[0]
> + vmull.s16 q13, d19, d3[1]
> + vmull.s16 q11, d16, d0[0]
> + vrshrn.s32 d24, q12, #14
> + vrshrn.s32 d16, q2, #14
> + vrshrn.s32 d7, q3, #14
> + vrshrn.s32 d6, q15, #14
> + vrshrn.s32 d29, q14, #14
> + vrshrn.s32 d17, q13, #14
> + vrshrn.s32 d28, q11, #14
> +
> + mbutterfly_l q10, q11, d17, d24, d0[1], d0[2]
> + mbutterfly_l q9, q15, d29, d16, d0[1], d0[2]
> + vneg.s32 q11, q11
> + vrshrn.s32 d27, q10, #14
> + vrshrn.s32 d21, q11, #14
> + vrshrn.s32 d23, q9, #14
> + vrshrn.s32 d25, q15, #14
> + vmov d4, d28
> + vmov d5, d28
> + mbutterfly0 d22, d26, d7, d6, d18, d30, q9, q15
> + vmov d20, d28
> + idct16_end
> endfunc
>
> function iadst16
> @@ -657,6 +745,42 @@ function iadst16
> bx lr
> endfunc
>
> +.macro load_add_store coef0, coef1, coef2, coef3
> + vrshr.s16 \coef0, \coef0, #6
> + vrshr.s16 \coef1, \coef1, #6
> +
> + vld1.32 {d4[]}, [r0,:32], r1
> + vld1.32 {d4[1]}, [r3,:32], r1
> + vrshr.s16 \coef2, \coef2, #6
> + vrshr.s16 \coef3, \coef3, #6
> + vld1.32 {d5[]}, [r0,:32], r1
> + vld1.32 {d5[1]}, [r3,:32], r1
> + vaddw.u8 \coef0, \coef0, d4
> + vld1.32 {d6[]}, [r0,:32], r1
> + vld1.32 {d6[1]}, [r3,:32], r1
> + vaddw.u8 \coef1, \coef1, d5
> + vld1.32 {d7[]}, [r0,:32], r1
> + vld1.32 {d7[1]}, [r3,:32], r1
> +
> + vqmovun.s16 d4, \coef0
> + vqmovun.s16 d5, \coef1
> + sub r0, r0, r1, lsl #2
> + sub r3, r3, r1, lsl #2
> + vaddw.u8 \coef2, \coef2, d6
> + vaddw.u8 \coef3, \coef3, d7
> + vst1.32 {d4[0]}, [r0,:32], r1
> + vst1.32 {d4[1]}, [r3,:32], r1
> + vqmovun.s16 d6, \coef2
> + vst1.32 {d5[0]}, [r0,:32], r1
> + vst1.32 {d5[1]}, [r3,:32], r1
> + vqmovun.s16 d7, \coef3
> +
> + vst1.32 {d6[0]}, [r0,:32], r1
> + vst1.32 {d6[1]}, [r3,:32], r1
> + vst1.32 {d7[0]}, [r0,:32], r1
> + vst1.32 {d7[1]}, [r3,:32], r1
> +.endm
> +
> .macro itxfm16_1d_funcs txfm
> @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @ transpose into a horizontal 16x4 slice and store.
> @@ -739,44 +863,8 @@ function \txfm\()16_1d_4x16_pass2_neon
> lsl r1, r1, #1
> bl \txfm\()16
>
> -.macro load_add_store coef0, coef1, coef2, coef3
> - vrshr.s16 \coef0, \coef0, #6
> - vrshr.s16 \coef1, \coef1, #6
> -
> - vld1.32 {d4[]}, [r0,:32], r1
> - vld1.32 {d4[1]}, [r3,:32], r1
> - vrshr.s16 \coef2, \coef2, #6
> - vrshr.s16 \coef3, \coef3, #6
> - vld1.32 {d5[]}, [r0,:32], r1
> - vld1.32 {d5[1]}, [r3,:32], r1
> - vaddw.u8 \coef0, \coef0, d4
> - vld1.32 {d6[]}, [r0,:32], r1
> - vld1.32 {d6[1]}, [r3,:32], r1
> - vaddw.u8 \coef1, \coef1, d5
> - vld1.32 {d7[]}, [r0,:32], r1
> - vld1.32 {d7[1]}, [r3,:32], r1
> -
> - vqmovun.s16 d4, \coef0
> - vqmovun.s16 d5, \coef1
> - sub r0, r0, r1, lsl #2
> - sub r3, r3, r1, lsl #2
> - vaddw.u8 \coef2, \coef2, d6
> - vaddw.u8 \coef3, \coef3, d7
> - vst1.32 {d4[0]}, [r0,:32], r1
> - vst1.32 {d4[1]}, [r3,:32], r1
> - vqmovun.s16 d6, \coef2
> - vst1.32 {d5[0]}, [r0,:32], r1
> - vst1.32 {d5[1]}, [r3,:32], r1
> - vqmovun.s16 d7, \coef3
> -
> - vst1.32 {d6[0]}, [r0,:32], r1
> - vst1.32 {d6[1]}, [r3,:32], r1
> - vst1.32 {d7[0]}, [r0,:32], r1
> - vst1.32 {d7[1]}, [r3,:32], r1
> -.endm
> load_add_store q8, q9, q10, q11
> load_add_store q12, q13, q14, q15
> -.purgem load_add_store
>
> pop {pc}
> endfunc
> @@ -795,6 +883,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon,
> export=1
> .ifc \txfm1\()_\txfm2,idct_idct
> cmp r3, #1
> beq idct16x16_dc_add_neon
> + cmp r3, #10
> + ble idct16x16_quarter_add_neon
> + cmp r3, #38
> + ble idct16x16_half_add_neon
> .endif
> push {r4-r8,lr}
> .ifnc \txfm1\()_\txfm2,idct_idct
> @@ -877,6 +969,210 @@ itxfm_func16x16 idct, iadst
> itxfm_func16x16 iadst, iadst
> .ltorg
>
> +function idct16_1d_4x16_pass1_quarter_neon
> + push {lr}
> + mov r12, #32
> + vmov.s16 q2, #0
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +
> + bl idct16_quarter
> +
> + @ Do four 4x4 transposes. Originally, d16-d31 contain the
> + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> + @ contain the transposed 4x4 blocks.
> + transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16,
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +
> + @ Store the transposed 4x4 blocks horizontally.
> + @ The first 4x4 block is kept in registers for the second pass,
> + @ store the rest in the temp buffer.
> + add r0, r0, #8
> +.irp i, 20, 24, 28
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
just writing the .irp out would use the same number of lines and is imho
clearer, the same apply below
> + add r0, r0, #8
> +.irp i, 21, 25, 29
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + add r0, r0, #8
> +.irp i, 22, 26, 30
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + add r0, r0, #8
> +.irp i, 23, 27, 31
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + pop {pc}
> +endfunc
> +
> +function idct16_1d_4x16_pass2_quarter_neon
> + push {lr}
> + cmp r3, #0
> + beq 1f
> + mov r12, #32
> + @ Only load the top 4 lines, and only do it for the later slices.
> + @ For the first slice, d16-d19 is kept in registers from the first
> pass.
the comment is oddly placed, please move it above the cmp #
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> +1:
> +
> + add r3, r0, r1
> + lsl r1, r1, #1
> + bl idct16_quarter
> +
> + load_add_store q8, q9, q10, q11
> + load_add_store q12, q13, q14, q15
> +
> + pop {pc}
> +endfunc
> +
> +function idct16_1d_4x16_pass1_half_neon
> + push {lr}
> + mov r12, #32
> + vmov.s16 q2, #0
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +
> + bl idct16_half
> +
> + @ Do four 4x4 transposes. Originally, d16-d31 contain the
> + @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> + @ contain the transposed 4x4 blocks.
> + transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16,
> d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +
> + @ Store the transposed 4x4 blocks horizontally.
> + cmp r1, #4
> + beq 1f
> +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + pop {pc}
> +1:
> + @ Special case: For the second input column (r1 == 4),
> + @ which would be stored as the second row in the temp buffer,
> + @ don't store the first 4x4 block, but keep it in registers
> + @ for the first slice of the second pass (where it is the
> + @ second 4x4 block).
> + add r0, r0, #8
> +.irp i, 20, 24, 28
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + add r0, r0, #8
> +.irp i, 21, 25, 29
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + add r0, r0, #8
> +.irp i, 22, 26, 30
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + add r0, r0, #8
> +.irp i, 23, 27, 31
> + vst1.16 {d\i}, [r0,:64]!
> +.endr
> + vmov d20, d16
> + vmov d21, d17
> + vmov d22, d18
> + vmov d23, d19
> + pop {pc}
> +endfunc
> +
> +function idct16_1d_4x16_pass2_half_neon
> + push {lr}
> + mov r12, #32
> + cmp r3, #0
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> + beq 1f
> +.irp i, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> +1:
> +
> + add r3, r0, r1
> + lsl r1, r1, #1
> + bl idct16_half
> +
> + load_add_store q8, q9, q10, q11
> + load_add_store q12, q13, q14, q15
> +
> + pop {pc}
> +endfunc
> +.purgem load_add_store
> +
> +function idct16x16_quarter_add_neon
> + push {r4-r9,lr}
> +
> + @ Align the stack, allocate a temp buffer
> +T mov r7, sp
> +T and r7, r7, #15
> +A and r7, sp, #15
> + add r7, r7, #512
> + sub sp, sp, r7
> +
> + mov r4, r0
> + mov r5, r1
> + mov r6, r2
> +
> + movrel r12, idct_coeffs
> + vld1.16 {q0-q1}, [r12,:128]
> +
> +.irp i, 0
not needed, unless idct16x16_half_add and idct16x16_quarter_add are
templated using a macro. The first .irp is still annoying though.
I would probably use a '.ifc \size, quarter ...; .else ...; .endif'
Has the full idct the same format?
> + add r0, sp, #(\i*32)
> + add r2, r6, #(\i*2)
> + bl idct16_1d_4x16_pass1_quarter_neon
> +.endr
> +.irp i, 0, 4, 8, 12
> + add r0, r4, #(\i)
> + mov r1, r5
> + add r2, sp, #(\i*2)
> + mov r3, #\i
> + bl idct16_1d_4x16_pass2_quarter_neon
> +.endr
> +
> + add sp, sp, r7
> + pop {r4-r9,pc}
> +endfunc
> +
> +function idct16x16_half_add_neon
> + push {r4-r9,lr}
> +
> + @ Align the stack, allocate a temp buffer
> +T mov r7, sp
> +T and r7, r7, #15
> +A and r7, sp, #15
> + add r7, r7, #512
> + sub sp, sp, r7
> +
> + mov r4, r0
> + mov r5, r1
> + mov r6, r2
> +
> + movrel r12, idct_coeffs
> + vld1.16 {q0-q1}, [r12,:128]
> +
> +.irp i, 0, 4
> + add r0, sp, #(\i*32)
> + mov r1, #\i
> + add r2, r6, #(\i*2)
> + bl idct16_1d_4x16_pass1_half_neon
> +.endr
> +.irp i, 0, 4, 8, 12
> + add r0, r4, #(\i)
> + mov r1, r5
> + add r2, sp, #(\i*2)
> + mov r3, #\i
> + bl idct16_1d_4x16_pass2_half_neon
> +.endr
> +
> + add sp, sp, r7
> + pop {r4-r9,pc}
> +endfunc
>
> function idct32x32_dc_add_neon
> movrel r12, idct_coeffs
> @@ -913,7 +1209,7 @@ function idct32x32_dc_add_neon
> bx lr
> endfunc
>
> -function idct32_odd
> +.macro idct32_odd_full
> movrel r12, idct_coeffs
> add r12, r12, #32
> vld1.16 {q0-q1}, [r12,:128]
> @@ -943,7 +1239,10 @@ function idct32_odd
> mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a,
> d20 = t18a
> mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a,
> d26 = t26a
> mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a,
> d22 = t22a
> + idct32_end
> +.endm
>
> +.macro idct32_end
> butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a
> butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18
> butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a
> @@ -973,8 +1272,94 @@ function idct32_odd
> mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22
> = t22
> mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23
> = t23a
> bx lr
> +.endm
> +
> +function idct32_odd
> + idct32_odd_full
> +endfunc
> +
> +function idct32_odd_half
> + movrel r12, idct_coeffs
> + add r12, r12, #32
> + vld1.16 {q0-q1}, [r12,:128]
> +
> + mbutterfly_h1 d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 =
> t31a
> + mbutterfly_h2 d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 =
> t30a
> + mbutterfly_h1 d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 =
> t29a
> + mbutterfly_h2 d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 =
> t28a
> + mbutterfly_h1 d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 =
> t27a
> + mbutterfly_h2 d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 =
> t26a
> + mbutterfly_h1 d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 =
> t25a
> + mbutterfly_h2 d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 =
> t24a
> +
> + sub r12, r12, #32
> + vld1.16 {q0}, [r12,:128]
> +
> + butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17
> + butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18
> + butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21
> + butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22
> + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25
> + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26
> + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30
> + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29
> +
> + mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a,
> d24 = t30a
> + mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a,
> d20 = t18a
> + mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a,
> d26 = t26a
> + mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a,
> d22 = t22a
> +
> + idct32_end
> +endfunc
> +
> +function idct32_odd_quarter
> + movrel r12, idct_coeffs
> + add r12, r12, #32
> + vld1.16 {q0-q1}, [r12,:128]
> +
> + vmull.s16 q2, d16, d0[0]
> + vmull.s16 q14, d19, d1[3]
> + vmull.s16 q15, d16, d0[1]
> + vmull.s16 q11, d17, d3[2]
> + vmull.s16 q3, d17, d3[3]
> + vmull.s16 q13, d19, d1[2]
> + vmull.s16 q10, d18, d2[0]
> + vmull.s16 q12, d18, d2[1]
> +
> + sub r12, r12, #32
> + vld1.16 {q0}, [r12,:128]
> +
> + vneg.s32 q14, q14
> + vneg.s32 q3, q3
> +
> + vrshrn.s32 d4, q2, #14
> + vrshrn.s32 d5, q14, #14
> + vrshrn.s32 d29, q15, #14
> + vrshrn.s32 d28, q11, #14
> + vrshrn.s32 d7, q3, #14
> + vrshrn.s32 d31, q13, #14
> + vrshrn.s32 d6, q10, #14
> + vrshrn.s32 d30, q12, #14
> +
> + mbutterfly_l q8, q9, d29, d4, d0[3], d1[0]
> + mbutterfly_l q13, q10, d31, d5, d0[3], d1[0]
> + vrshrn.s32 d23, q8, #14
> + vrshrn.s32 d24, q9, #14
> + vneg.s32 q10, q10
> + vrshrn.s32 d27, q13, #14
> + vrshrn.s32 d20, q10, #14
> + mbutterfly_l q8, q9, d30, d6, d1[1], d1[2]
> + vrshrn.s32 d21, q8, #14
> + vrshrn.s32 d26, q9, #14
> + mbutterfly_l q8, q9, d28, d7, d1[1], d1[2]
> + vrshrn.s32 d25, q8, #14
> + vneg.s32 q9, q9
> + vrshrn.s32 d22, q9, #14
> +
> + idct32_end
> endfunc
>
> +.macro idct32_funcs suffix
> @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
> @ We don't have register space to do a single pass IDCT of 4x32 though,
> @ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
> @@ -984,7 +1369,7 @@ endfunc
> @ r0 = dst (temp buffer)
> @ r1 = unused
> @ r2 = src
> -function idct32_1d_4x32_pass1_neon
> +function idct32_1d_4x32_pass1\suffix\()_neon
> push {lr}
>
> movrel r12, idct_coeffs
> @@ -995,12 +1380,26 @@ function idct32_1d_4x32_pass1_neon
> vmov.s16 d4, #0
>
> @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.ifb \suffix
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> vld1.16 {d\i}, [r2,:64]
> vst1.16 {d4}, [r2,:64], r12
> .endr
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +.endif
>
> - bl idct16
> + bl idct16\suffix
>
> @ Do four 4x4 transposes. Originally, d16-d31 contain the
> @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> @@ -1026,17 +1425,39 @@ function idct32_1d_4x32_pass1_neon
>
> @ Move r2 back to the start of the input, and move
> @ to the first odd row
> +.ifb \suffix
> sub r2, r2, r12, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> + sub r2, r2, r12, lsl #2
> +.endif
> +.ifc \suffix,_half
> + sub r2, r2, r12, lsl #3
> +.endif
> add r2, r2, #64
>
> vmov.s16 d4, #0
> @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.ifb \suffix
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> vld1.16 {d\i}, [r2,:64]
> vst1.16 {d4}, [r2,:64], r12
> .endr
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64]
> + vst1.16 {d4}, [r2,:64], r12
> +.endr
> +.endif
>
> - bl idct32_odd
> + bl idct32_odd\suffix
>
> transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31,
> d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
>
> @@ -1072,19 +1493,33 @@ endfunc
> @ r0 = dst
> @ r1 = dst stride
> @ r2 = src (temp buffer)
> -function idct32_1d_4x32_pass2_neon
> +function idct32_1d_4x32_pass2\suffix\()_neon
> push {lr}
> movrel r12, idct_coeffs
> vld1.16 {q0-q1}, [r12,:128]
>
> mov r12, #128
> @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.ifb \suffix
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> vld1.16 {d\i}, [r2,:64], r12
> .endr
> sub r2, r2, r12, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> + sub r2, r2, r12, lsl #2
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> + sub r2, r2, r12, lsl #3
> +.endif
>
> - bl idct16
> + bl idct16\suffix
>
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> vst1.16 {d\i}, [r2,:64], r12
> @@ -1094,13 +1529,27 @@ function idct32_1d_4x32_pass2_neon
> add r2, r2, #64
>
> @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.ifb \suffix
> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> vld1.16 {d\i}, [r2,:64], r12
> .endr
> sub r2, r2, r12, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> + sub r2, r2, r12, lsl #2
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> + vld1.16 {d\i}, [r2,:64], r12
> +.endr
> + sub r2, r2, r12, lsl #3
> +.endif
> sub r2, r2, #64
>
> - bl idct32_odd
> + bl idct32_odd\suffix
>
> mov r12, #128
> .macro load_acc_store a, b, c, d, neg=0
> @@ -1150,6 +1599,11 @@ function idct32_1d_4x32_pass2_neon
> .purgem load_acc_store
> pop {pc}
> endfunc
> +.endm
> +
> +idct32_funcs
> +idct32_funcs _quarter
> +idct32_funcs _half
>
> const min_eob_idct_idct_32, align=4
> .short 0, 9, 34, 70, 135, 240, 336, 448
> @@ -1158,6 +1612,10 @@ endconst
> function ff_vp9_idct_idct_32x32_add_neon, export=1
> cmp r3, #1
> beq idct32x32_dc_add_neon
> + cmp r3, #34
> + ble idct32x32_quarter_add_neon
> + cmp r3, #135
> + ble idct32x32_half_add_neon
> push {r4-r8,lr}
> vpush {q4-q7}
> movrel r8, min_eob_idct_idct_32 + 2
> @@ -1209,3 +1667,102 @@ A and r7, sp, #15
> vpop {q4-q7}
> pop {r4-r8,pc}
> endfunc
> +
> +function idct32x32_quarter_add_neon
> + push {r4-r7,lr}
> + vpush {q4-q7}
> +
> + @ Align the stack, allocate a temp buffer
> +T mov r7, sp
> +T and r7, r7, #15
> +A and r7, sp, #15
> + add r7, r7, #2048
> + sub sp, sp, r7
> +
> + mov r4, r0
> + mov r5, r1
> + mov r6, r2
> +
> +.irp i, 0, 4
> + add r0, sp, #(\i*64)
> +.if \i == 4
> + cmp r3, #9
> + ble 1f
> +.endif
> + add r2, r6, #(\i*2)
> + bl idct32_1d_4x32_pass1_quarter_neon
> +.endr
> + b 3f
> +
> +1:
> + @ Write zeros to the temp buffer for pass 2
> + vmov.i16 q14, #0
> + vmov.i16 q15, #0
> +.rept 8
> + vst1.16 {q14-q15}, [r0,:128]!
> +.endr
> +3:
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> + add r0, r4, #(\i)
> + mov r1, r5
> + add r2, sp, #(\i*2)
> + bl idct32_1d_4x32_pass2_quarter_neon
> +.endr
> +
> + add sp, sp, r7
> + vpop {q4-q7}
> + pop {r4-r7,pc}
> +endfunc
> +
> +function idct32x32_half_add_neon
> + push {r4-r8,lr}
> + vpush {q4-q7}
> + movrel r8, min_eob_idct_idct_32 + 2
> +
> + @ Align the stack, allocate a temp buffer
> +T mov r7, sp
> +T and r7, r7, #15
> +A and r7, sp, #15
> + add r7, r7, #2048
> + sub sp, sp, r7
> +
> + mov r4, r0
> + mov r5, r1
> + mov r6, r2
> +
> +.irp i, 0, 4, 8, 12
> + add r0, sp, #(\i*64)
> +.if \i > 0
> + ldrh_post r1, r8, #2
> + cmp r3, r1
> + it le
> + movle r1, #(16 - \i)/2
> + ble 1f
> +.endif
> + add r2, r6, #(\i*2)
> + bl idct32_1d_4x32_pass1_half_neon
> +.endr
> + b 3f
> +
> +1:
> + @ Write zeros to the temp buffer for pass 2
> + vmov.i16 q14, #0
> + vmov.i16 q15, #0
> +2:
> + subs r1, r1, #1
> +.rept 4
> + vst1.16 {q14-q15}, [r0,:128]!
> +.endr
> + bne 2b
> +3:
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> + add r0, r4, #(\i)
> + mov r1, r5
> + add r2, sp, #(\i*2)
> + bl idct32_1d_4x32_pass2_half_neon
> +.endr
> +
> + add sp, sp, r7
> + vpop {q4-q7}
> + pop {r4-r8,pc}
> +endfunc
otherwise ok.
I'm not really sure which variant I prefer. Is the speed difference
mesuable for idct heavy real world samples? If you have preference for
one or the other variant I trust your judgement.
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel