[libav-devel] [PATCH] fate.sh: Allow setting other make flags for running tests
If makeopts_fate is set, these makeopts are used for running the tests instead of the normal makeopts. If it isn't set, the normal makeopts variable is used as before. This is useful if remote testing on a lesser machine where a large number of parallel jobs might be undesireable, while wanting to speed up the build with many parallel processes. --- tests/fate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fate.sh b/tests/fate.sh index b8ee1ae..f7ca891 100755 --- a/tests/fate.sh +++ b/tests/fate.sh @@ -73,7 +73,7 @@ compile()( fate()( test "$build_only" = "yes" && return cd ${build} || return -${make} ${makeopts} -k fate +${make} ${makeopts_fate-${makeopts}} -k fate ) clean(){ -- 2.9.3 (Apple Git-75) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 3189.0 2486.8 2509.9 1964.1 vp9_inv_dct_dct_32x32_sub32_add_neon: 18448.1 16682.0 14235.4 11993.4 By skipping individual 4x16 or 4x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.6235.1 vp9_inv_dct_dct_16x16_sub4_add_neon:2079.7 1606.3 1772.1 1264.8 vp9_inv_dct_dct_16x16_sub8_add_neon:2449.2 1834.3 2046.5 1499.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 2826.2 2109.2 2295.9 1758.2 vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5457.5863.7554.7 vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9 vp9_inv_dct_dct_32x32_sub8_add_neon: 12217.8 9068.1 9420.4 7518.3 vp9_inv_dct_dct_32x32_sub12_add_neon: 12967.3 10455.5 10223.9 8275.7 vp9_inv_dct_dct_32x32_sub16_add_neon: 14084.1 11933.7 10998.9 9012.5 vp9_inv_dct_dct_32x32_sub20_add_neon: 15171.4 13335.0 11820.6 9757.2 vp9_inv_dct_dct_32x32_sub24_add_neon: 16229.6 15185.7 12614.4 10504.9 vp9_inv_dct_dct_32x32_sub28_add_neon: 17338.1 15955.3 13445.0 11248.4 vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. In common VP9 content in a few inspected clips, 70-90% of the non-dc-only 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left 8x8 or 16x16 subpartitions respectively. --- This goes on top of the checkasm vp9dsp patch that adds benchmarking of generic subpartitions in the itxfm. --- libavcodec/arm/vp9itxfm_neon.S | 70 -- tests/checkasm/vp9dsp.c| 6 ++-- 2 files changed, 64 insertions(+), 12 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 01944bd..769579a 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -659,10 +659,17 @@ endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ transpose into a horizontal 16x4 slice and store. @ r0 = dst (temp buffer) -@ r1 = unused +@ r1 = slice offset @ r2 = src -@ r3 = slice offset +@ r3 = eob +@ r9 = min eob function \txfm\()16_1d_4x16_pass1_neon +.ifc \txfm,idct +@ Check if this whole input slice is zero +cmp r3, r9 +ble 2f +.endif + mov r12, #32 vmov.s16q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 @ Store the transposed 4x4 blocks horizontally. -cmp r3, #12 +cmp r1, #12 beq 1f .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr bx lr 1: -@ Special case: For the last input column (r3 == 12), +@ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @ don't store the first 4x4 block, but keep it in registers @ for the first slice of the second pass (where it is the @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon vmovd30, d18 vmovd31, d19 bx lr + +.ifc \txfm,idct +2: +@ Set d28-d31 to zero, for the in-register passthrough of coefficients to pass 2 +vmov.i16q14, #0 +vmov.i16q15, #0 +@ Write zeros to the temp buffer for pass 2 +.rept 4 +vst1.16 {q14-q15}, [r0,:128]! +.endr +bx lr +.endif endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -781,15 +800,23 @@ endfunc itxfm16_1d_funcs idct itxfm16_1d_funcs iadst +@ This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 +.short 0, 10, 38, 89 +endconst + .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct cmp r3, #1 beq idct16x16_dc_add_neon .endif -push{r4-r7,lr} +push{r4-r9,lr} .ifnc \txfm1\()_\txfm2,idct_idct vpush {q4-q7} +mov r9, #0 +.else +movrel r8, min_eob_
[libav-devel] [PATCH 02/11] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it
--- libavcodec/aarch64/vp9itxfm_neon.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 2dc6b75..21352f0 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -1001,7 +1001,6 @@ function idct32_1d_8x32_pass2_neon idct16 -mov x9, #128 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x2], x9 .endr @@ -1018,7 +1017,6 @@ function idct32_1d_8x32_pass2_neon idct32_odd -mov x9, #128 .macro load_acc_store a, b, c, d, neg=0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 03/11] arm: vp9itxfm: Rename a macro parameter to fit better
Since the same parameter is used for both input and output, the name inout is more fitting. This matches the naming used below in the dmbutterfly macro. --- libavcodec/arm/vp9itxfm_neon.S | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index acb1c6d..01944bd 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -125,16 +125,16 @@ endconst vmlal.s16 \out4, \in4, \coef1 .endm -@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14 -@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14 -@ in are 2 d registers, tmp are 2 q registers -.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0 -mbutterfly_l\tmp1, \tmp2, \in1, \in2, \coef1, \coef2 +@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14 +@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14 +@ inout are 2 d registers, tmp are 2 q registers +.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0 +mbutterfly_l\tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2 .if \neg > 0 vneg.s32\tmp2, \tmp2 .endif -vrshrn.s32 \in1, \tmp1, #14 -vrshrn.s32 \in2, \tmp2, #14 +vrshrn.s32 \inout1, \tmp1, #14 +vrshrn.s32 \inout2, \tmp2, #14 .endm @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 01/11] arm/aarch64: vp9itxfm: Fix indentation of macro arguments
--- libavcodec/aarch64/vp9itxfm_neon.S | 16 libavcodec/arm/vp9itxfm_neon.S | 8 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 65406b9..2dc6b75 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -969,14 +969,14 @@ function idct32_1d_8x32_pass1_neon st1 {v7.8h}, [x0], #16 .endm -store_rev 31, 23 -store_rev 30, 22 -store_rev 29, 21 -store_rev 28, 20 -store_rev 27, 19 -store_rev 26, 18 -store_rev 25, 17 -store_rev 24, 16 +store_rev 31, 23 +store_rev 30, 22 +store_rev 29, 21 +store_rev 28, 20 +store_rev 27, 19 +store_rev 26, 18 +store_rev 25, 17 +store_rev 24, 16 .purgem store_rev ret endfunc diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 5d73d84..acb1c6d 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -1017,10 +1017,10 @@ function idct32_1d_4x32_pass1_neon .endr .endm -store_rev 31, 27, 23, 19 -store_rev 30, 26, 22, 18 -store_rev 29, 25, 21, 17 -store_rev 28, 24, 20, 16 +store_rev 31, 27, 23, 19 +store_rev 30, 26, 22, 18 +store_rev 29, 25, 21, 17 +store_rev 28, 24, 20, 16 .purgem store_rev bx lr endfunc -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 05/11] arm: vp9itxfm: Make the larger core transforms standalone functions
This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from 15252 to 12316 bytes. This gives a small slowdown of a couple tens of cycles, up to a few hundred cycles for the full case of the largest transform, but makes it more feasible to add more optimized versions of these transforms. Before: vp9_inv_dct_dct_16x16_sub4_add_neon:2079.7 1606.3 1772.1 1264.8 vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7 vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9 vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1 After: vp9_inv_dct_dct_16x16_sub4_add_neon:2214.2 1617.2 1767.9 1286.5 vp9_inv_dct_dct_16x16_sub16_add_neon: 3256.8 2489.2 2554.8 2002.9 vp9_inv_dct_dct_32x32_sub4_add_neon: 10891.4 8124.4 8614.0 6828.5 vp9_inv_dct_dct_32x32_sub32_add_neon: 18626.4 17264.0 14298.7 12067.5 --- libavcodec/arm/vp9itxfm_neon.S | 41 - 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 769579a..d10de1e 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -.macro idct16 +function idct16 mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -580,9 +580,10 @@ endfunc vmovd4, d21 @ d4 = t10a butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d4@ d21 = out[5], d26 = out[10] -.endm +bx lr +endfunc -.macro iadst16 +function iadst16 movrel r12, iadst16_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -653,7 +654,8 @@ endfunc vmovd16, d2 vmovd30, d4 -.endm +bx lr +endfunc .macro itxfm16_1d_funcs txfm @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -669,6 +671,7 @@ function \txfm\()16_1d_4x16_pass1_neon cmp r3, r9 ble 2f .endif +push{lr} mov r12, #32 vmov.s16q2, #0 @@ -677,7 +680,7 @@ function \txfm\()16_1d_4x16_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr -\txfm\()16 +bl \txfm\()16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -690,7 +693,7 @@ function \txfm\()16_1d_4x16_pass1_neon .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr -bx lr +pop {pc} 1: @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @@ -717,7 +720,7 @@ function \txfm\()16_1d_4x16_pass1_neon vmovd29, d17 vmovd30, d18 vmovd31, d19 -bx lr +pop {pc} .ifc \txfm,idct 2: @@ -739,6 +742,7 @@ endfunc @ r2 = src (temp buffer) @ r3 = slice offset function \txfm\()16_1d_4x16_pass2_neon +push{lr} mov r12, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 vld1.16 {d\i}, [r2,:64], r12 @@ -752,7 +756,7 @@ function \txfm\()16_1d_4x16_pass2_neon add r3, r0, r1 lsl r1, r1, #1 -\txfm\()16 +bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s16 \coef0, \coef0, #6 @@ -793,7 +797,7 @@ function \txfm\()16_1d_4x16_pass2_neon load_add_store q12, q13, q14, q15 .purgem load_add_store -bx lr +pop {pc} endfunc .endm @@ -906,7 +910,7 @@ function idct32x32_dc_add_neon bx lr endfunc -.macro idct32_odd +function idct32_odd movrel r12, idct_coeffs add r12, r12, #32 vld1.16 {q0-q1}, [r12,:128] @@ -965,7 +969,8 @@ endfunc mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a -.endm +bx lr +endfunc @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. @ We don't have register space to do a single pass IDCT of 4x32 though, @@ -981,6 +986,7 @@ function id
[libav-devel] [PATCH 08/11] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 8083.0 By skipping individual 8x16 or 8x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7 vp9_inv_dct_dct_16x16_sub4_add_neon:1051.7 vp9_inv_dct_dct_16x16_sub8_add_neon:1051.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 1373.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 1377.4 vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1 vp9_inv_dct_dct_32x32_sub4_add_neon:5186.6 vp9_inv_dct_dct_32x32_sub8_add_neon:5195.7 vp9_inv_dct_dct_32x32_sub12_add_neon: 6151.5 vp9_inv_dct_dct_32x32_sub16_add_neon: 6160.0 vp9_inv_dct_dct_32x32_sub20_add_neon: 7121.0 vp9_inv_dct_dct_32x32_sub24_add_neon: 7120.6 vp9_inv_dct_dct_32x32_sub28_add_neon: 8090.1 vp9_inv_dct_dct_32x32_sub32_add_neon: 8095.2 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. --- libavcodec/aarch64/vp9itxfm_neon.S | 69 ++ 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 21352f0..dc58652 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -588,6 +588,9 @@ endfunc .macro store i, dst, inc st1 {v\i\().8h}, [\dst], \inc .endm +.macro movi_v i, size, imm +moviv\i\()\size, \imm +.endm .macro load_clear i, src, inc ld1 {v\i\().8h}, [\src] st1 {v2.8h}, [\src], \inc @@ -596,11 +599,18 @@ endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // transpose into a horizontal 16x8 slice and store. // x0 = dst (temp buffer) -// x1 = unused +// x1 = slice offset // x2 = src -// x3 = slice offset +// w3 = eob +// w7 = min eob .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon +.ifc \txfm,idct +// Check if this whole input slice is zero +cmp w3, w7 +b.le2f +.endif + mov x9, #32 moviv2.8h, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -616,14 +626,14 @@ function \txfm\()16_1d_8x16_pass1_neon transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 // Store the transposed 8x8 blocks horizontally. -cmp x3, #8 +cmp x1, #8 b.eq1f .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr ret 1: -// Special case: For the last input column (x3 == 8), +// Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, // don't store the first 8x8 block, but keep it in registers // for the first slice of the second pass (where it is the @@ -641,6 +651,20 @@ function \txfm\()16_1d_8x16_pass1_neon mov v30.16b, v22.16b mov v31.16b, v23.16b ret + +.ifc \txfm,idct +2: +// Set v24-v31 to zero, for the in-register passthrough of +// coefficients to pass 2. Since we only do two slices, this can +// only ever happen for the second slice. So we only need to store +// zeros to the temp buffer for the second half of the buffer. +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 +add x0, x0, #16 +movi_v \i, .16b, #0 +store 24, x0, #16 +.endr +ret +.endif endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, @@ -719,6 +743,11 @@ endfunc itxfm16_1d_funcs idct itxfm16_1d_funcs iadst +// This is the minimum eob value for each subpartition, in increments of 8 +const min_eob_idct_idct_16, align=4 +.short 0, 38 +endconst + .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct @@ -743,6 +772,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 movrel x10, idct_coeffs .ifnc \txfm1\()_\txfm2,idct_idct movrel x11, iadst16_coeffs +mov x7, #0 +.else +movrel x12, min_eob_idct_idct_16 .endif .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] @@ -750,8 +782,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .irp i, 0, 8 add x0, sp, #(\i*32) +mov x1, #\i add x2, x6, #(\i*2) -mov x3, #\i +.ifc \txfm1\()_\txfm
[libav-devel] [PATCH 09/11] aarch64: vp9itxfm: Make the larger core transforms standalone functions
This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from 19540 to 14784 bytes. This gives a small slowdown of a couple of tens of cycles, and makes it more feasible to add more optimized versions of these transforms. Before: vp9_inv_dct_dct_16x16_sub4_add_neon: 1051.8 vp9_inv_dct_dct_16x16_sub16_add_neon: 1378.8 vp9_inv_dct_dct_32x32_sub4_add_neon: 5190.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8091.7 After: vp9_inv_dct_dct_16x16_sub4_add_neon: 1065.0 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.7 vp9_inv_dct_dct_32x32_sub4_add_neon: 5208.6 vp9_inv_dct_dct_32x32_sub32_add_neon: 8121.9 --- libavcodec/aarch64/vp9itxfm_neon.S | 41 ++ 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index dc58652..ef40a5a 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -463,7 +463,7 @@ function idct16x16_dc_add_neon ret endfunc -.macro idct16 +function idct16 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a @@ -506,9 +506,10 @@ endfunc butterfly_8hv19, v28, v5, v28 // v19 = out[3], v28 = out[12] butterfly_8hv20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_8hv21, v26, v26, v3// v21 = out[5], v26 = out[10] -.endm +ret +endfunc -.macro iadst16 +function iadst16 ld1 {v0.8h,v1.8h}, [x11] dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 @@ -577,7 +578,8 @@ endfunc mov v16.16b, v2.16b mov v30.16b, v4.16b -.endm +ret +endfunc // Helper macros; we can't use these expressions directly within // e.g. .irp due to the extra concatenation \(). Therefore wrap @@ -610,6 +612,7 @@ function \txfm\()16_1d_8x16_pass1_neon cmp w3, w7 b.le2f .endif +mov x14, x30 mov x9, #32 moviv2.8h, #0 @@ -617,7 +620,7 @@ function \txfm\()16_1d_8x16_pass1_neon load_clear \i, x2, x9 .endr -\txfm\()16 +bl \txfm\()16 // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two @@ -631,7 +634,7 @@ function \txfm\()16_1d_8x16_pass1_neon .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr -ret +br x14 1: // Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, @@ -650,7 +653,7 @@ function \txfm\()16_1d_8x16_pass1_neon mov v29.16b, v21.16b mov v30.16b, v22.16b mov v31.16b, v23.16b -ret +br x14 .ifc \txfm,idct 2: @@ -674,6 +677,7 @@ endfunc // x2 = src (temp buffer) // x3 = slice offset function \txfm\()16_1d_8x16_pass2_neon +mov x14, x30 mov x9, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load\i, x2, x9 @@ -686,7 +690,7 @@ function \txfm\()16_1d_8x16_pass2_neon add x3, x0, x1 lsl x1, x1, #1 -\txfm\()16 +bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 srshr \coef0, \coef0, #6 @@ -736,7 +740,7 @@ function \txfm\()16_1d_8x16_pass2_neon load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b .purgem load_add_store -ret +br x14 endfunc .endm @@ -852,7 +856,7 @@ function idct32x32_dc_add_neon ret endfunc -.macro idct32_odd +function idct32_odd ld1 {v0.8h,v1.8h}, [x11] dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a @@ -907,7 +911,8 @@ endfunc dmbutterfly0v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a dmbutterfly0v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 dmbutterfly0v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a -.endm +ret +endfunc // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. // The 32-point IDCT can be decomposed into two 16-point IDCTs; @@ -925,6 +930,7 @@ function idct32_1d_8x32_pass1_neon cmp w3,
[libav-devel] [PATCH 06/11] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible
This work is sponsored by, and copyright, Google. This increases the code size of libavcodec/arm/vp9itxfm_neon.o from 12316 to 15000 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.7235.1 vp9_inv_dct_dct_16x16_sub4_add_neon: 2086.9 1530.7 1769.8 1286.3 vp9_inv_dct_dct_16x16_sub8_add_neon: 2461.8 1805.2 2026.4 1527.5 vp9_inv_dct_dct_16x16_sub12_add_neon: 2838.8 2193.9 2297.6 1775.9 vp9_inv_dct_dct_16x16_sub16_add_neon: 3213.9 2468.6 2554.7 2005.5 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5457.5864.4554.7 vp9_inv_dct_dct_32x32_sub4_add_neon: 10687.2 7931.0 8621.0 6826.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 11802.3 9222.5 9423.6 7571.4 vp9_inv_dct_dct_32x32_sub12_add_neon:12922.5 10579.7 10244.9 8321.2 vp9_inv_dct_dct_32x32_sub16_add_neon:14039.9 11840.0 11032.5 9068.3 vp9_inv_dct_dct_32x32_sub20_add_neon:15188.7 1.9 11849.5 9822.9 vp9_inv_dct_dct_32x32_sub24_add_neon:16277.9 14917.8 12653.1 10570.2 vp9_inv_dct_dct_32x32_sub28_add_neon:17392.6 16043.6 13424.9 11329.9 vp9_inv_dct_dct_32x32_sub32_add_neon:18508.1 17207.1 14236.0 12066.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.7235.1 vp9_inv_dct_dct_16x16_sub4_add_neon: 1336.5 1012.5 1225.9860.7 vp9_inv_dct_dct_16x16_sub8_add_neon: 2023.2 1768.8 1868.1 1358.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 2947.1 2228.9 2304.8 1795.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 3247.9 2536.7 2547.0 2036.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 751.5456.7863.5553.9 vp9_inv_dct_dct_32x32_sub4_add_neon: 8019.6 5868.0 6632.6 5134.4 vp9_inv_dct_dct_32x32_sub8_add_neon: 8808.1 6966.8 7198.0 5690.6 vp9_inv_dct_dct_32x32_sub12_add_neon:11291.5 10146.7 9628.8 7566.7 vp9_inv_dct_dct_32x32_sub16_add_neon:12159.2 11004.2 10373.3 8237.7 vp9_inv_dct_dct_32x32_sub20_add_neon:15230.9 13467.6 11841.1 9748.8 vp9_inv_dct_dct_32x32_sub24_add_neon:16361.5 14854.5 12677.6 10505.0 vp9_inv_dct_dct_32x32_sub28_add_neon:17497.8 15833.3 13493.0 11254.0 vp9_inv_dct_dct_32x32_sub32_add_neon:18591.8 17348.5 14355.5 12001.7 --- libavcodec/arm/vp9itxfm_neon.S | 351 ++--- 1 file changed, 331 insertions(+), 20 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index d10de1e..99a5e1f 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -74,6 +74,14 @@ endconst vrshrn.s32 \out2, \tmpq4, #14 .endm +@ Same as mbutterfly0 above, but treating the input in in2 as zero, +@ writing the same output into both out1 and out2. +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 +vmull.s16 \tmpq3, \in1, d0[0] +vrshrn.s32 \out1, \tmpq3, #14 +vmov\out2, \out1 +.endm + @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 @ Same as mbutterfly0, but with input being 2 q registers, output @@ -137,6 +145,23 @@ endconst vrshrn.s32 \inout2, \tmp2, #14 .endm +@ Same as mbutterfly above, but treating the input in inout2 as zero +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout1, \coef1 +vmull.s16 \tmp2, \inout1, \coef2 +vrshrn.s32 \inout1, \tmp1, #14 +vrshrn.s32 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout1 as zero +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout2, \coef2 +vmull.s16 \tmp2, \inout2, \coef1 +vneg.s32\tmp1, \tmp1 +vrshrn.s32 \inout2, \tmp2, #14 +vrshrn.s32 \inout1, \tmp1, #14 +.endm + @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 @ inout are 4 d registers, tmp are 4 q registers @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -function idct16 +.macro idct16_full mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -556,7 +581,10 @@ function idct16 mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a mbutterfly d23, d25, d0[1], d0[2], q9, q15@ d23 = t9a, d25 = t14a mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a +idct16_end +.endm +.macro idct16_end butterfly d18, d7, d4, d7@ d18 = t0a,
[libav-devel] [PATCH 10/11] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible
This work is sponsored by, and copyright, Google. This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o from 14784 to 18548 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 1065.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 1065.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.5 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.3 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 vp9_inv_dct_dct_32x32_sub4_add_neon: 5203.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 5199.8 vp9_inv_dct_dct_32x32_sub12_add_neon: 6172.3 vp9_inv_dct_dct_32x32_sub16_add_neon: 6176.1 vp9_inv_dct_dct_32x32_sub20_add_neon: 7144.5 vp9_inv_dct_dct_32x32_sub24_add_neon: 7143.7 vp9_inv_dct_dct_32x32_sub28_add_neon: 8114.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 8112.0 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 714.2 vp9_inv_dct_dct_16x16_sub8_add_neon: 926.8 vp9_inv_dct_dct_16x16_sub12_add_neon: 1402.3 vp9_inv_dct_dct_16x16_sub16_add_neon: 1405.9 vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1 vp9_inv_dct_dct_32x32_sub4_add_neon: 3958.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 3958.8 vp9_inv_dct_dct_32x32_sub12_add_neon: 5461.1 vp9_inv_dct_dct_32x32_sub16_add_neon: 5467.4 vp9_inv_dct_dct_32x32_sub20_add_neon: 7175.4 vp9_inv_dct_dct_32x32_sub24_add_neon: 7172.5 vp9_inv_dct_dct_32x32_sub28_add_neon: 8136.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8135.9 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. --- libavcodec/aarch64/vp9itxfm_neon.S | 369 +++-- 1 file changed, 349 insertions(+), 20 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index ef40a5a..d74245f 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -75,6 +75,16 @@ endconst .endif .endm +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 +smull \tmp1\().4s, \in1\().4h, v0.h[0] +smull2 \tmp2\().4s, \in1\().8h, v0.h[0] +rshrn \out1\().4h, \tmp1\().4s, #14 +rshrn2 \out1\().8h, \tmp2\().4s, #14 +mov \out2\().16b, \out1\().16b +.endm + // out1,out2 = in1 * coef1 - in2 * coef2 // out3,out4 = in1 * coef2 + in2 * coef1 // out are 4 x .4s registers, in are 2 x .8h registers @@ -104,6 +114,43 @@ endconst rshrn2 \inout2\().8h, \tmp4\().4s, #14 .endm +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout1\().4h, \coef1 +smull2 \tmp2\().4s, \inout1\().8h, \coef1 +smull \tmp3\().4s, \inout1\().4h, \coef2 +smull2 \tmp4\().4s, \inout1\().8h, \coef2 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout2\().4h, \coef2 +smull2 \tmp2\().4s, \inout2\().8h, \coef2 +smull \tmp3\().4s, \inout2\().4h, \coef1 +smull2 \tmp4\().4s, \inout2\().8h, \coef1 +neg \tmp1\().4s, \tmp1\().4s +neg \tmp2\().4s, \tmp2\().4s +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef +smull \out1\().4s, \in\().4h, \coef +smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift +rshrn \out\().4h, \in1\().4s, \shift +rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + // out1 = in1 + in2 // out2 = in1 - in2 .macro butterfly_8h out1, out2, in1, in2 @@ -463,7 +510,7 @@ function idct16x16_dc_add_neon ret endfunc -function idct16 +.macro idct16_full dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
[libav-devel] [PATCH 07/11] arm: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32
This work is sponsored by, and copyright, Google. This avoids having to fill the temp buffer with zeros for the skipped slices, and leads to slightly more straightforward code for these cases (for the 16x16 case, where the special case pass functions are written out instead of templated from the same macro), instead of riddling the common code with special case branches or macro .ifs. The code size increases from 15000 bytes to 19864 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.7235.1 vp9_inv_dct_dct_16x16_sub4_add_neon: 1336.5 1012.5 1225.9860.7 vp9_inv_dct_dct_16x16_sub8_add_neon: 2023.2 1768.8 1868.1 1358.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 2947.1 2228.9 2304.8 1795.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 3247.9 2536.7 2547.0 2036.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 751.5456.7863.5553.9 vp9_inv_dct_dct_32x32_sub4_add_neon: 8019.6 5868.0 6632.6 5134.4 vp9_inv_dct_dct_32x32_sub8_add_neon: 8808.1 6966.8 7198.0 5690.6 vp9_inv_dct_dct_32x32_sub12_add_neon:11291.5 10146.7 9628.8 7566.7 vp9_inv_dct_dct_32x32_sub16_add_neon:12159.2 11004.2 10373.3 8237.7 vp9_inv_dct_dct_32x32_sub20_add_neon:15230.9 13467.6 11841.1 9748.8 vp9_inv_dct_dct_32x32_sub24_add_neon:16361.5 14854.5 12677.6 10505.0 vp9_inv_dct_dct_32x32_sub28_add_neon:17497.8 15833.3 13493.0 11254.0 vp9_inv_dct_dct_32x32_sub32_add_neon:18591.8 17348.5 14355.5 12001.7 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.7235.1 vp9_inv_dct_dct_16x16_sub4_add_neon: 1209.5863.9 1034.7764.7 vp9_inv_dct_dct_16x16_sub8_add_neon: 1915.8 1590.9 1739.0 1281.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 2850.5 2204.3 2292.1 1779.8 vp9_inv_dct_dct_16x16_sub16_add_neon: 3240.1 2490.6 2555.8 2009.9 vp9_inv_dct_dct_32x32_sub1_add_neon: 751.5458.9863.5553.9 vp9_inv_dct_dct_32x32_sub4_add_neon: 7566.3 5721.3 6043.8 4920.7 vp9_inv_dct_dct_32x32_sub8_add_neon: 8366.1 6786.5 6594.1 5476.2 vp9_inv_dct_dct_32x32_sub12_add_neon:10980.0 9885.5 9237.7 7436.5 vp9_inv_dct_dct_32x32_sub16_add_neon:11917.3 11156.8 9963.0 8113.0 vp9_inv_dct_dct_32x32_sub20_add_neon:15201.3 13632.9 11844.2 9819.9 vp9_inv_dct_dct_32x32_sub24_add_neon:16333.8 14541.2 12654.5 10580.7 vp9_inv_dct_dct_32x32_sub28_add_neon:17459.1 16165.8 13450.0 11325.3 vp9_inv_dct_dct_32x32_sub32_add_neon:18612.2 17386.7 14281.6 12065.8 --- This reverts parts of the previous commit (changing some register uses to another register); if both are to be applied, they should be applied squashed together. (And similarly for review, it's much easier to squash the two and review the end result.) They are presented sequentially as two steps, to show the effect on runtime and code size of each alternative. --- libavcodec/arm/vp9itxfm_neon.S | 532 + 1 file changed, 374 insertions(+), 158 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 99a5e1f..b6c23c8 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -745,6 +745,42 @@ function iadst16 bx lr endfunc +.macro load_add_store coef0, coef1, coef2, coef3 +vrshr.s16 \coef0, \coef0, #6 +vrshr.s16 \coef1, \coef1, #6 + +vld1.32 {d4[]}, [r0,:32], r1 +vld1.32 {d4[1]}, [r3,:32], r1 +vrshr.s16 \coef2, \coef2, #6 +vrshr.s16 \coef3, \coef3, #6 +vld1.32 {d5[]}, [r0,:32], r1 +vld1.32 {d5[1]}, [r3,:32], r1 +vaddw.u8\coef0, \coef0, d4 +vld1.32 {d6[]}, [r0,:32], r1 +vld1.32 {d6[1]}, [r3,:32], r1 +vaddw.u8\coef1, \coef1, d5 +vld1.32 {d7[]}, [r0,:32], r1 +vld1.32 {d7[1]}, [r3,:32], r1 + +vqmovun.s16 d4, \coef0 +vqmovun.s16 d5, \coef1 +sub r0, r0, r1, lsl #2 +sub r3, r3, r1, lsl #2 +vaddw.u8\coef2, \coef2, d6 +vaddw.u8\coef3, \coef3, d7 +vst1.32 {d4[0]}, [r0,:32], r1 +vst1.32 {d4[1]}, [r3,:32], r1 +vqmovun.s16 d6, \coef2 +vst1.32 {d5[0]}, [r0,:32], r1 +vst1.32 {d5[1]}, [r3,:32], r1 +vqmovun.s16 d7, \coef3 + +vst1.32 {d6[0]}, [r0,:32], r1 +vst1.32 {d6[1]}, [r3,:32], r1 +vst1.32 {d7[0]}, [r0,:32], r1 +vst1.32 {d7[1]}, [r3,:32], r1 +.endm + .macro itxfm16_1d_funcs txfm @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ transpose into a horizontal 16x4 slice and store. @@ -763,40 +799,13 @@ function \txfm\()16_1d_4x16_pass1_neon
[libav-devel] [PATCH 11/11] aarch64: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32
This work is sponsored by, and copyright, Google. This avoids having to fill the temp buffer with zeros for the skipped slices, and leads to slightly more straightforward code for these cases (for the 16x16 case, where the special case pass functions are written out instead of templated from the same macro), instead of riddling the common code with special case branches or macro .ifs. The code size increases from 18548 bytes to 24580 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 714.2 vp9_inv_dct_dct_16x16_sub8_add_neon: 926.8 vp9_inv_dct_dct_16x16_sub12_add_neon: 1402.3 vp9_inv_dct_dct_16x16_sub16_add_neon: 1405.9 vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1 vp9_inv_dct_dct_32x32_sub4_add_neon: 3958.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 3958.8 vp9_inv_dct_dct_32x32_sub12_add_neon: 5461.1 vp9_inv_dct_dct_32x32_sub16_add_neon: 5467.4 vp9_inv_dct_dct_32x32_sub20_add_neon: 7175.4 vp9_inv_dct_dct_32x32_sub24_add_neon: 7172.5 vp9_inv_dct_dct_32x32_sub28_add_neon: 8136.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8135.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7 vp9_inv_dct_dct_16x16_sub4_add_neon: 644.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 854.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1393.8 vp9_inv_dct_dct_16x16_sub16_add_neon: 1392.6 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.6 vp9_inv_dct_dct_32x32_sub4_add_neon: 3684.3 vp9_inv_dct_dct_32x32_sub8_add_neon: 3682.6 vp9_inv_dct_dct_32x32_sub12_add_neon: 5316.3 vp9_inv_dct_dct_32x32_sub16_add_neon: 5315.9 vp9_inv_dct_dct_32x32_sub20_add_neon: 7146.4 vp9_inv_dct_dct_32x32_sub24_add_neon: 7151.5 vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8117.5 --- This reverts parts of the previous commit (changing some register uses to another register); if both are to be applied, they should be applied squashed together. (And similarly for review, it's much easier to squash the two and review the end result.) They are presented sequentially as two steps, to show the effect on runtime and code size of each alternative. --- libavcodec/aarch64/vp9itxfm_neon.S | 737 +++-- 1 file changed, 458 insertions(+), 279 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index d74245f..78041d3 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -710,6 +710,51 @@ endfunc st1 {v2.8h}, [\src], \inc .endm +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 +srshr \coef0, \coef0, #6 +ld1 {v2.8b}, [x0], x1 +srshr \coef1, \coef1, #6 +ld1 {v3.8b}, [x3], x1 +srshr \coef2, \coef2, #6 +ld1 {v4.8b}, [x0], x1 +srshr \coef3, \coef3, #6 +uaddw \coef0, \coef0, v2.8b +ld1 {v5.8b}, [x3], x1 +uaddw \coef1, \coef1, v3.8b +srshr \coef4, \coef4, #6 +ld1 {v6.8b}, [x0], x1 +srshr \coef5, \coef5, #6 +ld1 {v7.8b}, [x3], x1 +sqxtun v2.8b, \coef0 +srshr \coef6, \coef6, #6 +sqxtun v3.8b, \coef1 +srshr \coef7, \coef7, #6 +uaddw \coef2, \coef2, v4.8b +ld1 {\tmp1}, [x0], x1 +uaddw \coef3, \coef3, v5.8b +ld1 {\tmp2}, [x3], x1 +sqxtun v4.8b, \coef2 +sub x0, x0, x1, lsl #2 +sub x3, x3, x1, lsl #2 +sqxtun v5.8b, \coef3 +uaddw \coef4, \coef4, v6.8b +st1 {v2.8b}, [x0], x1 +uaddw \coef5, \coef5, v7.8b +st1 {v3.8b}, [x3], x1 +sqxtun v6.8b, \coef4 +st1 {v4.8b}, [x0], x1 +sqxtun v7.8b, \coef5 +st1 {v5.8b}, [x3], x1 +uaddw \coef6, \coef6, \tmp1 +st1 {v6.8b}, [x0], x1 +uaddw \coef7, \coef7, \tmp2 +st1 {v7.8b}, [x3], x1 +sqxtun \tmp1, \coef6 +sqxtun \tmp2, \coef7 +st1 {\tmp1}, [x0], x1 +st1 {\tmp2}, [x3], x1 +.endm + // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // transpose into a horizontal 16x8 slice and store. // x0 = dst (temp buffer) @@ -728,37 +773,12 @@ function \txfm\()16_1d_8x16_pass1_neon mov x9, #32 moviv2.8h, #0 - -.ifc \txfm,idct -cmp w3, #10 -b.le3f -cmp w3, #38 -b.le4f
[libav-devel] [PATCH] rtmpdh: Do global initialization before running the test
The rtmpdh code can use crypto libraries which may require a process global init. (gcrypt is one of the libraries where the rtmpdh test code can fail if global init hasn't been done, depending on gcrypt version.) --- libavformat/tests/rtmpdh.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavformat/tests/rtmpdh.c b/libavformat/tests/rtmpdh.c index c25ca91..be7ea5a 100644 --- a/libavformat/tests/rtmpdh.c +++ b/libavformat/tests/rtmpdh.c @@ -17,6 +17,7 @@ */ #include "libavformat/rtmpdh.c" +#include "libavformat/avformat.h" #include @@ -150,6 +151,7 @@ fail: int main(void) { +avformat_network_init(); if (test_random_shared_secret() < 0) return 1; if (test_ref_data() < 0) -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtsp: Refactor ff_rtsp_fetch_packet
On Wed, 23 Nov 2016, Luca Barbato wrote: Move the actual data parsing and the queued stream selection in separate functions. --- Before I continue chopping that function in smaller bits I want your opinion. The plan is to further reshape this quite complex loop so at least it is easier to see what's happening. Then I'd add move the receive buffer size as an option and possibly keep the networking side in a separate thread to avoid overruns when the bitrate is much larger than the current design expects. I'm not too eager about this in particular, but some refactoring here probably can be good, as long as it doesn't break anything. libavformat/rtsp.c | 106 - libavformat/rtsp.h | 1 + 2 files changed, 66 insertions(+), 41 deletions(-) diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c index 7e59857..15ea393 100644 --- a/libavformat/rtsp.c +++ b/libavformat/rtsp.c @@ -2043,11 +2043,68 @@ static int pick_stream(AVFormatContext *s, RTSPStream **rtsp_st, return AVERROR(EAGAIN); } +static int64_t parse_packet(RTSPState *rt, AVPacket *pkt) +{ +int ret; I'm not sure this name is quite right; this only does the simpler version of parsing out more packets out of the existing buffers; further down in ff_rtsp_fetch_packet there's the full version which also sends RTCP RR and other RTCP feedback, and syncs the RTCP based timestamps for other streams. Ideally I guess we should integrate both of them into the same function, and just do the RTCP sync and feedback if a specific flag is set to the function. + +if (rt->transport == RTSP_TRANSPORT_RDT) { +ret = ff_rdt_parse_packet(rt->cur_transport_priv, pkt, NULL, 0); +} else if (rt->transport == RTSP_TRANSPORT_RTP) { +ret = ff_rtp_parse_packet(rt->cur_transport_priv, pkt, NULL, 0); +} else if (CONFIG_RTPDEC && rt->ts) { +ret = ff_mpegts_parse_packet(rt->ts, pkt, + rt->recvbuf + rt->recvbuf_pos, + rt->recvbuf_len - rt->recvbuf_pos); +if (ret >= 0) { +rt->recvbuf_pos += ret; +ret = rt->recvbuf_pos < rt->recvbuf_len; +} +} else +ret = -1; + +if (ret == 0) { +rt->cur_transport_priv = NULL; +return 0; +} else if (ret == 1) +return 0; + +return ret; +} + +static int select_queue_stream(AVFormatContext *s) +{ +RTSPState *rt = s->priv_data; +int i; +int64_t wait_end; +int64_t first_queue_time = 0; + +for (i = 0; i < rt->nb_rtsp_streams; i++) { +RTPDemuxContext *rtpctx = rt->rtsp_streams[i]->transport_priv; +int64_t queue_time; +if (!rtpctx) +continue; +queue_time = ff_rtp_queued_packet_time(rtpctx); +if (queue_time && (queue_time - first_queue_time < 0 || + !first_queue_time)) { +first_queue_time = queue_time; +rt->first_queue_st = rt->rtsp_streams[i]; +} +} +if (first_queue_time) { +wait_end = first_queue_time + s->max_delay; +} else { +wait_end = 0; +rt->first_queue_st = NULL; +} I'd prefer this to pass first_queue_st as a pointer to this function, keeping it as a local variable in the caller for now. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCHv2] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it
--- libavcodec/aarch64/vp9itxfm_neon.S | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 2dc6b75..f4194a6 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -599,9 +599,9 @@ endfunc // x1 = unused // x2 = src // x3 = slice offset +// x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon -mov x9, #32 moviv2.8h, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 @@ -649,8 +649,8 @@ endfunc // x1 = dst stride // x2 = src (temp buffer) // x3 = slice offset +// x9 = temp buffer stride function \txfm\()16_1d_8x16_pass2_neon -mov x9, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load\i, x2, x9 .endr @@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] .endif +mov x9, #32 .irp i, 0, 8 add x0, sp, #(\i*32) @@ -882,13 +883,12 @@ endfunc // x0 = dst (temp buffer) // x1 = unused // x2 = src +// x9 = double input stride // x10 = idct_coeffs // x11 = idct_coeffs + 32 function idct32_1d_8x32_pass1_neon ld1 {v0.8h,v1.8h}, [x10] -// Double stride of the input, since we only read every other line -mov x9, #128 moviv4.8h, #0 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) @@ -987,12 +987,13 @@ endfunc // x0 = dst // x1 = dst stride // x2 = src (temp buffer) +// x7 = negative double temp buffer stride +// x9 = double temp buffer stride // x10 = idct_coeffs // x11 = idct_coeffs + 32 function idct32_1d_8x32_pass2_neon ld1 {v0.8h,v1.8h}, [x10] -mov x9, #128 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2], x9 @@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon idct16 -mov x9, #128 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x2], x9 .endr @@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon idct32_odd -mov x9, #128 .macro load_acc_store a, b, c, d, neg=0 +.if \neg == 0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 -.if \neg == 0 add v4.8h, v4.8h, v\a\().8h ld1 {v6.8h}, [x2], x9 add v5.8h, v5.8h, v\b\().8h @@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon add v6.8h, v6.8h, v\c\().8h add v7.8h, v7.8h, v\d\().8h .else +ld1 {v4.8h}, [x2], x7 +ld1 {v5.8h}, [x2], x7 sub v4.8h, v4.8h, v\a\().8h -ld1 {v6.8h}, [x2], x9 +ld1 {v6.8h}, [x2], x7 sub v5.8h, v5.8h, v\b\().8h -ld1 {v7.8h}, [x2], x9 +ld1 {v7.8h}, [x2], x7 sub v6.8h, v6.8h, v\c\().8h sub v7.8h, v7.8h, v\d\().8h .endif @@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon load_acc_store 23, 22, 21, 20 load_acc_store 19, 18, 17, 16 sub x2, x2, x9 -neg x9, x9 load_acc_store 16, 17, 18, 19, 1 load_acc_store 20, 21, 22, 23, 1 load_acc_store 24, 25, 26, 27, 1 @@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 mov x5, x1 mov x6, x2 +// Double stride of the input, since we only read every other line +mov x9, #128 +neg x7, x9 + .irp i, 0, 8, 16, 24 add x0, sp, #(\i*64) add x2, x6, #(\i*2) -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtmpdh: Do global initialization before running the test
On Thu, 24 Nov 2016, Diego Biurrun wrote: On Wed, Nov 23, 2016 at 11:35:08PM +0200, Martin Storsjö wrote: The rtmpdh code can use crypto libraries which may require a process global init. (gcrypt is one of the libraries where the rtmpdh test code can fail if global init hasn't been done, depending on gcrypt version.) --- libavformat/tests/rtmpdh.c | 2 ++ 1 file changed, 2 insertions(+) As you predicted, this does not fail my particular test case. does not fix, I presume To fix that, you could just as an experiment try this: diff --git a/libavformat/tls_gnutls.c b/libavformat/tls_gnutls.c index 3e29a45..abc9e69 100644 --- a/libavformat/tls_gnutls.c +++ b/libavformat/tls_gnutls.c @@ -35,7 +35,8 @@ #include "libavutil/opt.h" #include "libavutil/parseutils.h" -#if HAVE_THREADS && GNUTLS_VERSION_NUMBER <= 0x020b00 +#if CONFIG_GCRYPT #include #include "libavutil/thread.h" GCRY_THREAD_OPTION_PTHREAD_IMPL; @@ -52,10 +53,10 @@ typedef struct TLSContext { void ff_gnutls_init(void) { avpriv_lock_avformat(); -#if HAVE_THREADS && GNUTLS_VERSION_NUMBER < 0x020b00 +#if CONFIG_GCRYPT if (gcry_control(GCRYCTL_ANY_INITIALIZATION_P) == 0) gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread); #endif gnutls_global_init(); avpriv_unlock_avformat(); } But I'm quite ok with dropping gcrypt support altogether. --- a/libavformat/tests/rtmpdh.c +++ b/libavformat/tests/rtmpdh.c @@ -17,6 +17,7 @@ #include "libavformat/rtmpdh.c" +#include "libavformat/avformat.h" nit: order Fixed @@ -150,6 +151,7 @@ fail: int main(void) { +avformat_network_init(); if (test_random_shared_secret() < 0) return 1; if (test_ref_data() < 0) LGTM Pushed to master Please push to the 12 branch as well. Will try to remember to do that later, I'm short on time right now. Or feel free to do that for me if you beat me to it. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtmpdh: Do global initialization before running the test
On Thu, 24 Nov 2016, Diego Biurrun wrote: To fix that, you could just as an experiment try this: diff --git a/libavformat/tls_gnutls.c b/libavformat/tls_gnutls.c index 3e29a45..abc9e69 100644 --- a/libavformat/tls_gnutls.c +++ b/libavformat/tls_gnutls.c @@ -35,7 +35,8 @@ #include "libavutil/opt.h" #include "libavutil/parseutils.h" -#if HAVE_THREADS && GNUTLS_VERSION_NUMBER <= 0x020b00 +#if CONFIG_GCRYPT #include #include "libavutil/thread.h" GCRY_THREAD_OPTION_PTHREAD_IMPL; @@ -52,10 +53,10 @@ typedef struct TLSContext { void ff_gnutls_init(void) { avpriv_lock_avformat(); -#if HAVE_THREADS && GNUTLS_VERSION_NUMBER < 0x020b00 +#if CONFIG_GCRYPT if (gcry_control(GCRYCTL_ANY_INITIALIZATION_P) == 0) gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread); #endif gnutls_global_init(); avpriv_unlock_avformat(); } That does the trick, please push :) No, since this fix is not appropriate. It's not quite straightforward though. Notice how we don't have a separate --enable-gcrypt or --enable-gmp; this is kinda intentional. You're not supposed to use gcrypt because you want to, only if this happens to be the backend that your gnutls backend uses. Now the existing code is the correct (AFAIK) way of initializing gnutls for use in a threadsafe way; in older versions when gcrypt was the only available backend, the caller had to do it manually. In preparation for making gnutls use a different backend, this was folded into gnutls_global_init, and in newer gnutls versions, nettle/hogweed/gmp is the only backend. (And they don't need the global init.) If we'd consider gcrypt and gmp features that can be enabled/disabled separately from gnutls, we'd need to have some sort of init like this somewhere, but tls_gnutls.c surely isn't the right spot for it in that case. Prior to 57cde2b180fcec0eaf60aad65f436ab6420546f5, we had this init in network.c, where it was done if gnutls and gcrypt were enabled. But after that commit, this init actually only gets done if the tls protocol is enabled. This was pointed out in review IIRC, but the cleanup achieved is worth more than the fact that a build with --enable-gnutls --disable-protocol=tls won't init gnutls/gcrypt correctly for the rtmpdh code. To fix this correctly, we should have a separate, similar init for the crypto backends used by the rtmpdh code, not related to the tls protocols (called from within the same avformat_network_init). So far, this have felt like too much of a corner case to need to be cared about though. If we drop gcrypt, the issue goes away even more, since gmp doesn't need any global init. The libcrypt half of OpenSSL does need some sort of init in order to be threadsafe (but it doesn't fail if not inited, like some versions of gcrypt). So in such a setup (--enable-openssl --disable-protocol=tls), the rtmpdh code won't be threadsafe. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 05/15] build: Drop gcrypt support
On Thu, 24 Nov 2016, Diego Biurrun wrote: GnuTLS in combination with gcrypt has been deprecated since 2010. --- This should be done after Martin pushes his patch to fix gcrypt support. No, there's nothing more to fix wrt gcrypt right now, feel free to proceed with this. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 3/3] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0 By skipping individual 8x16 or 8x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon:1043.7 vp9_inv_dct_dct_16x16_sub4_add_neon:1045.3 vp9_inv_dct_dct_16x16_sub8_add_neon:1043.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 1374.0 vp9_inv_dct_dct_16x16_sub16_add_neon: 1368.7 vp9_inv_dct_dct_32x32_sub1_add_neon: 555.6 vp9_inv_dct_dct_32x32_sub2_add_neon:5180.0 vp9_inv_dct_dct_32x32_sub4_add_neon:5175.1 vp9_inv_dct_dct_32x32_sub8_add_neon:5186.6 vp9_inv_dct_dct_32x32_sub12_add_neon: 6159.5 vp9_inv_dct_dct_32x32_sub16_add_neon: 6162.7 vp9_inv_dct_dct_32x32_sub20_add_neon: 7129.0 vp9_inv_dct_dct_32x32_sub24_add_neon: 7133.1 vp9_inv_dct_dct_32x32_sub28_add_neon: 8107.1 vp9_inv_dct_dct_32x32_sub32_add_neon: 8105.6 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. --- Updated based on Janne's review of the arm version. --- libavcodec/aarch64/vp9itxfm_neon.S | 60 ++ 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index f4194a6..9d2ba11 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -588,6 +588,9 @@ endfunc .macro store i, dst, inc st1 {v\i\().8h}, [\dst], \inc .endm +.macro movi_v i, size, imm +moviv\i\()\size, \imm +.endm .macro load_clear i, src, inc ld1 {v\i\().8h}, [\src] st1 {v2.8h}, [\src], \inc @@ -596,9 +599,8 @@ endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, // transpose into a horizontal 16x8 slice and store. // x0 = dst (temp buffer) -// x1 = unused +// x1 = slice offset // x2 = src -// x3 = slice offset // x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 // Store the transposed 8x8 blocks horizontally. -cmp x3, #8 +cmp x1, #8 b.eq1f .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr ret 1: -// Special case: For the last input column (x3 == 8), +// Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, // don't store the first 8x8 block, but keep it in registers // for the first slice of the second pass (where it is the @@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .irp i, 0, 8 add x0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i == 8 +cmp w3, #38 +b.le1f +.endif +.endif +mov x1, #\i add x2, x6, #(\i*2) -mov x3, #\i bl \txfm1\()16_1d_8x16_pass1_neon .endr .ifc \txfm1\()_\txfm2,iadst_idct ld1 {v0.8h,v1.8h}, [x10] .endif + +.ifc \txfm1\()_\txfm2,idct_idct +b 3f +1: +// Set v24-v31 to zero, for the in-register passthrough of +// coefficients to pass 2. Since we only do two slices, this can +// only ever happen for the second slice. So we only need to store +// zeros to the temp buffer for the second half of the buffer. +.irp i, 24, 25, 26, 27, 28, 29, 30, 31 +add x0, x0, #16 +movi_v \i, .16b, #0 +store 24, x0, #16 +.endr +3: +.endif + .irp i, 0, 8 add x0, x4, #(\i) mov x1, x5 @@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon ret endfunc +const min_eob_idct_idct_32, align=4 +.short 0, 34, 135, 336 +endconst + function ff_vp9_idct_idct_32x32_add_neon, export=1 cmp w3, #1 b.eqidct32x32_dc_add_neon movrel x10, idct_coeffs add x11, x10, #32 +movrel x12, min_eob_idct_idct_32 + 2 mov x15, x30 @@ -1099,9 +1128,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 .irp i, 0, 8, 16, 24 add x0, sp, #(\i*64) +.if \i > 0 +ldrhw1, [x12], #2 +cmp w3, w1 +mov x
[libav-devel] [PATCH 1/3] arm: vp9itxfm: Only reload the idct coeffs for the iadst_idct combination
This avoids reloading them if they haven't been clobbered, if the first pass also was idct. This is similar to what was done in the aarch64 version. --- libavcodec/arm/vp9itxfm_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 01944bd..2049241 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -814,7 +814,7 @@ A and r7, sp, #15 mov r3, #\i bl \txfm1\()16_1d_4x16_pass1_neon .endr -.ifc \txfm2,idct +.ifc \txfm1\()_\txfm2,iadst_idct movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] .endif -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/3] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google. Previously all subpartitions except the eob=1 (DC) case ran with the same runtime: vp9_inv_dct_dct_16x16_sub16_add_neon: 3188.1 2435.4 2499.0 1969.0 vp9_inv_dct_dct_32x32_sub32_add_neon: 18531.7 16582.3 14207.6 12000.3 By skipping individual 4x16 or 4x32 pixel slices in the first pass, we reduce the runtime of these functions like this: vp9_inv_dct_dct_16x16_sub1_add_neon: 274.6189.5211.7235.8 vp9_inv_dct_dct_16x16_sub2_add_neon:2064.0 1534.8 1719.4 1248.7 vp9_inv_dct_dct_16x16_sub4_add_neon:2135.0 1477.2 1736.3 1249.5 vp9_inv_dct_dct_16x16_sub8_add_neon:2446.7 1828.7 1993.6 1494.7 vp9_inv_dct_dct_16x16_sub12_add_neon: 2832.4 2118.3 2266.5 1735.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.7 2475.3 2523.5 1983.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 756.2456.7862.0553.9 vp9_inv_dct_dct_32x32_sub2_add_neon: 10682.2 8190.4 8539.2 6762.5 vp9_inv_dct_dct_32x32_sub4_add_neon: 10813.5 8014.9 8518.3 6762.8 vp9_inv_dct_dct_32x32_sub8_add_neon: 11859.6 9313.0 9347.4 7514.5 vp9_inv_dct_dct_32x32_sub12_add_neon: 12946.6 10752.4 10192.2 8280.2 vp9_inv_dct_dct_32x32_sub16_add_neon: 14074.6 11946.5 11001.4 9008.6 vp9_inv_dct_dct_32x32_sub20_add_neon: 15269.9 13662.7 11816.1 9762.6 vp9_inv_dct_dct_32x32_sub24_add_neon: 16327.9 14940.1 12626.7 10516.0 vp9_inv_dct_dct_32x32_sub28_add_neon: 17462.7 15776.1 13446.2 11264.7 vp9_inv_dct_dct_32x32_sub32_add_neon: 18575.5 17157.0 14249.3 12015.1 I.e. in general a very minor overhead for the full subpartition case due to the additional loads and cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. In common VP9 content in a few inspected clips, 70-90% of the non-dc-only 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left 8x8 or 16x16 subpartitions respectively. --- Updated with Janne's suggestions. The weird speedup for vp9_inv_dct_dct_16x16_sub16_add_neon on the Cortex A8 in the previous iteration of the patch seems to be mostly within noise for that test; it does still appear occasionally when testing. --- libavcodec/arm/vp9itxfm_neon.S | 75 +- tests/checkasm/vp9dsp.c| 6 ++-- 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 2049241..5abe435 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -659,9 +659,8 @@ endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ transpose into a horizontal 16x4 slice and store. @ r0 = dst (temp buffer) -@ r1 = unused +@ r1 = slice offset @ r2 = src -@ r3 = slice offset function \txfm\()16_1d_4x16_pass1_neon mov r12, #32 vmov.s16q2, #0 @@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 @ Store the transposed 4x4 blocks horizontally. -cmp r3, #12 +cmp r1, #12 beq 1f .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr bx lr 1: -@ Special case: For the last input column (r3 == 12), +@ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @ don't store the first 4x4 block, but keep it in registers @ for the first slice of the second pass (where it is the @@ -781,15 +780,22 @@ endfunc itxfm16_1d_funcs idct itxfm16_1d_funcs iadst +@ This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 +.short 0, 10, 38, 89 +endconst + .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct cmp r3, #1 beq idct16x16_dc_add_neon .endif -push{r4-r7,lr} +push{r4-r8,lr} .ifnc \txfm1\()_\txfm2,idct_idct vpush {q4-q7} +.else +movrel r8, min_eob_idct_idct_16 + 2 .endif @ Align the stack, allocate a temp buffer @@ -810,10 +816,36 @@ A and r7, sp, #15 .irp i, 0, 4, 8, 12 add r0, sp, #(\i*32) +.ifc \txfm1\()_\txfm2,idct_idct +.if \i > 0 +ldrh_post r1, r8, #2 +cmp r3, r1 +it le +movle r1, #(16 - \i)/4 +ble 1f +.endif +.endif +mov r1, #\i add r2, r6, #(\i*2) -mov r3, #\i
[libav-devel] [PATCH] vp9dsp: add DC only versions for idct/idct.
From: Clément Bœsch before: time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - real0m11.125s user0m11.059s sys 0m0.050s time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - real0m10.944s user0m10.819s sys 0m0.064s after: time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - real0m8.153s user0m8.034s sys 0m0.050s time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null - real0m8.038s user0m7.980s sys 0m0.039s --- libavcodec/vp9dsp.c | 32 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c index 73006fa..ead2f88 100644 --- a/libavcodec/vp9dsp.c +++ b/libavcodec/vp9dsp.c @@ -944,7 +944,7 @@ static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp) #undef init_intra_pred } -#define itxfm_wrapper(type_a, type_b, sz, bits) \ +#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \ static void \ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \ ptrdiff_t stride, \ @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \ { \ int i, j; \ int16_t tmp[sz * sz], out[sz]; \ +\ +if (has_dconly && eob == 1) { \ +const int t = (((block[0] * 11585 + (1 << 13)) >> 14) \ + * 11585 + (1 << 13)) >> 14; \ +block[0] = 0; \ +for (i = 0; i < sz; i++) { \ +for (j = 0; j < sz; j++)\ +dst[j * stride] = av_clip_uint8(dst[j * stride] + \ +(bits ? \ + (t + (1 << (bits - 1))) >> bits : \ + t)); \ +dst++; \ +} \ +return; \ +} \ +\ for (i = 0; i < sz; i++)\ type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);\ memset(block, 0, sz * sz * sizeof(*block)); \ @@ -967,11 +983,11 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \ } \ } -#define itxfm_wrap(sz, bits) \ -itxfm_wrapper(idct, idct, sz, bits) \ -itxfm_wrapper(iadst, idct, sz, bits) \ -itxfm_wrapper(idct, iadst, sz, bits) \ -itxfm_wrapper(iadst, iadst, sz, bits) +#define itxfm_wrap(sz, bits) \ +itxfm_wrapper(idct, idct, sz, bits, 1) \ +itxfm_wrapper(iadst, idct, sz, bits, 0) \ +itxfm_wrapper(idct, iadst, sz, bits, 0) \ +itxfm_wrapper(iadst, iadst, sz, bits, 0) #define IN(x) in[x * stride] @@ -1490,7 +1506,7 @@ static av_always_inline void idct32_1d(int16_t *out, const int16_t *in, out[31] = t0 - t31; } -itxfm_wrapper(idct, idct, 32, 6) +itxfm_wrapper(idct, idct, 32, 6, 1) static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in, ptrdiff_t stride, int pass) @@ -1523,7 +1539,7 @@ static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in, out[3] = t3; } -itxfm_wrapper(iwht, iwht, 4, 0) +itxfm_wrapper(iwht, iwht, 4, 0, 0) #undef IN #undef itxfm_wrapper -- 2.9.3 (Apple Git-75) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] vp9dsp: add DC only versions for idct/idct.
On Tue, 29 Nov 2016, Diego Biurrun wrote: idct/idct? The vp9 inverse transforms can be different combos for the vertical and horizontal passes; it can be iwht_iwht, idct_idct, idct_iadst, iadst_idct, iadst_iadst. This optimization is only valid for the idct_idct case. --- a/libavcodec/vp9dsp.c +++ b/libavcodec/vp9dsp.c @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \ +if (has_dconly && eob == 1) { \ +const int t = (((block[0] * 11585 + (1 << 13)) >> 14) \ + * 11585 + (1 << 13)) >> 14; \ +block[0] = 0; \ +for (i = 0; i < sz; i++) { \ +for (j = 0; j < sz; j++)\ +dst[j * stride] = av_clip_uint8(dst[j * stride] + \ +(bits ? \ + (t + (1 << (bits - 1))) >> bits : \ + t)); \ FTLIW: dst[j * stride] = \ av_clip_uint8(dst[j * stride] +\ (bits ? (t + (1 << (bits - 1))) >> bits \ : t)); \ FTLIW? This looks more readable, sure. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 4/5] aarch64: vp9itxfm: Restructure the idct32 store macros
This avoids concatenation, which can't be used if the whole macro is wrapped within another macro. --- libavcodec/aarch64/vp9itxfm_neon.S | 80 +++--- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 5a080a4..be9643e 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -942,23 +942,23 @@ function idct32_1d_8x32_pass1_neon .macro store_rev a, b // There's no rev128 instruction, but we reverse each 64 bit // half, and then flip them using an ext with 8 bytes offset. -rev64 v1.8h, v\b\().8h -st1 {v\a\().8h}, [x0], #16 -rev64 v0.8h, v\a\().8h +rev64 v1.8h, \b +st1 {\a}, [x0], #16 +rev64 v0.8h, \a ext v1.16b, v1.16b, v1.16b, #8 -st1 {v\b\().8h}, [x0], #16 +st1 {\b}, [x0], #16 ext v0.16b, v0.16b, v0.16b, #8 st1 {v1.8h}, [x0], #16 st1 {v0.8h}, [x0], #16 .endm -store_rev 16, 24 -store_rev 17, 25 -store_rev 18, 26 -store_rev 19, 27 -store_rev 20, 28 -store_rev 21, 29 -store_rev 22, 30 -store_rev 23, 31 +store_rev v16.8h, v24.8h +store_rev v17.8h, v25.8h +store_rev v18.8h, v26.8h +store_rev v19.8h, v27.8h +store_rev v20.8h, v28.8h +store_rev v21.8h, v29.8h +store_rev v22.8h, v30.8h +store_rev v23.8h, v31.8h sub x0, x0, #512 .purgem store_rev @@ -984,14 +984,14 @@ function idct32_1d_8x32_pass1_neon // subtracted from the output. .macro store_rev a, b ld1 {v4.8h}, [x0] -rev64 v1.8h, v\b\().8h -add v4.8h, v4.8h, v\a\().8h -rev64 v0.8h, v\a\().8h +rev64 v1.8h, \b +add v4.8h, v4.8h, \a +rev64 v0.8h, \a st1 {v4.8h}, [x0], #16 ext v1.16b, v1.16b, v1.16b, #8 ld1 {v5.8h}, [x0] ext v0.16b, v0.16b, v0.16b, #8 -add v5.8h, v5.8h, v\b\().8h +add v5.8h, v5.8h, \b st1 {v5.8h}, [x0], #16 ld1 {v6.8h}, [x0] sub v6.8h, v6.8h, v1.8h @@ -1001,14 +1001,14 @@ function idct32_1d_8x32_pass1_neon st1 {v7.8h}, [x0], #16 .endm -store_rev 31, 23 -store_rev 30, 22 -store_rev 29, 21 -store_rev 28, 20 -store_rev 27, 19 -store_rev 26, 18 -store_rev 25, 17 -store_rev 24, 16 +store_rev v31.8h, v23.8h +store_rev v30.8h, v22.8h +store_rev v29.8h, v21.8h +store_rev v28.8h, v20.8h +store_rev v27.8h, v19.8h +store_rev v26.8h, v18.8h +store_rev v25.8h, v17.8h +store_rev v24.8h, v16.8h .purgem store_rev br x14 endfunc @@ -1055,21 +1055,21 @@ function idct32_1d_8x32_pass2_neon .if \neg == 0 ld1 {v4.8h}, [x2], x9 ld1 {v5.8h}, [x2], x9 -add v4.8h, v4.8h, v\a\().8h +add v4.8h, v4.8h, \a ld1 {v6.8h}, [x2], x9 -add v5.8h, v5.8h, v\b\().8h +add v5.8h, v5.8h, \b ld1 {v7.8h}, [x2], x9 -add v6.8h, v6.8h, v\c\().8h -add v7.8h, v7.8h, v\d\().8h +add v6.8h, v6.8h, \c +add v7.8h, v7.8h, \d .else ld1 {v4.8h}, [x2], x7 ld1 {v5.8h}, [x2], x7 -sub v4.8h, v4.8h, v\a\().8h +sub v4.8h, v4.8h, \a ld1 {v6.8h}, [x2], x7 -sub v5.8h, v5.8h, v\b\().8h +sub v5.8h, v5.8h, \b ld1 {v7.8h}, [x2], x7 -sub v6.8h, v6.8h, v\c\().8h -sub v7.8h, v7.8h, v\d\().8h +sub v6.8h, v6.8h, \c +sub v7.8h, v7.8h, \d .endif ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 @@ -1093,15 +1093,15 @@ function idct32_1d_8x32_pass2_neon st1 {v6.8b}, [x0], x1 st1 {v7.8b}, [x0], x1 .endm -load_acc_store 31, 30, 29, 28 -load_acc_store 27, 26, 25, 24 -load_acc_store 23, 22, 21, 20 -load_acc_store 19, 18, 17, 16 +load_acc_store v31.8h, v30.8h, v2
[libav-devel] [PATCH 2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
This work is sponsored by, and copyright, Google. This increases the code size of libavcodec/arm/vp9itxfm_neon.o from 12388 to 15064 bytes. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8 vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7 1521.2 1734.8 1262.0 vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9 1801.6 2007.8 1508.5 vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9 vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.5211.5236.1 vp9_inv_dct_dct_16x16_sub2_add_neon:1448.2994.0 1191.3836.0 vp9_inv_dct_dct_16x16_sub4_add_neon:1437.0991.0 1191.6836.0 vp9_inv_dct_dct_16x16_sub8_add_neon:2114.5 1757.9 1855.3 1335.3 vp9_inv_dct_dct_16x16_sub12_add_neon: 2862.7 2141.5 2293.3 1772.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 3299.6 2419.1 2552.7 2033.0 vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0457.5864.3554.8 vp9_inv_dct_dct_32x32_sub2_add_neon:7867.8 5978.6 6594.6 5109.9 vp9_inv_dct_dct_32x32_sub4_add_neon:7871.0 5772.5 6582.2 5108.5 vp9_inv_dct_dct_32x32_sub8_add_neon:8694.8 6925.7 7125.7 5671.4 vp9_inv_dct_dct_32x32_sub12_add_neon: 11250.3 9654.7 9557.6 7540.5 vp9_inv_dct_dct_32x32_sub16_add_neon: 12129.5 11061.1 10295.0 8220.7 vp9_inv_dct_dct_32x32_sub20_add_neon: 15218.4 13580.8 11841.3 9739.9 vp9_inv_dct_dct_32x32_sub24_add_neon: 16343.5 15097.0 12629.2 10496.6 vp9_inv_dct_dct_32x32_sub28_add_neon: 17482.2 15516.4 13476.0 11261.0 vp9_inv_dct_dct_32x32_sub32_add_neon: 18586.7 16817.5 14289.3 12019.0 --- If we wouldn't have made the core transforms standalone functions in the previous patch, the code size would increase to around 21 KB (which isn't too bad), but the idct32 pass1/2 functions would bloat up so much that they would require literal pools within the functions themselves. --- libavcodec/arm/vp9itxfm_neon.S | 351 ++--- 1 file changed, 331 insertions(+), 20 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 22e63e5..bd3f678 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -74,6 +74,14 @@ endconst vrshrn.s32 \out2, \tmpq4, #14 .endm +@ Same as mbutterfly0 above, but treating the input in in2 as zero, +@ writing the same output into both out1 and out2. +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 +vmull.s16 \tmpq3, \in1, d0[0] +vrshrn.s32 \out1, \tmpq3, #14 +vmov\out2, \out1 +.endm + @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 @ Same as mbutterfly0, but with input being 2 q registers, output @@ -137,6 +145,23 @@ endconst vrshrn.s32 \inout2, \tmp2, #14 .endm +@ Same as mbutterfly above, but treating the input in inout2 as zero +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout1, \coef1 +vmull.s16 \tmp2, \inout1, \coef2 +vrshrn.s32 \inout1, \tmp1, #14 +vrshrn.s32 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout1 as zero +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout2, \coef2 +vmull.s16 \tmp2, \inout2, \coef1 +vneg.s32\tmp1, \tmp1 +vrshrn.s32 \inout2, \tmp2, #14 +vrshrn.s32 \inout1, \tmp1, #14 +.endm + @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 @ inout are 4 d registers, tmp are 4 q registers @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -function idct16 +.macro idct16_full mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a
[libav-devel] [PATCH 1/5] arm: vp9itxfm: Make the larger core transforms standalone functions
This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from 15324 to 12388 bytes. This gives a small slowdown of a couple tens of cycles, up to around 150 cycles for the full case of the largest transform, but makes it more feasible to add more optimized versions of these transforms. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub4_add_neon:2063.4 1516.0 1719.5 1245.1 vp9_inv_dct_dct_16x16_sub16_add_neon: 3279.3 2454.5 2525.2 1982.3 vp9_inv_dct_dct_32x32_sub4_add_neon: 10750.0 7955.4 8525.6 6754.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 18574.0 17108.4 14216.7 12010.2 After: vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 --- libavcodec/arm/vp9itxfm_neon.S | 43 +- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 5abe435..22e63e5 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -.macro idct16 +function idct16 mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -580,9 +580,10 @@ endfunc vmovd4, d21 @ d4 = t10a butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d4@ d21 = out[5], d26 = out[10] -.endm +bx lr +endfunc -.macro iadst16 +function iadst16 movrel r12, iadst16_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -653,7 +654,8 @@ endfunc vmovd16, d2 vmovd30, d4 -.endm +bx lr +endfunc .macro itxfm16_1d_funcs txfm @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -662,6 +664,8 @@ endfunc @ r1 = slice offset @ r2 = src function \txfm\()16_1d_4x16_pass1_neon +push{lr} + mov r12, #32 vmov.s16q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr -\txfm\()16 +bl \txfm\()16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr -bx lr +pop {pc} 1: @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon vmovd29, d17 vmovd30, d18 vmovd31, d19 -bx lr +pop {pc} endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -719,6 +723,7 @@ endfunc @ r2 = src (temp buffer) @ r3 = slice offset function \txfm\()16_1d_4x16_pass2_neon +push{lr} mov r12, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 vld1.16 {d\i}, [r2,:64], r12 @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon add r3, r0, r1 lsl r1, r1, #1 -\txfm\()16 +bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s16 \coef0, \coef0, #6 @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon load_add_store q12, q13, q14, q15 .purgem load_add_store -bx lr +pop {pc} endfunc .endm @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon bx lr endfunc -.macro idct32_odd +function idct32_odd movrel r12, idct_coeffs add r12, r12, #32 vld1.16 {q0-q1}, [r12,:128] @@ -967,7 +972,8 @@ endfunc mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a -.endm +bx lr +endf
[libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)
This work is sponsored by, and copyright, Google. This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o from 14740 to 18504 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1 vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9 vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6 vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9 vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1 vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0 vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 697.0 vp9_inv_dct_dct_16x16_sub4_add_neon: 697.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 908.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1399.6 vp9_inv_dct_dct_16x16_sub16_add_neon: 1403.3 vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1 vp9_inv_dct_dct_32x32_sub2_add_neon:3879.7 vp9_inv_dct_dct_32x32_sub4_add_neon:3952.2 vp9_inv_dct_dct_32x32_sub8_add_neon:3948.4 vp9_inv_dct_dct_32x32_sub12_add_neon: 5462.1 vp9_inv_dct_dct_32x32_sub16_add_neon: 5461.7 vp9_inv_dct_dct_32x32_sub20_add_neon: 7169.2 vp9_inv_dct_dct_32x32_sub24_add_neon: 7162.4 vp9_inv_dct_dct_32x32_sub28_add_neon: 8137.4 vp9_inv_dct_dct_32x32_sub32_add_neon: 8136.7 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. --- If we wouldn't have made the core transforms standalone functions, the code size would end up at around 28 KB. --- libavcodec/aarch64/vp9itxfm_neon.S | 367 +++-- 1 file changed, 347 insertions(+), 20 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index be9643e..bb79348 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -75,6 +75,16 @@ endconst .endif .endm +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 +smull \tmp1\().4s, \in1\().4h, v0.h[0] +smull2 \tmp2\().4s, \in1\().8h, v0.h[0] +rshrn \out1\().4h, \tmp1\().4s, #14 +rshrn2 \out1\().8h, \tmp2\().4s, #14 +mov \out2\().16b, \out1\().16b +.endm + // out1,out2 = in1 * coef1 - in2 * coef2 // out3,out4 = in1 * coef2 + in2 * coef1 // out are 4 x .4s registers, in are 2 x .8h registers @@ -104,6 +114,43 @@ endconst rshrn2 \inout2\().8h, \tmp4\().4s, #14 .endm +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout1\().4h, \coef1 +smull2 \tmp2\().4s, \inout1\().8h, \coef1 +smull \tmp3\().4s, \inout1\().4h, \coef2 +smull2 \tmp4\().4s, \inout1\().8h, \coef2 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout2\().4h, \coef2 +smull2 \tmp2\().4s, \inout2\().8h, \coef2 +smull \tmp3\().4s, \inout2\().4h, \coef1 +smull2 \tmp4\().4s, \inout2\().8h, \coef1 +neg \tmp1\().4s, \tmp1\().4s +neg \tmp2\().4s, \tmp2\().4s +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef +smull \out1\().4s, \in\().4h, \coef +smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift +rshrn \out\().4h, \in1\().4s, \shift +rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + // out1 = in1 + in2 // out2 = in1 - in2 .macro butterfly_8h out1, out2, in1, in2 @@ -463,7 +510,7 @@ function idct16x16_dc_add_neon ret endfunc -function idct16 +.macro idct16_full dmbutterfly0v16, v24,
[libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
This work is sponsored by, and copyright, Google. This makes it easier to avoid filling the temp buffer with zeros for the skipped slices, and leads to slightly more straightforward code for these cases (for the 16x16 case, where the special case pass functions are written out instead of templated from the same macro), instead of riddling the common code with special case branches or macro .ifs. The code size increases from 14740 bytes to 24472 bytes. Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1 vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9 vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6 vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9 vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1 vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0 vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 639.0 vp9_inv_dct_dct_16x16_sub4_add_neon: 639.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 845.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1389.4 vp9_inv_dct_dct_16x16_sub16_add_neon: 1389.3 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 vp9_inv_dct_dct_32x32_sub2_add_neon:3684.1 vp9_inv_dct_dct_32x32_sub4_add_neon:3682.6 vp9_inv_dct_dct_32x32_sub8_add_neon:3684.1 vp9_inv_dct_dct_32x32_sub12_add_neon: 5319.0 vp9_inv_dct_dct_32x32_sub16_add_neon: 5323.5 vp9_inv_dct_dct_32x32_sub20_add_neon: 7149.8 vp9_inv_dct_dct_32x32_sub24_add_neon: 7148.2 vp9_inv_dct_dct_32x32_sub28_add_neon: 8124.5 vp9_inv_dct_dct_32x32_sub32_add_neon: 8122.1 --- If we wouldn't have made the core transforms standalone functions, the code size would end up at around 34 KB. The binary output is 6 KB larger than in the other alternative, but is more straightforward and gives better opportunities to special case them further. In the arm version, there was a significant speedup compared to the other alternative (having cmps within the functions), skipping zeroing of the temp buffer. Here there's much less difference. --- libavcodec/aarch64/vp9itxfm_neon.S | 628 + 1 file changed, 566 insertions(+), 62 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index be9643e..9910170 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -75,6 +75,16 @@ endconst .endif .endm +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 +smull \tmp1\().4s, \in1\().4h, v0.h[0] +smull2 \tmp2\().4s, \in1\().8h, v0.h[0] +rshrn \out1\().4h, \tmp1\().4s, #14 +rshrn2 \out1\().8h, \tmp2\().4s, #14 +mov \out2\().16b, \out1\().16b +.endm + // out1,out2 = in1 * coef1 - in2 * coef2 // out3,out4 = in1 * coef2 + in2 * coef1 // out are 4 x .4s registers, in are 2 x .8h registers @@ -104,6 +114,43 @@ endconst rshrn2 \inout2\().8h, \tmp4\().4s, #14 .endm +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout1\().4h, \coef1 +smull2 \tmp2\().4s, \inout1\().8h, \coef1 +smull \tmp3\().4s, \inout1\().4h, \coef2 +smull2 \tmp4\().4s, \inout1\().8h, \coef2 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 +smull \tmp1\().4s, \inout2\().4h, \coef2 +smull2 \tmp2\().4s, \inout2\().8h, \coef2 +smull \tmp3\().4s, \inout2\().4h, \coef1 +smull2 \tmp4\().4s, \inout2\().8h, \coef1 +neg \tmp1\().4s, \tmp1\().4s +neg \tmp2\().4s, \tmp2\().4s +rshrn \inout2\().4h, \tmp3\().4s, #14 +rshrn2 \inout2\().8h, \tmp4\().4s, #14 +rshrn \inout1\().4h, \tmp1\().4s, #14 +rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef +smull \out1\().4s, \in\().4h, \coe
[libav-devel] [PATCH 3/5] aarch64: vp9itxfm: Make the larger core transforms standalone functions
This work is sponsored by, and copyright, Google. This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from 19496 to 14740 bytes. This gives a small slowdown of a couple of tens of cycles, but makes it more feasible to add more optimized versions of these transforms. Before: vp9_inv_dct_dct_16x16_sub4_add_neon:1036.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.2 vp9_inv_dct_dct_32x32_sub4_add_neon:5180.0 vp9_inv_dct_dct_32x32_sub32_add_neon: 8095.7 After: vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9 vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 --- libavcodec/aarch64/vp9itxfm_neon.S | 42 +++--- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 053d46f..5a080a4 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -463,7 +463,7 @@ function idct16x16_dc_add_neon ret endfunc -.macro idct16 +function idct16 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a @@ -506,9 +506,10 @@ endfunc butterfly_8hv19, v28, v5, v28 // v19 = out[3], v28 = out[12] butterfly_8hv20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_8hv21, v26, v26, v3// v21 = out[5], v26 = out[10] -.endm +ret +endfunc -.macro iadst16 +function iadst16 ld1 {v0.8h,v1.8h}, [x11] dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0 @@ -577,7 +578,8 @@ endfunc mov v16.16b, v2.16b mov v30.16b, v4.16b -.endm +ret +endfunc // Helper macros; we can't use these expressions directly within // e.g. .irp due to the extra concatenation \(). Therefore wrap @@ -604,12 +606,14 @@ endfunc // x9 = input stride .macro itxfm16_1d_funcs txfm function \txfm\()16_1d_8x16_pass1_neon +mov x14, x30 + moviv2.8h, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 .endr -\txfm\()16 +bl \txfm\()16 // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two @@ -623,7 +627,7 @@ function \txfm\()16_1d_8x16_pass1_neon .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 store \i, x0, #16 .endr -ret +br x14 1: // Special case: For the last input column (x1 == 8), // which would be stored as the last row in the temp buffer, @@ -642,7 +646,7 @@ function \txfm\()16_1d_8x16_pass1_neon mov v29.16b, v21.16b mov v30.16b, v22.16b mov v31.16b, v23.16b -ret +br x14 endfunc // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it, @@ -653,6 +657,7 @@ endfunc // x3 = slice offset // x9 = temp buffer stride function \txfm\()16_1d_8x16_pass2_neon +mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load\i, x2, x9 .endr @@ -664,7 +669,7 @@ function \txfm\()16_1d_8x16_pass2_neon add x3, x0, x1 lsl x1, x1, #1 -\txfm\()16 +bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 srshr \coef0, \coef0, #6 @@ -714,7 +719,7 @@ function \txfm\()16_1d_8x16_pass2_neon load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b .purgem load_add_store -ret +br x14 endfunc .endm @@ -843,7 +848,7 @@ function idct32x32_dc_add_neon ret endfunc -.macro idct32_odd +function idct32_odd ld1 {v0.8h,v1.8h}, [x11] dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a @@ -898,7 +903,8 @@ endfunc dmbutterfly0v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a dmbutterfly0v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 dmbutterfly0v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a -.endm +ret +endfunc // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. // The 32-point IDCT can be decomposed into two 16-point IDCTs; @@ -912,6 +918,7 @@ endfunc // x10 = idct_coeffs // x11 = idct_coeffs + 32 function
[libav-devel] [PATCH 2/5] arm: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)
This work is sponsored by, and copyright, Google. This makes it easier to avoid filling the temp buffer with zeros for the skipped slices, and leads to slightly more straightforward code for these cases (for the 16x16 case, where the special case pass functions are written out instead of templated from the same macro), instead of riddling the common code with special case branches or macro .ifs. The code size increases from 12388 bytes to 19932 bytes. Before: Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8 vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7 1521.2 1734.8 1262.0 vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8 1608.5 1735.7 1262.0 vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9 1801.6 2007.8 1508.5 vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9 vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 274.4189.5211.7235.8 vp9_inv_dct_dct_16x16_sub2_add_neon:1214.2962.0 1034.4764.0 vp9_inv_dct_dct_16x16_sub4_add_neon:1214.5911.0 1034.7763.9 vp9_inv_dct_dct_16x16_sub8_add_neon:2000.6 1601.9 1729.0 1286.4 vp9_inv_dct_dct_16x16_sub12_add_neon: 2854.3 2122.2 2292.9 1757.6 vp9_inv_dct_dct_16x16_sub16_add_neon: 3231.1 2477.9 2544.6 2005.7 vp9_inv_dct_dct_32x32_sub1_add_neon: 756.1460.3865.3553.9 vp9_inv_dct_dct_32x32_sub2_add_neon:7603.7 5469.8 6046.2 4922.6 vp9_inv_dct_dct_32x32_sub4_add_neon:7586.9 5740.2 6061.5 4921.5 vp9_inv_dct_dct_32x32_sub8_add_neon:8380.7 6554.4 6600.4 5476.3 vp9_inv_dct_dct_32x32_sub12_add_neon: 11005.8 9856.2 9242.4 7462.3 vp9_inv_dct_dct_32x32_sub16_add_neon: 11959.7 10698.5 9998.0 8134.5 vp9_inv_dct_dct_32x32_sub20_add_neon: 15250.8 13175.6 11854.4 9825.7 vp9_inv_dct_dct_32x32_sub24_add_neon: 16382.3 14501.4 12671.7 10579.5 vp9_inv_dct_dct_32x32_sub28_add_neon: 17521.2 16403.8 13486.3 11331.2 vp9_inv_dct_dct_32x32_sub32_add_neon: 18630.8 17398.7 14383.2 12089.4 --- If we wouldn't have made the core transforms standalone functions, the code size would end up at around 26 KB. The binary output is 4 KB larger than in the other alternative, but is more straightforward and gives better opportunities to special case them, and is a couple hundred cycles faster for the small subpartitions. --- libavcodec/arm/vp9itxfm_neon.S | 645 ++--- 1 file changed, 601 insertions(+), 44 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 22e63e5..4bba4b9 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -74,6 +74,14 @@ endconst vrshrn.s32 \out2, \tmpq4, #14 .endm +@ Same as mbutterfly0 above, but treating the input in in2 as zero, +@ writing the same output into both out1 and out2. +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 +vmull.s16 \tmpq3, \in1, d0[0] +vrshrn.s32 \out1, \tmpq3, #14 +vmov\out2, \out1 +.endm + @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 @ Same as mbutterfly0, but with input being 2 q registers, output @@ -137,6 +145,23 @@ endconst vrshrn.s32 \inout2, \tmp2, #14 .endm +@ Same as mbutterfly above, but treating the input in inout2 as zero +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout1, \coef1 +vmull.s16 \tmp2, \inout1, \coef2 +vrshrn.s32 \inout1, \tmp1, #14 +vrshrn.s32 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout1 as zero +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 +vmull.s16 \tmp1, \inout2, \coef2 +vmull.s16 \tmp2, \inout2, \coef1 +vneg.s32\tmp1, \tmp1 +vrshrn.s32 \inout2, \tmp2, #14 +vrshrn.s32 \inout1, \tmp1, #14 +.endm + @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4
Re: [libav-devel] [PATCH 4/5] configure: Simplify MMAL check
On Sat, 3 Dec 2016, Diego Biurrun wrote: --- configure | 14 ++ 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/configure b/configure index f838a53..44fc147 100755 --- a/configure +++ b/configure @@ -4677,14 +4677,12 @@ enabled libx265 && require_pkg_config x265 x265.h x265_api_get && die "ERROR: libx265 version must be >= 57."; } enabled libxavs && require libxavs "stdint.h xavs.h" xavs_encoder_encode -lxavs enabled libxvid && require libxvid xvid.h xvid_global -lxvidcore -enabled mmal && { check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || -{ ! enabled cross_compile && { -add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ; -add_extralibs -L/opt/vc/lib/ -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ; -check_lib interface/mmal/mmal.h mmal_port_connect ; } -check_lib interface/mmal/mmal.h mmal_port_connect ; } || - die "ERROR: mmal not found"; } -enabled mmal && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" +enabled mmal && { { ! enabled cross_compile && + add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && + add_ldflags -L/opt/vc/lib/ ; } + check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || + die "ERROR: mmal not found"; } && + check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } enabled omx_rpi && { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } -- 2.1.4 So, previously this first tried to check for the headers/libs in the existing include paths, before trying to add the /opt/vc/include paths (for non-cross builds) and trying again. Now for a non-cross build, it'd hardcode adding /opt/vc/include paths even if the headers would be found in the existing include path. Ditto for the OMX patch. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/8] decode: restructure the core decoding code
On Wed, 30 Nov 2016, Anton Khirnov wrote: Currently, the new decoding API is pretty much just a wrapper around the old deprecated one. This is problematic, since it interferes with making full use of the flexibility added by the new API. The old API should also be removed at some future point. Reorganize the code so that the new send_packet/receive_frame functions call the actual decoding directly and change the old deprecated avcodec_decode_* functions into wrappers around the new API. The new internal API for decoders is now changing as well. Before this commit, it mirrors the public API, so the decoders need to implement send_packet() and receive_frame() callbacks. This turns out to require awkward constructs in both the decoders and the generic code. After this commit, the decoders only implement the receive_frame() callback and call a new internal function, ff_decode_get_packet() to obtain input data, in the same manner to how the bitstream filters now work. avcodec will now always make a reference to the input packet, which means that non-refcounted input packets will be copied. Keeping the previous behaviour, where this copy could sometimes be avoided, would make the code significantly more complex and fragile for only dubious gains, since packets are typically small and everyone who cares about performance should use refcounted packets anyway. --- libavcodec/avcodec.h | 13 +- libavcodec/decode.c | 391 +++--- libavcodec/decode.h | 35 + libavcodec/internal.h | 17 +++ libavcodec/utils.c| 22 ++- 5 files changed, 293 insertions(+), 185 deletions(-) create mode 100644 libavcodec/decode.h diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index e75d300..e34ea44 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -3211,20 +3211,25 @@ typedef struct AVCodec { int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, AVPacket *avpkt); int (*close)(AVCodecContext *); /** - * Decode/encode API with decoupled packet/frame dataflow. The API is the + * Encode API with decoupled packet/frame dataflow. The API is the * same as the avcodec_ prefixed APIs (avcodec_send_frame() etc.), except * that: * - never called if the codec is closed or the wrong type, * - AVPacket parameter change side data is applied right before calling * AVCodec->send_packet, Shouldn't this bullet point also be moved away, since send_packet is moved away/removed? - * - if AV_CODEC_CAP_DELAY is not set, drain packets or frames are never sent, + * - if AV_CODEC_CAP_DELAY is not set, drain packets are never sent, Isn't this the other way around; if this only is for encoders now, the encoders would get drain frames, not packets? * - only one drain packet is ever passed down (until the next flush()), * - a drain AVPacket is always NULL (no need to check for avpkt->size). */ int (*send_frame)(AVCodecContext *avctx, const AVFrame *frame); -int (*send_packet)(AVCodecContext *avctx, const AVPacket *avpkt); -int (*receive_frame)(AVCodecContext *avctx, AVFrame *frame); int (*receive_packet)(AVCodecContext *avctx, AVPacket *avpkt); + +/** + * Decode API with decoupled packet/frame dataflow. This function is called + * to get one output frame. It should call ff_decode_get_packet() to obtain + * input data. + */ +int (*receive_frame)(AVCodecContext *avctx, AVFrame *frame); /** * Flush buffers. * Will be called when seeking // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/5] configure: Simplify MMAL check
On Sat, 3 Dec 2016, Diego Biurrun wrote: On Sat, Dec 03, 2016 at 12:33:04PM +0200, Martin Storsjö wrote: On Sat, 3 Dec 2016, Diego Biurrun wrote: >--- a/configure >+++ b/configure >@@ -4677,14 +4677,12 @@ enabled libx265 && require_pkg_config x265 x265.h x265_api_get && >enabled libxavs && require libxavs "stdint.h xavs.h" xavs_encoder_encode -lxavs >enabled libxvid && require libxvid xvid.h xvid_global -lxvidcore >-enabled mmal && { check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || >-{ ! enabled cross_compile && { >-add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ; >-add_extralibs -L/opt/vc/lib/ -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ; >-check_lib interface/mmal/mmal.h mmal_port_connect ; } >-check_lib interface/mmal/mmal.h mmal_port_connect ; } || >- die "ERROR: mmal not found"; } >-enabled mmal && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" >+enabled mmal && { { ! enabled cross_compile && >+ add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && >+ add_ldflags -L/opt/vc/lib/ ; } >+ check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || >+ die "ERROR: mmal not found"; } && >+ check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" >enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } >enabled omx_rpi && { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL > check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } >-- >2.1.4 So, previously this first tried to check for the headers/libs in the existing include paths, before trying to add the /opt/vc/include paths (for non-cross builds) and trying again. Now for a non-cross build, it'd hardcode adding /opt/vc/include paths even if the headers would be found in the existing include path. OK, it was a tad hard for me to be sure what the original intention was. (Note that it redundantly checks twice after setting the flags.) It should be one check before and one after setting the flags? Yes, that's the intent. First try checking if it is found in the existing paths, like we do for any normal lib. If not found, but requested and not cross compiling, try adding these paths and see if it is found then. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/5] configure: Simplify OMX check
On Sat, 3 Dec 2016, Diego Biurrun wrote: On Sat, Dec 03, 2016 at 02:50:31PM +0100, Luca Barbato wrote: On 03/12/2016 11:07, Diego Biurrun wrote: On Sat, Dec 03, 2016 at 11:00:01AM +0100, Luca Barbato wrote: On 03/12/2016 07:49, Diego Biurrun wrote: +enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } +enabled omx_rpi && { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL + check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } Flip the order and it should work. At this point in configure, omx_rpi has not turned on omx yet, so there should not be any duplicate checking. so it would simply die if you are on a rpi and enable omx? Isn't that what the omx-rpi option is for? I'm somewhat mystified by what the intended behavior is... Right, if you do --enable-omx --enable-omx-rpi (which is kind of tautological, because omx-rpi changes the behaviour of the omx module; you don't get two encoders, but you get the normal omx encoder, but configured for the alternate ABI and other broadcom peculiarities), it would previously succeed, while it now would fail to find the headers. I don't think that's a case worth caring about though, in that case, just drop the redundant --enable-omx. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Simplify OMX check
On Sun, 4 Dec 2016, Diego Biurrun wrote: --- Switched the order of the checks, as suggested by Luca. configure | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/configure b/configure index e816eb5..455d4fb 100755 --- a/configure +++ b/configure @@ -2190,6 +2190,7 @@ nvenc_deps_any="dlopen LoadLibrary" nvenc_extralibs='$ldl' omx_deps="dlopen pthreads" omx_extralibs='$ldl' +omx_rpi_select="omx" qsvdec_select="qsv" qsvenc_select="qsv" vaapi_encode_deps="vaapi" @@ -4685,12 +4686,10 @@ enabled mmal && { check_lib interface/mmal/mmal.h mmal_port_connect check_lib interface/mmal/mmal.h mmal_port_connect ; } || die "ERROR: mmal not found"; } enabled mmal && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" -enabled omx_rpi && enable omx -enabled omx && { check_header OMX_Core.h || -{ ! enabled cross_compile && enabled omx_rpi && { -add_cflags -isystem/opt/vc/include/IL ; } -check_header OMX_Core.h ; } || +enabled omx_rpi && { check_header OMX_Core.h || + { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL && check_header OMX_Core.h ; } || die "ERROR: OpenMAX IL headers not found"; } +enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } enabled openssl && { { check_pkg_config openssl openssl/ssl.h OPENSSL_init_ssl || check_pkg_config openssl openssl/ssl.h SSL_library_init; } && { add_cflags $openssl_cflags && add_extralibs $openssl_libs; }|| -- 2.1.4 I was almost about to say that it seems ok, but it fails one cruicial detail: After running ./configure --enable-omx-rpi, I get the following in config.h: #define CONFIG_OMX 1 #define CONFIG_OMX_RPI 1 #define CONFIG_H264_OMX_ENCODER 0 #define CONFIG_MPEG4_OMX_ENCODER 0 With the current master, they all end up as 1 as intended. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Simplify MMAL check
On Sun, 4 Dec 2016, Diego Biurrun wrote: --- Third time's the charm? configure | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/configure b/configure index 849226a..533a37a 100755 --- a/configure +++ b/configure @@ -4682,13 +4682,12 @@ enabled libx265 && require_pkg_config x265 x265.h x265_api_get && enabled libxavs && require libxavs "stdint.h xavs.h" xavs_encoder_encode -lxavs enabled libxvid && require libxvid xvid.h xvid_global -lxvidcore enabled mmal && { check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || -{ ! enabled cross_compile && { -add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ; -add_extralibs -L/opt/vc/lib/ -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ; -check_lib interface/mmal/mmal.h mmal_port_connect ; } -check_lib interface/mmal/mmal.h mmal_port_connect ; } || - die "ERROR: mmal not found"; } -enabled mmal && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" + { ! enabled cross_compile && + add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && + add_ldflags -L/opt/vc/lib/ && + check_lib interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } || + die "ERROR: mmal not found" && + check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } enabled omx_rpi && { check_header OMX_Core.h || { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL && check_header OMX_Core.h ; } || -- 2.1.4 This seems to work as intended, so ok. (Tested in both cross setups with manually set paths, and directly on device with automatically added paths.) // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Move COMPONENT_LIST to the bottom of CONFIG_LIST
On Mon, 5 Dec 2016, Diego Biurrun wrote: This ensures that dependencies are resolved correctly. COMPONENT_LIST can contain parts that depend on previous entries of CONFIG_LIST. --- This fixes the issue that Martin noticed with omx_rpi and the actual omx encoders not getting enabled. configure | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configure b/configure index b4680e4..52a09f7 100755 --- a/configure +++ b/configure @@ -1362,8 +1362,8 @@ SUBSYSTEM_LIST=" rdft " +# COMPONENT_LIST needs to be the last to ensure correct dependency checking CONFIG_LIST=" -$COMPONENT_LIST $EXAMPLE_LIST $EXTERNAL_LIBRARY_LIST $HWACCEL_LIBRARY_LIST @@ -1379,6 +1379,7 @@ CONFIG_LIST=" thumb valgrind_backtrace xmm_clobber_test +$COMPONENT_LIST " THREADS_LIST=" -- 2.1.4 Seems to work for me // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: Simplify OMX check
On Mon, 5 Dec 2016, Diego Biurrun wrote: On Sun, Dec 04, 2016 at 11:20:51PM +0200, Martin Storsjö wrote: On Sun, 4 Dec 2016, Diego Biurrun wrote: >--- a/configure >+++ b/configure >@@ -2190,6 +2190,7 @@ nvenc_deps_any="dlopen LoadLibrary" >nvenc_extralibs='$ldl' >omx_deps="dlopen pthreads" >omx_extralibs='$ldl' >+omx_rpi_select="omx" >qsvdec_select="qsv" >qsvenc_select="qsv" >vaapi_encode_deps="vaapi" >@@ -4685,12 +4686,10 @@ enabled mmal && { check_lib interface/mmal/mmal.h mmal_port_connect >check_lib interface/mmal/mmal.h mmal_port_connect ; } || > die "ERROR: mmal not found"; } >enabled mmal && check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS" >-enabled omx_rpi && enable omx >-enabled omx && { check_header OMX_Core.h || >-{ ! enabled cross_compile && enabled omx_rpi && { >-add_cflags -isystem/opt/vc/include/IL ; } >-check_header OMX_Core.h ; } || >+enabled omx_rpi && { check_header OMX_Core.h || >+ { ! enabled cross_compile && add_cflags -isystem/opt/vc/include/IL && check_header OMX_Core.h ; } || > die "ERROR: OpenMAX IL headers not found"; } >+enabled omx && { check_header OMX_Core.h || die "ERROR: OpenMAX IL headers not found"; } >enabled openssl && { { check_pkg_config openssl openssl/ssl.h OPENSSL_init_ssl || > check_pkg_config openssl openssl/ssl.h SSL_library_init; } && { > add_cflags $openssl_cflags && add_extralibs $openssl_libs; }|| I was almost about to say that it seems ok, but it fails one cruicial detail: After running ./configure --enable-omx-rpi, I get the following in config.h: #define CONFIG_OMX 1 #define CONFIG_OMX_RPI 1 #define CONFIG_H264_OMX_ENCODER 0 #define CONFIG_MPEG4_OMX_ENCODER 0 With the current master, they all end up as 1 as intended. This should not happen of course. I know the reason, this patch is the symptom of the underlying problem. Patch coming up shortly. Thanks for noticing. This patch is ok with the other fix // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] omx: Fix OOM check
On Mon, 5 Dec 2016, Timothy Gu wrote: Also use av_mallocz_array(). Found by Coverity in FFmpeg. --- libavcodec/omx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/omx.c b/libavcodec/omx.c index 0c61c2f..05c8743 100644 --- a/libavcodec/omx.c +++ b/libavcodec/omx.c @@ -352,12 +352,12 @@ static av_cold int find_component(OMXContext *omx_context, void *logctx, av_log(logctx, AV_LOG_WARNING, "No component for role %s found\n", role); return AVERROR_ENCODER_NOT_FOUND; } -components = av_mallocz(sizeof(char*) * num); +components = av_mallocz_array(num, sizeof(*components)); if (!components) return AVERROR(ENOMEM); for (i = 0; i < num; i++) { components[i] = av_mallocz(OMX_MAX_STRINGNAME_SIZE); -if (!components) { +if (!components[i]) { ret = AVERROR(ENOMEM); goto end; } -- 2.10.2 Thanks, this looks ok to me. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [libav-commits] build: Move entries related to building TOOLS to a subdirectory Makefile
On Wed, 7 Dec 2016, Diego Biurrun wrote: Module: libav Branch: master Commit: 3e105d08848162b90d886bde59c010d4b0362a4b Author:Diego Biurrun Committer: Diego Biurrun Date: Mon Dec 5 18:38:53 2016 +0100 build: Move entries related to building TOOLS to a subdirectory Makefile --- Makefile |8 +--- tools/Makefile | 11 +++ 2 files changed, 12 insertions(+), 7 deletions(-) Since this commit, a plain "make" does nothing - some other make target than "all" seems to be the first target? Please fix. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/1] arm64: replace 'bic' with immediate with 'and' with inverted immediate
On Thu, 8 Dec 2016, Janne Grunau wrote: The former is not an official pseudo instruction although gas and llvm's internal assembler support it. Fixes a build error with xcode 6.2 reported by Memphiz on github. --- libavcodec/aarch64/synth_filter_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/synth_filter_neon.S b/libavcodec/aarch64/synth_filter_neon.S index 9551bff8e3..b001c737da 100644 --- a/libavcodec/aarch64/synth_filter_neon.S +++ b/libavcodec/aarch64/synth_filter_neon.S @@ -50,7 +50,7 @@ function ff_synth_filter_float_neon, export=1 add x1, x1, x7, lsl #2 // synth_buf sub w8, w7, #32 stp x5, x1, [sp, #16] -bic x7, x7, #63 +and x7, x7, #~63 and w8, w8, #511 stp x7, x30, [sp, #32] str w8, [x2] -- 2.11.0 Ok // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] configure: use -O1 with MSVC by default
On Tue, 13 Dec 2016, Steve Lhomme wrote: From: Steve Lhomme Otherwise some ARM and other unsupported CPU/OS is linked with -O0 --- configure | 1 + 1 file changed, 1 insertion(+) I find the commit message quite hard to understand and easily misleading. Would this make more sense? ---8<--- configure: Use -O1 with MSVC for --disable-optimizations Without any optimization flags, MSVC does no dead code elimination (DCE) at all, even for the most trivial cases. DCE is a prerequisite for building libav correctly, otherwise there are undefined references to functions for other architectures and disabled components. -O1 is the minimal optimization flags for MSVC that does include DCE. ---8<--- // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] libavformat: Fix a faulty api deprecation guard in prepare_input_packet
This seems to have been added by mistake in 11de006b, by not noticing the negation for the existing condition. This block does not contain any code that accesses the codec field in AVStream. This function is meant to serve as a complement to compute_pkt_fields2, which is guarded by FF_API_COMPUTE_PKT_FIELDS2 && FF_API_LAVF_AVCTX. --- libavformat/mux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/mux.c b/libavformat/mux.c index 37c4541..2561a6d 100644 --- a/libavformat/mux.c +++ b/libavformat/mux.c @@ -423,7 +423,7 @@ static int prepare_input_packet(AVFormatContext *s, AVPacket *pkt) if (ret < 0) return ret; -#if !FF_API_COMPUTE_PKT_FIELDS2 && FF_API_LAVF_AVCTX +#if !FF_API_COMPUTE_PKT_FIELDS2 || !FF_API_LAVF_AVCTX /* sanitize the timestamps */ if (!(s->oformat->flags & AVFMT_NOTIMESTAMPS)) { AVStream *st = s->streams[pkt->stream_index]; -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] rtmppkt: Check for packet size mismatches
From: Michael Niedermayer Fixes out of array access. Found-by: Paul Cher Reviewed-by: Paul Cher CC: libav-sta...@libav.org --- libavformat/rtmppkt.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c index f8c51d0..373c3ea 100644 --- a/libavformat/rtmppkt.c +++ b/libavformat/rtmppkt.c @@ -235,6 +235,13 @@ static int rtmp_packet_read_one_chunk(URLContext *h, RTMPPacket *p, if (hdr != RTMP_PS_TWELVEBYTES) timestamp += prev_pkt[channel_id].timestamp; +if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) { +av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n", +size, prev_pkt[channel_id].size); +ff_rtmp_packet_destroy(&prev_pkt[channel_id]); +prev_pkt[channel_id].read = 0; +} + if (!prev_pkt[channel_id].read) { if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp, size)) < 0) -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] hmac: Explicitly convert types at function pointer assignment
On Thu, 15 Dec 2016, Diego Biurrun wrote: Fixes a number of warnings of the type libavutil/hmac.c:61:21: warning: assignment from incompatible pointer type --- Now with the preliminaries requested by Anton. libavutil/hmac.c | 30 ++ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/libavutil/hmac.c b/libavutil/hmac.c index 378be62..2f4fa52 100644 --- a/libavutil/hmac.c +++ b/libavutil/hmac.c @@ -18,6 +18,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include +#include #include #include "attributes.h" @@ -29,12 +31,16 @@ #define MAX_HASHLEN 32 #define MAX_BLOCKLEN 64 +typedef void (*hmac_final)(void *ctx, uint8_t *dst); +typedef void (*hmac_update)(void *ctx, const uint8_t *src, size_t len); +typedef void (*hmac_init)(void *ctx); + struct AVHMAC { void *hash; int blocklen, hashlen; -void (*final)(void*, uint8_t*); -void (*update)(void*, const uint8_t*, int len); -void (*init)(void*); +hmac_final final; +hmac_update update; +hmac_init init; uint8_t key[MAX_BLOCKLEN]; int keylen; }; @@ -58,33 +64,33 @@ AVHMAC *av_hmac_alloc(enum AVHMACType type) case AV_HMAC_MD5: c->blocklen = 64; c->hashlen = 16; -c->init = av_md5_init; -c->update = av_md5_update; -c->final= av_md5_final; +c->init = (hmac_init) av_md5_init; +c->update = (hmac_update) av_md5_update; +c->final= (hmac_final) av_md5_final; c->hash = av_md5_alloc(); break; Until we actually do the major bump and change the parameter type, this is absolutely wrong. Until the next major bump, the actual implementation of the function av_md5_update takes an int parameter, but here we'd ignore that and pretend that it takes a size_t. If we're unlucky, a size_t is passed differently as parameter than an (un)signed int on some architecture, and we'd have a complete breakage. Yes it does give warnings right now, but the ABI of the functions at least match, contrary to what it would do after this patch. Additionally, this patch would hide any future ABI incompatibility completely by silencing all that with the cast. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtmppkt: Check for packet size mismatches
On Thu, 15 Dec 2016, Luca Barbato wrote: On 15/12/2016 09:12, Martin Storsjö wrote: From: Michael Niedermayer Fixes out of array access. Found-by: Paul Cher Reviewed-by: Paul Cher CC: libav-sta...@libav.org --- libavformat/rtmppkt.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c index f8c51d0..373c3ea 100644 --- a/libavformat/rtmppkt.c +++ b/libavformat/rtmppkt.c @@ -235,6 +235,13 @@ static int rtmp_packet_read_one_chunk(URLContext *h, RTMPPacket *p, if (hdr != RTMP_PS_TWELVEBYTES) timestamp += prev_pkt[channel_id].timestamp; +if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) { +av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n", +size, prev_pkt[channel_id].size); +ff_rtmp_packet_destroy(&prev_pkt[channel_id]); +prev_pkt[channel_id].read = 0; +} + if (!prev_pkt[channel_id].read) { if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp, size)) < 0) Why it happens? When you have fragmented packets, the first packet declares the size and the later ones (normally) are small follow-on packets that don't repeat the size and all that. But technically the later fragments also can have a full header, declaring a different size than the previous packet. In those cases we didn't use to check that the partial packet that was allocated earlier actually matches the size in the current packet. If it is ok not to forward an error the log message could be demoted to verbose. I guess we can and should return an error here directly as well. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] rtmppkt: Check for packet size mismatches
From: Michael Niedermayer When receiving fragmented packets, the first packet declares the size, and the later ones normally are small follow-on packets that don't repeat the size and the other header fields. But technically, the later fragments also can have a full header, declaring a different size than the previous packet. If the follow-on packet declares a larger size than the initial one, we could end up writing outside of the allocation. This fixes out of bounds writes. Found-by: Paul Cher Reviewed-by: Paul Cher CC: libav-sta...@libav.org --- Now with error return and improved commit message. --- libavformat/rtmppkt.c | 8 1 file changed, 8 insertions(+) diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c index f8c51d0..1cb3078 100644 --- a/libavformat/rtmppkt.c +++ b/libavformat/rtmppkt.c @@ -235,6 +235,14 @@ static int rtmp_packet_read_one_chunk(URLContext *h, RTMPPacket *p, if (hdr != RTMP_PS_TWELVEBYTES) timestamp += prev_pkt[channel_id].timestamp; +if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) { +av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n", +size, prev_pkt[channel_id].size); +ff_rtmp_packet_destroy(&prev_pkt[channel_id]); +prev_pkt[channel_id].read = 0; +return AVERROR_INVALIDDATA; +} + if (!prev_pkt[channel_id].read) { if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp, size)) < 0) -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] aarch64: vp9itxfm: Use the offset parameter to movrel
This fixes build failures for iOS, broken since cad42fadcd2c. --- libavcodec/aarch64/vp9itxfm_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 053d46f..7ce6df0 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -1108,7 +1108,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 movrel x10, idct_coeffs add x11, x10, #32 -movrel x12, min_eob_idct_idct_32 + 2 +movrel x12, min_eob_idct_idct_32, 2 mov x15, x30 -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] Skipping checkasm on MSVC DLL setups?
Hi, It seems like checkasm has started failing to build in MSVC DLL setups recently, see e.g. this instance: https://fate.libav.org/i686-msvc-10-dll-wine The reason is that once you build with --enable-shared with MSVC, you can't link those object files via static libraries any longer. (See d66c52c2b369401ba4face1c171ccb19130b7a31, that added support for MSVC built DLLs, for the full explanation.) And checkasm links to the libs statically in order to link to non-exported fucntions. In practice, this hasn't been an issue so far, but since 972c71e9cb63e24f57ee481e413199c7d88a8813, checkasm seems to pull in enough object files from libavcodec to start exhibiting the problem I explained in the commit when MSVC DLL support as added above. So it has only worked by coincidence so far. We could of course try to figure out what part of the dsp object files that checkasm links suddenly pulls in bsf code in this link, but that'd also be just hacking around the problem. Since checkasm in MSVC DLL builds isn't that important, since it's covered pretty well in other test setups anyway, I would suggest to just disable checkasm in this case. We easily have got CONFIG_SHARED available in makefile, but I guess we'd need to add something else to identify MSVC. Or just a "disable checkasm" or similar, somewhere in configure? // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] http: Check for negative chunk sizes
A negative chunk size is illegal and would end up used as length for memcpy, where it would lead to memory accesses out of bounds. Found-by: Paul Cher CC: libav-sta...@libav.org --- libavformat/http.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavformat/http.c b/libavformat/http.c index 8fe8d11..7e3708e 100644 --- a/libavformat/http.c +++ b/libavformat/http.c @@ -784,6 +784,8 @@ static int http_read_stream(URLContext *h, uint8_t *buf, int size) av_log(NULL, AV_LOG_TRACE, "Chunked encoding data size: %"PRId64"'\n", s->chunksize); +if (s->chunksize < 0) +return AVERROR_INVALIDDATA; if (!s->chunksize) return 0; -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] http: Check for negative chunk sizes
On Mon, 19 Dec 2016, Vittorio Giovara wrote: On Mon, Dec 19, 2016 at 9:56 PM, Martin Storsjö wrote: A negative chunk size is illegal and would end up used as length for memcpy, where it would lead to memory accesses out of bounds. Found-by: Paul Cher CC: libav-sta...@libav.org --- libavformat/http.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavformat/http.c b/libavformat/http.c index 8fe8d11..7e3708e 100644 --- a/libavformat/http.c +++ b/libavformat/http.c @@ -784,6 +784,8 @@ static int http_read_stream(URLContext *h, uint8_t *buf, int size) av_log(NULL, AV_LOG_TRACE, "Chunked encoding data size: %"PRId64"'\n", s->chunksize); +if (s->chunksize < 0) +return AVERROR_INVALIDDATA; if (!s->chunksize) return 0; This is mostly a nit, but would it make sense to coalesce the second `if` into a `else if`? Ok with me either way. Pushed with this changed as suggested. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] fate: Add --ignore-tests configure option for omitting specific FATE tests
From: Janne Grunau This can be useful to filter out noise in known-broken scenarios like miscompilation by legacy compilers and similar. Originally based on a patch by Diego Biurrun. --- This is the common agreed upon consensus from Diego's and Janne's discussion, as far as I read it. --- configure | 11 +++ doc/fate.texi | 1 + tests/Makefile| 4 +++- tests/fate-run.sh | 9 +++-- tests/fate.sh | 1 + 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/configure b/configure index 8e40238..8472145 100755 --- a/configure +++ b/configure @@ -348,6 +348,8 @@ Developer options (useful when working on Libav itself): --random-seed=VALUE seed value for --enable/disable-random --disable-valgrind-backtrace do not print a backtrace under Valgrind (only applies to --disable-optimizations builds) + --ignore-tests=TESTS comma-separated list (without "fate-" prefix + in the name) of tests which result is ignored NOTE: Object files are built at the place where configure is launched. EOF @@ -1815,6 +1817,7 @@ CMDLINE_SET=" host_ld host_ldflags host_os +ignore_tests ld logfile malloc_prefix @@ -5192,6 +5195,13 @@ for type in decoder encoder hwaccel parser demuxer muxer protocol filter bsf ind echo done +if test -n "$ignore_tests"; then +ignore_tests=$(echo $ignore_tests | tr ',' ' ') +echo "Ignored FATE tests:" +echo $ignore_tests | print_3_columns +echo +fi + license="LGPL version 2.1 or later" if enabled nonfree; then license="nonfree and unredistributable" @@ -5316,6 +5326,7 @@ SLIB_INSTALL_EXTRA_LIB=${SLIB_INSTALL_EXTRA_LIB} SLIB_INSTALL_EXTRA_SHLIB=${SLIB_INSTALL_EXTRA_SHLIB} VERSION_SCRIPT_POSTPROCESS_CMD=${VERSION_SCRIPT_POSTPROCESS_CMD} SAMPLES:=${samples:-\$(LIBAV_SAMPLES)} +IGNORE_TESTS=$ignore_tests EOF map 'eval echo "${v}_FFLIBS=\$${v}_deps" >> avbuild/config.mak' $LIBRARY_LIST diff --git a/doc/fate.texi b/doc/fate.texi index 9e654e7..b1bfa2e 100644 --- a/doc/fate.texi +++ b/doc/fate.texi @@ -139,6 +139,7 @@ workdir=# directory in which to do all the work fate_recv="ssh -T fate@@fate.libav.org" # command to submit report comment=# optional description build_only= # set to "yes" for a compile-only instance that skips tests +ignore_tests= # the following are optional and map to configure options arch= diff --git a/tests/Makefile b/tests/Makefile index 0e475a2..30e06e8 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -143,11 +143,13 @@ endif FATE_UTILS = base64 tiny_psnr +$(addprefix fate-, $(IGNORE_TESTS)): REPORT=ignore + fate: $(FATE) $(FATE): $(FATE_UTILS:%=tests/%$(HOSTEXESUF)) @echo "TEST$(@:fate-%=%)" - $(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" "$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' '$(THREAD_TYPE)' '$(CPUFLAGS)' '$(CMP_SHIFT)' '$(CMP_TARGET)' '$(SIZE_TOLERANCE)' '$(CMP_UNIT)' '$(GEN)' '$(HWACCEL)' + $(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" "$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' '$(THREAD_TYPE)' '$(CPUFLAGS)' '$(CMP_SHIFT)' '$(CMP_TARGET)' '$(SIZE_TOLERANCE)' '$(CMP_UNIT)' '$(GEN)' '$(HWACCEL)' '$(REPORT)' fate-list: @printf '%s\n' $(sort $(FATE)) diff --git a/tests/fate-run.sh b/tests/fate-run.sh index b1b299a..3982a18 100755 --- a/tests/fate-run.sh +++ b/tests/fate-run.sh @@ -24,6 +24,7 @@ size_tolerance=${14:-0} cmp_unit=${15:-2} gen=${16:-no} hwaccel=${17:-none} +report_type=${18:-standard} outdir="tests/data/fate" outfile="${outdir}/${test}" @@ -212,13 +213,17 @@ if test -e "$ref" || test $cmp = "oneline" ; then esac cmperr=$? test $err = 0 && err=$cmperr -test $err = 0 || cat $cmpfile +if [ "$report_type" = "ignore" ]; then +test $err = 0 || echo "ignoring fate-${test}" && err=0 +else +test $err = 0 || cat $cmpfile +fi else echo "reference file '$ref' not found" err=1 fi -if [ $err -eq 0 ]; then +if [ $err -eq 0 ] && test $report_type = "standard" ; then unset cmpo erro else cmpo="$($base64 <$cmpfile)" diff --git a/tests/fate.sh b/tests/fate.sh index 4608d2d..c93e20a 100755 --- a/tests/fate.sh +++ b/tests/fate.sh @@ -47,6 +47,7 @@ configure()( --prefix="${inst}" \ --samples="${samples}" \ --enable-gpl\ +${ignore_tests:+--ignore-tests="$ignore_tests"} \ ${arch:+--arch=$arch} \ ${cpu:+--cpu="$cpu"}\ ${toolchain:+--toolchain="$toolchain"} \ -- 2.10.1 (Apple Git-78) ___
[libav-devel] [PATCH] fate: Unset the sig variable if ignoring a test failure
Otherwise the .rep file would still contain a signal instead of a zero, even if the process returned success. --- Also, I think the fate- prefix should be omitted in the IGNORE line. See https://fate.libav.org/i386-apple-darwin-gcc-4.0/20161228201333/test for an example of what it looks like. And instead of "IGNORE " with double spaces, I think the more correct/consistent thing to use would be a \t, to match what the makefiles use for the CC etc lines. --- tests/fate-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fate-run.sh b/tests/fate-run.sh index 27cd626..e1aaf64 100755 --- a/tests/fate-run.sh +++ b/tests/fate-run.sh @@ -214,7 +214,7 @@ if test -e "$ref" || test $cmp = "oneline" ; then cmperr=$? test $err = 0 && err=$cmperr if [ "$report_type" = "ignore" ]; then -test $err = 0 || echo "IGNORE fate-${test}" && err=0 +test $err = 0 || echo "IGNORE fate-${test}" && err=0 && unset sig else test $err = 0 || cat $cmpfile fi -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] fate: Tweak printing of ignored tests
Use a tab instead of two spaces, skip the fate prefix for the test name. This makes IGNORE line fit in even better with the other make printouts. --- tests/fate-run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fate-run.sh b/tests/fate-run.sh index e1aaf64..623fd63 100755 --- a/tests/fate-run.sh +++ b/tests/fate-run.sh @@ -214,7 +214,7 @@ if test -e "$ref" || test $cmp = "oneline" ; then cmperr=$? test $err = 0 && err=$cmperr if [ "$report_type" = "ignore" ]; then -test $err = 0 || echo "IGNORE fate-${test}" && err=0 && unset sig +test $err = 0 || echo "IGNORE\t${test}" && err=0 && unset sig else test $err = 0 || cat $cmpfile fi -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] fate: Skip the checkasm test if CONFIG_STATIC is disabled
When building DLLs with MSVC, CONFIG_STATIC is disabled (see d66c52c2b3694 for a more verbose explanation) since the built object files can't be linked statically (which checkasm does). This worked up until recently, only by luck. --- tests/fate/checkasm.mak | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 939eb56..5721f1f 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -19,5 +19,5 @@ $(FATE_CHECKASM): tests/checkasm/checkasm$(EXESUF) $(FATE_CHECKASM): CMD = run tests/checkasm/checkasm --test=$(@:fate-checkasm-%=%) $(FATE_CHECKASM): REF = /dev/null -FATE += $(FATE_CHECKASM) +FATE-$(CONFIG_STATIC) += $(FATE_CHECKASM) fate-checkasm: $(FATE_CHECKASM) -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] Skipping checkasm on MSVC DLL setups?
On Sat, 31 Dec 2016, Diego Biurrun wrote: Hmmm... On Mon, Dec 19, 2016 at 11:51:36AM +0200, Martin Storsjö wrote: It seems like checkasm has started failing to build in MSVC DLL setups recently, see e.g. this instance: https://fate.libav.org/i686-msvc-10-dll-wine The reason is that once you build with --enable-shared with MSVC, you can't link those object files via static libraries any longer. (See d66c52c2b369401ba4face1c171ccb19130b7a31, that added support for MSVC built DLLs, for the full explanation.) And checkasm links to the libs statically in order to link to non-exported fucntions. In practice, this hasn't been an issue so far, but since 972c71e9cb63e24f57ee481e413199c7d88a8813, checkasm seems to pull in enough object files from libavcodec to start exhibiting the problem I explained in the commit when MSVC DLL support as added above. So it has only worked by coincidence so far. We could of course try to figure out what part of the dsp object files that checkasm links suddenly pulls in bsf code in this link, but that'd also be just hacking around the problem. I'm afraid we're hacking around the problem either way and the only proper solution is to rid ourselves of avpriv_, which is basically the devil. Yes. In this particular case, getting rid of the avpriv data symbols is enough; the avpriv functions are harmless wrt this. Since checkasm in MSVC DLL builds isn't that important, since it's covered pretty well in other test setups anyway, I would suggest to just disable checkasm in this case. We easily have got CONFIG_SHARED available in makefile, but I guess we'd need to add something else to identify MSVC. Or just a "disable checkasm" or similar, somewhere in configure? I pointed out how you could achieve this by adding a CONFIG_STATIC condition in the right place, but what about the other TESTPROGS? The situation wrt them seems equally brittle. At any point in the future some avpriv_ symbol might get pulled in and break linking them. Disabling all the tests that use TESTPROGS also seems undesirable since we're speaking of 200+ tests. The seek tests are among them... I don't see that as being too bad. Given the choice between removing these fate instances altogether (or, playing devil's advocate even further, removing the MSVC DLL support altogether, since it's something of a hack wrt avpriv data), or disabling a couple hundred of the tests, I much prefer disabling these tests. We've got corresponding fate instances that test the library internals in general, thoroughly, without building as DLLs. When building as DLLs, the only thing that you really can test is avconv.exe. This looks like a dilemma where all the alternatives are hacks to me... Well, partially yes. But the patch I posted based on your suggestion isn't all that wrong either (although incomplete wrt testprogs); if CONFIG_STATIC is disabled, one probably shouldn't try to link checkasm (and the others). Basically, as far as I know, the only fate instances with CONFIG_STATIC disabled would be my MSVC DLL ones. It would probably hit people building with --disable-static (wanting to redistribute only e.g. shared libs), making a "make check" in their builds test things a little less thoroughly, but I would see that as an acceptable compromise. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 4/6] aarch64: vp9mc: Simplify the extmla macro parameters
Fold the field lengths into the macro. This makes the macro invocations much more readable, when the lines are shorter. This also makes it easier to use only half the registers within the macro. --- libavcodec/aarch64/vp9mc_neon.S | 50 - 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S index c1f1876..99f1809 100644 --- a/libavcodec/aarch64/vp9mc_neon.S +++ b/libavcodec/aarch64/vp9mc_neon.S @@ -193,41 +193,41 @@ endfunc // for size >= 16), and multiply-accumulate into dst1 and dst3 (or // dst1-dst2 and dst3-dst4 for size >= 16) .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size -ext v20.16b, \src1, \src2, #(2*\offset) -ext v22.16b, \src4, \src5, #(2*\offset) +ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) +ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) .if \size >= 16 -mla \dst1, v20.8h, v0.h[\offset] -ext v21.16b, \src2, \src3, #(2*\offset) -mla \dst3, v22.8h, v0.h[\offset] -ext v23.16b, \src5, \src6, #(2*\offset) -mla \dst2, v21.8h, v0.h[\offset] -mla \dst4, v23.8h, v0.h[\offset] +mla \dst1\().8h, v20.8h, v0.h[\offset] +ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) +mla \dst3\().8h, v22.8h, v0.h[\offset] +ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) +mla \dst2\().8h, v21.8h, v0.h[\offset] +mla \dst4\().8h, v23.8h, v0.h[\offset] .else -mla \dst1, v20.8h, v0.h[\offset] -mla \dst3, v22.8h, v0.h[\offset] +mla \dst1\().8h, v20.8h, v0.h[\offset] +mla \dst3\().8h, v22.8h, v0.h[\offset] .endif .endm // The same as above, but don't accumulate straight into the // destination, but use a temp register and accumulate with saturation. .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size -ext v20.16b, \src1, \src2, #(2*\offset) -ext v22.16b, \src4, \src5, #(2*\offset) +ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset) +ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset) .if \size >= 16 mul v20.8h, v20.8h, v0.h[\offset] -ext v21.16b, \src2, \src3, #(2*\offset) +ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset) mul v22.8h, v22.8h, v0.h[\offset] -ext v23.16b, \src5, \src6, #(2*\offset) +ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) mul v21.8h, v21.8h, v0.h[\offset] mul v23.8h, v23.8h, v0.h[\offset] .else mul v20.8h, v20.8h, v0.h[\offset] mul v22.8h, v22.8h, v0.h[\offset] .endif -sqadd \dst1, \dst1, v20.8h -sqadd \dst3, \dst3, v22.8h +sqadd \dst1\().8h, \dst1\().8h, v20.8h +sqadd \dst3\().8h, \dst3\().8h, v22.8h .if \size >= 16 -sqadd \dst2, \dst2, v21.8h -sqadd \dst4, \dst4, v23.8h +sqadd \dst2\().8h, \dst2\().8h, v21.8h +sqadd \dst4\().8h, \dst4\().8h, v23.8h .endif .endm @@ -292,13 +292,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2 mul v2.8h, v5.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] .endif -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 1, \size -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 2, \size -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, \idx1, \size -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 5, \size -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 6, \size -extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 7, \size -extmulqadd v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, \idx2, \size +extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size +extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size +extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size +extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \s
[libav-devel] [PATCH 2/6] aarch64: vp9dsp: Fix vertical alignment in the init file
--- libavcodec/aarch64/vp9dsp_init_aarch64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c index 486ae1e..2ac9e19 100644 --- a/libavcodec/aarch64/vp9dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c @@ -125,7 +125,7 @@ static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp) #define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \ dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon -#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \ +#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \ init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \ init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \ init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \ @@ -133,7 +133,7 @@ static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp) init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \ init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx) -#define init_mc_funcs_dirs(idx, sz) \ +#define init_mc_funcs_dirs(idx, sz)\ init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \ init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \ init_mc_funcs(idx, hv, 1, 1, sz,) -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 6/6] aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
No measured speedup on an Cortex A53, but other cores might benefit. --- libavcodec/aarch64/vp9mc_neon.S | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S index 99f1809..95ed26c 100644 --- a/libavcodec/aarch64/vp9mc_neon.S +++ b/libavcodec/aarch64/vp9mc_neon.S @@ -202,9 +202,12 @@ endfunc ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) mla \dst2\().8h, v21.8h, v0.h[\offset] mla \dst4\().8h, v23.8h, v0.h[\offset] -.else +.elseif \size == 8 mla \dst1\().8h, v20.8h, v0.h[\offset] mla \dst3\().8h, v22.8h, v0.h[\offset] +.else +mla \dst1\().4h, v20.4h, v0.h[\offset] +mla \dst3\().4h, v22.4h, v0.h[\offset] .endif .endm // The same as above, but don't accumulate straight into the @@ -219,16 +222,24 @@ endfunc ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset) mul v21.8h, v21.8h, v0.h[\offset] mul v23.8h, v23.8h, v0.h[\offset] -.else +.elseif \size == 8 mul v20.8h, v20.8h, v0.h[\offset] mul v22.8h, v22.8h, v0.h[\offset] +.else +mul v20.4h, v20.4h, v0.h[\offset] +mul v22.4h, v22.4h, v0.h[\offset] .endif +.if \size == 4 +sqadd \dst1\().4h, \dst1\().4h, v20.4h +sqadd \dst3\().4h, \dst3\().4h, v22.4h +.else sqadd \dst1\().8h, \dst1\().8h, v20.8h sqadd \dst3\().8h, \dst3\().8h, v22.8h .if \size >= 16 sqadd \dst2\().8h, \dst2\().8h, v21.8h sqadd \dst4\().8h, \dst4\().8h, v23.8h .endif +.endif .endm -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 5/6] arm: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
Before:Cortex A7 A8 A9 A53 vp9_put_8tap_smooth_4h_neon: 378.1 273.2 340.7 229.5 After: vp9_put_8tap_smooth_4h_neon: 352.1 222.2 290.5 229.5 --- libavcodec/arm/vp9mc_neon.S | 33 ++--- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S index a5413a3..8d43ff1 100644 --- a/libavcodec/arm/vp9mc_neon.S +++ b/libavcodec/arm/vp9mc_neon.S @@ -209,7 +209,7 @@ endfunc @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or @ dst1-dst2 and dst3-dst4 for size >= 16) -.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size +.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size vext.8 q14, \src1, \src2, #(2*\offset) vext.8 q15, \src4, \src5, #(2*\offset) .if \size >= 16 @@ -219,14 +219,17 @@ endfunc vext.8 q6, \src5, \src6, #(2*\offset) vmla_lane \dst2, q5, \offset vmla_lane \dst4, q6, \offset -.else +.elseif \size == 8 vmla_lane \dst1, q14, \offset vmla_lane \dst3, q15, \offset +.else +vmla_lane \dst1d, d28, \offset +vmla_lane \dst3d, d30, \offset .endif .endm @ The same as above, but don't accumulate straight into the @ destination, but use a temp register and accumulate with saturation. -.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size +.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size vext.8 q14, \src1, \src2, #(2*\offset) vext.8 q15, \src4, \src5, #(2*\offset) .if \size >= 16 @@ -236,16 +239,24 @@ endfunc vext.8 q6, \src5, \src6, #(2*\offset) vmul_lane q5, q5, \offset vmul_lane q6, q6, \offset -.else +.elseif \size == 8 vmul_lane q14, q14, \offset vmul_lane q15, q15, \offset +.else +vmul_lane d28, d28, \offset +vmul_lane d30, d30, \offset .endif +.if \size == 4 +vqadd.s16 \dst1d, \dst1d, d28 +vqadd.s16 \dst3d, \dst3d, d30 +.else vqadd.s16 \dst1, \dst1, q14 vqadd.s16 \dst3, \dst3, q15 .if \size >= 16 vqadd.s16 \dst2, \dst2, q5 vqadd.s16 \dst4, \dst4, q6 .endif +.endif .endm @@ -309,13 +320,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2 vmul.s16q2, q9, d0[0] vmul.s16q4, q12, d0[0] .endif -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 1, \size -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 2, \size -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx1, \size -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 5, \size -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 6, \size -extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 7, \size -extmulqadd q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx2, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size +extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size +extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size @ Round, shift and saturate vqrshrun.s16d2, q1, #7 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/6] arm: vp9mc: Fix vertical alignment of operands
--- libavcodec/arm/vp9mc_neon.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S index d3d2c7f..a5413a3 100644 --- a/libavcodec/arm/vp9mc_neon.S +++ b/libavcodec/arm/vp9mc_neon.S @@ -79,7 +79,7 @@ function ff_vp9_avg32_neon, export=1 vrhadd.u8 q0, q0, q2 vrhadd.u8 q1, q1, q3 subsr12, r12, #1 -vst1.8 {q0, q1}, [r0, :128], r1 +vst1.8 {q0, q1}, [r0, :128], r1 bne 1b bx lr endfunc @@ -408,7 +408,7 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1 add r12, r12, 120*\offset - 8 cmp r5, #8 add r12, r12, r5, lsl #3 -mov r5, #\size +mov r5, #\size .if \size >= 16 bge \type\()_8tap_16h_34 b \type\()_8tap_16h_43 @@ -543,7 +543,7 @@ function \type\()_8tap_8v_\idx1\idx2 vld1.8 {d0}, [r12, :64] vmovl.s8q0, d0 1: -mov r12, r4 +mov r12, r4 loadl q5, q6, q7 loadl q8, q9, q10, q11 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 3/6] aarch64: vp9mc: Fix a comment to refer to a register with the right name
--- libavcodec/aarch64/vp9mc_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S index 720273b..c1f1876 100644 --- a/libavcodec/aarch64/vp9mc_neon.S +++ b/libavcodec/aarch64/vp9mc_neon.S @@ -250,7 +250,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2 .if \size >= 16 sub x1, x1, x5 .endif -// size >= 16 loads two qwords and increments r2, +// size >= 16 loads two qwords and increments x2, // for size 4/8 it's enough with one qword and no // postincrement .if \size >= 16 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
This work is sponsored by, and copyright, Google. Before:Cortex A7 A8 A9 A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.7 235.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 459.2 862.2 553.9 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 226.5 145.0 225.1 171.8 vp9_inv_dct_dct_32x32_sub1_add_neon: 721.2 415.7 727.6 475.0 --- libavcodec/arm/vp9itxfm_neon.S | 54 -- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 5abe435..a81240b 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -518,16 +518,23 @@ function idct16x16_dc_add_neon vrshr.s16 q8, q8, #6 +mov r3, r0 mov r12, #16 1: @ Loop to add the constant from q8 into all 16x16 outputs -vld1.8 {q3}, [r0,:128] -vaddw.u8q10, q8, d6 -vaddw.u8q11, q8, d7 -vqmovun.s16 d6, q10 -vqmovun.s16 d7, q11 -vst1.8 {q3}, [r0,:128], r1 -subsr12, r12, #1 +subsr12, r12, #2 +vld1.8 {q2}, [r0,:128], r1 +vaddw.u8q10, q8, d4 +vld1.8 {q3}, [r0,:128], r1 +vaddw.u8q11, q8, d5 +vaddw.u8q12, q8, d6 +vaddw.u8q13, q8, d7 +vqmovun.s16 d4, q10 +vqmovun.s16 d5, q11 +vqmovun.s16 d6, q12 +vst1.8 {q2}, [r3,:128], r1 +vqmovun.s16 d7, q13 +vst1.8 {q3}, [r3,:128], r1 bne 1b bx lr @@ -889,20 +896,31 @@ function idct32x32_dc_add_neon vrshr.s16 q8, q8, #6 +mov r3, r0 mov r12, #32 1: @ Loop to add the constant from q8 into all 32x32 outputs -vld1.8 {q2-q3}, [r0,:128] -vaddw.u8q10, q8, d4 -vaddw.u8q11, q8, d5 -vaddw.u8q12, q8, d6 -vaddw.u8q13, q8, d7 -vqmovun.s16 d4, q10 -vqmovun.s16 d5, q11 -vqmovun.s16 d6, q12 -vqmovun.s16 d7, q13 -vst1.8 {q2-q3}, [r0,:128], r1 -subsr12, r12, #1 +subsr12, r12, #2 +vld1.8 {q0-q1}, [r0,:128], r1 +vaddw.u8q9, q8, d0 +vaddw.u8q10, q8, d1 +vld1.8 {q2-q3}, [r0,:128], r1 +vaddw.u8q11, q8, d2 +vaddw.u8q12, q8, d3 +vaddw.u8q13, q8, d4 +vaddw.u8q14, q8, d5 +vaddw.u8q15, q8, d6 +vqmovun.s16 d0, q9 +vaddw.u8q9, q8, d7 +vqmovun.s16 d1, q10 +vqmovun.s16 d2, q11 +vqmovun.s16 d3, q12 +vqmovun.s16 d4, q13 +vqmovun.s16 d5, q14 +vst1.8 {q0-q1}, [r3,:128], r1 +vqmovun.s16 d6, q15 +vqmovun.s16 d7, q9 +vst1.8 {q2-q3}, [r3,:128], r1 bne 1b bx lr -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/2] aarch64: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
This work is sponsored by, and copyright, Google. Before: Cortex A53 vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 180.2 vp9_inv_dct_dct_32x32_sub1_add_neon: 475.3 --- libavcodec/aarch64/vp9itxfm_neon.S | 54 +- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index 7ce6df0..a4284e3 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -448,16 +448,23 @@ function idct16x16_dc_add_neon srshr v2.8h, v2.8h, #6 +mov x3, x0 mov x4, #16 1: // Loop to add the constant from v2 into all 16x16 outputs -ld1 {v3.16b}, [x0] -uaddw v4.8h, v2.8h, v3.8b -uaddw2 v5.8h, v2.8h, v3.16b -sqxtun v4.8b, v4.8h -sqxtun2 v4.16b, v5.8h -st1 {v4.16b}, [x0], x1 -subsx4, x4, #1 +subsx4, x4, #2 +ld1 {v3.16b}, [x0], x1 +ld1 {v4.16b}, [x0], x1 +uaddw v16.8h, v2.8h, v3.8b +uaddw2 v17.8h, v2.8h, v3.16b +uaddw v18.8h, v2.8h, v4.8b +uaddw2 v19.8h, v2.8h, v4.16b +sqxtun v3.8b, v16.8h +sqxtun2 v3.16b, v17.8h +sqxtun v4.8b, v18.8h +sqxtun2 v4.16b, v19.8h +st1 {v3.16b}, [x3], x1 +st1 {v4.16b}, [x3], x1 b.ne1b ret @@ -824,20 +831,31 @@ function idct32x32_dc_add_neon srshr v0.8h, v2.8h, #6 +mov x3, x0 mov x4, #32 1: // Loop to add the constant v0 into all 32x32 outputs -ld1 {v1.16b,v2.16b}, [x0] -uaddw v3.8h, v0.8h, v1.8b -uaddw2 v4.8h, v0.8h, v1.16b -uaddw v5.8h, v0.8h, v2.8b -uaddw2 v6.8h, v0.8h, v2.16b -sqxtun v3.8b, v3.8h -sqxtun2 v3.16b, v4.8h -sqxtun v4.8b, v5.8h -sqxtun2 v4.16b, v6.8h -st1 {v3.16b,v4.16b}, [x0], x1 -subsx4, x4, #1 +subsx4, x4, #2 +ld1 {v1.16b,v2.16b}, [x0], x1 +uaddw v16.8h, v0.8h, v1.8b +uaddw2 v17.8h, v0.8h, v1.16b +ld1 {v3.16b,v4.16b}, [x0], x1 +uaddw v18.8h, v0.8h, v2.8b +uaddw2 v19.8h, v0.8h, v2.16b +uaddw v20.8h, v0.8h, v3.8b +uaddw2 v21.8h, v0.8h, v3.16b +uaddw v22.8h, v0.8h, v4.8b +uaddw2 v23.8h, v0.8h, v4.16b +sqxtun v1.8b, v16.8h +sqxtun2 v1.16b, v17.8h +sqxtun v2.8b, v18.8h +sqxtun2 v2.16b, v19.8h +sqxtun v3.8b, v20.8h +sqxtun2 v3.16b, v21.8h +st1 {v1.16b,v2.16b}, [x3], x1 +sqxtun v4.8b, v22.8h +sqxtun2 v4.16b, v23.8h +st1 {v3.16b,v4.16b}, [x3], x1 b.ne1b ret -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 11/11] rtp: Convert to the new bitstream reader
On Fri, 13 Jan 2017, Anton Khirnov wrote: Quoting Diego Biurrun (2017-01-11 18:10:08) From: Alexandra Hájková --- libavformat/rtpdec_h261.c | 19 ++- libavformat/rtpdec_h263_rfc2190.c | 19 ++- libavformat/rtpdec_latm.c | 24 +--- libavformat/rtpdec_mpeg4.c| 16 +--- libavformat/rtpdec_qt.c | 31 --- libavformat/rtpenc_h263_rfc2190.c | 29 +++-- 6 files changed, 73 insertions(+), 65 deletions(-) diff --git a/libavformat/rtpdec_mpeg4.c b/libavformat/rtpdec_mpeg4.c index 00a732b..b3cd6c9 100644 --- a/libavformat/rtpdec_mpeg4.c +++ b/libavformat/rtpdec_mpeg4.c @@ -27,11 +27,13 @@ * @author Romain Degez */ -#include "rtpdec_formats.h" -#include "internal.h" #include "libavutil/attributes.h" #include "libavutil/avstring.h" -#include "libavcodec/get_bits.h" + +#include "libavcodec/bitstream.h" + +#include "rtpdec_formats.h" +#include "internal.h" #define MAX_AAC_HBR_FRAME_SIZE 8191 @@ -113,7 +115,7 @@ static int parse_fmtp_config(AVCodecParameters *par, const char *value) static int rtp_parse_mp4_au(PayloadContext *data, const uint8_t *buf, int len) { int au_headers_length, au_header_size, i; -GetBitContext getbitcontext; +BitstreamContext bctx; if (len < 2) return AVERROR_INVALIDDATA; @@ -134,7 +136,7 @@ static int rtp_parse_mp4_au(PayloadContext *data, const uint8_t *buf, int len) if (len < data->au_headers_length_bytes) return AVERROR_INVALIDDATA; -init_get_bits(&getbitcontext, buf, data->au_headers_length_bytes * 8); +bitstream_init(&bctx, buf, data->au_headers_length_bytes * 8); /* XXX: Wrong if optional additional sections are present (cts, dts etc...) */ au_header_size = data->sizelength + data->indexlength; @@ -151,8 +153,8 @@ static int rtp_parse_mp4_au(PayloadContext *data, const uint8_t *buf, int len) } for (i = 0; i < data->nb_au_headers; ++i) { -data->au_headers[i].size = get_bits_long(&getbitcontext, data->sizelength); -data->au_headers[i].index = get_bits_long(&getbitcontext, data->indexlength); +data->au_headers[i].size = bitstream_read(&bctx, data->sizelength); +data->au_headers[i].index = bitstream_read(&bctx, data->indexlength); Not really a problem introduced by this patch, but those two fields don't seem to be bound-checked anywhere. The values are parsed in parse_fmtp (with an AVOption like mechanism), where they are limited to a maximum of 32. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 3/6] arm: vp9lpf: Use orrs instead of orr+cmp
--- libavcodec/arm/vp9lpf_neon.S | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index 5e154f6..9be4cef 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -77,8 +77,7 @@ vdup.u8 d3, r3 @ H vmovr2, r3, d4 -orr r2, r2, r3 -cmp r2, #0 +orrsr2, r2, r3 @ If no pixels need filtering, just exit as soon as possible beq 9f @@ -191,8 +190,7 @@ .if \wd >= 8 vmovr2, r3, d6 -orr r2, r2, r3 -cmp r2, #0 +orrsr2, r2, r3 @ If no pixels need flat8in, jump to flat8out @ (or to a writeout of the inner 4 pixels, for wd=8) beq 6f @@ -247,14 +245,12 @@ 6: vorrd2, d6, d7 vmovr2, r3, d2 -orr r2, r2, r3 -cmp r2, #0 +orrsr2, r2, r3 @ If no pixels needed flat8in nor flat8out, jump to a @ writeout of the inner 4 pixels beq 7f vmovr2, r3, d7 -orr r2, r2, r3 -cmp r2, #0 +orrsr2, r2, r3 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels beq 8f -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 4/6] arm: vp9lpf: Interleave the start of flat8in into the calculation above
This adds lots of extra .ifs, but speeds it up by a couple cycles, by avoiding stalls. --- libavcodec/arm/vp9lpf_neon.S | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index 9be4cef..e31c807 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -181,16 +181,20 @@ vmovl.u8q0, d22@ p1 vmovl.u8q1, d25@ q1 +.if \wd >= 8 +vmovr2, r3, d6 +.endif vaddw.s8q0, q0, \tmp3 @ p1 + f vsubw.s8q1, q1, \tmp3 @ q1 - f +.if \wd >= 8 +orrsr2, r2, r3 +.endif vqmovun.s16 d0, q0 @ out p1 vqmovun.s16 d2, q1 @ out q1 vbitd22, d0, d5@ if (!hev && fm && !flat8in) vbitd25, d2, d5 .if \wd >= 8 -vmovr2, r3, d6 -orrsr2, r2, r3 @ If no pixels need flat8in, jump to flat8out @ (or to a writeout of the inner 4 pixels, for wd=8) beq 6f -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 5/6] aarch64: vp9lpf: Interleave the start of flat8in into the calculation above
--- libavcodec/aarch64/vp9lpf_neon.S | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 4553173..3894307 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -316,20 +316,30 @@ uxtl_sz v0.8h, v1.8h, v22, \sz// p1 uxtl_sz v2.8h, v3.8h, v25, \sz// q1 +.if \wd >= 8 +mov x5, v6.d[0] +.endif saddw_szv0.8h, v1.8h, v0.8h, v1.8h, \tmp3, \sz // p1 + f ssubw_szv2.8h, v3.8h, v2.8h, v3.8h, \tmp3, \sz // q1 - f +.if \wd >= 8 +.ifc \sz, .16b +mov x6, v6.d[1] +.endif +.endif sqxtun_sz v0, v0.8h, v1.8h, \sz // out p1 sqxtun_sz v2, v2.8h, v3.8h, \sz // out q1 +.if \wd >= 8 +.ifc \sz, .16b +addsx5, x5, x6 +.endif +.endif bit v22\sz, v0\sz, v5\sz // if (!hev && fm && !flat8in) bit v25\sz, v2\sz, v5\sz // If no pixels need flat8in, jump to flat8out // (or to a writeout of the inner 4 pixels, for wd=8) .if \wd >= 8 -mov x5, v6.d[0] .ifc \sz, .16b -mov x6, v6.d[1] -addsx5, x5, x6 b.eq6f .else cbz x5, 6f -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 2/6] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit
The theoretical maximum value of E is 193, so we can just saturate the addition to 255. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7 vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0 After: vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.684.0 83.0 vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0133.7 vp9_loop_filter_v_16_8_neon:490.0 417.5 377.7 289.0271.0 vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0446.7 --- libavcodec/aarch64/vp9lpf_neon.S | 40 +--- libavcodec/arm/vp9lpf_neon.S | 11 +-- 2 files changed, 14 insertions(+), 37 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index 3b8e6eb..4553173 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -51,13 +51,6 @@ // see the arm version instead. -.macro uabdl_sz dst1, dst2, in1, in2, sz -uabdl \dst1, \in1\().8b, \in2\().8b -.ifc \sz, .16b -uabdl2 \dst2, \in1\().16b, \in2\().16b -.endif -.endm - .macro add_sz dst1, dst2, in1, in2, in3, in4, sz add \dst1, \in1, \in3 .ifc \sz, .16b @@ -86,20 +79,6 @@ .endif .endm -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz -cmhs\dst1, \in1, \in3 -.ifc \sz, .16b -cmhs\dst2, \in2, \in4 -.endif -.endm - -.macro xtn_sz dst, in1, in2, sz -xtn \dst\().8b, \in1 -.ifc \sz, .16b -xtn2\dst\().16b, \in2 -.endif -.endm - .macro usubl_sz dst1, dst2, in1, in2, sz usubl \dst1, \in1\().8b, \in2\().8b .ifc \sz, .16b @@ -179,20 +158,20 @@ // tmpq2 == tmp3 + tmp4, etc. .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 .if \mix == 0 -dup v0.8h, w2// E -dup v1.8h, w2// E +dup v0\sz, w2// E dup v2\sz, w3// I dup v3\sz, w4// H .else -dup v0.8h, w2// E +dup v0.8b, w2// E dup v2.8b, w3// I dup v3.8b, w4// H +lsr w5, w2, #8 lsr w6, w3, #8 lsr w7, w4, #8 -ushrv1.8h, v0.8h, #8 // E +dup v1.8b, w5// E dup v4.8b, w6// I -bic v0.8h, #255, lsl 8 // E dup v5.8b, w7// H +trn1v0.2d, v0.2d, v1.2d trn1v2.2d, v2.2d, v4.2d trn1v3.2d, v3.2d, v5.2d .endif @@ -206,16 +185,15 @@ umaxv4\sz, v4\sz, v5\sz umaxv5\sz, v6\sz, v7\sz umax\tmp1\sz, \tmp1\sz, \tmp2\sz -uabdl_szv6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) +uabdv6\sz, v23\sz, v24\sz// abs(p0 - q0) umaxv4\sz, v4\sz, v5\sz -add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz // abs(p0 - q0) * 2 +uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 uabdv5\sz, v22\sz, v25\sz// abs(p1 - q1) umaxv4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), ..., abs(q2 - q3)) ushrv5\sz, v5\sz, #1 cmhsv4\sz, v2\sz, v4\sz // max(abs()) <= I -uaddw_szv6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 -cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz -xtn_sz v5, v6.8h, v7.8h, \sz +uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +cmhsv5\sz, v0\sz, v6\sz and v4\sz, v4\sz, v5\sz // fm // If no pixels need filtering, just exit as soon as possible diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index c57c0e9..5e154f6 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -51,7 +51,7 @@ @ and d28-d31 as temp registers, or d8-d15. @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4 -vdup.u16q0, r2 @ E +vdup.u8 d0, r2 @ E vdup.u8 d2, r3 @ I ldr r3, [sp] @@ -64,16 +64,15 @@
[libav-devel] [PATCH 1/6] arm/aarch64: vp9lpf: Calculate !hev directly
Previously we first calculated hev, and then negated it. Since we were able to schedule the negation in the middle of another calculation, we don't see any gain in all cases. Before: Cortex A7 A8 A9 A53 A53/AArch64 vp9_loop_filter_v_4_8_neon: 147.0 129.0 115.889.0 88.7 vp9_loop_filter_v_8_8_neon: 242.0 198.5 174.7 140.0136.7 vp9_loop_filter_v_16_8_neon:500.0 419.5 382.7 293.0275.7 vp9_loop_filter_v_16_16_neon: 971.2 825.5 731.5 579.0453.0 After: vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7 vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7 vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7 vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0 --- libavcodec/aarch64/vp9lpf_neon.S | 5 ++--- libavcodec/arm/vp9lpf_neon.S | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index e9c7d9e..3b8e6eb 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -292,7 +292,7 @@ .if \mix != 0 sxtlv1.8h, v1.8b .endif -cmhiv5\sz, v5\sz, v3\sz // hev +cmhsv5\sz, v3\sz, v5\sz // !hev .if \wd == 8 // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 .if \mix != 0 @@ -306,11 +306,10 @@ .elseif \wd == 8 bic v4\sz, v4\sz, v6\sz // fm && !flat8in .endif -mvn v5\sz, v5\sz // !hev +and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in .if \wd == 16 and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm .endif -and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) bic \tmp1\sz, \tmp1\sz, v5\sz// if (!hev) av_clip_int8 = 0 diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index fbf2901..c57c0e9 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -141,7 +141,7 @@ .if \wd == 8 vcle.u8 d6, d6, d0@ flat8in .endif -vcgt.u8 d5, d5, d3@ hev +vcle.u8 d5, d5, d3@ !hev .if \wd == 8 vandd6, d6, d4@ flat8in && fm .endif @@ -151,11 +151,10 @@ .elseif \wd == 8 vbicd4, d4, d6@ fm && !flat8in .endif -vmvnd5, d5 @ !hev +vandd5, d5, d4@ !hev && fm && !flat8in .if \wd == 16 vandd7, d7, d6@ flat8out && flat8in && fm .endif -vandd5, d5, d4@ !hev && fm && !flat8in vmul.s16\tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0) vbic\tmp1, \tmp1, d5@ if (!hev) av_clip_int8 = 0 -- 2.7.4 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 6/6] arm: vp9lpf: Implement the mix2_44 function with one single filter pass
For this case, with 8 inputs but only changing 4 of them, we can fit all 16 input pixels into a q register, and still have enough temporary registers for doing the loop filter. The wd=8 filters would require too many temporary registers for processing all 16 pixels at once though. Before: Cortex A7 A8 A9 A53 vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2 After: vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0 --- libavcodec/arm/vp9dsp_init_arm.c | 7 +- libavcodec/arm/vp9lpf_neon.S | 191 +++ 2 files changed, 195 insertions(+), 3 deletions(-) diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c index e99d931..1ede170 100644 --- a/libavcodec/arm/vp9dsp_init_arm.c +++ b/libavcodec/arm/vp9dsp_init_arm.c @@ -194,6 +194,8 @@ define_loop_filters(8, 8); define_loop_filters(16, 8); define_loop_filters(16, 16); +define_loop_filters(44, 16); + #define lf_mix_fn(dir, wd1, wd2, stridea) \ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, \ ptrdiff_t stride, \ @@ -207,7 +209,6 @@ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, lf_mix_fn(h, wd1, wd2, stride) \ lf_mix_fn(v, wd1, wd2, sizeof(uint8_t)) -lf_mix_fns(4, 4) lf_mix_fns(4, 8) lf_mix_fns(8, 4) lf_mix_fns(8, 8) @@ -227,8 +228,8 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp) dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon; dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon; -dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon; -dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon; +dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon; +dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon; dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon; dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon; dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon; diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S index e31c807..12984a9 100644 --- a/libavcodec/arm/vp9lpf_neon.S +++ b/libavcodec/arm/vp9lpf_neon.S @@ -44,6 +44,109 @@ vtrn.8 \r2, \r3 .endm +@ The input to and output from this macro is in the registers q8-q15, +@ and q0-q7 are used as scratch registers. +@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 +.macro loop_filter_q +vdup.u8 d0, r2 @ E +lsr r2, r2, #8 +vdup.u8 d2, r3 @ I +lsr r3, r3, #8 +vdup.u8 d1, r2 @ E +vdup.u8 d3, r3 @ I + +vabd.u8 q2, q8, q9 @ abs(p3 - p2) +vabd.u8 q3, q9, q10@ abs(p2 - p1) +vabd.u8 q4, q10, q11@ abs(p1 - p0) +vabd.u8 q5, q12, q13@ abs(q0 - q1) +vabd.u8 q6, q13, q14@ abs(q1 - q2) +vabd.u8 q7, q14, q15@ abs(q2 - q3) +vmax.u8 q2, q2, q3 +vmax.u8 q3, q4, q5 +vmax.u8 q4, q6, q7 +vabd.u8 q5, q11, q12@ abs(p0 - q0) +vmax.u8 q2, q2, q3 +vqadd.u8q5, q5, q5 @ abs(p0 - q0) * 2 +vabd.u8 q7, q10, q13@ abs(p1 - q1) +vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) +vshr.u8 q7, q7, #1 +vcle.u8 q2, q2, q1 @ max(abs()) <= I +vqadd.u8q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 +vcle.u8 q5, q5, q0 +vandq2, q2, q5 @ fm + +vshrn.u16 d10, q2, #4 +vmovr2, r3, d10 +orrsr2, r2, r3 +@ If no pixels need filtering, just exit as soon as possible +beq 9f + +@ Calculate the normal inner loop filter for 2 or 4 pixels +ldr r3, [sp, #64] +vabd.u8 q3, q10, q11@ abs(p1 - p0) +vabd.u8 q4, q13, q12@ abs(q1 - q0) + +vsubl.u8q5, d20, d26@ p1 - q1 +vsubl.u8q6, d21, d27@ p1 - q1 +vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) +vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1) +vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1) +vdup.u8 d8, r3 @ H +lsr r3, r3, #8 +vdup.u8 d9, r3 @ H +vsubl.u8q6, d24, d22@ q0 - p0 +vsubl.u8q7, d25, d23@ q0 - p0 +vcle.u8 q3, q3, q4 @ hev +
[libav-devel] [PATCH] rtmpproto: Make bytes_read variables 64bit.
From: Carl Eugen Hoyos When bytes_read overflowed, last_bytes_read did not yet overflow and no bytes-read report was created leading to a timeout. Analyzed-by: Thomas Bernhard --- libavformat/rtmpproto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavformat/rtmpproto.c b/libavformat/rtmpproto.c index 5298c18..09d22eb 100644 --- a/libavformat/rtmpproto.c +++ b/libavformat/rtmpproto.c @@ -94,8 +94,8 @@ typedef struct RTMPContext { int flv_nb_packets; ///< number of flv packets published RTMPPacketout_pkt;///< rtmp packet, created from flv a/v or metadata (for output) uint32_t client_report_size; ///< number of bytes after which client should report to server -uint32_t bytes_read; ///< number of bytes read from server -uint32_t last_bytes_read;///< number of bytes read last reported to server +uint64_t bytes_read; ///< number of bytes read from server +uint64_t last_bytes_read;///< number of bytes read last reported to server uint32_t last_timestamp; ///< last timestamp received in a packet int skip_bytes; ///< number of bytes to skip from the input FLV stream in the next write call int has_audio; ///< presence of audio data -- 2.10.1 (Apple Git-78) ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtmpproto: Make bytes_read variables 64bit.
On Wed, 25 Jan 2017, Luca Barbato wrote: On 25/01/2017 09:29, Martin Storsjö wrote: From: Carl Eugen Hoyos When bytes_read overflowed, last_bytes_read did not yet overflow and no bytes-read report was created leading to a timeout. Analyzed-by: Thomas Bernhard --- libavformat/rtmpproto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Not against it, but theoretically wouldn't the problem present itself unchanged once you shove enough data in? (yes would take quite a bit of time given the current network fabrics available). Theoretically yes, but I'm quite ok with a solution that breaks after sending/receiving 2^64 bytes. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] configure: Add quotes around a variable which might be empty
If we only have a target compiler but no host compiler, the $type variable will be empty once. (Currently we fail to do a cross build if no host compiler is available due to using the host compiler for processing option lists though. But despite that, this comparison in configure needs quotes.) --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index f186753..7570c0a 100755 --- a/configure +++ b/configure @@ -5033,7 +5033,7 @@ fi for pfx in "" host_; do varname=${pfx%_}cc_type eval "type=\$$varname" -if [ $type = "msvc" ]; then +if [ "$type" = "msvc" ]; then check_${pfx}cc
Re: [libav-devel] [PATCH] build: Detect blocks C language extension and add it as VDA dependency
On Wed, 25 Jan 2017, Diego Biurrun wrote: Newer versions of OS X use the blocks extension in VDA-related headers. --- configure | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) It might be useful for a reader to include a comment on the indented usecase. I.e. if using a third party compiler such as gcc, these frameworks have to be avoided. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] rtmp: Account bytes_read may wrap around
On Wed, 25 Jan 2017, Luca Barbato wrote: The servers seem to be happy to receive the wrapped around value as long they receive a report, otherwise they would timeout. Initially reported and analyzed by Thomas Bernhard. --- Here the alternative patch discussed on IRC. libavformat/rtmpproto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/rtmpproto.c b/libavformat/rtmpproto.c index 5298c18..49a40dd 100644 --- a/libavformat/rtmpproto.c +++ b/libavformat/rtmpproto.c @@ -2416,7 +2416,7 @@ static int get_packet(URLContext *s, int for_header) rt->last_timestamp = rpkt.timestamp; rt->bytes_read += ret; -if (rt->bytes_read > rt->last_bytes_read + rt->client_report_size) { +if (rt->bytes_read - rt->last_bytes_read > rt->client_report_size) { av_log(s, AV_LOG_DEBUG, "Sending bytes read report\n"); if ((ret = gen_bytes_read(s, rt, rpkt.timestamp + 1)) < 0) return ret; -- 2.9.2 Seems ok to me, although I'm not able to test the issue easily. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 01/12] dashenc: fix ISO8601 UTC parsing
On Fri, 27 Jan 2017, Peter Große wrote: From: Anton Schubert Appends Z to timestamp to force ISO8601 datetime parsing as UTC. Without Z, some browsers (Chrome) interpret the timestamp as localtime and others (Firefox) interpret it as UTC. Signed-off-by: Anton Schubert --- libavformat/dashenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index ce01860..2b27950 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -429,7 +429,7 @@ static void format_date_now(char *buf, int size) struct tm *ptm, tmbuf; ptm = gmtime_r(&t, &tmbuf); if (ptm) { -if (!strftime(buf, size, "%Y-%m-%dT%H:%M:%S", ptm)) +if (!strftime(buf, size, "%Y-%m-%dT%H:%M:%SZ", ptm)) buf[0] = '\0'; } } -- 2.10.2 LGTM, pushed // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 02/12] dashenc: add mandatory id to AdaptationSet and Period in manifest
On Fri, 27 Jan 2017, Peter Große wrote: From: Peter Große Signed-off-by: Peter Große --- libavformat/dashenc.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) LGTM, will push // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 03/12] dashenc: increase buffer time hint in the manifest
On Fri, 27 Jan 2017, Peter Große wrote: From: Anton Schubert to avoid rebuffering on the clientside for difficult network conditions. Signed-off-by: Anton Schubert --- libavformat/dashenc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 865f50a..44785cd 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -481,7 +481,7 @@ static int write_manifest(AVFormatContext *s, int final) } } avio_printf(out, "\tminBufferTime=\""); -write_time(out, c->last_duration); +write_time(out, c->last_duration * 2); avio_printf(out, "\">\n"); avio_printf(out, "\t\n"); if (title) { -- 2.10.2 I guess this is ok. I tried to keep the buffer amount low to allow as reasonably low latency as possible in practice, but I guess this is a safer compromise. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 04/12] dashenc: add option to provide UTC timing source
On Fri, 27 Jan 2017, Peter Große wrote: From: Peter Große If set, adds a UTCTime tag in the manifest. UTCTiming, not UTCTime, right? See http://vm2.dashif.org/dash.js/docs/jsdocs/MediaPlayer.html#addUTCTimingSource for more information. Usable default: "https://time.akamai.com/?iso"; Hmm, I don't see this documented in ISO/IEC 23009-1:2014 at least, and the documentation from dashif don't seem to define it either. I won't mind including it, but I'd like to know what standard defines it before merging. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 05/12] dashenc: separate segments based on current segment duration
On Fri, 27 Jan 2017, Peter Große wrote: The current implementation creates new segments comparing pkt->pts - first_pts > total_duration I guess this would be more understandable written like this: pkt->pts - first_pts > nb_segs * min_seg_duration or total_duration > nb_segs * min_seg_duration This works fine, but if the keyframe interval is smaller than "min_seg_duration" segments shorter than the minimum segment duration are created. Example: keyint=50, min_seg_duration=300 segment 1 contains keyframe 1 (duration=2s < total_duration=3s) and keyframe 2 (duration=4s >= total_duration=3s) segment 2 contains keyframe 3 (duration=6s >= total_duration=6s) segment 3 contains keyframe 4 (duration=8s < total_duration=9s) and keyframe 5 (duration=10s >= total_duration=9s) ... Segment 2 is only 2s long, shorter than min_seg_duration = 3s. FWIW, this was the indended behaviour originally, but I agree that the option name isn't quite optimal given this. To fix this, new segments are created based on the actual written duration. Otherwise the option name "min_seg_duration" is misleading. Signed-off-by: Peter Große --- libavformat/dashenc.c | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 198932c..93c292f 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -62,7 +62,7 @@ typedef struct OutputStream { int ctx_inited; uint8_t iobuf[32768]; AVIOContext *out; -int packets_written; +int duration_written; char initfile[1024]; int64_t init_start_pos; int init_range_length; @@ -785,7 +785,7 @@ static int dash_flush(AVFormatContext *s, int final, int stream) int64_t start_pos; int range_length, index_length = 0; -if (!os->packets_written) +if (!os->duration_written) continue; // Flush the single stream that got a keyframe right now. @@ -823,7 +823,7 @@ static int dash_flush(AVFormatContext *s, int final, int stream) av_write_frame(os->ctx, NULL); avio_flush(os->ctx->pb); -os->packets_written = 0; +os->duration_written = 0; range_length = avio_tell(os->ctx->pb) - start_pos; if (c->single_file) { @@ -868,7 +868,6 @@ static int dash_write_packet(AVFormatContext *s, AVPacket *pkt) DASHContext *c = s->priv_data; AVStream *st = s->streams[pkt->stream_index]; OutputStream *os = &c->streams[pkt->stream_index]; -int64_t seg_end_duration = (os->segment_index) * (int64_t) c->min_seg_duration; int ret; ret = update_stream_extradata(s, os, st->codecpar); @@ -897,9 +896,9 @@ static int dash_write_packet(AVFormatContext *s, AVPacket *pkt) os->first_pts = pkt->pts; if ((!c->has_video || st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) && -pkt->flags & AV_PKT_FLAG_KEY && os->packets_written && -av_compare_ts(pkt->pts - os->first_pts, st->time_base, - seg_end_duration, AV_TIME_BASE_Q) >= 0) { +pkt->flags & AV_PKT_FLAG_KEY && os->duration_written && +av_compare_ts(os->duration_written + pkt->duration, st->time_base, + c->min_seg_duration, AV_TIME_BASE_Q) >= 0) { int64_t prev_duration = c->last_duration; c->last_duration = av_rescale_q(pkt->pts - os->start_pts, @@ -922,7 +921,7 @@ static int dash_write_packet(AVFormatContext *s, AVPacket *pkt) return ret; } -if (!os->packets_written) { +if (!os->duration_written) { // If we wrote a previous segment, adjust the start time of the segment // to the end of the previous one (which is the same as the mp4 muxer // does). This avoids gaps in the timeline. @@ -935,7 +934,7 @@ static int dash_write_packet(AVFormatContext *s, AVPacket *pkt) os->max_pts = pkt->pts + pkt->duration; else os->max_pts = FFMAX(os->max_pts, pkt->pts + pkt->duration); -os->packets_written++; +os->duration_written += pkt->duration; return ff_write_chained(os->ctx, 0, pkt, s); } This isn't the best way of achieving what you're trying to do. Don't accumulate pkt->duration values. In general we can't really assume that pkt->duration is set sensibly. In some places we need an estimate of the duration of a packet, and in those cases we fill it in using the duration of the previous packet (i.e. the diff between this packets timestamp and the previous one) if we don't have a definitive value available. In those cases where we use an estimate of the duration, we make sure to correct it as soon as possible when we get the real timestamps of the next packet. But we shouldn't accumulate the values since that would end up with accumulating the error, blowing it out of propertion at worst. To get the behaviour you want, couldn't you just change the comparison like this: -av_compare_ts(pkt->pts - os->first_pts, st->
Re: [libav-devel] [PATCH 06/12] dashenc: support for hinting stream bandwidth using metadata option
On Fri, 27 Jan 2017, Peter Große wrote: From: Peter Große Bandwidth information is required in the manifest, but not always provided by the demuxer. So enable hinting the stream bandwidth via a metadata field, supports same values as codec bitrate setting. Example: -metadata:s:v:0 bitrate=3500k Signed-off-by: Peter Große --- libavformat/dashenc.c | 12 1 file changed, 12 insertions(+) This is kinda abusing the metadata mechanism, which is mostly intended to be used for user-visible metadata like author/title and such. I'd like to ask for Anton's opinion on this (although I'm pretty sure we've discussed it before). // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available
On Fri, 27 Jan 2017, Peter Große wrote: Bandwidth information is required in the manifest, but not always provided by the demuxer. If not available via metadata calculate bandwith based on the size and duration of the first segment. Signed-off-by: Peter Große --- libavformat/dashenc.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 1cd9563..1b12d4f 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -846,6 +846,17 @@ static int dash_flush(AVFormatContext *s, int final, int stream) if (ret < 0) break; } + +if (!os->bit_rate) { +// calculate average bitrate of first segment +double bitrate = (int)( (double) range_length * 8.0 * AV_TIME_BASE / (double)(os->max_pts - os->start_pts) ); +if (bitrate >= 0 && bitrate <= INT64_MAX) { +os->bit_rate = bitrate; +snprintf(os->bandwidth_str, sizeof(os->bandwidth_str), + " bandwidth=\"%d\"", os->bit_rate); +} +} + add_segment(os, filename, os->start_pts, os->max_pts - os->start_pts, start_pos, range_length, index_length); av_log(s, AV_LOG_VERBOSE, "Representation %d media segment %d written to: %s\n", i, os->segment_index, full_path); } -- 2.10.2 This probably is fine with me. We can have $Bandwidth$ as part of the segment filename template - I guess this is too late to get it right for those cases? Although I don't mind if we don't support that case when we don't have a nominal bitrate set and try to use it in the template. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets
On Fri, 27 Jan 2017, Peter Große wrote: This patch is based on the stream assignment code in webmdashenc. This muxer isn't available in libav, so the reference doesn't really work in context here. Additional changes: * Default to one AdaptationSet per stream Previously all mapped streams of a media type (video, audio) where assigned to a single AdaptationSet. Using the DASH live profile it is mandatory, that the segments of all representations are aligned, which is currently not enforced. This leads to problems when using video streams with different key frame intervals. So to play safe, default to one AdaptationSet per stream, unless overwritten by explicit assignment Is there any way to easily get it back to the original behaviour, e.g. tell it to group all audio tracks together and all video tracks together, without having to manually specify the stream numbers? * Make sure all streams are assigned to exactly one AdaptationSet Signed-off-by: Peter Große --- libavformat/dashenc.c | 193 ++ 1 file changed, 162 insertions(+), 31 deletions(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 1b12d4f..c561ad1 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -24,6 +24,7 @@ #include #endif +#include "libavutil/avutil.h" #include "libavutil/avstring.h" #include "libavutil/eval.h" #include "libavutil/intreadwrite.h" @@ -58,9 +59,15 @@ typedef struct Segment { int n; } Segment; +typedef struct AdaptationSet { +char id[10]; +enum AVMediaType media_type; +AVDictionary *metadata; +} AdaptationSet; + typedef struct OutputStream { AVFormatContext *ctx; -int ctx_inited; +int ctx_inited, as_idx; uint8_t iobuf[32768]; AVIOContext *out; int duration_written; @@ -79,6 +86,9 @@ typedef struct OutputStream { typedef struct DASHContext { const AVClass *class; /* Class for private options. */ +char *adaptation_sets; +AdaptationSet *as; +int nb_as; int window_size; int extra_window_size; int min_seg_duration; @@ -87,7 +97,7 @@ typedef struct DASHContext { int use_timeline; int single_file; OutputStream *streams; -int has_video, has_audio; +int has_video; int64_t last_duration; int64_t total_duration; char availability_start_time[100]; @@ -176,6 +186,16 @@ static void dash_free(AVFormatContext *s) { DASHContext *c = s->priv_data; int i, j; + +if (c->as) { +for (i = 0; i < c->nb_as; i++) { +if (&c->as[i].metadata) +av_dict_free(&c->as[i].metadata); +} +av_freep(&c->as); +c->nb_as = 0; +} + if (!c->streams) return; for (i = 0; i < s->nb_streams; i++) { @@ -436,12 +456,144 @@ static void format_date_now(char *buf, int size) } } +static int write_adaptation_set(AVFormatContext *s, AVIOContext *out, int as_index) +{ +DASHContext *c = s->priv_data; +AdaptationSet *as = &c->as[as_index]; +int i; + +avio_printf(out, "\t\t\n", +as->id, as->media_type == AVMEDIA_TYPE_VIDEO ? "video" : "audio"); + +for (i = 0; i < s->nb_streams; i++) { +OutputStream *os = &c->streams[i]; + +if (os->as_idx - 1 != as_index) +continue; + +if (as->media_type == AVMEDIA_TYPE_VIDEO) { +avio_printf(out, "\t\t\t\n", +i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->width, s->streams[i]->codecpar->height); +} else { +avio_printf(out, "\t\t\t\n", +i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->sample_rate); +avio_printf(out, "\t\t\t\t\n", +s->streams[i]->codecpar->channels); +} +output_segment_list(os, out, c); +avio_printf(out, "\t\t\t\n"); +} +avio_printf(out, "\t\t\n"); + +return 0; +} + +static int parse_adaptation_sets(AVFormatContext *s) +{ +DASHContext *c = s->priv_data; +char *p = c->adaptation_sets; +char *q; +enum { new_set, parse_id, parsing_streams } state; +int i; + +/* default: one AdaptationSet for each stream */ +if (!p) { +void *mem = av_mallocz(sizeof(*c->as) * s->nb_streams); +if (!mem) +return AVERROR(ENOMEM); +c->as = mem; +c->nb_as = s->nb_streams; + +for (i = 0; i < s->nb_streams; i++) { +AdaptationSet *as = &c->as[i]; +OutputStream *os = &c->streams[i]; +snprintf(as->id, sizeof(as->id), "%d", i); +as->metadata = NULL; +as->media_type = s->streams[i]->codecpar->codec_type; +os->as_idx = i + 1; +} +return 0; +} + +/* syntax id=0,streams=0,1,2 id=1,streams=3,4 and so on */ +state = new_set; +while (p < c->adaptation_sets + strlen(c->adaptation_sets)) { Isn't this equivalent to "while (*p) {"? +if (*p == ' ')
Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets
On Fri, 27 Jan 2017, Peter Große wrote: This patch is based on the stream assignment code in webmdashenc. Additional changes: * Default to one AdaptationSet per stream Previously all mapped streams of a media type (video, audio) where assigned to a single AdaptationSet. Using the DASH live profile it is mandatory, that the segments of all representations are aligned, which is currently not enforced. This leads to problems when using video streams with different key frame intervals. So to play safe, default to one AdaptationSet per stream, unless overwritten by explicit assignment * Make sure all streams are assigned to exactly one AdaptationSet Signed-off-by: Peter Große --- libavformat/dashenc.c | 193 ++ 1 file changed, 162 insertions(+), 31 deletions(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 1b12d4f..c561ad1 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -24,6 +24,7 @@ #include #endif +#include "libavutil/avutil.h" #include "libavutil/avstring.h" #include "libavutil/eval.h" #include "libavutil/intreadwrite.h" @@ -58,9 +59,15 @@ typedef struct Segment { int n; } Segment; +typedef struct AdaptationSet { +char id[10]; +enum AVMediaType media_type; +AVDictionary *metadata; +} AdaptationSet; Oh, I also forgot to say; as far as I can see (on a sloppy read-through), metadata is unused in this patch. If that's really true, skip it here and only add it in the next patch where it is used. That makes this patch slightly less confusing. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 09/12] dashenc: copy language and role metadata from streams assigned to sets
On Fri, 27 Jan 2017, Peter Große wrote: Signed-off-by: Peter Große --- libavformat/dashenc.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index c561ad1..a0c7811 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -460,10 +460,19 @@ static int write_adaptation_set(AVFormatContext *s, AVIOContext *out, int as_ind { DASHContext *c = s->priv_data; AdaptationSet *as = &c->as[as_index]; +AVDictionaryEntry *lang, *role; int i; -avio_printf(out, "\t\t\n", +avio_printf(out, "\t\tid, as->media_type == AVMEDIA_TYPE_VIDEO ? "video" : "audio"); +lang = av_dict_get(as->metadata, "language", NULL, 0); +if (lang) +avio_printf(out, " lang=\"%s\"", lang->value); +avio_printf(out, ">\n"); + +role = av_dict_get(as->metadata, "role", NULL, 0); +if (role) +avio_printf(out, "\t\t\t\n", role->value); for (i = 0; i < s->nb_streams; i++) { OutputStream *os = &c->streams[i]; @@ -669,6 +678,14 @@ static int write_manifest(AVFormatContext *s, int final) return ff_rename(temp_filename, s->filename); } +static int dict_copy_entry(AVDictionary **dst, const AVDictionary *src, const char *key) +{ +AVDictionaryEntry *entry = av_dict_get(src, key, NULL, 0); +if (entry) +av_dict_set(dst, key, entry->value, AV_DICT_DONT_OVERWRITE); +return 0; +} + static int dash_write_header(AVFormatContext *s) { DASHContext *c = s->priv_data; @@ -713,6 +730,7 @@ static int dash_write_header(AVFormatContext *s) for (i = 0; i < s->nb_streams; i++) { OutputStream *os = &c->streams[i]; +AdaptationSet *as = &c->as[os->as_idx - 1]; AVFormatContext *ctx; AVStream *st; AVDictionary *opts = NULL; @@ -743,6 +761,10 @@ static int dash_write_header(AVFormatContext *s) } } +// copy AdaptationSet language and role from stream metadata +dict_copy_entry(&as->metadata, s->streams[i]->metadata, "language"); +dict_copy_entry(&as->metadata, s->streams[i]->metadata, "role"); + ctx = avformat_alloc_context(); if (!ctx) { ret = AVERROR(ENOMEM); -- 2.10.2 This seems ok, even though it (mis)uses metadata for this kind of info. Or what does Anton think of this one? // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 10/12] dashenc: use avio_dynbuf instead of packet_write callback
On Fri, 27 Jan 2017, Peter Große wrote: The dash_write function drops data, if no IOContext is initialized. This might happen when a subordinate muxer calls avio_flush(). Using a dynamic buffer fixes that. Signed-off-by: Peter Große --- libavformat/dashenc.c | 87 +-- 1 file changed, 50 insertions(+), 37 deletions(-) This patch at least needs a mention of what muxer would do that. Currently, the only subordinate muxer you can use is the mp4 one, and that one (with the configured options) doesn't flush except when told to. diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index a0c7811..86b454e 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -68,11 +68,10 @@ typedef struct AdaptationSet { typedef struct OutputStream { AVFormatContext *ctx; int ctx_inited, as_idx; -uint8_t iobuf[32768]; AVIOContext *out; int duration_written; char initfile[1024]; -int64_t init_start_pos; +int64_t init_start_pos, pos; int init_range_length; int nb_segments, segments_size, segment_index; Segment **segments; @@ -108,14 +107,6 @@ typedef struct DASHContext { const char *utc_timing_url; } DASHContext; -static int dash_write(void *opaque, uint8_t *buf, int buf_size) -{ -OutputStream *os = opaque; -if (os->out) -avio_write(os->out, buf, buf_size); -return buf_size; -} - // RFC 6381 static void set_codec_str(AVFormatContext *s, AVCodecParameters *par, char *str, int size) @@ -182,6 +173,29 @@ static void set_codec_str(AVFormatContext *s, AVCodecParameters *par, } } +static int flush_dynbuf(OutputStream *os, int *range_length) +{ +uint8_t *buffer; +int ret; + +if (!os->ctx->pb) { +return AVERROR(EINVAL); +} + +// flush +av_write_frame(os->ctx, NULL); +avio_flush(os->ctx->pb); + +// write out to file +*range_length = avio_close_dyn_buf(os->ctx->pb, &buffer); +avio_write(os->out, buffer, *range_length); +av_free(buffer); + +// re-open buffer +ret = avio_open_dyn_buf(&os->ctx->pb); +return ret; The intermediate variable feels a little superfluous here +} + static void dash_free(AVFormatContext *s) { DASHContext *c = s->priv_data; @@ -203,7 +217,7 @@ static void dash_free(AVFormatContext *s) if (os->ctx && os->ctx_inited) av_write_trailer(os->ctx); if (os->ctx && os->ctx->pb) -av_free(os->ctx->pb); +ffio_free_dyn_buf(&os->ctx->pb); ff_format_io_close(s, &os->out); if (os->ctx) avformat_free_context(os->ctx); @@ -689,7 +703,7 @@ static int dict_copy_entry(AVDictionary **dst, const AVDictionary *src, const ch static int dash_write_header(AVFormatContext *s) { DASHContext *c = s->priv_data; -int ret = 0, i; +int ret = 0, i, range_length; AVOutputFormat *oformat; char *ptr; char basename[1024]; @@ -786,11 +800,9 @@ static int dash_write_header(AVFormatContext *s) st->time_base = s->streams[i]->time_base; ctx->avoid_negative_ts = s->avoid_negative_ts; -ctx->pb = avio_alloc_context(os->iobuf, sizeof(os->iobuf), AVIO_FLAG_WRITE, os, NULL, dash_write, NULL); -if (!ctx->pb) { -ret = AVERROR(ENOMEM); +if ((ret = avio_open_dyn_buf(&ctx->pb)) < 0) goto fail; -} +ctx->pb->seekable = 0; if (c->single_file) { if (c->single_file_name) @@ -807,11 +819,17 @@ static int dash_write_header(AVFormatContext *s) os->init_start_pos = 0; av_dict_set(&opts, "movflags", "frag_custom+dash+delay_moov", 0); -if ((ret = avformat_write_header(ctx, &opts)) < 0) { - goto fail; -} +if ((ret = avformat_write_header(ctx, &opts)) < 0) +goto fail; + +if ((ret = flush_dynbuf(os, &range_length)) < 0) +goto fail; + +if (!c->single_file) +ff_format_io_close(s, &os->out); + +os->pos = os->init_range_length = range_length; This isn't correct. See c5e7ea13d2d4da0c5da91973a547afff6fe9e011. We intentionally defer writing the init segment until we have passed the first segment of packets into the muxer, to handle the cases where the init segments depend on knowledge from the actual first few packets. (Proper edit lists for handling b-frame delay needs to know the timestamp of the first packet, and for AC3, the moov box takes a copy of the full first packet.) For using the webm muxer, you probably might need to have this behaviour conditional on the kind of muxer. os->ctx_inited = 1; -avio_flush(ctx->pb); av_dict_free(&opts); av_log(s, AV_LOG_VERBOSE, "Representation %d init segment will be written to: %s\n", i, filename); @@ -946,7 +964,6 @@ static int dash_flush(AVFormatContext *s, int final, int stream) for (i = 0; i < s->nb_streams; i++) { OutputStream
Re: [libav-devel] [PATCH 11/12] dashenc: add webm support
On Fri, 27 Jan 2017, Peter Große wrote: Use webm muxer for VP8, VP9 and Opus codec, mp4 muxer otherwise. Also copy stream metadata to output stream. Signed-off-by: Peter Große --- libavformat/dashenc.c | 68 --- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 86b454e..24665cd 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -113,6 +113,23 @@ static void set_codec_str(AVFormatContext *s, AVCodecParameters *par, { const AVCodecTag *tags[2] = { NULL, NULL }; uint32_t tag; + +// common Webm codecs are not part of RFC 6381 +switch (par->codec_id) { +case AV_CODEC_ID_VP8: +snprintf(str, size, "vp8"); +return; +case AV_CODEC_ID_VP9: +snprintf(str, size, "vp9"); +return; +case AV_CODEC_ID_VORBIS: +snprintf(str, size, "vorbis"); +return; +case AV_CODEC_ID_OPUS: +snprintf(str, size, "opus"); +return; +} + Hmm, I'm pondering if it'd be worth to store these in some more compact form, like a table, for all codecs which just have a single simple plain string? if (par->codec_type == AVMEDIA_TYPE_VIDEO) tags[0] = ff_codec_movvideo_tags; else if (par->codec_type == AVMEDIA_TYPE_AUDIO) @@ -495,11 +512,11 @@ static int write_adaptation_set(AVFormatContext *s, AVIOContext *out, int as_ind continue; if (as->media_type == AVMEDIA_TYPE_VIDEO) { -avio_printf(out, "\t\t\t\n", -i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->width, s->streams[i]->codecpar->height); +avio_printf(out, "\t\t\t\n", +i, os->ctx->oformat->name, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->width, s->streams[i]->codecpar->height); } else { -avio_printf(out, "\t\t\t\n", -i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->sample_rate); +avio_printf(out, "\t\t\t\n", +i, os->ctx->oformat->name, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->sample_rate); Using oformat->name here feels a little fragile. I think I'd rather have that stored as a string somewhere in the context. (Not sure if it's better to have it "flexible" to compose it as video/ and audio/, or store the full "video/mp4" etc as audioMime and videoMime.) avio_printf(out, "\t\t\t\t\n", s->streams[i]->codecpar->channels); } @@ -700,11 +717,18 @@ static int dict_copy_entry(AVDictionary **dst, const AVDictionary *src, const ch return 0; } +static int dict_set_int(AVDictionary **pm, const char *key, int64_t value, int flags) +{ +char valuestr[22]; +snprintf(valuestr, sizeof(valuestr), "%"PRId64, value); +flags &= ~AV_DICT_DONT_STRDUP_VAL; +return av_dict_set(pm, key, valuestr, flags); +} + static int dash_write_header(AVFormatContext *s) { DASHContext *c = s->priv_data; int ret = 0, i, range_length; -AVOutputFormat *oformat; char *ptr; char basename[1024]; @@ -727,12 +751,6 @@ static int dash_write_header(AVFormatContext *s) if (ptr) *ptr = '\0'; -oformat = av_guess_format("mp4", NULL, NULL); -if (!oformat) { -ret = AVERROR_MUXER_NOT_FOUND; -goto fail; -} - c->streams = av_mallocz(sizeof(*c->streams) * s->nb_streams); if (!c->streams) { ret = AVERROR(ENOMEM); @@ -784,12 +802,25 @@ static int dash_write_header(AVFormatContext *s) ret = AVERROR(ENOMEM); goto fail; } + +// choose muxer based on codec: webm for VP8/9 and opus, mp4 otherwise +if (s->streams[i]->codecpar->codec_id == AV_CODEC_ID_VP8 || +s->streams[i]->codecpar->codec_id == AV_CODEC_ID_VP9 || +s->streams[i]->codecpar->codec_id == AV_CODEC_ID_OPUS) { +ctx->oformat = av_guess_format("webm", NULL, NULL); +} else { +ctx->oformat = av_guess_format("mp4", NULL, NULL); +} +if (!ctx->oformat) { +ret = AVERROR_MUXER_NOT_FOUND; +goto fail; +} os->ctx = ctx; -ctx->oformat = oformat; ctx->interrupt_callback = s->interrupt_callback; ctx->opaque = s->opaque; ctx->io_close = s->io_close; ctx->io_open= s->io_open; +av_dict_copy(&ctx->metadata, s->metadata, 0); Does the webm muxer need some specific metadata which we don't pass through right now, or is it just for making sure that metadata ends up set on the chained muxer and included in the stream? I'd at least want a comment in the commit message explaining why this is necessary. if (!(st = avformat_new_stream(ctx, NULL))) { ret = AVERROR(ENOMEM); @@ -798,7 +829,10 @@ static int dash_write_header(AVFormatContext *s) avcodec_parameters_copy(s
Re: [libav-devel] [PATCH 12/12] doc: add dash muxer
On Fri, 27 Jan 2017, Peter Große wrote: Signed-off-by: Peter Große --- doc/muxers.texi | 53 + 1 file changed, 53 insertions(+) diff --git a/doc/muxers.texi b/doc/muxers.texi index 5430da7..95d8051 100644 --- a/doc/muxers.texi +++ b/doc/muxers.texi @@ -53,6 +53,59 @@ avconv -i INPUT -c:a pcm_u8 -c:v mpeg2video -f crc - See also the @ref{framecrc} muxer. +@anchor{dash} +@section dash + +Dynamic Adaptive Streaming over HTTP (DASH) muxer that creates segments +and manifest files according to the MPEG-DASH standard ISO/IEC 23009-1:2014. + +For more information see: + +@itemize @bullet +@item +WebM DASH Specification: @url{https://sites.google.com/a/webmproject.org/wiki/adaptive-streaming/webm-dash-specification} +@item +ISO DASH Specification: @url{http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip} +@end itemize I'd rather have the actual ISO spec first :P + +It creates a MPD manifest file and segment files for each stream. + +The segment filename might contain pre-defined identifiers used with SegmentTemplate +as defined in section 5.3.9.4.4 of the standard. Available identifiers are "$RepresentationID$", +"$Number$", "$Bandwidth$" and "$Time$". + +@example +avconv -i in.nut -f dash +@end example It might be of value to show a full testcase also, where you use multiple quality streams for both audio and video. I've used a command line like this locally for testing that: ./avconv -re -i -map 0 -map 0 -acodec libfdk_aac -vcodec libx264 -b:v:0 800k -b:v:1 300k -s:v:1 320x170 -profile:v:1 baseline -profile:v:0 main -bf 1 -keyint_min 120 -g 120 -sc_threshold 0 -b_strategy 0 -use_timeline 1 -use_template 1 -window_size 5 -f dash -ar:a:1 22050 /path/to/out.mpd This should afaik produce well-aligned keyframes in all substreams. Feel free to pick through this and see what of it makes sense to include in an example. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 04/12] dashenc: add option to provide UTC timing source
On Fri, 27 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 14:58:21 +0200 (EET) Martin Storsjö wrote: On Fri, 27 Jan 2017, Peter Große wrote: If set, adds a UTCTime tag in the manifest. UTCTiming, not UTCTime, right? Right. See http://vm2.dashif.org/dash.js/docs/jsdocs/MediaPlayer.html#addUTCTimingSource for more information. Usable default: "https://time.akamai.com/?iso"; Hmm, I don't see this documented in ISO/IEC 23009-1:2014 at least, and the documentation from dashif don't seem to define it either. I won't mind including it, but I'd like to know what standard defines it before merging. Mh true, this is not part of the standard, but introduced in their "Guidelines for Implementations: DASH-IF Interoperability Points" [1] in Section 4.7. Which URL shall I reference in the commit message? The guidelines link page, the PDF, or not at all but mentioning the document's title? I'd at least refer to the guidelines page, but linking explicitly to the PDF as well probably is good. The "usable default" is just a hint. It is not mentioned in any document, it was just what I found in player implementations (like mentioned in the message above). I could drop it. No, do keep it, any such extra info is useful. But I started wondering, is there any actual use for this, when we explicitly list all the segments (either as individual segment filenames or via the template). As far as I can see, this is mostly necessary if you implicitly calculate the live edge based on availabilityStartTime and the segment length. So is there actually any gain from using it in our configuration? // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available
On Fri, 27 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 15:21:20 +0200 (EET) Martin Storsjö wrote: On Fri, 27 Jan 2017, Peter Große wrote: Bandwidth information is required in the manifest, but not always provided by the demuxer. If not available via metadata calculate bandwith based on the size and duration of the first segment. This probably is fine with me. We can have $Bandwidth$ as part of the segment filename template - I guess this is too late to get it right for those cases? Although I don't mind if we don't support that case when we don't have a nominal bitrate set and try to use it in the template. Than we have to postpone opening any file handle until the first call to dash_flush, right? Which should be possible, but requires some code to be moved. Using a dyn_buf (patch 10) might also help here, so writing to files happens as late a possible. Hmm, indeed. If we postpone this patch until after that, this should be easier to handle right. (The alternative would be to just count the bitrate of the codec payload as we pass into the chained muxer; the bit_rate value we use otherwise also refers to only the codec payload data not including the container, but since we're probably doing the dyn_buf stuff anyway, that's even simpler.) // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets
On Fri, 27 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 15:39:30 +0200 (EET) Martin Storsjö wrote: On Fri, 27 Jan 2017, Peter Große wrote: This patch is based on the stream assignment code in webmdashenc. This muxer isn't available in libav, so the reference doesn't really work in context here. True, I thought I changed to something like "This patch is based on the stream assignment code of a webm dash muxer by Vignesh Venkatasubramanian". So far only a big part of the parse_adaption_set function is copied over, the rest is basically my own code. But it looks like, I'll to also make changes to that part (regarding your comments below) so I might not need the reference anymore?! It might be ok to drop the reference altogether. Or perhaps just add something like "originally based partially on code by V.V."? That way, you have something like proper attribution, given the amount of code taken (i.e. much inspiration but not necessarily much literal lines). // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 10/12] dashenc: use avio_dynbuf instead of packet_write callback
On Fri, 27 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 16:02:23 +0200 (EET) Martin Storsjö wrote: On Fri, 27 Jan 2017, Peter Große wrote: av_dict_set(&opts, "movflags", "frag_custom+dash+delay_moov", 0); -if ((ret = avformat_write_header(ctx, &opts)) < 0) { - goto fail; -} +if ((ret = avformat_write_header(ctx, &opts)) < 0) +goto fail; + +if ((ret = flush_dynbuf(os, &range_length)) < 0) +goto fail; + +if (!c->single_file) +ff_format_io_close(s, &os->out); + +os->pos = os->init_range_length = range_length; This isn't correct. See c5e7ea13d2d4da0c5da91973a547afff6fe9e011. We intentionally defer writing the init segment until we have passed the first segment of packets into the muxer, to handle the cases where the init segments depend on knowledge from the actual first few packets. (Proper edit lists for handling b-frame delay needs to know the timestamp of the first packet, and for AC3, the moov box takes a copy of the full first packet.) For using the webm muxer, you probably might need to have this behaviour conditional on the kind of muxer. So how do you avoid spilling actual packets into the init segment? Is it that av_write_frame only writes to the file, what was flushed before using avio_flush? This relies on the fact that in frag_custom mode, the muxer itself will never spontaneously flush any output, it only does that when explicitly flushed (with av_write_frame(NULL)). When you have the delay_moov flag added, it won't write anything at all during av_write_header, and will write the init segment during the first av_write_frame(NULL). So if changing existing code, if you add delay_moov, you need to add one extra flush to get the init segment. (If you flush earlier, the moov will be based more on guesses and less on actual data.) This code resulted from dealing with matroskaenc having his own idea when to flush and start new clusters. So my first idea would to the move the "writing out the init segment" code to a function and call it conditionally depending on the muxer. If webm, call it in write_header. Then in dash_flush call it, if os->init_range_length is zero. Yeah, that sounds sensible. Or rather invert the condition - instead of if (webm), do if (!mp4), since it's specific to the mp4+delay_moov combo. // Martin; ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 11/12] dashenc: add webm support
On Fri, 27 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 16:09:06 +0200 (EET) Martin Storsjö wrote: +// common Webm codecs are not part of RFC 6381 +switch (par->codec_id) { +case AV_CODEC_ID_VP8: +snprintf(str, size, "vp8"); +return; +case AV_CODEC_ID_VP9: +snprintf(str, size, "vp9"); +return; +case AV_CODEC_ID_VORBIS: +snprintf(str, size, "vorbis"); +return; +case AV_CODEC_ID_OPUS: +snprintf(str, size, "opus"); +return; +} + Hmm, I'm pondering if it'd be worth to store these in some more compact form, like a table, for all codecs which just have a single simple plain string? Do you have some example code? I'm not sure if I get how to do tables in C. Like with a struct? struct codec_string { int codec_id; const char *codec_str; }; struct codec_string codecs[] = { {AV_CODEC_ID_VP8, "vp8"}, {AV_CODEC_ID_VP9, "vp9"}, ... }; Yes, something like this. If favouring compact code (which we tend to do), something like this would do: static struct codec_string { int id; const char *str; } codecs[] = { { AV_CODEC_ID_VP8, "vp8" }, ... { 0, NULL } }; int i; for (i = 0; codecs[i].id; i++) if (codecs[i].id == id) return codecs[i].str; All in all, the number of lines probably might end up higher than before, but the bulk of the repetitive part is more compact, and adding another entry is less annoying. Using oformat->name here feels a little fragile. I think I'd rather have that stored as a string somewhere in the context. (Not sure if it's better to have it "flexible" to compose it as video/ and audio/, or store the full "video/mp4" etc as audioMime and videoMime.) I might opt for the short version, since I use the format name also for conditionals like if (!strcmp(os->ctx->oformat->name, "mp4")) Therefor maybe a char *format_name in the OutputStream struct? That might be ok. Maybe a comment at the place where it's assigned, saying that this will be used as part of a mime string (so future other formats won't accidentally create bogus mime strings). ctx->interrupt_callback = s->interrupt_callback; ctx->opaque = s->opaque; ctx->io_close = s->io_close; ctx->io_open= s->io_open; +av_dict_copy(&ctx->metadata, s->metadata, 0); Does the webm muxer need some specific metadata which we don't pass through right now, or is it just for making sure that metadata ends up set on the chained muxer and included in the stream? I'd at least want a comment in the commit message explaining why this is necessary. No specific metadata is needed. I can move this to a separate commit. if (!(st = avformat_new_stream(ctx, NULL))) { ret = AVERROR(ENOMEM); @@ -798,7 +829,10 @@ static int dash_write_header(AVFormatContext *s) avcodec_parameters_copy(st->codecpar, s->streams[i]->codecpar); st->sample_aspect_ratio = s->streams[i]->sample_aspect_ratio; st->time_base = s->streams[i]->time_base; +st->avg_frame_rate = s->streams[i]->avg_frame_rate; ctx->avoid_negative_ts = s->avoid_negative_ts; +ctx->flags = s->flags; +ctx->max_delay = s->max_delay; Are these new settings strictly necessary for chained-webm? If not, I'd rather add them in a separate commit afterwards, with an explanation on what it helps for. To be honest, I haven't checked. I just looked at what other muxers having subordinate muxers do, when they initialize the context. Ok - absolutely in another commit then, if at all. In general it might seem useful to pass on as many fields as possible, but so far, I've taken the inverse approach; only forward the ones I absolutely know I need. I probably don't mind adding this with a commit message with a good motivation though. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available
On Sat, 28 Jan 2017, Peter Große wrote: On Fri, 27 Jan 2017 22:24:35 +0200 (EET) Martin Storsjö wrote: Than we have to postpone opening any file handle until the first call to dash_flush, right? Which should be possible, but requires some code to be moved. Using a dyn_buf (patch 10) might also help here, so writing to files happens as late a possible. Hmm, indeed. If we postpone this patch until after that, this should be easier to handle right. (The alternative would be to just count the bitrate of the codec payload as we pass into the chained muxer; the bit_rate value we use otherwise also refers to only the codec payload data not including the container, but since we're probably doing the dyn_buf stuff anyway, that's even simpler.) Mh, after thinking about this a bit more, I'm not sure the complexity is worth it. I'd have to store the contents of the dyn_buf containing the init data somewhere until the first packet is ready to be flushed so I can calculate the average bitrate to be used in the filenames. Only then I know the init segment filename. This might be ok for mp4, since everything happens within the first call to dash_flush. But for webm this doesn't work that easy. So maybe the patch is ok for now. Ok - it's not worth complicating the code too much just for that pathological case. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2 01/10] dashenc: add option to provide UTC timing source
On Sun, 29 Jan 2017, Peter Große wrote: From: Peter Große If set, adds a UTCTiming tag in the manifest. This is part of the recommendations listed in the "Guidelines for Implementations: DASH-IF Interoperability Points" [1][2] Section 4.7 describes means for the Availability Time Synchronization. A usable default is "https://time.akamai.com/?iso"; [1] http://dashif.org/guidelines/ [2] http://dashif.org/wp-content/uploads/2016/12/DASH-IF-IOP-v4.0-clean.pdf (current version as of writing) Signed-off-by: Peter Große --- v2: add references to IOP Content-wise, the patch looks fine. But I'm still missing a comment on a question from the previous review: But I started wondering, is there any actual use for this, when we explicitly list all the segments (either as individual segment filenames or via the template). As far as I can see, this is mostly necessary if you implicitly calculate the live edge based on availabilityStartTime and the segment length. So is there actually any gain from using it in our configuration? // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2 02/10] dashenc: separate segments based on current segment duration
On Sun, 29 Jan 2017, Peter Große wrote: The current implementation creates new segments comparing pkt->pts - first_pts > nb_segs * min_seg_duration This works fine, but if the keyframe interval is smaller than "min_seg_duration" segments shorter than the minimum segment duration are created. Example: keyint=50, min_seg_duration=300 segment 1 contains keyframe 1 (duration=2s < total_duration=3s) and keyframe 2 (duration=4s >= total_duration=3s) segment 2 contains keyframe 3 (duration=6s >= total_duration=6s) segment 3 contains keyframe 4 (duration=8s < total_duration=9s) and keyframe 5 (duration=10s >= total_duration=9s) ... Segment 2 is only 2s long, shorter than min_seg_duration = 3s. To fix this, new segments are created based on the actual written duration. Otherwise the option name "min_seg_duration" is misleading. Signed-off-by: Peter Große --- v2: use pts difference instead of fragile duration calculation Seems ok to me. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2 03/10] dashenc: calculate stream bitrate from first segment if not available
On Sun, 29 Jan 2017, Peter Große wrote: Bandwidth information is required in the manifest, but not always provided by the demuxer. In that case calculate the bandwith based on the size and duration of the first segment. Signed-off-by: Peter Große --- v2: reword commit message --- libavformat/dashenc.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c index 98722cd..ae1bb0b 100644 --- a/libavformat/dashenc.c +++ b/libavformat/dashenc.c @@ -834,6 +834,16 @@ static int dash_flush(AVFormatContext *s, int final, int stream) if (ret < 0) break; } + +if (!os->bit_rate) { +// calculate average bitrate of first segment +double bitrate = (int)( (double) range_length * 8.0 * AV_TIME_BASE / (double)(os->max_pts - os->start_pts) ); Sorry for not pointing it out in the last round, but promoting this to double feels a bit superfluous. Just cast to int64_t instead, that should be just fine for this use case. (Floats can end up rounded differently on different platforms, and even though it doesn't really matter here, having it completely determinisic when easily possible is desireable.) // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2 04/10] dashenc: add support for assigning streams to AdaptationSets
On Sun, 29 Jan 2017, Peter Große wrote: Also makes sure all streams are assigned to exactly one AdaptationSet. This patch is originally based partially on code by Vignesh Venkatasubramanian. Signed-off-by: Peter Große --- v2: * changing default stream assignment moved to separate patch * removed metadata field from AdaptationSet * check strtol result in parse_adaptation_set to detect non-numeric input * move allocation of adaptation set to separate function * fix string parsing problems pointed out by Martin --- libavformat/dashenc.c | 201 ++ 1 file changed, 170 insertions(+), 31 deletions(-) Very nice improvement from the previous iteration! I appreciate the fact that the default behaviour isn't changed here as well. +os = &c->streams[i]; +if (as->media_type == AVMEDIA_TYPE_UNKNOWN) { +as->media_type = s->streams[i]->codecpar->codec_type; +} else if (as->media_type != s->streams[i]->codecpar->codec_type) { +av_log(s, AV_LOG_ERROR, "Mixing codec types within an AdaptationSet is not allowed\n"); +return AVERROR(EINVAL); +} else if (os->as_idx) { +av_log(s, AV_LOG_ERROR, "Assigning a stream to more than one AdaptationSet is not allowed\n"); +return AVERROR(EINVAL); +} +os->as_idx = c->nb_as; + +if (*p == ' ') state = new_set; +if (*p) +p++; Nitpick: The state = new_set; statement could be on a separate line. I can change that before pushing. Other than that, I think this is good to go. // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH v2 05/10] dashenc: allow assigning all streams of a media type to an AdaptationSet
On Sun, 29 Jan 2017, Peter Große wrote: Using the characters "v" or "a" instead of stream index numbers for assigning streams in the adaption_set option, all streams matching that given type will be added to the AdaptationSet. Signed-off-by: Peter Große --- libavformat/dashenc.c | 61 +-- 1 file changed, 45 insertions(+), 16 deletions(-) Nice, this looks good // Martin ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel