[libav-devel] [PATCH] fate.sh: Allow setting other make flags for running tests

2016-11-22 Thread Martin Storsjö
If makeopts_fate is set, these makeopts are used for running the
tests instead of the normal makeopts. If it isn't set, the normal
makeopts variable is used as before.

This is useful if remote testing on a lesser machine where a large
number of parallel jobs might be undesireable, while wanting to speed
up the build with many parallel processes.
---
 tests/fate.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fate.sh b/tests/fate.sh
index b8ee1ae..f7ca891 100755
--- a/tests/fate.sh
+++ b/tests/fate.sh
@@ -73,7 +73,7 @@ compile()(
 fate()(
 test "$build_only" = "yes" && return
 cd ${build} || return
-${make} ${makeopts} -k fate
+${make} ${makeopts_fate-${makeopts}} -k fate
 )
 
 clean(){
-- 
2.9.3 (Apple Git-75)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   3189.0   2486.8   2509.9   1964.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18448.1  16682.0  14235.4  11993.4

By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5188.7211.6235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:2079.7   1606.3   1772.1   1264.8
vp9_inv_dct_dct_16x16_sub8_add_neon:2449.2   1834.3   2046.5   1499.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2826.2   2109.2   2295.9   1758.2
vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5457.5863.7554.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
vp9_inv_dct_dct_32x32_sub8_add_neon:   12217.8   9068.1   9420.4   7518.3
vp9_inv_dct_dct_32x32_sub12_add_neon:  12967.3  10455.5  10223.9   8275.7
vp9_inv_dct_dct_32x32_sub16_add_neon:  14084.1  11933.7  10998.9   9012.5
vp9_inv_dct_dct_32x32_sub20_add_neon:  15171.4  13335.0  11820.6   9757.2
vp9_inv_dct_dct_32x32_sub24_add_neon:  16229.6  15185.7  12614.4  10504.9
vp9_inv_dct_dct_32x32_sub28_add_neon:  17338.1  15955.3  13445.0  11248.4
vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.

In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
---
This goes on top of the checkasm vp9dsp patch that adds benchmarking
of generic subpartitions in the itxfm.
---
 libavcodec/arm/vp9itxfm_neon.S | 70 --
 tests/checkasm/vp9dsp.c|  6 ++--
 2 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 01944bd..769579a 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,10 +659,17 @@ endfunc
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
 @ r2 = src
-@ r3 = slice offset
+@ r3 = eob
+@ r9 = min eob
 function \txfm\()16_1d_4x16_pass1_neon
+.ifc \txfm,idct
+@ Check if this whole input slice is zero
+cmp r3,  r9
+ble 2f
+.endif
+
 mov r12, #32
 vmov.s16q2, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
 transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, 
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
 
 @ Store the transposed 4x4 blocks horizontally.
-cmp r3,  #12
+cmp r1,  #12
 beq 1f
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
 vst1.16 {d\i}, [r0,:64]!
 .endr
 bx  lr
 1:
-@ Special case: For the last input column (r3 == 12),
+@ Special case: For the last input column (r1 == 12),
 @ which would be stored as the last row in the temp buffer,
 @ don't store the first 4x4 block, but keep it in registers
 @ for the first slice of the second pass (where it is the
@@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
 vmovd30, d18
 vmovd31, d19
 bx  lr
+
+.ifc \txfm,idct
+2:
+@ Set d28-d31 to zero, for the in-register passthrough of coefficients 
to pass 2
+vmov.i16q14, #0
+vmov.i16q15, #0
+@ Write zeros to the temp buffer for pass 2
+.rept 4
+vst1.16 {q14-q15}, [r0,:128]!
+.endr
+bx  lr
+.endif
 endfunc
 
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -781,15 +800,23 @@ endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst
 
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+.short  0, 10, 38, 89
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
 cmp r3,  #1
 beq idct16x16_dc_add_neon
 .endif
-push{r4-r7,lr}
+push{r4-r9,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
 vpush   {q4-q7}
+mov r9,  #0
+.else
+movrel  r8,  min_eob_

[libav-devel] [PATCH 02/11] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

2016-11-23 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9itxfm_neon.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 2dc6b75..21352f0 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1001,7 +1001,6 @@ function idct32_1d_8x32_pass2_neon
 
 idct16
 
-mov x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 st1 {v\i\().8h}, [x2], x9
 .endr
@@ -1018,7 +1017,6 @@ function idct32_1d_8x32_pass2_neon
 
 idct32_odd
 
-mov x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
 ld1 {v4.8h},  [x2], x9
 ld1 {v5.8h},  [x2], x9
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 03/11] arm: vp9itxfm: Rename a macro parameter to fit better

2016-11-23 Thread Martin Storsjö
Since the same parameter is used for both input and output,
the name inout is more fitting.

This matches the naming used below in the dmbutterfly macro.
---
 libavcodec/arm/vp9itxfm_neon.S | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index acb1c6d..01944bd 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -125,16 +125,16 @@ endconst
 vmlal.s16   \out4, \in4, \coef1
 .endm
 
-@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14
-@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14
-@ in are 2 d registers, tmp are 2 q registers
-.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0
-mbutterfly_l\tmp1, \tmp2, \in1, \in2, \coef1, \coef2
+@ inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
+@ inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
+@ inout are 2 d registers, tmp are 2 q registers
+.macro mbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, neg=0
+mbutterfly_l\tmp1, \tmp2, \inout1, \inout2, \coef1, \coef2
 .if \neg > 0
 vneg.s32\tmp2, \tmp2
 .endif
-vrshrn.s32  \in1, \tmp1,  #14
-vrshrn.s32  \in2, \tmp2,  #14
+vrshrn.s32  \inout1, \tmp1,  #14
+vrshrn.s32  \inout2, \tmp2,  #14
 .endm
 
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) 
>> 14
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 01/11] arm/aarch64: vp9itxfm: Fix indentation of macro arguments

2016-11-23 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9itxfm_neon.S | 16 
 libavcodec/arm/vp9itxfm_neon.S |  8 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 65406b9..2dc6b75 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -969,14 +969,14 @@ function idct32_1d_8x32_pass1_neon
 st1 {v7.8h},  [x0], #16
 .endm
 
-store_rev 31, 23
-store_rev 30, 22
-store_rev 29, 21
-store_rev 28, 20
-store_rev 27, 19
-store_rev 26, 18
-store_rev 25, 17
-store_rev 24, 16
+store_rev   31, 23
+store_rev   30, 22
+store_rev   29, 21
+store_rev   28, 20
+store_rev   27, 19
+store_rev   26, 18
+store_rev   25, 17
+store_rev   24, 16
 .purgem store_rev
 ret
 endfunc
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 5d73d84..acb1c6d 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -1017,10 +1017,10 @@ function idct32_1d_4x32_pass1_neon
 .endr
 .endm
 
-store_rev 31, 27, 23, 19
-store_rev 30, 26, 22, 18
-store_rev 29, 25, 21, 17
-store_rev 28, 24, 20, 16
+store_rev   31, 27, 23, 19
+store_rev   30, 26, 22, 18
+store_rev   29, 25, 21, 17
+store_rev   28, 24, 20, 16
 .purgem store_rev
 bx  lr
 endfunc
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 05/11] arm: vp9itxfm: Make the larger core transforms standalone functions

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15252 to 12316 bytes.

This gives a small slowdown of a couple tens of cycles, up to a few
hundred cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_neon:2079.7   1606.3   1772.1   1264.8
vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:2214.2   1617.2   1767.9   1286.5
vp9_inv_dct_dct_16x16_sub16_add_neon:   3256.8   2489.2   2554.8   2002.9
vp9_inv_dct_dct_32x32_sub4_add_neon:   10891.4   8124.4   8614.0   6828.5
vp9_inv_dct_dct_32x32_sub32_add_neon:  18626.4  17264.0  14298.7  12067.5
---
 libavcodec/arm/vp9itxfm_neon.S | 41 -
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 769579a..d10de1e 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-.macro idct16
+function idct16
 mbutterfly0 d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 
= t1a
 mbutterfly  d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
t3a
 mbutterfly  d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
t7a
@@ -580,9 +580,10 @@ endfunc
 vmovd4,  d21 @ d4  = t10a
 butterfly   d20, d27, d6,  d27   @ d20 = out[4], d27 = 
out[11]
 butterfly   d21, d26, d26, d4@ d21 = out[5], d26 = 
out[10]
-.endm
+bx  lr
+endfunc
 
-.macro iadst16
+function iadst16
 movrel  r12, iadst16_coeffs
 vld1.16 {q0-q1}, [r12,:128]
 
@@ -653,7 +654,8 @@ endfunc
 
 vmovd16, d2
 vmovd30, d4
-.endm
+bx  lr
+endfunc
 
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -669,6 +671,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 cmp r3,  r9
 ble 2f
 .endif
+push{lr}
 
 mov r12, #32
 vmov.s16q2, #0
@@ -677,7 +680,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 vst1.16 {d4},  [r2,:64], r12
 .endr
 
-\txfm\()16
+bl  \txfm\()16
 
 @ Do four 4x4 transposes. Originally, d16-d31 contain the
 @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -690,7 +693,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
 vst1.16 {d\i}, [r0,:64]!
 .endr
-bx  lr
+pop {pc}
 1:
 @ Special case: For the last input column (r1 == 12),
 @ which would be stored as the last row in the temp buffer,
@@ -717,7 +720,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 vmovd29, d17
 vmovd30, d18
 vmovd31, d19
-bx  lr
+pop {pc}
 
 .ifc \txfm,idct
 2:
@@ -739,6 +742,7 @@ endfunc
 @ r2 = src (temp buffer)
 @ r3 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
+push{lr}
 mov r12, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
 vld1.16 {d\i}, [r2,:64], r12
@@ -752,7 +756,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 
 add r3,  r0,  r1
 lsl r1,  r1,  #1
-\txfm\()16
+bl  \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3
 vrshr.s16   \coef0, \coef0, #6
@@ -793,7 +797,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 load_add_store  q12, q13, q14, q15
 .purgem load_add_store
 
-bx  lr
+pop {pc}
 endfunc
 .endm
 
@@ -906,7 +910,7 @@ function idct32x32_dc_add_neon
 bx  lr
 endfunc
 
-.macro idct32_odd
+function idct32_odd
 movrel  r12, idct_coeffs
 add r12, r12, #32
 vld1.16 {q0-q1}, [r12,:128]
@@ -965,7 +969,8 @@ endfunc
 mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = 
t21a
 mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = 
t22
 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = 
t23a
-.endm
+bx  lr
+endfunc
 
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
 @ We don't have register space to do a single pass IDCT of 4x32 though,
@@ -981,6 +986,7 @@ function id

[libav-devel] [PATCH 08/11] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   1372.2
vp9_inv_dct_dct_32x32_sub32_add_neon:   8083.0

By skipping individual 8x16 or 8x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon: 236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:1051.7
vp9_inv_dct_dct_16x16_sub8_add_neon:1051.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   1373.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   1377.4
vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1
vp9_inv_dct_dct_32x32_sub4_add_neon:5186.6
vp9_inv_dct_dct_32x32_sub8_add_neon:5195.7
vp9_inv_dct_dct_32x32_sub12_add_neon:   6151.5
vp9_inv_dct_dct_32x32_sub16_add_neon:   6160.0
vp9_inv_dct_dct_32x32_sub20_add_neon:   7121.0
vp9_inv_dct_dct_32x32_sub24_add_neon:   7120.6
vp9_inv_dct_dct_32x32_sub28_add_neon:   8090.1
vp9_inv_dct_dct_32x32_sub32_add_neon:   8095.2

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 69 ++
 1 file changed, 63 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 21352f0..dc58652 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -588,6 +588,9 @@ endfunc
 .macro store i, dst, inc
 st1 {v\i\().8h},  [\dst], \inc
 .endm
+.macro movi_v i, size, imm
+moviv\i\()\size,  \imm
+.endm
 .macro load_clear i, src, inc
 ld1 {v\i\().8h}, [\src]
 st1 {v2.8h},  [\src], \inc
@@ -596,11 +599,18 @@ endfunc
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
-// x1 = unused
+// x1 = slice offset
 // x2 = src
-// x3 = slice offset
+// w3 = eob
+// w7 = min eob
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
+.ifc \txfm,idct
+// Check if this whole input slice is zero
+cmp w3, w7
+b.le2f
+.endif
+
 mov x9, #32
 moviv2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -616,14 +626,14 @@ function \txfm\()16_1d_8x16_pass1_neon
 transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 
 // Store the transposed 8x8 blocks horizontally.
-cmp x3,  #8
+cmp x1,  #8
 b.eq1f
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 store   \i,  x0,  #16
 .endr
 ret
 1:
-// Special case: For the last input column (x3 == 8),
+// Special case: For the last input column (x1 == 8),
 // which would be stored as the last row in the temp buffer,
 // don't store the first 8x8 block, but keep it in registers
 // for the first slice of the second pass (where it is the
@@ -641,6 +651,20 @@ function \txfm\()16_1d_8x16_pass1_neon
 mov v30.16b, v22.16b
 mov v31.16b, v23.16b
 ret
+
+.ifc \txfm,idct
+2:
+// Set v24-v31 to zero, for the in-register passthrough of
+// coefficients to pass 2. Since we only do two slices, this can
+// only ever happen for the second slice. So we only need to store
+// zeros to the temp buffer for the second half of the buffer.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+add x0,  x0,  #16
+movi_v  \i,  .16b, #0
+store   24,  x0,  #16
+.endr
+ret
+.endif
 endfunc
 
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@@ -719,6 +743,11 @@ endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst
 
+// This is the minimum eob value for each subpartition, in increments of 8
+const min_eob_idct_idct_16, align=4
+.short  0, 38
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
@@ -743,6 +772,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 movrel  x10, idct_coeffs
 .ifnc \txfm1\()_\txfm2,idct_idct
 movrel  x11, iadst16_coeffs
+mov x7,  #0
+.else
+movrel  x12, min_eob_idct_idct_16
 .endif
 .ifc \txfm1,idct
 ld1 {v0.8h,v1.8h}, [x10]
@@ -750,8 +782,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, 
export=1
 
 .irp i, 0, 8
 add x0,  sp,  #(\i*32)
+mov x1,  #\i
 add x2,  x6,  #(\i*2)
-mov x3,  #\i
+.ifc \txfm1\()_\txfm

[libav-devel] [PATCH 09/11] aarch64: vp9itxfm: Make the larger core transforms standalone functions

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from
19540 to 14784 bytes.

This gives a small slowdown of a couple of tens of cycles, and makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_neon:  1051.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 1378.8
vp9_inv_dct_dct_32x32_sub4_add_neon:  5190.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8091.7

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:  1065.0
vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.7
vp9_inv_dct_dct_32x32_sub4_add_neon:  5208.6
vp9_inv_dct_dct_32x32_sub32_add_neon: 8121.9
---
 libavcodec/aarch64/vp9itxfm_neon.S | 41 ++
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index dc58652..ef40a5a 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -463,7 +463,7 @@ function idct16x16_dc_add_neon
 ret
 endfunc
 
-.macro idct16
+function idct16
 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
t0a,  v24 = t1a
 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a
@@ -506,9 +506,10 @@ endfunc
 butterfly_8hv19, v28, v5,  v28   // v19 = out[3], v28 
= out[12]
 butterfly_8hv20, v27, v6,  v27   // v20 = out[4], v27 
= out[11]
 butterfly_8hv21, v26, v26, v3// v21 = out[5], v26 
= out[10]
-.endm
+ret
+endfunc
 
-.macro iadst16
+function iadst16
 ld1 {v0.8h,v1.8h}, [x11]
 
 dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // 
v6,v7   = t1,   v4,v5   = t0
@@ -577,7 +578,8 @@ endfunc
 
 mov v16.16b, v2.16b
 mov v30.16b, v4.16b
-.endm
+ret
+endfunc
 
 // Helper macros; we can't use these expressions directly within
 // e.g. .irp due to the extra concatenation \(). Therefore wrap
@@ -610,6 +612,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 cmp w3, w7
 b.le2f
 .endif
+mov x14, x30
 
 mov x9, #32
 moviv2.8h, #0
@@ -617,7 +620,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 load_clear  \i,  x2,  x9
 .endr
 
-\txfm\()16
+bl  \txfm\()16
 
 // Do two 8x8 transposes. Originally, v16-v31 contain the
 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
@@ -631,7 +634,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 store   \i,  x0,  #16
 .endr
-ret
+br  x14
 1:
 // Special case: For the last input column (x1 == 8),
 // which would be stored as the last row in the temp buffer,
@@ -650,7 +653,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 mov v29.16b, v21.16b
 mov v30.16b, v22.16b
 mov v31.16b, v23.16b
-ret
+br  x14
 
 .ifc \txfm,idct
 2:
@@ -674,6 +677,7 @@ endfunc
 // x2 = src (temp buffer)
 // x3 = slice offset
 function \txfm\()16_1d_8x16_pass2_neon
+mov x14, x30
 mov x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 load\i,  x2,  x9
@@ -686,7 +690,7 @@ function \txfm\()16_1d_8x16_pass2_neon
 
 add x3,  x0,  x1
 lsl x1,  x1,  #1
-\txfm\()16
+bl  \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, 
tmp1, tmp2
 srshr   \coef0, \coef0, #6
@@ -736,7 +740,7 @@ function \txfm\()16_1d_8x16_pass2_neon
 load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, 
v30.8h, v31.8h, v16.8b, v17.8b
 .purgem load_add_store
 
-ret
+br  x14
 endfunc
 .endm
 
@@ -852,7 +856,7 @@ function idct32x32_dc_add_neon
 ret
 endfunc
 
-.macro idct32_odd
+function idct32_odd
 ld1 {v0.8h,v1.8h}, [x11]
 
 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
t16a, v31 = t31a
@@ -907,7 +911,8 @@ endfunc
 dmbutterfly0v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = 
t26a, v21 = t21a
 dmbutterfly0v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = 
t25,  v22 = t22
 dmbutterfly0v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = 
t24a, v23 = t23a
-.endm
+ret
+endfunc
 
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -925,6 +930,7 @@ function idct32_1d_8x32_pass1_neon
 cmp w3, 

[libav-devel] [PATCH 06/11] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/arm/vp9itxfm_neon.o
from 12316 to 15000 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:   271.5188.7211.7235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:  2086.9   1530.7   1769.8   1286.3
vp9_inv_dct_dct_16x16_sub8_add_neon:  2461.8   1805.2   2026.4   1527.5
vp9_inv_dct_dct_16x16_sub12_add_neon: 2838.8   2193.9   2297.6   1775.9
vp9_inv_dct_dct_16x16_sub16_add_neon: 3213.9   2468.6   2554.7   2005.5
vp9_inv_dct_dct_32x32_sub1_add_neon:   752.5457.5864.4554.7
vp9_inv_dct_dct_32x32_sub4_add_neon: 10687.2   7931.0   8621.0   6826.8
vp9_inv_dct_dct_32x32_sub8_add_neon: 11802.3   9222.5   9423.6   7571.4
vp9_inv_dct_dct_32x32_sub12_add_neon:12922.5  10579.7  10244.9   8321.2
vp9_inv_dct_dct_32x32_sub16_add_neon:14039.9  11840.0  11032.5   9068.3
vp9_inv_dct_dct_32x32_sub20_add_neon:15188.7  1.9  11849.5   9822.9
vp9_inv_dct_dct_32x32_sub24_add_neon:16277.9  14917.8  12653.1  10570.2
vp9_inv_dct_dct_32x32_sub28_add_neon:17392.6  16043.6  13424.9  11329.9
vp9_inv_dct_dct_32x32_sub32_add_neon:18508.1  17207.1  14236.0  12066.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   271.5188.7211.7235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:  1336.5   1012.5   1225.9860.7
vp9_inv_dct_dct_16x16_sub8_add_neon:  2023.2   1768.8   1868.1   1358.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 2947.1   2228.9   2304.8   1795.7
vp9_inv_dct_dct_16x16_sub16_add_neon: 3247.9   2536.7   2547.0   2036.1
vp9_inv_dct_dct_32x32_sub1_add_neon:   751.5456.7863.5553.9
vp9_inv_dct_dct_32x32_sub4_add_neon:  8019.6   5868.0   6632.6   5134.4
vp9_inv_dct_dct_32x32_sub8_add_neon:  8808.1   6966.8   7198.0   5690.6
vp9_inv_dct_dct_32x32_sub12_add_neon:11291.5  10146.7   9628.8   7566.7
vp9_inv_dct_dct_32x32_sub16_add_neon:12159.2  11004.2  10373.3   8237.7
vp9_inv_dct_dct_32x32_sub20_add_neon:15230.9  13467.6  11841.1   9748.8
vp9_inv_dct_dct_32x32_sub24_add_neon:16361.5  14854.5  12677.6  10505.0
vp9_inv_dct_dct_32x32_sub28_add_neon:17497.8  15833.3  13493.0  11254.0
vp9_inv_dct_dct_32x32_sub32_add_neon:18591.8  17348.5  14355.5  12001.7
---
 libavcodec/arm/vp9itxfm_neon.S | 351 ++---
 1 file changed, 331 insertions(+), 20 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index d10de1e..99a5e1f 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
 vrshrn.s32  \out2, \tmpq4, #14
 .endm
 
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+vmull.s16   \tmpq3, \in1, d0[0]
+vrshrn.s32  \out1,  \tmpq3, #14
+vmov\out2,  \out1
+.endm
+
 @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 @ Same as mbutterfly0, but with input being 2 q registers, output
@@ -137,6 +145,23 @@ endconst
 vrshrn.s32  \inout2, \tmp2,  #14
 .endm
 
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout1, \coef1
+vmull.s16   \tmp2,   \inout1, \coef2
+vrshrn.s32  \inout1, \tmp1,   #14
+vrshrn.s32  \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout2, \coef2
+vmull.s16   \tmp2,   \inout2, \coef1
+vneg.s32\tmp1,   \tmp1
+vrshrn.s32  \inout2, \tmp2,   #14
+vrshrn.s32  \inout1, \tmp1,   #14
+.endm
+
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) 
>> 14
 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) 
>> 14
 @ inout are 4 d registers, tmp are 4 q registers
@@ -534,7 +559,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-function idct16
+.macro idct16_full
 mbutterfly0 d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 
= t1a
 mbutterfly  d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
t3a
 mbutterfly  d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
t7a
@@ -556,7 +581,10 @@ function idct16
 mbutterfly0 d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, 
d26 = t5a
 mbutterfly  d23, d25, d0[1], d0[2], q9,  q15@ d23 = t9a,  
d25 = t14a
 mbutterfly  d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, 
d21 = t10a
+idct16_end
+.endm
 
+.macro idct16_end
 butterfly   d18, d7,  d4,  d7@ d18 = t0a,  

[libav-devel] [PATCH 10/11] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o
from 14784 to 18548 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:   236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:  1065.0
vp9_inv_dct_dct_16x16_sub8_add_neon:  1065.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.5
vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.3
vp9_inv_dct_dct_32x32_sub1_add_neon:   556.5
vp9_inv_dct_dct_32x32_sub4_add_neon:  5203.8
vp9_inv_dct_dct_32x32_sub8_add_neon:  5199.8
vp9_inv_dct_dct_32x32_sub12_add_neon: 6172.3
vp9_inv_dct_dct_32x32_sub16_add_neon: 6176.1
vp9_inv_dct_dct_32x32_sub20_add_neon: 7144.5
vp9_inv_dct_dct_32x32_sub24_add_neon: 7143.7
vp9_inv_dct_dct_32x32_sub28_add_neon: 8114.2
vp9_inv_dct_dct_32x32_sub32_add_neon: 8112.0

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:   714.2
vp9_inv_dct_dct_16x16_sub8_add_neon:   926.8
vp9_inv_dct_dct_16x16_sub12_add_neon: 1402.3
vp9_inv_dct_dct_16x16_sub16_add_neon: 1405.9
vp9_inv_dct_dct_32x32_sub1_add_neon:   554.1
vp9_inv_dct_dct_32x32_sub4_add_neon:  3958.8
vp9_inv_dct_dct_32x32_sub8_add_neon:  3958.8
vp9_inv_dct_dct_32x32_sub12_add_neon: 5461.1
vp9_inv_dct_dct_32x32_sub16_add_neon: 5467.4
vp9_inv_dct_dct_32x32_sub20_add_neon: 7175.4
vp9_inv_dct_dct_32x32_sub24_add_neon: 7172.5
vp9_inv_dct_dct_32x32_sub28_add_neon: 8136.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8135.9

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 369 +++--
 1 file changed, 349 insertions(+), 20 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index ef40a5a..d74245f 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@ endconst
 .endif
 .endm
 
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+smull   \tmp1\().4s,  \in1\().4h,  v0.h[0]
+smull2  \tmp2\().4s,  \in1\().8h,  v0.h[0]
+rshrn   \out1\().4h,  \tmp1\().4s, #14
+rshrn2  \out1\().8h,  \tmp2\().4s, #14
+mov \out2\().16b, \out1\().16b
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
 rshrn2  \inout2\().8h, \tmp4\().4s,  #14
 .endm
 
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout1\().4h, \coef1
+smull2  \tmp2\().4s, \inout1\().8h, \coef1
+smull   \tmp3\().4s, \inout1\().4h, \coef2
+smull2  \tmp4\().4s, \inout1\().8h, \coef2
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout2\().4h, \coef2
+smull2  \tmp2\().4s, \inout2\().8h, \coef2
+smull   \tmp3\().4s, \inout2\().4h, \coef1
+smull2  \tmp4\().4s, \inout2\().8h, \coef1
+neg \tmp1\().4s, \tmp1\().4s
+neg \tmp2\().4s, \tmp2\().4s
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+smull   \out1\().4s, \in\().4h, \coef
+smull2  \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+rshrn   \out\().4h, \in1\().4s, \shift
+rshrn2  \out\().8h, \in2\().4s, \shift
+.endm
+
+
 // out1 = in1 + in2
 // out2 = in1 - in2
 .macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
 ret
 endfunc
 
-function idct16
+.macro idct16_full
 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
t0a,  v24 = t1a
 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a

[libav-devel] [PATCH 07/11] arm: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.

The code size increases from 15000 bytes to 19864 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:   271.5188.7211.7235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:  1336.5   1012.5   1225.9860.7
vp9_inv_dct_dct_16x16_sub8_add_neon:  2023.2   1768.8   1868.1   1358.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 2947.1   2228.9   2304.8   1795.7
vp9_inv_dct_dct_16x16_sub16_add_neon: 3247.9   2536.7   2547.0   2036.1
vp9_inv_dct_dct_32x32_sub1_add_neon:   751.5456.7863.5553.9
vp9_inv_dct_dct_32x32_sub4_add_neon:  8019.6   5868.0   6632.6   5134.4
vp9_inv_dct_dct_32x32_sub8_add_neon:  8808.1   6966.8   7198.0   5690.6
vp9_inv_dct_dct_32x32_sub12_add_neon:11291.5  10146.7   9628.8   7566.7
vp9_inv_dct_dct_32x32_sub16_add_neon:12159.2  11004.2  10373.3   8237.7
vp9_inv_dct_dct_32x32_sub20_add_neon:15230.9  13467.6  11841.1   9748.8
vp9_inv_dct_dct_32x32_sub24_add_neon:16361.5  14854.5  12677.6  10505.0
vp9_inv_dct_dct_32x32_sub28_add_neon:17497.8  15833.3  13493.0  11254.0
vp9_inv_dct_dct_32x32_sub32_add_neon:18591.8  17348.5  14355.5  12001.7

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   271.5188.7211.7235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:  1209.5863.9   1034.7764.7
vp9_inv_dct_dct_16x16_sub8_add_neon:  1915.8   1590.9   1739.0   1281.7
vp9_inv_dct_dct_16x16_sub12_add_neon: 2850.5   2204.3   2292.1   1779.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 3240.1   2490.6   2555.8   2009.9
vp9_inv_dct_dct_32x32_sub1_add_neon:   751.5458.9863.5553.9
vp9_inv_dct_dct_32x32_sub4_add_neon:  7566.3   5721.3   6043.8   4920.7
vp9_inv_dct_dct_32x32_sub8_add_neon:  8366.1   6786.5   6594.1   5476.2
vp9_inv_dct_dct_32x32_sub12_add_neon:10980.0   9885.5   9237.7   7436.5
vp9_inv_dct_dct_32x32_sub16_add_neon:11917.3  11156.8   9963.0   8113.0
vp9_inv_dct_dct_32x32_sub20_add_neon:15201.3  13632.9  11844.2   9819.9
vp9_inv_dct_dct_32x32_sub24_add_neon:16333.8  14541.2  12654.5  10580.7
vp9_inv_dct_dct_32x32_sub28_add_neon:17459.1  16165.8  13450.0  11325.3
vp9_inv_dct_dct_32x32_sub32_add_neon:18612.2  17386.7  14281.6  12065.8

---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
 libavcodec/arm/vp9itxfm_neon.S | 532 +
 1 file changed, 374 insertions(+), 158 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 99a5e1f..b6c23c8 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -745,6 +745,42 @@ function iadst16
 bx  lr
 endfunc
 
+.macro load_add_store coef0, coef1, coef2, coef3
+vrshr.s16   \coef0, \coef0, #6
+vrshr.s16   \coef1, \coef1, #6
+
+vld1.32 {d4[]},   [r0,:32], r1
+vld1.32 {d4[1]},  [r3,:32], r1
+vrshr.s16   \coef2, \coef2, #6
+vrshr.s16   \coef3, \coef3, #6
+vld1.32 {d5[]},   [r0,:32], r1
+vld1.32 {d5[1]},  [r3,:32], r1
+vaddw.u8\coef0, \coef0, d4
+vld1.32 {d6[]},   [r0,:32], r1
+vld1.32 {d6[1]},  [r3,:32], r1
+vaddw.u8\coef1, \coef1, d5
+vld1.32 {d7[]},   [r0,:32], r1
+vld1.32 {d7[1]},  [r3,:32], r1
+
+vqmovun.s16 d4,  \coef0
+vqmovun.s16 d5,  \coef1
+sub r0,  r0,  r1, lsl #2
+sub r3,  r3,  r1, lsl #2
+vaddw.u8\coef2, \coef2, d6
+vaddw.u8\coef3, \coef3, d7
+vst1.32 {d4[0]},  [r0,:32], r1
+vst1.32 {d4[1]},  [r3,:32], r1
+vqmovun.s16 d6,  \coef2
+vst1.32 {d5[0]},  [r0,:32], r1
+vst1.32 {d5[1]},  [r3,:32], r1
+vqmovun.s16 d7,  \coef3
+
+vst1.32 {d6[0]},  [r0,:32], r1
+vst1.32 {d6[1]},  [r3,:32], r1
+vst1.32 {d7[0]},  [r0,:32], r1
+vst1.32 {d7[1]},  [r3,:32], r1
+.endm
+
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
@@ -763,40 +799,13 @@ function \txfm\()16_1d_4x16_pass1_neon

[libav-devel] [PATCH 11/11] aarch64: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32

2016-11-23 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.

The code size increases from 18548 bytes to 24580 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:   236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:   714.2
vp9_inv_dct_dct_16x16_sub8_add_neon:   926.8
vp9_inv_dct_dct_16x16_sub12_add_neon: 1402.3
vp9_inv_dct_dct_16x16_sub16_add_neon: 1405.9
vp9_inv_dct_dct_32x32_sub1_add_neon:   554.1
vp9_inv_dct_dct_32x32_sub4_add_neon:  3958.8
vp9_inv_dct_dct_32x32_sub8_add_neon:  3958.8
vp9_inv_dct_dct_32x32_sub12_add_neon: 5461.1
vp9_inv_dct_dct_32x32_sub16_add_neon: 5467.4
vp9_inv_dct_dct_32x32_sub20_add_neon: 7175.4
vp9_inv_dct_dct_32x32_sub24_add_neon: 7172.5
vp9_inv_dct_dct_32x32_sub28_add_neon: 8136.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8135.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:   644.0
vp9_inv_dct_dct_16x16_sub8_add_neon:   854.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 1393.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 1392.6
vp9_inv_dct_dct_32x32_sub1_add_neon:   556.6
vp9_inv_dct_dct_32x32_sub4_add_neon:  3684.3
vp9_inv_dct_dct_32x32_sub8_add_neon:  3682.6
vp9_inv_dct_dct_32x32_sub12_add_neon: 5316.3
vp9_inv_dct_dct_32x32_sub16_add_neon: 5315.9
vp9_inv_dct_dct_32x32_sub20_add_neon: 7146.4
vp9_inv_dct_dct_32x32_sub24_add_neon: 7151.5
vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8117.5

---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 737 +++--
 1 file changed, 458 insertions(+), 279 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index d74245f..78041d3 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -710,6 +710,51 @@ endfunc
 st1 {v2.8h},  [\src], \inc
 .endm
 
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, 
tmp1, tmp2
+srshr   \coef0, \coef0, #6
+ld1 {v2.8b},  [x0], x1
+srshr   \coef1, \coef1, #6
+ld1 {v3.8b},  [x3], x1
+srshr   \coef2, \coef2, #6
+ld1 {v4.8b},  [x0], x1
+srshr   \coef3, \coef3, #6
+uaddw   \coef0, \coef0, v2.8b
+ld1 {v5.8b},  [x3], x1
+uaddw   \coef1, \coef1, v3.8b
+srshr   \coef4, \coef4, #6
+ld1 {v6.8b},  [x0], x1
+srshr   \coef5, \coef5, #6
+ld1 {v7.8b},  [x3], x1
+sqxtun  v2.8b,  \coef0
+srshr   \coef6, \coef6, #6
+sqxtun  v3.8b,  \coef1
+srshr   \coef7, \coef7, #6
+uaddw   \coef2, \coef2, v4.8b
+ld1 {\tmp1},  [x0], x1
+uaddw   \coef3, \coef3, v5.8b
+ld1 {\tmp2},  [x3], x1
+sqxtun  v4.8b,  \coef2
+sub x0,  x0,  x1, lsl #2
+sub x3,  x3,  x1, lsl #2
+sqxtun  v5.8b,  \coef3
+uaddw   \coef4, \coef4, v6.8b
+st1 {v2.8b},  [x0], x1
+uaddw   \coef5, \coef5, v7.8b
+st1 {v3.8b},  [x3], x1
+sqxtun  v6.8b,  \coef4
+st1 {v4.8b},  [x0], x1
+sqxtun  v7.8b,  \coef5
+st1 {v5.8b},  [x3], x1
+uaddw   \coef6, \coef6, \tmp1
+st1 {v6.8b},  [x0], x1
+uaddw   \coef7, \coef7, \tmp2
+st1 {v7.8b},  [x3], x1
+sqxtun  \tmp1,  \coef6
+sqxtun  \tmp2,  \coef7
+st1 {\tmp1},  [x0], x1
+st1 {\tmp2},  [x3], x1
+.endm
+
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
@@ -728,37 +773,12 @@ function \txfm\()16_1d_8x16_pass1_neon
 
 mov x9, #32
 moviv2.8h, #0
-
-.ifc \txfm,idct
-cmp w3, #10
-b.le3f
-cmp w3, #38
-b.le4f

[libav-devel] [PATCH] rtmpdh: Do global initialization before running the test

2016-11-23 Thread Martin Storsjö
The rtmpdh code can use crypto libraries which may require
a process global init. (gcrypt is one of the libraries
where the rtmpdh test code can fail if global init hasn't been
done, depending on gcrypt version.)
---
 libavformat/tests/rtmpdh.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavformat/tests/rtmpdh.c b/libavformat/tests/rtmpdh.c
index c25ca91..be7ea5a 100644
--- a/libavformat/tests/rtmpdh.c
+++ b/libavformat/tests/rtmpdh.c
@@ -17,6 +17,7 @@
  */
 
 #include "libavformat/rtmpdh.c"
+#include "libavformat/avformat.h"
 
 #include 
 
@@ -150,6 +151,7 @@ fail:
 
 int main(void)
 {
+avformat_network_init();
 if (test_random_shared_secret() < 0)
 return 1;
 if (test_ref_data() < 0)
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] rtsp: Refactor ff_rtsp_fetch_packet

2016-11-23 Thread Martin Storsjö

On Wed, 23 Nov 2016, Luca Barbato wrote:


Move the actual data parsing and the queued stream selection
in separate functions.
---

Before I continue chopping that function in smaller bits I want your opinion.
The plan is to further reshape this quite complex loop so at least it is easier
to see what's happening.

Then I'd add move the receive buffer size as an option and possibly keep the
networking side in a separate thread to avoid overruns when the bitrate is much
larger than the current design expects.


I'm not too eager about this in particular, but some refactoring here 
probably can be good, as long as it doesn't break anything.



libavformat/rtsp.c | 106 -
libavformat/rtsp.h |   1 +
2 files changed, 66 insertions(+), 41 deletions(-)

diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index 7e59857..15ea393 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -2043,11 +2043,68 @@ static int pick_stream(AVFormatContext *s, RTSPStream 
**rtsp_st,
return AVERROR(EAGAIN);
}

+static int64_t parse_packet(RTSPState *rt, AVPacket *pkt)
+{
+int ret;


I'm not sure this name is quite right; this only does the simpler version 
of parsing out more packets out of the existing buffers; further down in 
ff_rtsp_fetch_packet there's the full version which also sends RTCP RR and 
other RTCP feedback, and syncs the RTCP based timestamps for other 
streams.


Ideally I guess we should integrate both of them into the same function, 
and just do the RTCP sync and feedback if a specific flag is set to the 
function.



+
+if (rt->transport == RTSP_TRANSPORT_RDT) {
+ret = ff_rdt_parse_packet(rt->cur_transport_priv, pkt, NULL, 0);
+} else if (rt->transport == RTSP_TRANSPORT_RTP) {
+ret = ff_rtp_parse_packet(rt->cur_transport_priv, pkt, NULL, 0);
+} else if (CONFIG_RTPDEC && rt->ts) {
+ret = ff_mpegts_parse_packet(rt->ts, pkt,
+ rt->recvbuf + rt->recvbuf_pos,
+ rt->recvbuf_len - rt->recvbuf_pos);
+if (ret >= 0) {
+rt->recvbuf_pos += ret;
+ret = rt->recvbuf_pos < rt->recvbuf_len;
+}
+} else
+ret = -1;
+
+if (ret == 0) {
+rt->cur_transport_priv = NULL;
+return 0;
+} else if (ret == 1)
+return 0;
+
+return ret;
+}
+
+static int select_queue_stream(AVFormatContext *s)
+{
+RTSPState *rt = s->priv_data;
+int i;
+int64_t wait_end;
+int64_t first_queue_time = 0;
+
+for (i = 0; i < rt->nb_rtsp_streams; i++) {
+RTPDemuxContext *rtpctx = rt->rtsp_streams[i]->transport_priv;
+int64_t queue_time;
+if (!rtpctx)
+continue;
+queue_time = ff_rtp_queued_packet_time(rtpctx);
+if (queue_time && (queue_time - first_queue_time < 0 ||
+   !first_queue_time)) {
+first_queue_time   = queue_time;
+rt->first_queue_st = rt->rtsp_streams[i];
+}
+}
+if (first_queue_time) {
+wait_end = first_queue_time + s->max_delay;
+} else {
+wait_end = 0;
+rt->first_queue_st = NULL;
+}


I'd prefer this to pass first_queue_st as a pointer to this function, 
keeping it as a local variable in the caller for now.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCHv2] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

2016-11-23 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9itxfm_neon.S | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 2dc6b75..f4194a6 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -599,9 +599,9 @@ endfunc
 // x1 = unused
 // x2 = src
 // x3 = slice offset
+// x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
-mov x9, #32
 moviv2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 load_clear  \i,  x2,  x9
@@ -649,8 +649,8 @@ endfunc
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x3 = slice offset
+// x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
-mov x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 load\i,  x2,  x9
 .endr
@@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
 ld1 {v0.8h,v1.8h}, [x10]
 .endif
+mov x9, #32
 
 .irp i, 0, 8
 add x0,  sp,  #(\i*32)
@@ -882,13 +883,12 @@ endfunc
 // x0 = dst (temp buffer)
 // x1 = unused
 // x2 = src
+// x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-// Double stride of the input, since we only read every other line
-mov x9,  #128
 moviv4.8h, #0
 
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -987,12 +987,13 @@ endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-mov x9, #128
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 ld1 {v\i\().8h}, [x2], x9
@@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
 
 idct16
 
-mov x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 st1 {v\i\().8h}, [x2], x9
 .endr
@@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
 
 idct32_odd
 
-mov x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
 ld1 {v4.8h},  [x2], x9
 ld1 {v5.8h},  [x2], x9
-.if \neg == 0
 add v4.8h, v4.8h, v\a\().8h
 ld1 {v6.8h},  [x2], x9
 add v5.8h, v5.8h, v\b\().8h
@@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
 add v6.8h, v6.8h, v\c\().8h
 add v7.8h, v7.8h, v\d\().8h
 .else
+ld1 {v4.8h},  [x2], x7
+ld1 {v5.8h},  [x2], x7
 sub v4.8h, v4.8h, v\a\().8h
-ld1 {v6.8h},  [x2], x9
+ld1 {v6.8h},  [x2], x7
 sub v5.8h, v5.8h, v\b\().8h
-ld1 {v7.8h},  [x2], x9
+ld1 {v7.8h},  [x2], x7
 sub v6.8h, v6.8h, v\c\().8h
 sub v7.8h, v7.8h, v\d\().8h
 .endif
@@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
 load_acc_store  23, 22, 21, 20
 load_acc_store  19, 18, 17, 16
 sub x2,  x2,  x9
-neg x9,  x9
 load_acc_store  16, 17, 18, 19, 1
 load_acc_store  20, 21, 22, 23, 1
 load_acc_store  24, 25, 26, 27, 1
@@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 mov x5,  x1
 mov x6,  x2
 
+// Double stride of the input, since we only read every other line
+mov x9,  #128
+neg x7,  x9
+
 .irp i, 0, 8, 16, 24
 add x0,  sp,  #(\i*64)
 add x2,  x6,  #(\i*2)
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] rtmpdh: Do global initialization before running the test

2016-11-24 Thread Martin Storsjö

On Thu, 24 Nov 2016, Diego Biurrun wrote:


On Wed, Nov 23, 2016 at 11:35:08PM +0200, Martin Storsjö wrote:

The rtmpdh code can use crypto libraries which may require
a process global init. (gcrypt is one of the libraries
where the rtmpdh test code can fail if global init hasn't been
done, depending on gcrypt version.)
---
 libavformat/tests/rtmpdh.c | 2 ++
 1 file changed, 2 insertions(+)


As you predicted, this does not fail my particular test case.


does not fix, I presume

To fix that, you could just as an experiment try this:

diff --git a/libavformat/tls_gnutls.c b/libavformat/tls_gnutls.c
index 3e29a45..abc9e69 100644
--- a/libavformat/tls_gnutls.c
+++ b/libavformat/tls_gnutls.c
@@ -35,7 +35,8 @@
 #include "libavutil/opt.h"
 #include "libavutil/parseutils.h"

-#if HAVE_THREADS && GNUTLS_VERSION_NUMBER <= 0x020b00
+#if CONFIG_GCRYPT
 #include 
 #include "libavutil/thread.h"
 GCRY_THREAD_OPTION_PTHREAD_IMPL;
@@ -52,10 +53,10 @@ typedef struct TLSContext {
 void ff_gnutls_init(void)
 {
 avpriv_lock_avformat();
-#if HAVE_THREADS && GNUTLS_VERSION_NUMBER < 0x020b00
+#if CONFIG_GCRYPT
 if (gcry_control(GCRYCTL_ANY_INITIALIZATION_P) == 0)
 gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread);
#endif
 gnutls_global_init();
 avpriv_unlock_avformat();
 }


But I'm quite ok with dropping gcrypt support altogether.



--- a/libavformat/tests/rtmpdh.c
+++ b/libavformat/tests/rtmpdh.c
@@ -17,6 +17,7 @@

 #include "libavformat/rtmpdh.c"
+#include "libavformat/avformat.h"


nit: order


Fixed


@@ -150,6 +151,7 @@ fail:

 int main(void)
 {
+avformat_network_init();
 if (test_random_shared_secret() < 0)
 return 1;
 if (test_ref_data() < 0)


LGTM


Pushed to master


Please push to the 12 branch as well.


Will try to remember to do that later, I'm short on time right now. Or 
feel free to do that for me if you beat me to it.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] rtmpdh: Do global initialization before running the test

2016-11-24 Thread Martin Storsjö

On Thu, 24 Nov 2016, Diego Biurrun wrote:


To fix that, you could just as an experiment try this:

diff --git a/libavformat/tls_gnutls.c b/libavformat/tls_gnutls.c
index 3e29a45..abc9e69 100644
--- a/libavformat/tls_gnutls.c
+++ b/libavformat/tls_gnutls.c
@@ -35,7 +35,8 @@
 #include "libavutil/opt.h"
 #include "libavutil/parseutils.h"

-#if HAVE_THREADS && GNUTLS_VERSION_NUMBER <= 0x020b00
+#if CONFIG_GCRYPT
 #include 
 #include "libavutil/thread.h"
 GCRY_THREAD_OPTION_PTHREAD_IMPL;
@@ -52,10 +53,10 @@ typedef struct TLSContext {
 void ff_gnutls_init(void)
 {
 avpriv_lock_avformat();
-#if HAVE_THREADS && GNUTLS_VERSION_NUMBER < 0x020b00
+#if CONFIG_GCRYPT
 if (gcry_control(GCRYCTL_ANY_INITIALIZATION_P) == 0)
 gcry_control(GCRYCTL_SET_THREAD_CBS, &gcry_threads_pthread);
#endif
 gnutls_global_init();
 avpriv_unlock_avformat();
 }


That does the trick, please push :)


No, since this fix is not appropriate.

It's not quite straightforward though.

Notice how we don't have a separate --enable-gcrypt or --enable-gmp; this 
is kinda intentional. You're not supposed to use gcrypt because you want 
to, only if this happens to be the backend that your gnutls backend uses.


Now the existing code is the correct (AFAIK) way of initializing gnutls 
for use in a threadsafe way; in older versions when gcrypt was the only 
available backend, the caller had to do it manually. In preparation for 
making gnutls use a different backend, this was folded into 
gnutls_global_init, and in newer gnutls versions, nettle/hogweed/gmp is 
the only backend. (And they don't need the global init.)


If we'd consider gcrypt and gmp features that can be enabled/disabled 
separately from gnutls, we'd need to have some sort of init like this 
somewhere, but tls_gnutls.c surely isn't the right spot for it in that 
case.


Prior to 57cde2b180fcec0eaf60aad65f436ab6420546f5, we had this init in 
network.c, where it was done if gnutls and gcrypt were enabled. But after 
that commit, this init actually only gets done if the tls protocol is 
enabled. This was pointed out in review IIRC, but the cleanup achieved is 
worth more than the fact that a build with --enable-gnutls 
--disable-protocol=tls won't init gnutls/gcrypt correctly for the rtmpdh 
code.


To fix this correctly, we should have a separate, similar init for the 
crypto backends used by the rtmpdh code, not related to the tls protocols 
(called from within the same avformat_network_init). So far, this have 
felt like too much of a corner case to need to be cared about though.


If we drop gcrypt, the issue goes away even more, since gmp doesn't need 
any global init. The libcrypt half of OpenSSL does need some sort of init 
in order to be threadsafe (but it doesn't fail if not inited, like some 
versions of gcrypt). So in such a setup (--enable-openssl 
--disable-protocol=tls), the rtmpdh code won't be threadsafe.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 05/15] build: Drop gcrypt support

2016-11-24 Thread Martin Storsjö

On Thu, 24 Nov 2016, Diego Biurrun wrote:


GnuTLS in combination with gcrypt has been deprecated since 2010.
---

This should be done after Martin pushes his patch to fix gcrypt support.


No, there's nothing more to fix wrt gcrypt right now, feel free to proceed 
with this.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 3/3] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

2016-11-28 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   1373.2
vp9_inv_dct_dct_32x32_sub32_add_neon:   8089.0

By skipping individual 8x16 or 8x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:1043.7
vp9_inv_dct_dct_16x16_sub4_add_neon:1045.3
vp9_inv_dct_dct_16x16_sub8_add_neon:1043.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   1374.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   1368.7
vp9_inv_dct_dct_32x32_sub1_add_neon: 555.6
vp9_inv_dct_dct_32x32_sub2_add_neon:5180.0
vp9_inv_dct_dct_32x32_sub4_add_neon:5175.1
vp9_inv_dct_dct_32x32_sub8_add_neon:5186.6
vp9_inv_dct_dct_32x32_sub12_add_neon:   6159.5
vp9_inv_dct_dct_32x32_sub16_add_neon:   6162.7
vp9_inv_dct_dct_32x32_sub20_add_neon:   7129.0
vp9_inv_dct_dct_32x32_sub24_add_neon:   7133.1
vp9_inv_dct_dct_32x32_sub28_add_neon:   8107.1
vp9_inv_dct_dct_32x32_sub32_add_neon:   8105.6

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
Updated based on Janne's review of the arm version.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 60 ++
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index f4194a6..9d2ba11 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -588,6 +588,9 @@ endfunc
 .macro store i, dst, inc
 st1 {v\i\().8h},  [\dst], \inc
 .endm
+.macro movi_v i, size, imm
+moviv\i\()\size,  \imm
+.endm
 .macro load_clear i, src, inc
 ld1 {v\i\().8h}, [\src]
 st1 {v2.8h},  [\src], \inc
@@ -596,9 +599,8 @@ endfunc
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
-// x1 = unused
+// x1 = slice offset
 // x2 = src
-// x3 = slice offset
 // x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
@@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
 transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 
 // Store the transposed 8x8 blocks horizontally.
-cmp x3,  #8
+cmp x1,  #8
 b.eq1f
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 store   \i,  x0,  #16
 .endr
 ret
 1:
-// Special case: For the last input column (x3 == 8),
+// Special case: For the last input column (x1 == 8),
 // which would be stored as the last row in the temp buffer,
 // don't store the first 8x8 block, but keep it in registers
 // for the first slice of the second pass (where it is the
@@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, 
export=1
 
 .irp i, 0, 8
 add x0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+cmp w3,  #38
+b.le1f
+.endif
+.endif
+mov x1,  #\i
 add x2,  x6,  #(\i*2)
-mov x3,  #\i
 bl  \txfm1\()16_1d_8x16_pass1_neon
 .endr
 .ifc \txfm1\()_\txfm2,iadst_idct
 ld1 {v0.8h,v1.8h}, [x10]
 .endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+b   3f
+1:
+// Set v24-v31 to zero, for the in-register passthrough of
+// coefficients to pass 2. Since we only do two slices, this can
+// only ever happen for the second slice. So we only need to store
+// zeros to the temp buffer for the second half of the buffer.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+add x0,  x0,  #16
+movi_v  \i,  .16b, #0
+store   24,  x0,  #16
+.endr
+3:
+.endif
+
 .irp i, 0, 8
 add x0,  x4,  #(\i)
 mov x1,  x5
@@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon
 ret
 endfunc
 
+const min_eob_idct_idct_32, align=4
+.short  0, 34, 135, 336
+endconst
+
 function ff_vp9_idct_idct_32x32_add_neon, export=1
 cmp w3,  #1
 b.eqidct32x32_dc_add_neon
 
 movrel  x10, idct_coeffs
 add x11, x10, #32
+movrel  x12, min_eob_idct_idct_32 + 2
 
 mov x15, x30
 
@@ -1099,9 +1128,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
 .irp i, 0, 8, 16, 24
 add x0,  sp,  #(\i*64)
+.if \i > 0
+ldrhw1,  [x12], #2
+cmp w3,  w1
+mov x

[libav-devel] [PATCH 1/3] arm: vp9itxfm: Only reload the idct coeffs for the iadst_idct combination

2016-11-28 Thread Martin Storsjö
This avoids reloading them if they haven't been clobbered, if the
first pass also was idct.

This is similar to what was done in the aarch64 version.
---
 libavcodec/arm/vp9itxfm_neon.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 01944bd..2049241 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -814,7 +814,7 @@ A   and r7,  sp,  #15
 mov r3,  #\i
 bl  \txfm1\()16_1d_4x16_pass1_neon
 .endr
-.ifc \txfm2,idct
+.ifc \txfm1\()_\txfm2,iadst_idct
 movrel  r12, idct_coeffs
 vld1.16 {q0-q1}, [r12,:128]
 .endif
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 2/3] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

2016-11-28 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   3188.1   2435.4   2499.0   1969.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18531.7  16582.3  14207.6  12000.3

By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon: 274.6189.5211.7235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:2064.0   1534.8   1719.4   1248.7
vp9_inv_dct_dct_16x16_sub4_add_neon:2135.0   1477.2   1736.3   1249.5
vp9_inv_dct_dct_16x16_sub8_add_neon:2446.7   1828.7   1993.6   1494.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2832.4   2118.3   2266.5   1735.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.7   2475.3   2523.5   1983.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 756.2456.7862.0553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   10682.2   8190.4   8539.2   6762.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10813.5   8014.9   8518.3   6762.8
vp9_inv_dct_dct_32x32_sub8_add_neon:   11859.6   9313.0   9347.4   7514.5
vp9_inv_dct_dct_32x32_sub12_add_neon:  12946.6  10752.4  10192.2   8280.2
vp9_inv_dct_dct_32x32_sub16_add_neon:  14074.6  11946.5  11001.4   9008.6
vp9_inv_dct_dct_32x32_sub20_add_neon:  15269.9  13662.7  11816.1   9762.6
vp9_inv_dct_dct_32x32_sub24_add_neon:  16327.9  14940.1  12626.7  10516.0
vp9_inv_dct_dct_32x32_sub28_add_neon:  17462.7  15776.1  13446.2  11264.7
vp9_inv_dct_dct_32x32_sub32_add_neon:  18575.5  17157.0  14249.3  12015.1

I.e. in general a very minor overhead for the full subpartition case due
to the additional loads and cmps, but a significant speedup for the cases
when we only need to process a small part of the actual input data.

In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
---
Updated with Janne's suggestions. The weird speedup for
vp9_inv_dct_dct_16x16_sub16_add_neon on the Cortex A8 in the previous
iteration of the patch seems to be mostly within noise for that test; it
does still appear occasionally when testing.
---
 libavcodec/arm/vp9itxfm_neon.S | 75 +-
 tests/checkasm/vp9dsp.c|  6 ++--
 2 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 2049241..5abe435 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,9 +659,8 @@ endfunc
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
 @ r2 = src
-@ r3 = slice offset
 function \txfm\()16_1d_4x16_pass1_neon
 mov r12, #32
 vmov.s16q2, #0
@@ -678,14 +677,14 @@ function \txfm\()16_1d_4x16_pass1_neon
 transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, 
d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
 
 @ Store the transposed 4x4 blocks horizontally.
-cmp r3,  #12
+cmp r1,  #12
 beq 1f
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
 vst1.16 {d\i}, [r0,:64]!
 .endr
 bx  lr
 1:
-@ Special case: For the last input column (r3 == 12),
+@ Special case: For the last input column (r1 == 12),
 @ which would be stored as the last row in the temp buffer,
 @ don't store the first 4x4 block, but keep it in registers
 @ for the first slice of the second pass (where it is the
@@ -781,15 +780,22 @@ endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst
 
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+.short  0, 10, 38, 89
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
 cmp r3,  #1
 beq idct16x16_dc_add_neon
 .endif
-push{r4-r7,lr}
+push{r4-r8,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
 vpush   {q4-q7}
+.else
+movrel  r8,  min_eob_idct_idct_16 + 2
 .endif
 
 @ Align the stack, allocate a temp buffer
@@ -810,10 +816,36 @@ A   and r7,  sp,  #15
 
 .irp i, 0, 4, 8, 12
 add r0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i > 0
+ldrh_post   r1,  r8,  #2
+cmp r3,  r1
+it  le
+movle   r1,  #(16 - \i)/4
+ble 1f
+.endif
+.endif
+mov r1,  #\i
 add r2,  r6,  #(\i*2)
-mov r3,  #\i
   

[libav-devel] [PATCH] vp9dsp: add DC only versions for idct/idct.

2016-11-29 Thread Martin Storsjö
From: Clément Bœsch 

before:

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real0m11.125s
user0m11.059s
sys 0m0.050s

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real0m10.944s
user0m10.819s
sys 0m0.064s

after:

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real0m8.153s
user0m8.034s
sys 0m0.050s

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real0m8.038s
user0m7.980s
sys 0m0.039s
---
 libavcodec/vp9dsp.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index 73006fa..ead2f88 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -944,7 +944,7 @@ static av_cold void vp9dsp_intrapred_init(VP9DSPContext 
*dsp)
 #undef init_intra_pred
 }
 
-#define itxfm_wrapper(type_a, type_b, sz, bits) \
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
 static void \
 type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst, \
   ptrdiff_t stride, \
@@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## 
_add_c(uint8_t *dst, \
 {   \
 int i, j;   \
 int16_t tmp[sz * sz], out[sz];  \
+\
+if (has_dconly && eob == 1) {   \
+const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)  \
+   * 11585 + (1 << 13)) >> 14;  \
+block[0] = 0;   \
+for (i = 0; i < sz; i++) {  \
+for (j = 0; j < sz; j++)\
+dst[j * stride] = av_clip_uint8(dst[j * stride] +   \
+(bits ? \
+ (t + (1 << (bits - 1))) >> 
bits : \
+ t));   \
+dst++;  \
+}   \
+return; \
+}   \
+\
 for (i = 0; i < sz; i++)\
 type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);\
 memset(block, 0, sz * sz * sizeof(*block)); \
@@ -967,11 +983,11 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## 
_add_c(uint8_t *dst, \
 }   \
 }
 
-#define itxfm_wrap(sz, bits) \
-itxfm_wrapper(idct, idct, sz, bits)  \
-itxfm_wrapper(iadst, idct, sz, bits) \
-itxfm_wrapper(idct, iadst, sz, bits) \
-itxfm_wrapper(iadst, iadst, sz, bits)
+#define itxfm_wrap(sz, bits) \
+itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+itxfm_wrapper(iadst, iadst, sz, bits, 0)
 
 #define IN(x) in[x * stride]
 
@@ -1490,7 +1506,7 @@ static av_always_inline void idct32_1d(int16_t *out, 
const int16_t *in,
 out[31] = t0   - t31;
 }
 
-itxfm_wrapper(idct, idct, 32, 6)
+itxfm_wrapper(idct, idct, 32, 6, 1)
 
 static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
   ptrdiff_t stride, int pass)
@@ -1523,7 +1539,7 @@ static av_always_inline void iwht4_1d(int16_t *out, const 
int16_t *in,
 out[3] = t3;
 }
 
-itxfm_wrapper(iwht, iwht, 4, 0)
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
 
 #undef IN
 #undef itxfm_wrapper
-- 
2.9.3 (Apple Git-75)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] vp9dsp: add DC only versions for idct/idct.

2016-11-29 Thread Martin Storsjö

On Tue, 29 Nov 2016, Diego Biurrun wrote:


idct/idct?


The vp9 inverse transforms can be different combos for the vertical and 
horizontal passes; it can be iwht_iwht, idct_idct, idct_iadst, iadst_idct, 
iadst_iadst. This optimization is only valid for the idct_idct case.





--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## 
_add_c(uint8_t *dst, \
+if (has_dconly && eob == 1) {   \
+const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)  \
+   * 11585 + (1 << 13)) >> 14;  \
+block[0] = 0;   \
+for (i = 0; i < sz; i++) {  \
+for (j = 0; j < sz; j++)\
+dst[j * stride] = av_clip_uint8(dst[j * stride] +   \
+(bits ? \
+ (t + (1 << (bits - 1))) >> 
bits : \
+ t));   \


FTLIW:

  dst[j * stride] =   \
   av_clip_uint8(dst[j * stride] +\
 (bits ? (t + (1 << (bits - 1))) >> bits  \
   : t)); \


FTLIW?

This looks more readable, sure.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 4/5] aarch64: vp9itxfm: Restructure the idct32 store macros

2016-12-01 Thread Martin Storsjö
This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 80 +++---
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 5a080a4..be9643e 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -942,23 +942,23 @@ function idct32_1d_8x32_pass1_neon
 .macro store_rev a, b
 // There's no rev128 instruction, but we reverse each 64 bit
 // half, and then flip them using an ext with 8 bytes offset.
-rev64   v1.8h, v\b\().8h
-st1 {v\a\().8h},  [x0], #16
-rev64   v0.8h, v\a\().8h
+rev64   v1.8h, \b
+st1 {\a},  [x0], #16
+rev64   v0.8h, \a
 ext v1.16b, v1.16b, v1.16b, #8
-st1 {v\b\().8h},  [x0], #16
+st1 {\b},  [x0], #16
 ext v0.16b, v0.16b, v0.16b, #8
 st1 {v1.8h},  [x0], #16
 st1 {v0.8h},  [x0], #16
 .endm
-store_rev   16, 24
-store_rev   17, 25
-store_rev   18, 26
-store_rev   19, 27
-store_rev   20, 28
-store_rev   21, 29
-store_rev   22, 30
-store_rev   23, 31
+store_rev   v16.8h, v24.8h
+store_rev   v17.8h, v25.8h
+store_rev   v18.8h, v26.8h
+store_rev   v19.8h, v27.8h
+store_rev   v20.8h, v28.8h
+store_rev   v21.8h, v29.8h
+store_rev   v22.8h, v30.8h
+store_rev   v23.8h, v31.8h
 sub x0,  x0,  #512
 .purgem store_rev
 
@@ -984,14 +984,14 @@ function idct32_1d_8x32_pass1_neon
 // subtracted from the output.
 .macro store_rev a, b
 ld1 {v4.8h},  [x0]
-rev64   v1.8h, v\b\().8h
-add v4.8h, v4.8h, v\a\().8h
-rev64   v0.8h, v\a\().8h
+rev64   v1.8h, \b
+add v4.8h, v4.8h, \a
+rev64   v0.8h, \a
 st1 {v4.8h},  [x0], #16
 ext v1.16b, v1.16b, v1.16b, #8
 ld1 {v5.8h},  [x0]
 ext v0.16b, v0.16b, v0.16b, #8
-add v5.8h, v5.8h, v\b\().8h
+add v5.8h, v5.8h, \b
 st1 {v5.8h},  [x0], #16
 ld1 {v6.8h},  [x0]
 sub v6.8h, v6.8h, v1.8h
@@ -1001,14 +1001,14 @@ function idct32_1d_8x32_pass1_neon
 st1 {v7.8h},  [x0], #16
 .endm
 
-store_rev   31, 23
-store_rev   30, 22
-store_rev   29, 21
-store_rev   28, 20
-store_rev   27, 19
-store_rev   26, 18
-store_rev   25, 17
-store_rev   24, 16
+store_rev   v31.8h, v23.8h
+store_rev   v30.8h, v22.8h
+store_rev   v29.8h, v21.8h
+store_rev   v28.8h, v20.8h
+store_rev   v27.8h, v19.8h
+store_rev   v26.8h, v18.8h
+store_rev   v25.8h, v17.8h
+store_rev   v24.8h, v16.8h
 .purgem store_rev
 br  x14
 endfunc
@@ -1055,21 +1055,21 @@ function idct32_1d_8x32_pass2_neon
 .if \neg == 0
 ld1 {v4.8h},  [x2], x9
 ld1 {v5.8h},  [x2], x9
-add v4.8h, v4.8h, v\a\().8h
+add v4.8h, v4.8h, \a
 ld1 {v6.8h},  [x2], x9
-add v5.8h, v5.8h, v\b\().8h
+add v5.8h, v5.8h, \b
 ld1 {v7.8h},  [x2], x9
-add v6.8h, v6.8h, v\c\().8h
-add v7.8h, v7.8h, v\d\().8h
+add v6.8h, v6.8h, \c
+add v7.8h, v7.8h, \d
 .else
 ld1 {v4.8h},  [x2], x7
 ld1 {v5.8h},  [x2], x7
-sub v4.8h, v4.8h, v\a\().8h
+sub v4.8h, v4.8h, \a
 ld1 {v6.8h},  [x2], x7
-sub v5.8h, v5.8h, v\b\().8h
+sub v5.8h, v5.8h, \b
 ld1 {v7.8h},  [x2], x7
-sub v6.8h, v6.8h, v\c\().8h
-sub v7.8h, v7.8h, v\d\().8h
+sub v6.8h, v6.8h, \c
+sub v7.8h, v7.8h, \d
 .endif
 ld1 {v0.8b}, [x0], x1
 ld1 {v1.8b}, [x0], x1
@@ -1093,15 +1093,15 @@ function idct32_1d_8x32_pass2_neon
 st1 {v6.8b}, [x0], x1
 st1 {v7.8b}, [x0], x1
 .endm
-load_acc_store  31, 30, 29, 28
-load_acc_store  27, 26, 25, 24
-load_acc_store  23, 22, 21, 20
-load_acc_store  19, 18, 17, 16
+load_acc_store  v31.8h, v30.8h, v2

[libav-devel] [PATCH 2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/arm/vp9itxfm_neon.o
from 12388 to 15064 bytes.

Before:  Cortex A7   A8   A9  A53
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7   1521.2   1734.8   1262.0
vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9   1801.6   2007.8   1508.5
vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.5211.5236.1
vp9_inv_dct_dct_16x16_sub2_add_neon:1448.2994.0   1191.3836.0
vp9_inv_dct_dct_16x16_sub4_add_neon:1437.0991.0   1191.6836.0
vp9_inv_dct_dct_16x16_sub8_add_neon:2114.5   1757.9   1855.3   1335.3
vp9_inv_dct_dct_16x16_sub12_add_neon:   2862.7   2141.5   2293.3   1772.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3299.6   2419.1   2552.7   2033.0
vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0457.5864.3554.8
vp9_inv_dct_dct_32x32_sub2_add_neon:7867.8   5978.6   6594.6   5109.9
vp9_inv_dct_dct_32x32_sub4_add_neon:7871.0   5772.5   6582.2   5108.5
vp9_inv_dct_dct_32x32_sub8_add_neon:8694.8   6925.7   7125.7   5671.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  11250.3   9654.7   9557.6   7540.5
vp9_inv_dct_dct_32x32_sub16_add_neon:  12129.5  11061.1  10295.0   8220.7
vp9_inv_dct_dct_32x32_sub20_add_neon:  15218.4  13580.8  11841.3   9739.9
vp9_inv_dct_dct_32x32_sub24_add_neon:  16343.5  15097.0  12629.2  10496.6
vp9_inv_dct_dct_32x32_sub28_add_neon:  17482.2  15516.4  13476.0  11261.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18586.7  16817.5  14289.3  12019.0

---
If we wouldn't have made the core transforms standalone functions
in the previous patch, the code size would increase to around 21 KB (which
isn't too bad), but the idct32 pass1/2 functions would bloat up so much
that they would require literal pools within the functions themselves.
---
 libavcodec/arm/vp9itxfm_neon.S | 351 ++---
 1 file changed, 331 insertions(+), 20 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 22e63e5..bd3f678 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
 vrshrn.s32  \out2, \tmpq4, #14
 .endm
 
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+vmull.s16   \tmpq3, \in1, d0[0]
+vrshrn.s32  \out1,  \tmpq3, #14
+vmov\out2,  \out1
+.endm
+
 @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 @ Same as mbutterfly0, but with input being 2 q registers, output
@@ -137,6 +145,23 @@ endconst
 vrshrn.s32  \inout2, \tmp2,  #14
 .endm
 
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout1, \coef1
+vmull.s16   \tmp2,   \inout1, \coef2
+vrshrn.s32  \inout1, \tmp1,   #14
+vrshrn.s32  \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout2, \coef2
+vmull.s16   \tmp2,   \inout2, \coef1
+vneg.s32\tmp1,   \tmp1
+vrshrn.s32  \inout2, \tmp2,   #14
+vrshrn.s32  \inout1, \tmp1,   #14
+.endm
+
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) 
>> 14
 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) 
>> 14
 @ inout are 4 d registers, tmp are 4 q registers
@@ -534,7 +559,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-function idct16
+.macro idct16_full
 mbutterfly0 d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 
= t1a

[libav-devel] [PATCH 1/5] arm: vp9itxfm: Make the larger core transforms standalone functions

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15324 to 12388 bytes.

This gives a small slowdown of a couple tens of cycles, up to around
150 cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:  Cortex A7   A8   A9  A53
vp9_inv_dct_dct_16x16_sub4_add_neon:2063.4   1516.0   1719.5   1245.1
vp9_inv_dct_dct_16x16_sub16_add_neon:   3279.3   2454.5   2525.2   1982.3
vp9_inv_dct_dct_32x32_sub4_add_neon:   10750.0   7955.4   8525.6   6754.2
vp9_inv_dct_dct_32x32_sub32_add_neon:  18574.0  17108.4  14216.7  12010.2

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9
---
 libavcodec/arm/vp9itxfm_neon.S | 43 +-
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 5abe435..22e63e5 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-.macro idct16
+function idct16
 mbutterfly0 d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 
= t1a
 mbutterfly  d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
t3a
 mbutterfly  d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
t7a
@@ -580,9 +580,10 @@ endfunc
 vmovd4,  d21 @ d4  = t10a
 butterfly   d20, d27, d6,  d27   @ d20 = out[4], d27 = 
out[11]
 butterfly   d21, d26, d26, d4@ d21 = out[5], d26 = 
out[10]
-.endm
+bx  lr
+endfunc
 
-.macro iadst16
+function iadst16
 movrel  r12, iadst16_coeffs
 vld1.16 {q0-q1}, [r12,:128]
 
@@ -653,7 +654,8 @@ endfunc
 
 vmovd16, d2
 vmovd30, d4
-.endm
+bx  lr
+endfunc
 
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -662,6 +664,8 @@ endfunc
 @ r1 = slice offset
 @ r2 = src
 function \txfm\()16_1d_4x16_pass1_neon
+push{lr}
+
 mov r12, #32
 vmov.s16q2, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 vst1.16 {d4},  [r2,:64], r12
 .endr
 
-\txfm\()16
+bl  \txfm\()16
 
 @ Do four 4x4 transposes. Originally, d16-d31 contain the
 @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
 vst1.16 {d\i}, [r0,:64]!
 .endr
-bx  lr
+pop {pc}
 1:
 @ Special case: For the last input column (r1 == 12),
 @ which would be stored as the last row in the temp buffer,
@@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 vmovd29, d17
 vmovd30, d18
 vmovd31, d19
-bx  lr
+pop {pc}
 endfunc
 
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -719,6 +723,7 @@ endfunc
 @ r2 = src (temp buffer)
 @ r3 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
+push{lr}
 mov r12, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
 vld1.16 {d\i}, [r2,:64], r12
@@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 
 add r3,  r0,  r1
 lsl r1,  r1,  #1
-\txfm\()16
+bl  \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3
 vrshr.s16   \coef0, \coef0, #6
@@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 load_add_store  q12, q13, q14, q15
 .purgem load_add_store
 
-bx  lr
+pop {pc}
 endfunc
 .endm
 
@@ -908,7 +913,7 @@ function idct32x32_dc_add_neon
 bx  lr
 endfunc
 
-.macro idct32_odd
+function idct32_odd
 movrel  r12, idct_coeffs
 add r12, r12, #32
 vld1.16 {q0-q1}, [r12,:128]
@@ -967,7 +972,8 @@ endfunc
 mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = 
t21a
 mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = 
t22
 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = 
t23a
-.endm
+bx  lr
+endf

[libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o
from 14740 to 18504 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1390.3
vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5
vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1
vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9
vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9
vp9_inv_dct_dct_32x32_sub12_add_neon:   6171.6
vp9_inv_dct_dct_32x32_sub16_add_neon:   6170.9
vp9_inv_dct_dct_32x32_sub20_add_neon:   7147.1
vp9_inv_dct_dct_32x32_sub24_add_neon:   7147.0
vp9_inv_dct_dct_32x32_sub28_add_neon:   8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8

After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon: 697.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 697.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 908.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1399.6
vp9_inv_dct_dct_16x16_sub16_add_neon:   1403.3
vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1
vp9_inv_dct_dct_32x32_sub2_add_neon:3879.7
vp9_inv_dct_dct_32x32_sub4_add_neon:3952.2
vp9_inv_dct_dct_32x32_sub8_add_neon:3948.4
vp9_inv_dct_dct_32x32_sub12_add_neon:   5462.1
vp9_inv_dct_dct_32x32_sub16_add_neon:   5461.7
vp9_inv_dct_dct_32x32_sub20_add_neon:   7169.2
vp9_inv_dct_dct_32x32_sub24_add_neon:   7162.4
vp9_inv_dct_dct_32x32_sub28_add_neon:   8137.4
vp9_inv_dct_dct_32x32_sub32_add_neon:   8136.7

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
If we wouldn't have made the core transforms standalone functions,
the code size would end up at around 28 KB.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 367 +++--
 1 file changed, 347 insertions(+), 20 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index be9643e..bb79348 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@ endconst
 .endif
 .endm
 
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+smull   \tmp1\().4s,  \in1\().4h,  v0.h[0]
+smull2  \tmp2\().4s,  \in1\().8h,  v0.h[0]
+rshrn   \out1\().4h,  \tmp1\().4s, #14
+rshrn2  \out1\().8h,  \tmp2\().4s, #14
+mov \out2\().16b, \out1\().16b
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
 rshrn2  \inout2\().8h, \tmp4\().4s,  #14
 .endm
 
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout1\().4h, \coef1
+smull2  \tmp2\().4s, \inout1\().8h, \coef1
+smull   \tmp3\().4s, \inout1\().4h, \coef2
+smull2  \tmp4\().4s, \inout1\().8h, \coef2
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout2\().4h, \coef2
+smull2  \tmp2\().4s, \inout2\().8h, \coef2
+smull   \tmp3\().4s, \inout2\().4h, \coef1
+smull2  \tmp4\().4s, \inout2\().8h, \coef1
+neg \tmp1\().4s, \tmp1\().4s
+neg \tmp2\().4s, \tmp2\().4s
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+smull   \out1\().4s, \in\().4h, \coef
+smull2  \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+rshrn   \out\().4h, \in1\().4s, \shift
+rshrn2  \out\().8h, \in2\().4s, \shift
+.endm
+
+
 // out1 = in1 + in2
 // out2 = in1 - in2
 .macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
 ret
 endfunc
 
-function idct16
+.macro idct16_full
 dmbutterfly0v16, v24,

[libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This makes it easier to avoid filling the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code for these
cases (for the 16x16 case, where the special case pass functions are
written out instead of templated from the same macro), instead of riddling
the common code with special case branches or macro .ifs.

The code size increases from 14740 bytes to 24472 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub8_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1390.3
vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5
vp9_inv_dct_dct_32x32_sub2_add_neon:5199.1
vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9
vp9_inv_dct_dct_32x32_sub8_add_neon:5196.9
vp9_inv_dct_dct_32x32_sub12_add_neon:   6171.6
vp9_inv_dct_dct_32x32_sub16_add_neon:   6170.9
vp9_inv_dct_dct_32x32_sub20_add_neon:   7147.1
vp9_inv_dct_dct_32x32_sub24_add_neon:   7147.0
vp9_inv_dct_dct_32x32_sub28_add_neon:   8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8

After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon: 639.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 639.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 845.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1389.4
vp9_inv_dct_dct_16x16_sub16_add_neon:   1389.3
vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5
vp9_inv_dct_dct_32x32_sub2_add_neon:3684.1
vp9_inv_dct_dct_32x32_sub4_add_neon:3682.6
vp9_inv_dct_dct_32x32_sub8_add_neon:3684.1
vp9_inv_dct_dct_32x32_sub12_add_neon:   5319.0
vp9_inv_dct_dct_32x32_sub16_add_neon:   5323.5
vp9_inv_dct_dct_32x32_sub20_add_neon:   7149.8
vp9_inv_dct_dct_32x32_sub24_add_neon:   7148.2
vp9_inv_dct_dct_32x32_sub28_add_neon:   8124.5
vp9_inv_dct_dct_32x32_sub32_add_neon:   8122.1

---
If we wouldn't have made the core transforms standalone functions,
the code size would end up at around 34 KB.

The binary output is 6 KB larger than in the other alternative,
but is more straightforward and gives better opportunities to
special case them further.

In the arm version, there was a significant speedup compared to the
other alternative (having cmps within the functions), skipping
zeroing of the temp buffer. Here there's much less difference.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 628 +
 1 file changed, 566 insertions(+), 62 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index be9643e..9910170 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@ endconst
 .endif
 .endm
 
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+smull   \tmp1\().4s,  \in1\().4h,  v0.h[0]
+smull2  \tmp2\().4s,  \in1\().8h,  v0.h[0]
+rshrn   \out1\().4h,  \tmp1\().4s, #14
+rshrn2  \out1\().8h,  \tmp2\().4s, #14
+mov \out2\().16b, \out1\().16b
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
 rshrn2  \inout2\().8h, \tmp4\().4s,  #14
 .endm
 
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout1\().4h, \coef1
+smull2  \tmp2\().4s, \inout1\().8h, \coef1
+smull   \tmp3\().4s, \inout1\().4h, \coef2
+smull2  \tmp4\().4s, \inout1\().8h, \coef2
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+smull   \tmp1\().4s, \inout2\().4h, \coef2
+smull2  \tmp2\().4s, \inout2\().8h, \coef2
+smull   \tmp3\().4s, \inout2\().4h, \coef1
+smull2  \tmp4\().4s, \inout2\().8h, \coef1
+neg \tmp1\().4s, \tmp1\().4s
+neg \tmp2\().4s, \tmp2\().4s
+rshrn   \inout2\().4h, \tmp3\().4s, #14
+rshrn2  \inout2\().8h, \tmp4\().4s, #14
+rshrn   \inout1\().4h, \tmp1\().4s, #14
+rshrn2  \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+smull   \out1\().4s, \in\().4h, \coe

[libav-devel] [PATCH 3/5] aarch64: vp9itxfm: Make the larger core transforms standalone functions

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/aarch64/vp9itxfm_neon.o from
19496 to 14740 bytes.

This gives a small slowdown of a couple of tens of cycles, but makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_neon:1036.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   1372.2
vp9_inv_dct_dct_32x32_sub4_add_neon:5180.0
vp9_inv_dct_dct_32x32_sub32_add_neon:   8095.7

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:1051.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
vp9_inv_dct_dct_32x32_sub4_add_neon:5199.9
vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8
---
 libavcodec/aarch64/vp9itxfm_neon.S | 42 +++---
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 053d46f..5a080a4 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -463,7 +463,7 @@ function idct16x16_dc_add_neon
 ret
 endfunc
 
-.macro idct16
+function idct16
 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
t0a,  v24 = t1a
 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a
@@ -506,9 +506,10 @@ endfunc
 butterfly_8hv19, v28, v5,  v28   // v19 = out[3], v28 
= out[12]
 butterfly_8hv20, v27, v6,  v27   // v20 = out[4], v27 
= out[11]
 butterfly_8hv21, v26, v26, v3// v21 = out[5], v26 
= out[10]
-.endm
+ret
+endfunc
 
-.macro iadst16
+function iadst16
 ld1 {v0.8h,v1.8h}, [x11]
 
 dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.h[1], v0.h[0]   // 
v6,v7   = t1,   v4,v5   = t0
@@ -577,7 +578,8 @@ endfunc
 
 mov v16.16b, v2.16b
 mov v30.16b, v4.16b
-.endm
+ret
+endfunc
 
 // Helper macros; we can't use these expressions directly within
 // e.g. .irp due to the extra concatenation \(). Therefore wrap
@@ -604,12 +606,14 @@ endfunc
 // x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
+mov x14, x30
+
 moviv2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 load_clear  \i,  x2,  x9
 .endr
 
-\txfm\()16
+bl  \txfm\()16
 
 // Do two 8x8 transposes. Originally, v16-v31 contain the
 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
@@ -623,7 +627,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
 store   \i,  x0,  #16
 .endr
-ret
+br  x14
 1:
 // Special case: For the last input column (x1 == 8),
 // which would be stored as the last row in the temp buffer,
@@ -642,7 +646,7 @@ function \txfm\()16_1d_8x16_pass1_neon
 mov v29.16b, v21.16b
 mov v30.16b, v22.16b
 mov v31.16b, v23.16b
-ret
+br  x14
 endfunc
 
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@@ -653,6 +657,7 @@ endfunc
 // x3 = slice offset
 // x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
+mov x14, x30
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 load\i,  x2,  x9
 .endr
@@ -664,7 +669,7 @@ function \txfm\()16_1d_8x16_pass2_neon
 
 add x3,  x0,  x1
 lsl x1,  x1,  #1
-\txfm\()16
+bl  \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, 
tmp1, tmp2
 srshr   \coef0, \coef0, #6
@@ -714,7 +719,7 @@ function \txfm\()16_1d_8x16_pass2_neon
 load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, 
v30.8h, v31.8h, v16.8b, v17.8b
 .purgem load_add_store
 
-ret
+br  x14
 endfunc
 .endm
 
@@ -843,7 +848,7 @@ function idct32x32_dc_add_neon
 ret
 endfunc
 
-.macro idct32_odd
+function idct32_odd
 ld1 {v0.8h,v1.8h}, [x11]
 
 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
t16a, v31 = t31a
@@ -898,7 +903,8 @@ endfunc
 dmbutterfly0v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = 
t26a, v21 = t21a
 dmbutterfly0v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = 
t25,  v22 = t22
 dmbutterfly0v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = 
t24a, v23 = t23a
-.endm
+ret
+endfunc
 
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -912,6 +918,7 @@ endfunc
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function

[libav-devel] [PATCH 2/5] arm: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)

2016-12-01 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

This makes it easier to avoid filling the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code for these
cases (for the 16x16 case, where the special case pass functions are written
out instead of templated from the same macro), instead of riddling the common
code with special case branches or macro .ifs.

The code size increases from 12388 bytes to 19932 bytes.

Before:  Cortex A7   A8   A9  A53
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0189.7211.9235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:2056.7   1521.2   1734.8   1262.0
vp9_inv_dct_dct_16x16_sub4_add_neon:2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub8_add_neon:2444.9   1801.6   2007.8   1508.5
vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0456.7866.0553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 274.4189.5211.7235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:1214.2962.0   1034.4764.0
vp9_inv_dct_dct_16x16_sub4_add_neon:1214.5911.0   1034.7763.9
vp9_inv_dct_dct_16x16_sub8_add_neon:2000.6   1601.9   1729.0   1286.4
vp9_inv_dct_dct_16x16_sub12_add_neon:   2854.3   2122.2   2292.9   1757.6
vp9_inv_dct_dct_16x16_sub16_add_neon:   3231.1   2477.9   2544.6   2005.7
vp9_inv_dct_dct_32x32_sub1_add_neon: 756.1460.3865.3553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:7603.7   5469.8   6046.2   4922.6
vp9_inv_dct_dct_32x32_sub4_add_neon:7586.9   5740.2   6061.5   4921.5
vp9_inv_dct_dct_32x32_sub8_add_neon:8380.7   6554.4   6600.4   5476.3
vp9_inv_dct_dct_32x32_sub12_add_neon:  11005.8   9856.2   9242.4   7462.3
vp9_inv_dct_dct_32x32_sub16_add_neon:  11959.7  10698.5   9998.0   8134.5
vp9_inv_dct_dct_32x32_sub20_add_neon:  15250.8  13175.6  11854.4   9825.7
vp9_inv_dct_dct_32x32_sub24_add_neon:  16382.3  14501.4  12671.7  10579.5
vp9_inv_dct_dct_32x32_sub28_add_neon:  17521.2  16403.8  13486.3  11331.2
vp9_inv_dct_dct_32x32_sub32_add_neon:  18630.8  17398.7  14383.2  12089.4

---
If we wouldn't have made the core transforms standalone functions,
the code size would end up at around 26 KB.

The binary output is 4 KB larger than in the other alternative,
but is more straightforward and gives better opportunities to
special case them, and is a couple hundred cycles faster for the
small subpartitions.
---
 libavcodec/arm/vp9itxfm_neon.S | 645 ++---
 1 file changed, 601 insertions(+), 44 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 22e63e5..4bba4b9 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
 vrshrn.s32  \out2, \tmpq4, #14
 .endm
 
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+vmull.s16   \tmpq3, \in1, d0[0]
+vrshrn.s32  \out1,  \tmpq3, #14
+vmov\out2,  \out1
+.endm
+
 @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 @ Same as mbutterfly0, but with input being 2 q registers, output
@@ -137,6 +145,23 @@ endconst
 vrshrn.s32  \inout2, \tmp2,  #14
 .endm
 
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout1, \coef1
+vmull.s16   \tmp2,   \inout1, \coef2
+vrshrn.s32  \inout1, \tmp1,   #14
+vrshrn.s32  \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+vmull.s16   \tmp1,   \inout2, \coef2
+vmull.s16   \tmp2,   \inout2, \coef1
+vneg.s32\tmp1,   \tmp1
+vrshrn.s32  \inout2, \tmp2,   #14
+vrshrn.s32  \inout1, \tmp1,   #14
+.endm
+
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 

Re: [libav-devel] [PATCH 4/5] configure: Simplify MMAL check

2016-12-03 Thread Martin Storsjö

On Sat, 3 Dec 2016, Diego Biurrun wrote:


---
configure | 14 ++
1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/configure b/configure
index f838a53..44fc147 100755
--- a/configure
+++ b/configure
@@ -4677,14 +4677,12 @@ enabled libx265   && require_pkg_config x265 x265.h 
x265_api_get &&
   die "ERROR: libx265 version must be >= 57."; }
enabled libxavs   && require libxavs "stdint.h xavs.h" 
xavs_encoder_encode -lxavs
enabled libxvid   && require libxvid xvid.h xvid_global -lxvidcore
-enabled mmal  && { check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-{ ! enabled cross_compile && {
-add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ;
-add_extralibs -L/opt/vc/lib/ -lmmal_core 
-lmmal_util -lmmal_vc_client -lbcm_host ;
-check_lib interface/mmal/mmal.h 
mmal_port_connect ; }
-check_lib interface/mmal/mmal.h 
mmal_port_connect ; } ||
-   die "ERROR: mmal not found"; }
-enabled mmal && check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
+enabled mmal  && { { ! enabled cross_compile &&
+ add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+ add_ldflags -L/opt/vc/lib/ ; }
+   check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+   die "ERROR: mmal not found"; } &&
+ check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
enabled omx   && { check_header OMX_Core.h || die "ERROR: OpenMAX IL 
headers not found"; }
enabled omx_rpi   && { ! enabled cross_compile && add_cflags 
-isystem/opt/vc/include/IL
   check_header OMX_Core.h  || die "ERROR: OpenMAX IL 
headers not found"; }
--
2.1.4


So, previously this first tried to check for the headers/libs in the 
existing include paths, before trying to add the /opt/vc/include paths 
(for non-cross builds) and trying again.


Now for a non-cross build, it'd hardcode adding /opt/vc/include paths even 
if the headers would be found in the existing include path.


Ditto for the OMX patch.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 3/8] decode: restructure the core decoding code

2016-12-03 Thread Martin Storsjö

On Wed, 30 Nov 2016, Anton Khirnov wrote:


Currently, the new decoding API is pretty much just a wrapper around the
old deprecated one. This is problematic, since it interferes with making
full use of the flexibility added by the new API. The old API should
also be removed at some future point.

Reorganize the code so that the new send_packet/receive_frame functions
call the actual decoding directly and change the old deprecated
avcodec_decode_* functions into wrappers around the new API.

The new internal API for decoders is now changing as well. Before this
commit, it mirrors the public API, so the decoders need to implement
send_packet() and receive_frame() callbacks. This turns out to require
awkward constructs in both the decoders and the generic code. After this
commit, the decoders only implement the receive_frame() callback and
call a new internal function, ff_decode_get_packet() to obtain input
data, in the same manner to how the bitstream filters now work.

avcodec will now always make a reference to the input packet, which means
that non-refcounted input packets will be copied. Keeping the previous
behaviour, where this copy could sometimes be avoided, would make the
code significantly more complex and fragile for only dubious gains,
since packets are typically small and everyone who cares about
performance should use refcounted packets anyway.
---
libavcodec/avcodec.h  |  13 +-
libavcodec/decode.c   | 391 +++---
libavcodec/decode.h   |  35 +
libavcodec/internal.h |  17 +++
libavcodec/utils.c|  22 ++-
5 files changed, 293 insertions(+), 185 deletions(-)
create mode 100644 libavcodec/decode.h

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index e75d300..e34ea44 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -3211,20 +3211,25 @@ typedef struct AVCodec {
int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, AVPacket 
*avpkt);
int (*close)(AVCodecContext *);
/**
- * Decode/encode API with decoupled packet/frame dataflow. The API is the
+ * Encode API with decoupled packet/frame dataflow. The API is the
 * same as the avcodec_ prefixed APIs (avcodec_send_frame() etc.), except
 * that:
 * - never called if the codec is closed or the wrong type,
 * - AVPacket parameter change side data is applied right before calling
 *   AVCodec->send_packet,


Shouldn't this bullet point also be moved away, since send_packet is 
moved away/removed?



- * - if AV_CODEC_CAP_DELAY is not set, drain packets or frames are never 
sent,
+ * - if AV_CODEC_CAP_DELAY is not set, drain packets are never sent,


Isn't this the other way around; if this only is for encoders now, the 
encoders would get drain frames, not packets?



 * - only one drain packet is ever passed down (until the next flush()),
 * - a drain AVPacket is always NULL (no need to check for avpkt->size).
 */
int (*send_frame)(AVCodecContext *avctx, const AVFrame *frame);
-int (*send_packet)(AVCodecContext *avctx, const AVPacket *avpkt);
-int (*receive_frame)(AVCodecContext *avctx, AVFrame *frame);
int (*receive_packet)(AVCodecContext *avctx, AVPacket *avpkt);
+
+/**
+ * Decode API with decoupled packet/frame dataflow. This function is called
+ * to get one output frame. It should call ff_decode_get_packet() to obtain
+ * input data.
+ */
+int (*receive_frame)(AVCodecContext *avctx, AVFrame *frame);
/**
 * Flush buffers.
 * Will be called when seeking


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 4/5] configure: Simplify MMAL check

2016-12-03 Thread Martin Storsjö

On Sat, 3 Dec 2016, Diego Biurrun wrote:


On Sat, Dec 03, 2016 at 12:33:04PM +0200, Martin Storsjö wrote:

On Sat, 3 Dec 2016, Diego Biurrun wrote:
>--- a/configure
>+++ b/configure
>@@ -4677,14 +4677,12 @@ enabled libx265   && require_pkg_config x265 x265.h 
x265_api_get &&
>enabled libxavs   && require libxavs "stdint.h xavs.h" 
xavs_encoder_encode -lxavs
>enabled libxvid   && require libxvid xvid.h xvid_global -lxvidcore
>-enabled mmal  && { check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
>-{ ! enabled cross_compile && {
>-add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ;
>-add_extralibs -L/opt/vc/lib/ -lmmal_core 
-lmmal_util -lmmal_vc_client -lbcm_host ;
>-check_lib interface/mmal/mmal.h 
mmal_port_connect ; }
>-check_lib interface/mmal/mmal.h 
mmal_port_connect ; } ||
>-   die "ERROR: mmal not found"; }
>-enabled mmal && check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
>+enabled mmal  && { { ! enabled cross_compile &&
>+ add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
>+ add_ldflags -L/opt/vc/lib/ ; }
>+   check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
>+   die "ERROR: mmal not found"; } &&
>+ check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
>enabled omx   && { check_header OMX_Core.h || die "ERROR: OpenMAX IL 
headers not found"; }
>enabled omx_rpi   && { ! enabled cross_compile && add_cflags 
-isystem/opt/vc/include/IL
>   check_header OMX_Core.h  || die "ERROR: OpenMAX IL 
headers not found"; }
>-- 
>2.1.4


So, previously this first tried to check for the headers/libs in the
existing include paths, before trying to add the /opt/vc/include paths (for
non-cross builds) and trying again.

Now for a non-cross build, it'd hardcode adding /opt/vc/include paths even
if the headers would be found in the existing include path.


OK, it was a tad hard for me to be sure what the original intention was.
(Note that it redundantly checks twice after setting the flags.)

It should be one check before and one after setting the flags?


Yes, that's the intent. First try checking if it is found in the existing 
paths, like we do for any normal lib. If not found, but requested and not 
cross compiling, try adding these paths and see if it is found then.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 3/5] configure: Simplify OMX check

2016-12-03 Thread Martin Storsjö

On Sat, 3 Dec 2016, Diego Biurrun wrote:


On Sat, Dec 03, 2016 at 02:50:31PM +0100, Luca Barbato wrote:

On 03/12/2016 11:07, Diego Biurrun wrote:

On Sat, Dec 03, 2016 at 11:00:01AM +0100, Luca Barbato wrote:

On 03/12/2016 07:49, Diego Biurrun wrote:

+enabled omx   && { check_header OMX_Core.h || die
"ERROR: OpenMAX IL headers not found"; } +enabled omx_rpi
&& { ! enabled cross_compile && add_cflags
-isystem/opt/vc/include/IL +
check_header OMX_Core.h  || die "ERROR: OpenMAX IL headers not
found"; }


Flip the order and it should work.


At this point in configure, omx_rpi has not turned on omx yet, so
there should not be any duplicate checking.


so it would simply die if you are on a rpi and enable omx?


Isn't that what the omx-rpi option is for? I'm somewhat mystified by
what the intended behavior is...


Right, if you do --enable-omx --enable-omx-rpi (which is kind of 
tautological, because omx-rpi changes the behaviour of the omx module; you 
don't get two encoders, but you get the normal omx encoder, but configured 
for the alternate ABI and other broadcom peculiarities), it would 
previously succeed, while it now would fail to find the headers.


I don't think that's a case worth caring about though, in that case, just 
drop the redundant --enable-omx.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] configure: Simplify OMX check

2016-12-04 Thread Martin Storsjö

On Sun, 4 Dec 2016, Diego Biurrun wrote:


---

Switched the order of the checks, as suggested by Luca.

configure | 9 -
1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index e816eb5..455d4fb 100755
--- a/configure
+++ b/configure
@@ -2190,6 +2190,7 @@ nvenc_deps_any="dlopen LoadLibrary"
nvenc_extralibs='$ldl'
omx_deps="dlopen pthreads"
omx_extralibs='$ldl'
+omx_rpi_select="omx"
qsvdec_select="qsv"
qsvenc_select="qsv"
vaapi_encode_deps="vaapi"
@@ -4685,12 +4686,10 @@ enabled mmal  && { check_lib 
interface/mmal/mmal.h mmal_port_connect
check_lib interface/mmal/mmal.h 
mmal_port_connect ; } ||
   die "ERROR: mmal not found"; }
enabled mmal && check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
-enabled omx_rpi && enable omx
-enabled omx   && { check_header OMX_Core.h ||
-{ ! enabled cross_compile && enabled omx_rpi 
&& {
-add_cflags -isystem/opt/vc/include/IL ; }
-check_header OMX_Core.h ; } ||
+enabled omx_rpi   && { check_header OMX_Core.h ||
+   { ! enabled cross_compile && add_cflags 
-isystem/opt/vc/include/IL && check_header OMX_Core.h ; } ||
   die "ERROR: OpenMAX IL headers not found"; }
+enabled omx   && { check_header OMX_Core.h || die "ERROR: OpenMAX IL 
headers not found"; }
enabled openssl   && { { check_pkg_config openssl openssl/ssl.h 
OPENSSL_init_ssl ||
 check_pkg_config openssl openssl/ssl.h 
SSL_library_init; } && {
   add_cflags $openssl_cflags && add_extralibs 
$openssl_libs; }||
--
2.1.4


I was almost about to say that it seems ok, but it fails one cruicial 
detail:


After running ./configure --enable-omx-rpi, I get the following in 
config.h:


#define CONFIG_OMX 1
#define CONFIG_OMX_RPI 1
#define CONFIG_H264_OMX_ENCODER 0
#define CONFIG_MPEG4_OMX_ENCODER 0

With the current master, they all end up as 1 as intended.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] configure: Simplify MMAL check

2016-12-04 Thread Martin Storsjö

On Sun, 4 Dec 2016, Diego Biurrun wrote:


---

Third time's the charm?

configure | 13 ++---
1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/configure b/configure
index 849226a..533a37a 100755
--- a/configure
+++ b/configure
@@ -4682,13 +4682,12 @@ enabled libx265   && require_pkg_config x265 x265.h 
x265_api_get &&
enabled libxavs   && require libxavs "stdint.h xavs.h" 
xavs_encoder_encode -lxavs
enabled libxvid   && require libxvid xvid.h xvid_global -lxvidcore
enabled mmal  && { check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
-{ ! enabled cross_compile && {
-add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline ;
-add_extralibs -L/opt/vc/lib/ -lmmal_core 
-lmmal_util -lmmal_vc_client -lbcm_host ;
-check_lib interface/mmal/mmal.h 
mmal_port_connect ; }
-check_lib interface/mmal/mmal.h 
mmal_port_connect ; } ||
-   die "ERROR: mmal not found"; }
-enabled mmal && check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
+   { ! enabled cross_compile &&
+ add_cflags -isystem/opt/vc/include/ 
-isystem/opt/vc/include/interface/vmcs_host/linux 
-isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+ add_ldflags -L/opt/vc/lib/ &&
+ check_lib interface/mmal/mmal.h 
mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
+   die "ERROR: mmal not found" &&
+   check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
enabled omx   && { check_header OMX_Core.h || die "ERROR: OpenMAX IL 
headers not found"; }
enabled omx_rpi   && { check_header OMX_Core.h ||
   { ! enabled cross_compile && add_cflags 
-isystem/opt/vc/include/IL && check_header OMX_Core.h ; } ||
--
2.1.4


This seems to work as intended, so ok. (Tested in both cross setups with 
manually set paths, and directly on device with automatically added 
paths.)


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] configure: Move COMPONENT_LIST to the bottom of CONFIG_LIST

2016-12-05 Thread Martin Storsjö

On Mon, 5 Dec 2016, Diego Biurrun wrote:


This ensures that dependencies are resolved correctly. COMPONENT_LIST
can contain parts that depend on previous entries of CONFIG_LIST.
---

This fixes the issue that Martin noticed with omx_rpi and the actual
omx encoders not getting enabled.

configure | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index b4680e4..52a09f7 100755
--- a/configure
+++ b/configure
@@ -1362,8 +1362,8 @@ SUBSYSTEM_LIST="
rdft
"

+# COMPONENT_LIST needs to be the last to ensure correct dependency checking
CONFIG_LIST="
-$COMPONENT_LIST
$EXAMPLE_LIST
$EXTERNAL_LIBRARY_LIST
$HWACCEL_LIBRARY_LIST
@@ -1379,6 +1379,7 @@ CONFIG_LIST="
thumb
valgrind_backtrace
xmm_clobber_test
+$COMPONENT_LIST
"

THREADS_LIST="
--
2.1.4


Seems to work for me

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] configure: Simplify OMX check

2016-12-05 Thread Martin Storsjö

On Mon, 5 Dec 2016, Diego Biurrun wrote:


On Sun, Dec 04, 2016 at 11:20:51PM +0200, Martin Storsjö wrote:

On Sun, 4 Dec 2016, Diego Biurrun wrote:
>--- a/configure
>+++ b/configure
>@@ -2190,6 +2190,7 @@ nvenc_deps_any="dlopen LoadLibrary"
>nvenc_extralibs='$ldl'
>omx_deps="dlopen pthreads"
>omx_extralibs='$ldl'
>+omx_rpi_select="omx"
>qsvdec_select="qsv"
>qsvenc_select="qsv"
>vaapi_encode_deps="vaapi"
>@@ -4685,12 +4686,10 @@ enabled mmal  && { check_lib 
interface/mmal/mmal.h mmal_port_connect
>check_lib interface/mmal/mmal.h 
mmal_port_connect ; } ||
>   die "ERROR: mmal not found"; }
>enabled mmal && check_func_headers interface/mmal/mmal.h 
"MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"
>-enabled omx_rpi && enable omx
>-enabled omx   && { check_header OMX_Core.h ||
>-{ ! enabled cross_compile && enabled omx_rpi 
&& {
>-add_cflags -isystem/opt/vc/include/IL ; }
>-check_header OMX_Core.h ; } ||
>+enabled omx_rpi   && { check_header OMX_Core.h ||
>+   { ! enabled cross_compile && add_cflags 
-isystem/opt/vc/include/IL && check_header OMX_Core.h ; } ||
>   die "ERROR: OpenMAX IL headers not found"; }
>+enabled omx   && { check_header OMX_Core.h || die "ERROR: OpenMAX IL 
headers not found"; }
>enabled openssl   && { { check_pkg_config openssl openssl/ssl.h 
OPENSSL_init_ssl ||
> check_pkg_config openssl openssl/ssl.h 
SSL_library_init; } && {
>   add_cflags $openssl_cflags && add_extralibs 
$openssl_libs; }||

I was almost about to say that it seems ok, but it fails one cruicial
detail:

After running ./configure --enable-omx-rpi, I get the following in config.h:

#define CONFIG_OMX 1
#define CONFIG_OMX_RPI 1
#define CONFIG_H264_OMX_ENCODER 0
#define CONFIG_MPEG4_OMX_ENCODER 0

With the current master, they all end up as 1 as intended.


This should not happen of course. I know the reason, this patch is the
symptom of the underlying problem. Patch coming up shortly.
Thanks for noticing.


This patch is ok with the other fix

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] omx: Fix OOM check

2016-12-05 Thread Martin Storsjö

On Mon, 5 Dec 2016, Timothy Gu wrote:


Also use av_mallocz_array().

Found by Coverity in FFmpeg.
---
libavcodec/omx.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/omx.c b/libavcodec/omx.c
index 0c61c2f..05c8743 100644
--- a/libavcodec/omx.c
+++ b/libavcodec/omx.c
@@ -352,12 +352,12 @@ static av_cold int find_component(OMXContext 
*omx_context, void *logctx,
av_log(logctx, AV_LOG_WARNING, "No component for role %s found\n", 
role);
return AVERROR_ENCODER_NOT_FOUND;
}
-components = av_mallocz(sizeof(char*) * num);
+components = av_mallocz_array(num, sizeof(*components));
if (!components)
return AVERROR(ENOMEM);
for (i = 0; i < num; i++) {
components[i] = av_mallocz(OMX_MAX_STRINGNAME_SIZE);
-if (!components) {
+if (!components[i]) {
ret = AVERROR(ENOMEM);
goto end;
}
--
2.10.2


Thanks, this looks ok to me.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [libav-commits] build: Move entries related to building TOOLS to a subdirectory Makefile

2016-12-07 Thread Martin Storsjö

On Wed, 7 Dec 2016, Diego Biurrun  wrote:


Module: libav
Branch: master
Commit: 3e105d08848162b90d886bde59c010d4b0362a4b

Author:Diego Biurrun 
Committer: Diego Biurrun 
Date:  Mon Dec  5 18:38:53 2016 +0100

build: Move entries related to building TOOLS to a subdirectory Makefile

---

Makefile   |8 +---
tools/Makefile |   11 +++
2 files changed, 12 insertions(+), 7 deletions(-)


Since this commit, a plain "make" does nothing - some other make target 
than "all" seems to be the first target?


Please fix.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/1] arm64: replace 'bic' with immediate with 'and' with inverted immediate

2016-12-08 Thread Martin Storsjö

On Thu, 8 Dec 2016, Janne Grunau wrote:


The former is not an official pseudo instruction although gas and llvm's
internal assembler support it. Fixes a build error with xcode 6.2
reported by Memphiz on github.
---
libavcodec/aarch64/synth_filter_neon.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/synth_filter_neon.S 
b/libavcodec/aarch64/synth_filter_neon.S
index 9551bff8e3..b001c737da 100644
--- a/libavcodec/aarch64/synth_filter_neon.S
+++ b/libavcodec/aarch64/synth_filter_neon.S
@@ -50,7 +50,7 @@ function ff_synth_filter_float_neon, export=1
add x1,  x1,  x7,  lsl #2   // synth_buf
sub w8,  w7,  #32
stp x5,  x1,  [sp, #16]
-bic x7,  x7,  #63
+and x7,  x7,  #~63
and w8,  w8,  #511
stp x7,  x30, [sp, #32]
str w8,  [x2]
--
2.11.0


Ok

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] configure: use -O1 with MSVC by default

2016-12-14 Thread Martin Storsjö

On Tue, 13 Dec 2016, Steve Lhomme wrote:


From: Steve Lhomme 

Otherwise some ARM and other unsupported CPU/OS is linked with -O0
---
configure | 1 +
1 file changed, 1 insertion(+)


I find the commit message quite hard to understand and easily misleading. 
Would this make more sense?


---8<---
configure: Use -O1 with MSVC for --disable-optimizations

Without any optimization flags, MSVC does no dead code elimination (DCE) 
at all, even for the most trivial cases. DCE is a prerequisite for 
building libav correctly, otherwise there are undefined references to 
functions for other architectures and disabled components.


-O1 is the minimal optimization flags for MSVC that does include DCE.
---8<---

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] libavformat: Fix a faulty api deprecation guard in prepare_input_packet

2016-12-14 Thread Martin Storsjö
This seems to have been added by mistake in 11de006b, by not
noticing the negation for the existing condition. This block does
not contain any code that accesses the codec field in AVStream.

This function is meant to serve as a complement to compute_pkt_fields2,
which is guarded by FF_API_COMPUTE_PKT_FIELDS2 && FF_API_LAVF_AVCTX.
---
 libavformat/mux.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavformat/mux.c b/libavformat/mux.c
index 37c4541..2561a6d 100644
--- a/libavformat/mux.c
+++ b/libavformat/mux.c
@@ -423,7 +423,7 @@ static int prepare_input_packet(AVFormatContext *s, 
AVPacket *pkt)
 if (ret < 0)
 return ret;
 
-#if !FF_API_COMPUTE_PKT_FIELDS2 && FF_API_LAVF_AVCTX
+#if !FF_API_COMPUTE_PKT_FIELDS2 || !FF_API_LAVF_AVCTX
 /* sanitize the timestamps */
 if (!(s->oformat->flags & AVFMT_NOTIMESTAMPS)) {
 AVStream *st = s->streams[pkt->stream_index];
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] rtmppkt: Check for packet size mismatches

2016-12-15 Thread Martin Storsjö
From: Michael Niedermayer 

Fixes out of array access.

Found-by: Paul Cher 
Reviewed-by: Paul Cher 

CC: libav-sta...@libav.org
---
 libavformat/rtmppkt.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c
index f8c51d0..373c3ea 100644
--- a/libavformat/rtmppkt.c
+++ b/libavformat/rtmppkt.c
@@ -235,6 +235,13 @@ static int rtmp_packet_read_one_chunk(URLContext *h, 
RTMPPacket *p,
 if (hdr != RTMP_PS_TWELVEBYTES)
 timestamp += prev_pkt[channel_id].timestamp;
 
+if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) {
+av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n",
+size, prev_pkt[channel_id].size);
+ff_rtmp_packet_destroy(&prev_pkt[channel_id]);
+prev_pkt[channel_id].read = 0;
+}
+
 if (!prev_pkt[channel_id].read) {
 if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp,
  size)) < 0)
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] hmac: Explicitly convert types at function pointer assignment

2016-12-15 Thread Martin Storsjö

On Thu, 15 Dec 2016, Diego Biurrun wrote:


Fixes a number of warnings of the type
libavutil/hmac.c:61:21: warning: assignment from incompatible pointer type
---

Now with the preliminaries requested by Anton.

libavutil/hmac.c | 30 ++
1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/libavutil/hmac.c b/libavutil/hmac.c
index 378be62..2f4fa52 100644
--- a/libavutil/hmac.c
+++ b/libavutil/hmac.c
@@ -18,6 +18,8 @@
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

+#include 
+#include 
#include 

#include "attributes.h"
@@ -29,12 +31,16 @@
#define MAX_HASHLEN 32
#define MAX_BLOCKLEN 64

+typedef void (*hmac_final)(void *ctx, uint8_t *dst);
+typedef void (*hmac_update)(void *ctx, const uint8_t *src, size_t len);
+typedef void (*hmac_init)(void *ctx);
+
struct AVHMAC {
void *hash;
int blocklen, hashlen;
-void (*final)(void*, uint8_t*);
-void (*update)(void*, const uint8_t*, int len);
-void (*init)(void*);
+hmac_final  final;
+hmac_update update;
+hmac_init   init;
uint8_t key[MAX_BLOCKLEN];
int keylen;
};
@@ -58,33 +64,33 @@ AVHMAC *av_hmac_alloc(enum AVHMACType type)
case AV_HMAC_MD5:
c->blocklen = 64;
c->hashlen  = 16;
-c->init = av_md5_init;
-c->update   = av_md5_update;
-c->final= av_md5_final;
+c->init = (hmac_init) av_md5_init;
+c->update   = (hmac_update) av_md5_update;
+c->final= (hmac_final) av_md5_final;
c->hash = av_md5_alloc();
break;


Until we actually do the major bump and change the parameter type, this is 
absolutely wrong.


Until the next major bump, the actual implementation of the function 
av_md5_update takes an int parameter, but here we'd ignore that and 
pretend that it takes a size_t. If we're unlucky, a size_t is passed 
differently as parameter than an (un)signed int on some architecture, and 
we'd have a complete breakage.


Yes it does give warnings right now, but the ABI of the functions at least 
match, contrary to what it would do after this patch. Additionally, this 
patch would hide any future ABI incompatibility completely by silencing 
all that with the cast.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] rtmppkt: Check for packet size mismatches

2016-12-15 Thread Martin Storsjö

On Thu, 15 Dec 2016, Luca Barbato wrote:


On 15/12/2016 09:12, Martin Storsjö wrote:

From: Michael Niedermayer 

Fixes out of array access.

Found-by: Paul Cher 
Reviewed-by: Paul Cher 

CC: libav-sta...@libav.org
---
 libavformat/rtmppkt.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c
index f8c51d0..373c3ea 100644
--- a/libavformat/rtmppkt.c
+++ b/libavformat/rtmppkt.c
@@ -235,6 +235,13 @@ static int rtmp_packet_read_one_chunk(URLContext *h, 
RTMPPacket *p,
 if (hdr != RTMP_PS_TWELVEBYTES)
 timestamp += prev_pkt[channel_id].timestamp;

+if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) {
+av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n",
+size, prev_pkt[channel_id].size);
+ff_rtmp_packet_destroy(&prev_pkt[channel_id]);
+prev_pkt[channel_id].read = 0;
+}
+
 if (!prev_pkt[channel_id].read) {
 if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp,
  size)) < 0)



Why it happens?


When you have fragmented packets, the first packet declares the size and 
the later ones (normally) are small follow-on packets that don't repeat 
the size and all that. But technically the later fragments also can have a 
full header, declaring a different size than the previous packet. In those 
cases we didn't use to check that the partial packet that was allocated 
earlier actually matches the size in the current packet.



If it is ok not to forward an error the log message could be demoted to
verbose.


I guess we can and should return an error here directly as well.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] rtmppkt: Check for packet size mismatches

2016-12-15 Thread Martin Storsjö
From: Michael Niedermayer 

When receiving fragmented packets, the first packet declares the size,
and the later ones normally are small follow-on packets that don't repeat
the size and the other header fields. But technically, the later fragments
also can have a full header, declaring a different size than the previous
packet.

If the follow-on packet declares a larger size than the initial one, we
could end up writing outside of the allocation.

This fixes out of bounds writes.

Found-by: Paul Cher 
Reviewed-by: Paul Cher 

CC: libav-sta...@libav.org
---
Now with error return and improved commit message.
---
 libavformat/rtmppkt.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/libavformat/rtmppkt.c b/libavformat/rtmppkt.c
index f8c51d0..1cb3078 100644
--- a/libavformat/rtmppkt.c
+++ b/libavformat/rtmppkt.c
@@ -235,6 +235,14 @@ static int rtmp_packet_read_one_chunk(URLContext *h, 
RTMPPacket *p,
 if (hdr != RTMP_PS_TWELVEBYTES)
 timestamp += prev_pkt[channel_id].timestamp;
 
+if (prev_pkt[channel_id].read && size != prev_pkt[channel_id].size) {
+av_log(h, AV_LOG_ERROR, "RTMP packet size mismatch %d != %d\n",
+size, prev_pkt[channel_id].size);
+ff_rtmp_packet_destroy(&prev_pkt[channel_id]);
+prev_pkt[channel_id].read = 0;
+return AVERROR_INVALIDDATA;
+}
+
 if (!prev_pkt[channel_id].read) {
 if ((ret = ff_rtmp_packet_create(p, channel_id, type, timestamp,
  size)) < 0)
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] aarch64: vp9itxfm: Use the offset parameter to movrel

2016-12-19 Thread Martin Storsjö
This fixes build failures for iOS, broken since cad42fadcd2c.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 053d46f..7ce6df0 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -1108,7 +1108,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 
 movrel  x10, idct_coeffs
 add x11, x10, #32
-movrel  x12, min_eob_idct_idct_32 + 2
+movrel  x12, min_eob_idct_idct_32, 2
 
 mov x15, x30
 
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] Skipping checkasm on MSVC DLL setups?

2016-12-19 Thread Martin Storsjö

Hi,

It seems like checkasm has started failing to build in MSVC DLL setups 
recently, see e.g. this instance: 
https://fate.libav.org/i686-msvc-10-dll-wine


The reason is that once you build with --enable-shared with MSVC, you 
can't link those object files via static libraries any longer. (See 
d66c52c2b369401ba4face1c171ccb19130b7a31, that added support for MSVC 
built DLLs, for the full explanation.) And checkasm links to the libs 
statically in order to link to non-exported fucntions.


In practice, this hasn't been an issue so far, but since 
972c71e9cb63e24f57ee481e413199c7d88a8813, checkasm seems to pull in enough 
object files from libavcodec to start exhibiting the problem I explained 
in the commit when MSVC DLL support as added above. So it has only worked 
by coincidence so far.


We could of course try to figure out what part of the dsp object files 
that checkasm links suddenly pulls in bsf code in this link, but that'd 
also be just hacking around the problem.


Since checkasm in MSVC DLL builds isn't that important, since it's covered 
pretty well in other test setups anyway, I would suggest to just disable 
checkasm in this case. We easily have got CONFIG_SHARED available in 
makefile, but I guess we'd need to add something else to identify MSVC. Or 
just a "disable checkasm" or similar, somewhere in configure?


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] http: Check for negative chunk sizes

2016-12-19 Thread Martin Storsjö
A negative chunk size is illegal and would end up used as
length for memcpy, where it would lead to memory accesses
out of bounds.

Found-by: Paul Cher 

CC: libav-sta...@libav.org
---
 libavformat/http.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavformat/http.c b/libavformat/http.c
index 8fe8d11..7e3708e 100644
--- a/libavformat/http.c
+++ b/libavformat/http.c
@@ -784,6 +784,8 @@ static int http_read_stream(URLContext *h, uint8_t *buf, 
int size)
 
 av_log(NULL, AV_LOG_TRACE, "Chunked encoding data size: 
%"PRId64"'\n",
 s->chunksize);
+if (s->chunksize < 0)
+return AVERROR_INVALIDDATA;
 
 if (!s->chunksize)
 return 0;
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] http: Check for negative chunk sizes

2016-12-23 Thread Martin Storsjö

On Mon, 19 Dec 2016, Vittorio Giovara wrote:


On Mon, Dec 19, 2016 at 9:56 PM, Martin Storsjö  wrote:

A negative chunk size is illegal and would end up used as
length for memcpy, where it would lead to memory accesses
out of bounds.

Found-by: Paul Cher 

CC: libav-sta...@libav.org
---
 libavformat/http.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavformat/http.c b/libavformat/http.c
index 8fe8d11..7e3708e 100644
--- a/libavformat/http.c
+++ b/libavformat/http.c
@@ -784,6 +784,8 @@ static int http_read_stream(URLContext *h, uint8_t *buf, 
int size)

 av_log(NULL, AV_LOG_TRACE, "Chunked encoding data size: 
%"PRId64"'\n",
 s->chunksize);
+if (s->chunksize < 0)
+return AVERROR_INVALIDDATA;

 if (!s->chunksize)
 return 0;


This is mostly a nit, but would it make sense to coalesce the second
`if` into a `else if`?
Ok with me either way.


Pushed with this changed as suggested.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] fate: Add --ignore-tests configure option for omitting specific FATE tests

2016-12-27 Thread Martin Storsjö
From: Janne Grunau 

This can be useful to filter out noise in known-broken scenarios like
miscompilation by legacy compilers and similar.

Originally based on a patch by Diego Biurrun.
---
This is the common agreed upon consensus from Diego's and Janne's
discussion, as far as I read it.
---
 configure | 11 +++
 doc/fate.texi |  1 +
 tests/Makefile|  4 +++-
 tests/fate-run.sh |  9 +++--
 tests/fate.sh |  1 +
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 8e40238..8472145 100755
--- a/configure
+++ b/configure
@@ -348,6 +348,8 @@ Developer options (useful when working on Libav itself):
   --random-seed=VALUE  seed value for --enable/disable-random
   --disable-valgrind-backtrace do not print a backtrace under Valgrind
(only applies to --disable-optimizations builds)
+  --ignore-tests=TESTS comma-separated list (without "fate-" prefix
+   in the name) of tests which result is ignored
 
 NOTE: Object files are built at the place where configure is launched.
 EOF
@@ -1815,6 +1817,7 @@ CMDLINE_SET="
 host_ld
 host_ldflags
 host_os
+ignore_tests
 ld
 logfile
 malloc_prefix
@@ -5192,6 +5195,13 @@ for type in decoder encoder hwaccel parser demuxer muxer 
protocol filter bsf ind
 echo
 done
 
+if test -n "$ignore_tests"; then
+ignore_tests=$(echo $ignore_tests | tr ',' ' ')
+echo "Ignored FATE tests:"
+echo $ignore_tests | print_3_columns
+echo
+fi
+
 license="LGPL version 2.1 or later"
 if enabled nonfree; then
 license="nonfree and unredistributable"
@@ -5316,6 +5326,7 @@ SLIB_INSTALL_EXTRA_LIB=${SLIB_INSTALL_EXTRA_LIB}
 SLIB_INSTALL_EXTRA_SHLIB=${SLIB_INSTALL_EXTRA_SHLIB}
 VERSION_SCRIPT_POSTPROCESS_CMD=${VERSION_SCRIPT_POSTPROCESS_CMD}
 SAMPLES:=${samples:-\$(LIBAV_SAMPLES)}
+IGNORE_TESTS=$ignore_tests
 EOF
 
 map 'eval echo "${v}_FFLIBS=\$${v}_deps" >> avbuild/config.mak' $LIBRARY_LIST
diff --git a/doc/fate.texi b/doc/fate.texi
index 9e654e7..b1bfa2e 100644
--- a/doc/fate.texi
+++ b/doc/fate.texi
@@ -139,6 +139,7 @@ workdir=# directory in 
which to do all the work
 fate_recv="ssh -T fate@@fate.libav.org"  # command to submit report
 comment=# optional description
 build_only= # set to "yes" for a compile-only instance that skips tests
+ignore_tests=
 
 # the following are optional and map to configure options
 arch=
diff --git a/tests/Makefile b/tests/Makefile
index 0e475a2..30e06e8 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -143,11 +143,13 @@ endif
 
 FATE_UTILS = base64 tiny_psnr
 
+$(addprefix fate-, $(IGNORE_TESTS)): REPORT=ignore
+
 fate: $(FATE)
 
 $(FATE): $(FATE_UTILS:%=tests/%$(HOSTEXESUF))
@echo "TEST$(@:fate-%=%)"
-   $(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" 
"$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' 
'$(THREAD_TYPE)' '$(CPUFLAGS)' '$(CMP_SHIFT)' '$(CMP_TARGET)' 
'$(SIZE_TOLERANCE)' '$(CMP_UNIT)' '$(GEN)' '$(HWACCEL)'
+   $(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" 
"$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' 
'$(THREAD_TYPE)' '$(CPUFLAGS)' '$(CMP_SHIFT)' '$(CMP_TARGET)' 
'$(SIZE_TOLERANCE)' '$(CMP_UNIT)' '$(GEN)' '$(HWACCEL)' '$(REPORT)'
 
 fate-list:
@printf '%s\n' $(sort $(FATE))
diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index b1b299a..3982a18 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -24,6 +24,7 @@ size_tolerance=${14:-0}
 cmp_unit=${15:-2}
 gen=${16:-no}
 hwaccel=${17:-none}
+report_type=${18:-standard}
 
 outdir="tests/data/fate"
 outfile="${outdir}/${test}"
@@ -212,13 +213,17 @@ if test -e "$ref" || test $cmp = "oneline" ; then
 esac
 cmperr=$?
 test $err = 0 && err=$cmperr
-test $err = 0 || cat $cmpfile
+if [ "$report_type" = "ignore" ]; then
+test $err = 0 || echo "ignoring fate-${test}" && err=0
+else
+test $err = 0 || cat $cmpfile
+fi
 else
 echo "reference file '$ref' not found"
 err=1
 fi
 
-if [ $err -eq 0 ]; then
+if [ $err -eq 0 ] && test $report_type = "standard" ; then
 unset cmpo erro
 else
 cmpo="$($base64 <$cmpfile)"
diff --git a/tests/fate.sh b/tests/fate.sh
index 4608d2d..c93e20a 100755
--- a/tests/fate.sh
+++ b/tests/fate.sh
@@ -47,6 +47,7 @@ configure()(
 --prefix="${inst}"  \
 --samples="${samples}"  \
 --enable-gpl\
+${ignore_tests:+--ignore-tests="$ignore_tests"} \
 ${arch:+--arch=$arch}   \
 ${cpu:+--cpu="$cpu"}\
 ${toolchain:+--toolchain="$toolchain"}  \
-- 
2.10.1 (Apple Git-78)

___

[libav-devel] [PATCH] fate: Unset the sig variable if ignoring a test failure

2016-12-28 Thread Martin Storsjö
Otherwise the .rep file would still contain a signal instead of a
zero, even if the process returned success.
---
Also, I think the fate- prefix should be omitted in the IGNORE line.
See https://fate.libav.org/i386-apple-darwin-gcc-4.0/20161228201333/test
for an example of what it looks like.

And instead of "IGNORE  " with double spaces, I think the more
correct/consistent thing to use would be a \t, to match what the makefiles
use for the CC etc lines.
---
 tests/fate-run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index 27cd626..e1aaf64 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -214,7 +214,7 @@ if test -e "$ref" || test $cmp = "oneline" ; then
 cmperr=$?
 test $err = 0 && err=$cmperr
 if [ "$report_type" = "ignore" ]; then
-test $err = 0 || echo "IGNORE  fate-${test}" && err=0
+test $err = 0 || echo "IGNORE  fate-${test}" && err=0 && unset sig
 else
 test $err = 0 || cat $cmpfile
 fi
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] fate: Tweak printing of ignored tests

2016-12-29 Thread Martin Storsjö
Use a tab instead of two spaces, skip the fate prefix for the test name.

This makes IGNORE line fit in even better with the other make printouts.
---
 tests/fate-run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fate-run.sh b/tests/fate-run.sh
index e1aaf64..623fd63 100755
--- a/tests/fate-run.sh
+++ b/tests/fate-run.sh
@@ -214,7 +214,7 @@ if test -e "$ref" || test $cmp = "oneline" ; then
 cmperr=$?
 test $err = 0 && err=$cmperr
 if [ "$report_type" = "ignore" ]; then
-test $err = 0 || echo "IGNORE  fate-${test}" && err=0 && unset sig
+test $err = 0 || echo "IGNORE\t${test}" && err=0 && unset sig
 else
 test $err = 0 || cat $cmpfile
 fi
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] fate: Skip the checkasm test if CONFIG_STATIC is disabled

2016-12-30 Thread Martin Storsjö
When building DLLs with MSVC, CONFIG_STATIC is disabled (see
d66c52c2b3694 for a more verbose explanation) since the built
object files can't be linked statically (which checkasm does).

This worked up until recently, only by luck.
---
 tests/fate/checkasm.mak | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 939eb56..5721f1f 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -19,5 +19,5 @@ $(FATE_CHECKASM): tests/checkasm/checkasm$(EXESUF)
 $(FATE_CHECKASM): CMD = run tests/checkasm/checkasm 
--test=$(@:fate-checkasm-%=%)
 $(FATE_CHECKASM): REF = /dev/null
 
-FATE += $(FATE_CHECKASM)
+FATE-$(CONFIG_STATIC) += $(FATE_CHECKASM)
 fate-checkasm: $(FATE_CHECKASM)
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] Skipping checkasm on MSVC DLL setups?

2016-12-31 Thread Martin Storsjö

On Sat, 31 Dec 2016, Diego Biurrun wrote:


Hmmm...

On Mon, Dec 19, 2016 at 11:51:36AM +0200, Martin Storsjö wrote:

It seems like checkasm has started failing to build in MSVC DLL setups
recently, see e.g. this instance:
https://fate.libav.org/i686-msvc-10-dll-wine

The reason is that once you build with --enable-shared with MSVC, you can't
link those object files via static libraries any longer. (See
d66c52c2b369401ba4face1c171ccb19130b7a31, that added support for MSVC built
DLLs, for the full explanation.) And checkasm links to the libs statically
in order to link to non-exported fucntions.

In practice, this hasn't been an issue so far, but since
972c71e9cb63e24f57ee481e413199c7d88a8813, checkasm seems to pull in enough
object files from libavcodec to start exhibiting the problem I explained in
the commit when MSVC DLL support as added above. So it has only worked by
coincidence so far.

We could of course try to figure out what part of the dsp object files that
checkasm links suddenly pulls in bsf code in this link, but that'd also be
just hacking around the problem.


I'm afraid we're hacking around the problem either way and the only proper
solution is to rid ourselves of avpriv_, which is basically the devil.


Yes. In this particular case, getting rid of the avpriv data symbols is 
enough; the avpriv functions are harmless wrt this.



Since checkasm in MSVC DLL builds isn't that important, since it's covered
pretty well in other test setups anyway, I would suggest to just disable
checkasm in this case. We easily have got CONFIG_SHARED available in
makefile, but I guess we'd need to add something else to identify MSVC. Or
just a "disable checkasm" or similar, somewhere in configure?


I pointed out how you could achieve this by adding a CONFIG_STATIC condition
in the right place, but what about the other TESTPROGS? The situation wrt
them seems equally brittle. At any point in the future some avpriv_ symbol
might get pulled in and break linking them. Disabling all the tests that
use TESTPROGS also seems undesirable since we're speaking of 200+ tests.
The seek tests are among them...


I don't see that as being too bad. Given the choice between removing these 
fate instances altogether (or, playing devil's advocate even further, 
removing the MSVC DLL support altogether, since it's something of a hack 
wrt avpriv data), or disabling a couple hundred of the tests, I much 
prefer disabling these tests.


We've got corresponding fate instances that test the library internals in 
general, thoroughly, without building as DLLs. When building as DLLs, the 
only thing that you really can test is avconv.exe.



This looks like a dilemma where all the alternatives are hacks to me...


Well, partially yes. But the patch I posted based on your suggestion isn't 
all that wrong either (although incomplete wrt testprogs); if 
CONFIG_STATIC is disabled, one probably shouldn't try to link checkasm 
(and the others).


Basically, as far as I know, the only fate instances with CONFIG_STATIC 
disabled would be my MSVC DLL ones.


It would probably hit people building with --disable-static (wanting to 
redistribute only e.g. shared libs), making a "make check" in their builds 
test things a little less thoroughly, but I would see that as an 
acceptable compromise.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 4/6] aarch64: vp9mc: Simplify the extmla macro parameters

2017-01-02 Thread Martin Storsjö
Fold the field lengths into the macro.

This makes the macro invocations much more readable, when the
lines are shorter.

This also makes it easier to use only half the registers within
the macro.
---
 libavcodec/aarch64/vp9mc_neon.S | 50 -
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index c1f1876..99f1809 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -193,41 +193,41 @@ endfunc
 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 // dst1-dst2 and dst3-dst4 for size >= 16)
 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, 
offset, size
-ext v20.16b, \src1, \src2, #(2*\offset)
-ext v22.16b, \src4, \src5, #(2*\offset)
+ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 .if \size >= 16
-mla \dst1, v20.8h, v0.h[\offset]
-ext v21.16b, \src2, \src3, #(2*\offset)
-mla \dst3, v22.8h, v0.h[\offset]
-ext v23.16b, \src5, \src6, #(2*\offset)
-mla \dst2, v21.8h, v0.h[\offset]
-mla \dst4, v23.8h, v0.h[\offset]
+mla \dst1\().8h, v20.8h, v0.h[\offset]
+ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
+mla \dst3\().8h, v22.8h, v0.h[\offset]
+ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
+mla \dst2\().8h, v21.8h, v0.h[\offset]
+mla \dst4\().8h, v23.8h, v0.h[\offset]
 .else
-mla \dst1, v20.8h, v0.h[\offset]
-mla \dst3, v22.8h, v0.h[\offset]
+mla \dst1\().8h, v20.8h, v0.h[\offset]
+mla \dst3\().8h, v22.8h, v0.h[\offset]
 .endif
 .endm
 // The same as above, but don't accumulate straight into the
 // destination, but use a temp register and accumulate with saturation.
 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, 
offset, size
-ext v20.16b, \src1, \src2, #(2*\offset)
-ext v22.16b, \src4, \src5, #(2*\offset)
+ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
+ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
 .if \size >= 16
 mul v20.8h, v20.8h, v0.h[\offset]
-ext v21.16b, \src2, \src3, #(2*\offset)
+ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
 mul v22.8h, v22.8h, v0.h[\offset]
-ext v23.16b, \src5, \src6, #(2*\offset)
+ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 mul v21.8h, v21.8h, v0.h[\offset]
 mul v23.8h, v23.8h, v0.h[\offset]
 .else
 mul v20.8h, v20.8h, v0.h[\offset]
 mul v22.8h, v22.8h, v0.h[\offset]
 .endif
-sqadd   \dst1, \dst1, v20.8h
-sqadd   \dst3, \dst3, v22.8h
+sqadd   \dst1\().8h, \dst1\().8h, v20.8h
+sqadd   \dst3\().8h, \dst3\().8h, v22.8h
 .if \size >= 16
-sqadd   \dst2, \dst2, v21.8h
-sqadd   \dst4, \dst4, v23.8h
+sqadd   \dst2\().8h, \dst2\().8h, v21.8h
+sqadd   \dst4\().8h, \dst4\().8h, v23.8h
 .endif
 .endm
 
@@ -292,13 +292,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 mul v2.8h,  v5.8h,  v0.h[0]
 mul v25.8h, v17.8h, v0.h[0]
 .endif
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, 1, \size
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, 2, \size
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, 5, \size
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, 6, \size
-extmla  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, 7, \size
-extmulqadd  v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  
v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
+extmla  v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 1,   
  \size
+extmla  v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 2,   
  \size
+extmla  v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 
\idx1, \size
+extmla  v1,  v2,  v24, v25, v4,  v5,  v6,  v16, v17, v18, 5,   
  \s

[libav-devel] [PATCH 2/6] aarch64: vp9dsp: Fix vertical alignment in the init file

2017-01-02 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9dsp_init_aarch64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c 
b/libavcodec/aarch64/vp9dsp_init_aarch64.c
index 486ae1e..2ac9e19 100644
--- a/libavcodec/aarch64/vp9dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -125,7 +125,7 @@ static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext 
*dsp)
 #define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
 dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
 
-#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx)   
\
 init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, 
pfx); \
 init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, 
pfx); \
 init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, 
pfx); \
@@ -133,7 +133,7 @@ static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext 
*dsp)
 init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, 
pfx); \
 init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, 
pfx)
 
-#define init_mc_funcs_dirs(idx, sz) \
+#define init_mc_funcs_dirs(idx, sz)\
 init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
 init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
 init_mc_funcs(idx, hv, 1, 1, sz,)
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 6/6] aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter

2017-01-02 Thread Martin Storsjö
No measured speedup on an Cortex A53, but other cores might benefit.
---
 libavcodec/aarch64/vp9mc_neon.S | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 99f1809..95ed26c 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -202,9 +202,12 @@ endfunc
 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 mla \dst2\().8h, v21.8h, v0.h[\offset]
 mla \dst4\().8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
 mla \dst1\().8h, v20.8h, v0.h[\offset]
 mla \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+mla \dst1\().4h, v20.4h, v0.h[\offset]
+mla \dst3\().4h, v22.4h, v0.h[\offset]
 .endif
 .endm
 // The same as above, but don't accumulate straight into the
@@ -219,16 +222,24 @@ endfunc
 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
 mul v21.8h, v21.8h, v0.h[\offset]
 mul v23.8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
 mul v20.8h, v20.8h, v0.h[\offset]
 mul v22.8h, v22.8h, v0.h[\offset]
+.else
+mul v20.4h, v20.4h, v0.h[\offset]
+mul v22.4h, v22.4h, v0.h[\offset]
 .endif
+.if \size == 4
+sqadd   \dst1\().4h, \dst1\().4h, v20.4h
+sqadd   \dst3\().4h, \dst3\().4h, v22.4h
+.else
 sqadd   \dst1\().8h, \dst1\().8h, v20.8h
 sqadd   \dst3\().8h, \dst3\().8h, v22.8h
 .if \size >= 16
 sqadd   \dst2\().8h, \dst2\().8h, v21.8h
 sqadd   \dst4\().8h, \dst4\().8h, v23.8h
 .endif
+.endif
 .endm
 
 
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 5/6] arm: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter

2017-01-02 Thread Martin Storsjö
Before:Cortex A7  A8 A9 A53
vp9_put_8tap_smooth_4h_neon:   378.1   273.2  340.7   229.5
After:
vp9_put_8tap_smooth_4h_neon:   352.1   222.2  290.5   229.5
---
 libavcodec/arm/vp9mc_neon.S | 33 ++---
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
index a5413a3..8d43ff1 100644
--- a/libavcodec/arm/vp9mc_neon.S
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -209,7 +209,7 @@ endfunc
 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
 @ dst1-dst2 and dst3-dst4 for size >= 16)
-.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, 
offset, size
+.macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, 
src5, src6, offset, size
 vext.8  q14, \src1, \src2, #(2*\offset)
 vext.8  q15, \src4, \src5, #(2*\offset)
 .if \size >= 16
@@ -219,14 +219,17 @@ endfunc
 vext.8  q6,  \src5, \src6, #(2*\offset)
 vmla_lane   \dst2,  q5,  \offset
 vmla_lane   \dst4,  q6,  \offset
-.else
+.elseif \size == 8
 vmla_lane   \dst1,  q14, \offset
 vmla_lane   \dst3,  q15, \offset
+.else
+vmla_lane   \dst1d, d28, \offset
+vmla_lane   \dst3d, d30, \offset
 .endif
 .endm
 @ The same as above, but don't accumulate straight into the
 @ destination, but use a temp register and accumulate with saturation.
-.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, 
offset, size
+.macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, 
src4, src5, src6, offset, size
 vext.8  q14, \src1, \src2, #(2*\offset)
 vext.8  q15, \src4, \src5, #(2*\offset)
 .if \size >= 16
@@ -236,16 +239,24 @@ endfunc
 vext.8  q6,  \src5, \src6, #(2*\offset)
 vmul_lane   q5,  q5,  \offset
 vmul_lane   q6,  q6,  \offset
-.else
+.elseif \size == 8
 vmul_lane   q14, q14, \offset
 vmul_lane   q15, q15, \offset
+.else
+vmul_lane   d28, d28, \offset
+vmul_lane   d30, d30, \offset
 .endif
+.if \size == 4
+vqadd.s16   \dst1d, \dst1d, d28
+vqadd.s16   \dst3d, \dst3d, d30
+.else
 vqadd.s16   \dst1,  \dst1,  q14
 vqadd.s16   \dst3,  \dst3,  q15
 .if \size >= 16
 vqadd.s16   \dst2,  \dst2,  q5
 vqadd.s16   \dst4,  \dst4,  q6
 .endif
+.endif
 .endm
 
 
@@ -309,13 +320,13 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 vmul.s16q2,  q9,  d0[0]
 vmul.s16q4,  q12, d0[0]
 .endif
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 1,  
   \size
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 2,  
   \size
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 
\idx1, \size
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 5,  
   \size
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 6,  
   \size
-extmla  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 7,  
   \size
-extmulqadd  q1,  q2,  q3,  q4,  q8,  q9,  q10,  q11, q12, q13, 
\idx2, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, 1, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, 2, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, \idx1, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, 5, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, 6, \size
+extmla  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, 7, \size
+extmulqadd  q1,  q2,  q3,  q4,  d2,  d6,  q8,  q9,  q10, q11, q12, 
q13, \idx2, \size
 
 @ Round, shift and saturate
 vqrshrun.s16d2,  q1,  #7
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/6] arm: vp9mc: Fix vertical alignment of operands

2017-01-02 Thread Martin Storsjö
---
 libavcodec/arm/vp9mc_neon.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
index d3d2c7f..a5413a3 100644
--- a/libavcodec/arm/vp9mc_neon.S
+++ b/libavcodec/arm/vp9mc_neon.S
@@ -79,7 +79,7 @@ function ff_vp9_avg32_neon, export=1
 vrhadd.u8   q0,  q0,  q2
 vrhadd.u8   q1,  q1,  q3
 subsr12, r12, #1
-vst1.8  {q0, q1},  [r0, :128], r1
+vst1.8  {q0,  q1},  [r0, :128], r1
 bne 1b
 bx  lr
 endfunc
@@ -408,7 +408,7 @@ function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
 add r12, r12, 120*\offset - 8
 cmp r5,  #8
 add r12, r12, r5, lsl #3
-mov r5, #\size
+mov r5,  #\size
 .if \size >= 16
 bge \type\()_8tap_16h_34
 b   \type\()_8tap_16h_43
@@ -543,7 +543,7 @@ function \type\()_8tap_8v_\idx1\idx2
 vld1.8  {d0},  [r12, :64]
 vmovl.s8q0,  d0
 1:
-mov r12,  r4
+mov r12, r4
 
 loadl   q5,  q6,  q7
 loadl   q8,  q9,  q10, q11
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 3/6] aarch64: vp9mc: Fix a comment to refer to a register with the right name

2017-01-02 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9mc_neon.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 720273b..c1f1876 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -250,7 +250,7 @@ function \type\()_8tap_\size\()h_\idx1\idx2
 .if \size >= 16
 sub x1,  x1,  x5
 .endif
-// size >= 16 loads two qwords and increments r2,
+// size >= 16 loads two qwords and increments x2,
 // for size 4/8 it's enough with one qword and no
 // postincrement
 .if \size >= 16
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/2] arm: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling

2017-01-04 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Before:Cortex A7  A8  A9 A53
vp9_inv_dct_dct_16x16_sub1_add_neon:   273.0   189.5   211.7   235.8
vp9_inv_dct_dct_32x32_sub1_add_neon:   752.0   459.2   862.2   553.9
After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   226.5   145.0   225.1   171.8
vp9_inv_dct_dct_32x32_sub1_add_neon:   721.2   415.7   727.6   475.0
---
 libavcodec/arm/vp9itxfm_neon.S | 54 --
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 5abe435..a81240b 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -518,16 +518,23 @@ function idct16x16_dc_add_neon
 
 vrshr.s16   q8,  q8,  #6
 
+mov r3,  r0
 mov r12, #16
 1:
 @ Loop to add the constant from q8 into all 16x16 outputs
-vld1.8  {q3},  [r0,:128]
-vaddw.u8q10, q8,  d6
-vaddw.u8q11, q8,  d7
-vqmovun.s16 d6,  q10
-vqmovun.s16 d7,  q11
-vst1.8  {q3},  [r0,:128], r1
-subsr12, r12, #1
+subsr12, r12, #2
+vld1.8  {q2},  [r0,:128], r1
+vaddw.u8q10, q8,  d4
+vld1.8  {q3},  [r0,:128], r1
+vaddw.u8q11, q8,  d5
+vaddw.u8q12, q8,  d6
+vaddw.u8q13, q8,  d7
+vqmovun.s16 d4,  q10
+vqmovun.s16 d5,  q11
+vqmovun.s16 d6,  q12
+vst1.8  {q2},  [r3,:128], r1
+vqmovun.s16 d7,  q13
+vst1.8  {q3},  [r3,:128], r1
 bne 1b
 
 bx  lr
@@ -889,20 +896,31 @@ function idct32x32_dc_add_neon
 
 vrshr.s16   q8,  q8,  #6
 
+mov r3,  r0
 mov r12, #32
 1:
 @ Loop to add the constant from q8 into all 32x32 outputs
-vld1.8  {q2-q3},  [r0,:128]
-vaddw.u8q10, q8,  d4
-vaddw.u8q11, q8,  d5
-vaddw.u8q12, q8,  d6
-vaddw.u8q13, q8,  d7
-vqmovun.s16 d4,  q10
-vqmovun.s16 d5,  q11
-vqmovun.s16 d6,  q12
-vqmovun.s16 d7,  q13
-vst1.8  {q2-q3},  [r0,:128], r1
-subsr12, r12, #1
+subsr12, r12, #2
+vld1.8  {q0-q1},  [r0,:128], r1
+vaddw.u8q9,  q8,  d0
+vaddw.u8q10, q8,  d1
+vld1.8  {q2-q3},  [r0,:128], r1
+vaddw.u8q11, q8,  d2
+vaddw.u8q12, q8,  d3
+vaddw.u8q13, q8,  d4
+vaddw.u8q14, q8,  d5
+vaddw.u8q15, q8,  d6
+vqmovun.s16 d0,  q9
+vaddw.u8q9,  q8,  d7
+vqmovun.s16 d1,  q10
+vqmovun.s16 d2,  q11
+vqmovun.s16 d3,  q12
+vqmovun.s16 d4,  q13
+vqmovun.s16 d5,  q14
+vst1.8  {q0-q1},  [r3,:128], r1
+vqmovun.s16 d6,  q15
+vqmovun.s16 d7,  q9
+vst1.8  {q2-q3},  [r3,:128], r1
 bne 1b
 
 bx  lr
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 2/2] aarch64: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling

2017-01-04 Thread Martin Storsjö
This work is sponsored by, and copyright, Google.

Before:   Cortex A53
vp9_inv_dct_dct_16x16_sub1_add_neon:   235.3
vp9_inv_dct_dct_32x32_sub1_add_neon:   555.1
After:
vp9_inv_dct_dct_16x16_sub1_add_neon:   180.2
vp9_inv_dct_dct_32x32_sub1_add_neon:   475.3
---
 libavcodec/aarch64/vp9itxfm_neon.S | 54 +-
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 7ce6df0..a4284e3 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -448,16 +448,23 @@ function idct16x16_dc_add_neon
 
 srshr   v2.8h, v2.8h, #6
 
+mov x3, x0
 mov x4, #16
 1:
 // Loop to add the constant from v2 into all 16x16 outputs
-ld1 {v3.16b},  [x0]
-uaddw   v4.8h,  v2.8h,  v3.8b
-uaddw2  v5.8h,  v2.8h,  v3.16b
-sqxtun  v4.8b,  v4.8h
-sqxtun2 v4.16b, v5.8h
-st1 {v4.16b},  [x0], x1
-subsx4,  x4,  #1
+subsx4,  x4,  #2
+ld1 {v3.16b},  [x0], x1
+ld1 {v4.16b},  [x0], x1
+uaddw   v16.8h, v2.8h,  v3.8b
+uaddw2  v17.8h, v2.8h,  v3.16b
+uaddw   v18.8h, v2.8h,  v4.8b
+uaddw2  v19.8h, v2.8h,  v4.16b
+sqxtun  v3.8b,  v16.8h
+sqxtun2 v3.16b, v17.8h
+sqxtun  v4.8b,  v18.8h
+sqxtun2 v4.16b, v19.8h
+st1 {v3.16b},  [x3], x1
+st1 {v4.16b},  [x3], x1
 b.ne1b
 
 ret
@@ -824,20 +831,31 @@ function idct32x32_dc_add_neon
 
 srshr   v0.8h, v2.8h, #6
 
+mov x3, x0
 mov x4, #32
 1:
 // Loop to add the constant v0 into all 32x32 outputs
-ld1 {v1.16b,v2.16b},  [x0]
-uaddw   v3.8h,  v0.8h,  v1.8b
-uaddw2  v4.8h,  v0.8h,  v1.16b
-uaddw   v5.8h,  v0.8h,  v2.8b
-uaddw2  v6.8h,  v0.8h,  v2.16b
-sqxtun  v3.8b,  v3.8h
-sqxtun2 v3.16b, v4.8h
-sqxtun  v4.8b,  v5.8h
-sqxtun2 v4.16b, v6.8h
-st1 {v3.16b,v4.16b},  [x0], x1
-subsx4,  x4,  #1
+subsx4,  x4,  #2
+ld1 {v1.16b,v2.16b},  [x0], x1
+uaddw   v16.8h, v0.8h,  v1.8b
+uaddw2  v17.8h, v0.8h,  v1.16b
+ld1 {v3.16b,v4.16b},  [x0], x1
+uaddw   v18.8h, v0.8h,  v2.8b
+uaddw2  v19.8h, v0.8h,  v2.16b
+uaddw   v20.8h, v0.8h,  v3.8b
+uaddw2  v21.8h, v0.8h,  v3.16b
+uaddw   v22.8h, v0.8h,  v4.8b
+uaddw2  v23.8h, v0.8h,  v4.16b
+sqxtun  v1.8b,  v16.8h
+sqxtun2 v1.16b, v17.8h
+sqxtun  v2.8b,  v18.8h
+sqxtun2 v2.16b, v19.8h
+sqxtun  v3.8b,  v20.8h
+sqxtun2 v3.16b, v21.8h
+st1 {v1.16b,v2.16b},  [x3], x1
+sqxtun  v4.8b,  v22.8h
+sqxtun2 v4.16b, v23.8h
+st1 {v3.16b,v4.16b},  [x3], x1
 b.ne1b
 
 ret
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 11/11] rtp: Convert to the new bitstream reader

2017-01-13 Thread Martin Storsjö

On Fri, 13 Jan 2017, Anton Khirnov wrote:


Quoting Diego Biurrun (2017-01-11 18:10:08)

From: Alexandra Hájková 

---
 libavformat/rtpdec_h261.c | 19 ++-
 libavformat/rtpdec_h263_rfc2190.c | 19 ++-
 libavformat/rtpdec_latm.c | 24 +---
 libavformat/rtpdec_mpeg4.c| 16 +---
 libavformat/rtpdec_qt.c   | 31 ---
 libavformat/rtpenc_h263_rfc2190.c | 29 +++--
 6 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/libavformat/rtpdec_mpeg4.c b/libavformat/rtpdec_mpeg4.c
index 00a732b..b3cd6c9 100644
--- a/libavformat/rtpdec_mpeg4.c
+++ b/libavformat/rtpdec_mpeg4.c
@@ -27,11 +27,13 @@
  * @author Romain Degez
  */

-#include "rtpdec_formats.h"
-#include "internal.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avstring.h"
-#include "libavcodec/get_bits.h"
+
+#include "libavcodec/bitstream.h"
+
+#include "rtpdec_formats.h"
+#include "internal.h"

 #define MAX_AAC_HBR_FRAME_SIZE 8191

@@ -113,7 +115,7 @@ static int parse_fmtp_config(AVCodecParameters *par, const 
char *value)
 static int rtp_parse_mp4_au(PayloadContext *data, const uint8_t *buf, int len)
 {
 int au_headers_length, au_header_size, i;
-GetBitContext getbitcontext;
+BitstreamContext bctx;

 if (len < 2)
 return AVERROR_INVALIDDATA;
@@ -134,7 +136,7 @@ static int rtp_parse_mp4_au(PayloadContext *data, const 
uint8_t *buf, int len)
 if (len < data->au_headers_length_bytes)
 return AVERROR_INVALIDDATA;

-init_get_bits(&getbitcontext, buf, data->au_headers_length_bytes * 8);
+bitstream_init(&bctx, buf, data->au_headers_length_bytes * 8);

 /* XXX: Wrong if optional additional sections are present (cts, dts 
etc...) */
 au_header_size = data->sizelength + data->indexlength;
@@ -151,8 +153,8 @@ static int rtp_parse_mp4_au(PayloadContext *data, const 
uint8_t *buf, int len)
 }

 for (i = 0; i < data->nb_au_headers; ++i) {
-data->au_headers[i].size  = get_bits_long(&getbitcontext, 
data->sizelength);
-data->au_headers[i].index = get_bits_long(&getbitcontext, 
data->indexlength);
+data->au_headers[i].size  = bitstream_read(&bctx, data->sizelength);
+data->au_headers[i].index = bitstream_read(&bctx, data->indexlength);


Not really a problem introduced by this patch, but those two fields
don't seem to be bound-checked anywhere.


The values are parsed in parse_fmtp (with an AVOption like mechanism), 
where they are limited to a maximum of 32.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 3/6] arm: vp9lpf: Use orrs instead of orr+cmp

2017-01-15 Thread Martin Storsjö
---
 libavcodec/arm/vp9lpf_neon.S | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index 5e154f6..9be4cef 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -77,8 +77,7 @@
 
 vdup.u8 d3,  r3  @ H
 vmovr2,  r3,  d4
-orr r2,  r2,  r3
-cmp r2,  #0
+orrsr2,  r2,  r3
 @ If no pixels need filtering, just exit as soon as possible
 beq 9f
 
@@ -191,8 +190,7 @@
 
 .if \wd >= 8
 vmovr2,  r3,  d6
-orr r2,  r2,  r3
-cmp r2,  #0
+orrsr2,  r2,  r3
 @ If no pixels need flat8in, jump to flat8out
 @ (or to a writeout of the inner 4 pixels, for wd=8)
 beq 6f
@@ -247,14 +245,12 @@
 6:
 vorrd2,  d6,  d7
 vmovr2,  r3,  d2
-orr r2,  r2,  r3
-cmp r2,  #0
+orrsr2,  r2,  r3
 @ If no pixels needed flat8in nor flat8out, jump to a
 @ writeout of the inner 4 pixels
 beq 7f
 vmovr2,  r3,  d7
-orr r2,  r2,  r3
-cmp r2,  #0
+orrsr2,  r2,  r3
 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
 beq 8f
 
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 4/6] arm: vp9lpf: Interleave the start of flat8in into the calculation above

2017-01-15 Thread Martin Storsjö
This adds lots of extra .ifs, but speeds it up by a couple cycles,
by avoiding stalls.
---
 libavcodec/arm/vp9lpf_neon.S | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index 9be4cef..e31c807 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -181,16 +181,20 @@
 
 vmovl.u8q0,  d22@ p1
 vmovl.u8q1,  d25@ q1
+.if \wd >= 8
+vmovr2,  r3,  d6
+.endif
 vaddw.s8q0,  q0,  \tmp3 @ p1 + f
 vsubw.s8q1,  q1,  \tmp3 @ q1 - f
+.if \wd >= 8
+orrsr2,  r2,  r3
+.endif
 vqmovun.s16 d0,  q0 @ out p1
 vqmovun.s16 d2,  q1 @ out q1
 vbitd22, d0,  d5@ if (!hev && fm && !flat8in)
 vbitd25, d2,  d5
 
 .if \wd >= 8
-vmovr2,  r3,  d6
-orrsr2,  r2,  r3
 @ If no pixels need flat8in, jump to flat8out
 @ (or to a writeout of the inner 4 pixels, for wd=8)
 beq 6f
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 5/6] aarch64: vp9lpf: Interleave the start of flat8in into the calculation above

2017-01-15 Thread Martin Storsjö
---
 libavcodec/aarch64/vp9lpf_neon.S | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 4553173..3894307 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -316,20 +316,30 @@
 
 uxtl_sz v0.8h,  v1.8h,  v22, \sz// p1
 uxtl_sz v2.8h,  v3.8h,  v25, \sz// q1
+.if \wd >= 8
+mov x5,  v6.d[0]
+.endif
 saddw_szv0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
 ssubw_szv2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
+.if \wd >= 8
+.ifc \sz, .16b
+mov x6,  v6.d[1]
+.endif
+.endif
 sqxtun_sz   v0,  v0.8h,  v1.8h, \sz // out p1
 sqxtun_sz   v2,  v2.8h,  v3.8h, \sz // out q1
+.if \wd >= 8
+.ifc \sz, .16b
+addsx5,  x5,  x6
+.endif
+.endif
 bit v22\sz, v0\sz,  v5\sz   // if (!hev && fm && 
!flat8in)
 bit v25\sz, v2\sz,  v5\sz
 
 // If no pixels need flat8in, jump to flat8out
 // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-mov x5,  v6.d[0]
 .ifc \sz, .16b
-mov x6,  v6.d[1]
-addsx5,  x5,  x6
 b.eq6f
 .else
 cbz x5,  6f
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 2/6] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit

2017-01-15 Thread Martin Storsjö
The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.

Before: Cortex A7  A8  A9 A53  A53/AArch64
vp9_loop_filter_v_4_8_neon: 143.0   127.7   114.888.0 87.7
vp9_loop_filter_v_8_8_neon: 241.0   197.2   173.7   140.0136.7
vp9_loop_filter_v_16_8_neon:497.0   419.5   379.7   293.0275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0452.0
After:
vp9_loop_filter_v_4_8_neon: 136.0   125.7   112.684.0 83.0
vp9_loop_filter_v_8_8_neon: 234.0   195.5   171.5   136.0133.7
vp9_loop_filter_v_16_8_neon:490.0   417.5   377.7   289.0271.0
vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0446.7
---
 libavcodec/aarch64/vp9lpf_neon.S | 40 +---
 libavcodec/arm/vp9lpf_neon.S | 11 +--
 2 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 3b8e6eb..4553173 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -51,13 +51,6 @@
 // see the arm version instead.
 
 
-.macro uabdl_sz dst1, dst2, in1, in2, sz
-uabdl   \dst1,  \in1\().8b,  \in2\().8b
-.ifc \sz, .16b
-uabdl2  \dst2,  \in1\().16b, \in2\().16b
-.endif
-.endm
-
 .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
 add \dst1,  \in1,  \in3
 .ifc \sz, .16b
@@ -86,20 +79,6 @@
 .endif
 .endm
 
-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
-cmhs\dst1,  \in1,  \in3
-.ifc \sz, .16b
-cmhs\dst2,  \in2,  \in4
-.endif
-.endm
-
-.macro xtn_sz dst, in1, in2, sz
-xtn \dst\().8b,  \in1
-.ifc \sz, .16b
-xtn2\dst\().16b, \in2
-.endif
-.endm
-
 .macro usubl_sz dst1, dst2, in1, in2, sz
 usubl   \dst1,  \in1\().8b,  \in2\().8b
 .ifc \sz, .16b
@@ -179,20 +158,20 @@
 // tmpq2 == tmp3 + tmp4, etc.
 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
 .if \mix == 0
-dup v0.8h,  w2// E
-dup v1.8h,  w2// E
+dup v0\sz,  w2// E
 dup v2\sz,  w3// I
 dup v3\sz,  w4// H
 .else
-dup v0.8h,  w2// E
+dup v0.8b,  w2// E
 dup v2.8b,  w3// I
 dup v3.8b,  w4// H
+lsr w5, w2,  #8
 lsr w6, w3,  #8
 lsr w7, w4,  #8
-ushrv1.8h,  v0.8h, #8 // E
+dup v1.8b,  w5// E
 dup v4.8b,  w6// I
-bic v0.8h,  #255, lsl 8 // E
 dup v5.8b,  w7// H
+trn1v0.2d,  v0.2d,  v1.2d
 trn1v2.2d,  v2.2d,  v4.2d
 trn1v3.2d,  v3.2d,  v5.2d
 .endif
@@ -206,16 +185,15 @@
 umaxv4\sz,  v4\sz,  v5\sz
 umaxv5\sz,  v6\sz,  v7\sz
 umax\tmp1\sz, \tmp1\sz, \tmp2\sz
-uabdl_szv6.8h,  v7.8h,  v23, v24, \sz // abs(p0 - q0)
+uabdv6\sz,  v23\sz, v24\sz// abs(p0 - q0)
 umaxv4\sz,  v4\sz,  v5\sz
-add_sz  v6.8h,  v7.8h,  v6.8h,  v7.8h,  v6.8h,  v7.8h, \sz // 
abs(p0 - q0) * 2
+uqadd   v6\sz,  v6\sz,  v6\sz // abs(p0 - q0) * 2
 uabdv5\sz,  v22\sz, v25\sz// abs(p1 - q1)
 umaxv4\sz,  v4\sz,  \tmp1\sz  // max(abs(p3 - p2), 
..., abs(q2 - q3))
 ushrv5\sz,  v5\sz,  #1
 cmhsv4\sz,  v2\sz,  v4\sz // max(abs()) <= I
-uaddw_szv6.8h,  v7.8h,  v6.8h,  v7.8h,  v5, \sz // abs(p0 - 
q0) * 2 + abs(p1 - q1) >> 1
-cmhs_sz v6.8h,  v7.8h,  v0.8h,  v1.8h,  v6.8h,  v7.8h, \sz
-xtn_sz  v5, v6.8h,  v7.8h,  \sz
+uqadd   v6\sz,  v6\sz,  v5\sz // abs(p0 - q0) * 2 + 
abs(p1 - q1) >> 1
+cmhsv5\sz,  v0\sz,  v6\sz
 and v4\sz,  v4\sz,  v5\sz // fm
 
 // If no pixels need filtering, just exit as soon as possible
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index c57c0e9..5e154f6 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -51,7 +51,7 @@
 @ and d28-d31 as temp registers, or d8-d15.
 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, 
tmpq2, tmpq3, tmpq4
-vdup.u16q0,  r2 @ E
+vdup.u8 d0,  r2 @ E
 vdup.u8 d2,  r3 @ I
 ldr r3,  [sp]
 
@@ -64,16 +64,15 @@
  

[libav-devel] [PATCH 1/6] arm/aarch64: vp9lpf: Calculate !hev directly

2017-01-15 Thread Martin Storsjö
Previously we first calculated hev, and then negated it.

Since we were able to schedule the negation in the middle
of another calculation, we don't see any gain in all cases.

Before: Cortex A7  A8  A9 A53  A53/AArch64
vp9_loop_filter_v_4_8_neon: 147.0   129.0   115.889.0 88.7
vp9_loop_filter_v_8_8_neon: 242.0   198.5   174.7   140.0136.7
vp9_loop_filter_v_16_8_neon:500.0   419.5   382.7   293.0275.7
vp9_loop_filter_v_16_16_neon:   971.2   825.5   731.5   579.0453.0
After:
vp9_loop_filter_v_4_8_neon: 143.0   127.7   114.888.0 87.7
vp9_loop_filter_v_8_8_neon: 241.0   197.2   173.7   140.0136.7
vp9_loop_filter_v_16_8_neon:497.0   419.5   379.7   293.0275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0452.0
---
 libavcodec/aarch64/vp9lpf_neon.S | 5 ++---
 libavcodec/arm/vp9lpf_neon.S | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e9c7d9e..3b8e6eb 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -292,7 +292,7 @@
 .if \mix != 0
 sxtlv1.8h,  v1.8b
 .endif
-cmhiv5\sz,  v5\sz,  v3\sz  // hev
+cmhsv5\sz,  v3\sz,  v5\sz  // !hev
 .if \wd == 8
 // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
 .if \mix != 0
@@ -306,11 +306,10 @@
 .elseif \wd == 8
 bic v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
 .endif
-mvn v5\sz,  v5\sz  // !hev
+and v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
 .if \wd == 16
 and v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
 .endif
-and v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
 
 mul_sz  \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  
\tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
 bic \tmp1\sz,  \tmp1\sz,  v5\sz// if (!hev) 
av_clip_int8 = 0
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index fbf2901..c57c0e9 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -141,7 +141,7 @@
 .if \wd == 8
 vcle.u8 d6,  d6,  d0@ flat8in
 .endif
-vcgt.u8 d5,  d5,  d3@ hev
+vcle.u8 d5,  d5,  d3@ !hev
 .if \wd == 8
 vandd6,  d6,  d4@ flat8in && fm
 .endif
@@ -151,11 +151,10 @@
 .elseif \wd == 8
 vbicd4,  d4,  d6@ fm && !flat8in
 .endif
-vmvnd5,  d5 @ !hev
+vandd5,  d5,  d4@ !hev && fm && !flat8in
 .if \wd == 16
 vandd7,  d7,  d6@ flat8out && flat8in && fm
 .endif
-vandd5,  d5,  d4@ !hev && fm && !flat8in
 
 vmul.s16\tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
 vbic\tmp1,   \tmp1,   d5@ if (!hev) av_clip_int8 = 0
-- 
2.7.4

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 6/6] arm: vp9lpf: Implement the mix2_44 function with one single filter pass

2017-01-15 Thread Martin Storsjö
For this case, with 8 inputs but only changing 4 of them, we can fit
all 16 input pixels into a q register, and still have enough temporary
registers for doing the loop filter.

The wd=8 filters would require too many temporary registers for
processing all 16 pixels at once though.

Before:  Cortex A7  A8 A9 A53
vp9_loop_filter_mix2_v_44_16_neon:   289.7   256.2  237.5   181.2
After:
vp9_loop_filter_mix2_v_44_16_neon:   221.2   150.5  177.7   138.0
---
 libavcodec/arm/vp9dsp_init_arm.c |   7 +-
 libavcodec/arm/vp9lpf_neon.S | 191 +++
 2 files changed, 195 insertions(+), 3 deletions(-)

diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index e99d931..1ede170 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -194,6 +194,8 @@ define_loop_filters(8, 8);
 define_loop_filters(16, 8);
 define_loop_filters(16, 16);
 
+define_loop_filters(44, 16);
+
 #define lf_mix_fn(dir, wd1, wd2, stridea)  
   \
 static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, 
   \
  ptrdiff_t stride, 
   \
@@ -207,7 +209,6 @@ static void 
loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,
 lf_mix_fn(h, wd1, wd2, stride) \
 lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
 
-lf_mix_fns(4, 4)
 lf_mix_fns(4, 8)
 lf_mix_fns(8, 4)
 lf_mix_fns(8, 8)
@@ -227,8 +228,8 @@ static av_cold void 
vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
 dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
 dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
 
-dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon;
-dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon;
+dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
 dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
 dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
 dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index e31c807..12984a9 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -44,6 +44,109 @@
 vtrn.8  \r2,  \r3
 .endm
 
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q
+vdup.u8 d0,  r2  @ E
+lsr r2,  r2,  #8
+vdup.u8 d2,  r3  @ I
+lsr r3,  r3,  #8
+vdup.u8 d1,  r2  @ E
+vdup.u8 d3,  r3  @ I
+
+vabd.u8 q2,  q8,  q9 @ abs(p3 - p2)
+vabd.u8 q3,  q9,  q10@ abs(p2 - p1)
+vabd.u8 q4,  q10, q11@ abs(p1 - p0)
+vabd.u8 q5,  q12, q13@ abs(q0 - q1)
+vabd.u8 q6,  q13, q14@ abs(q1 - q2)
+vabd.u8 q7,  q14, q15@ abs(q2 - q3)
+vmax.u8 q2,  q2,  q3
+vmax.u8 q3,  q4,  q5
+vmax.u8 q4,  q6,  q7
+vabd.u8 q5,  q11, q12@ abs(p0 - q0)
+vmax.u8 q2,  q2,  q3
+vqadd.u8q5,  q5,  q5 @ abs(p0 - q0) * 2
+vabd.u8 q7,  q10, q13@ abs(p1 - q1)
+vmax.u8 q2,  q2,  q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
+vshr.u8 q7,  q7,  #1
+vcle.u8 q2,  q2,  q1 @ max(abs()) <= I
+vqadd.u8q5,  q5,  q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+vcle.u8 q5,  q5,  q0
+vandq2,  q2,  q5 @ fm
+
+vshrn.u16   d10, q2,  #4
+vmovr2,  r3,  d10
+orrsr2,  r2,  r3
+@ If no pixels need filtering, just exit as soon as possible
+beq 9f
+
+@ Calculate the normal inner loop filter for 2 or 4 pixels
+ldr r3,  [sp, #64]
+vabd.u8 q3,  q10, q11@ abs(p1 - p0)
+vabd.u8 q4,  q13, q12@ abs(q1 - q0)
+
+vsubl.u8q5,  d20, d26@ p1 - q1
+vsubl.u8q6,  d21, d27@ p1 - q1
+vmax.u8 q3,  q3,  q4 @ max(abs(p1 - p0), abs(q1 - q0))
+vqmovn.s16  d10, q5  @ av_clip_int8p(p1 - q1)
+vqmovn.s16  d11, q6  @ av_clip_int8p(p1 - q1)
+vdup.u8 d8,  r3  @ H
+lsr r3,  r3,  #8
+vdup.u8 d9,  r3  @ H
+vsubl.u8q6,  d24, d22@ q0 - p0
+vsubl.u8q7,  d25, d23@ q0 - p0
+vcle.u8 q3,  q3,  q4 @ hev
+   

[libav-devel] [PATCH] rtmpproto: Make bytes_read variables 64bit.

2017-01-25 Thread Martin Storsjö
From: Carl Eugen Hoyos 

When bytes_read overflowed, last_bytes_read did not yet overflow
and no bytes-read report was created leading to a timeout.

Analyzed-by: Thomas Bernhard
---
 libavformat/rtmpproto.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavformat/rtmpproto.c b/libavformat/rtmpproto.c
index 5298c18..09d22eb 100644
--- a/libavformat/rtmpproto.c
+++ b/libavformat/rtmpproto.c
@@ -94,8 +94,8 @@ typedef struct RTMPContext {
 int   flv_nb_packets; ///< number of flv packets 
published
 RTMPPacketout_pkt;///< rtmp packet, created from 
flv a/v or metadata (for output)
 uint32_t  client_report_size; ///< number of bytes after which 
client should report to server
-uint32_t  bytes_read; ///< number of bytes read from 
server
-uint32_t  last_bytes_read;///< number of bytes read last 
reported to server
+uint64_t  bytes_read; ///< number of bytes read from 
server
+uint64_t  last_bytes_read;///< number of bytes read last 
reported to server
 uint32_t  last_timestamp; ///< last timestamp received in 
a packet
 int   skip_bytes; ///< number of bytes to skip 
from the input FLV stream in the next write call
 int   has_audio;  ///< presence of audio data
-- 
2.10.1 (Apple Git-78)

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] rtmpproto: Make bytes_read variables 64bit.

2017-01-25 Thread Martin Storsjö

On Wed, 25 Jan 2017, Luca Barbato wrote:


On 25/01/2017 09:29, Martin Storsjö wrote:

From: Carl Eugen Hoyos 

When bytes_read overflowed, last_bytes_read did not yet overflow
and no bytes-read report was created leading to a timeout.

Analyzed-by: Thomas Bernhard
---
 libavformat/rtmpproto.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)



Not against it, but theoretically wouldn't the problem present itself
unchanged once you shove enough data in? (yes would take quite a bit of
time given the current network fabrics available).


Theoretically yes, but I'm quite ok with a solution that breaks after 
sending/receiving 2^64 bytes.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH] configure: Add quotes around a variable which might be empty

2017-01-25 Thread Martin Storsjö
If we only have a target compiler but no host compiler, the $type
variable will be empty once.

(Currently we fail to do a cross build if no host compiler is available
due to using the host compiler for processing option lists though.
But despite that, this comparison in configure needs quotes.)
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index f186753..7570c0a 100755
--- a/configure
+++ b/configure
@@ -5033,7 +5033,7 @@ fi
 for pfx in "" host_; do
 varname=${pfx%_}cc_type
 eval "type=\$$varname"
-if [ $type = "msvc" ]; then
+if [ "$type" = "msvc" ]; then
 check_${pfx}cc 

Re: [libav-devel] [PATCH] build: Detect blocks C language extension and add it as VDA dependency

2017-01-25 Thread Martin Storsjö

On Wed, 25 Jan 2017, Diego Biurrun wrote:


Newer versions of OS X use the blocks extension in VDA-related headers.
---
configure | 7 ++-
1 file changed, 6 insertions(+), 1 deletion(-)


It might be useful for a reader to include a comment on the indented 
usecase. I.e. if using a third party compiler such as gcc, these 
frameworks have to be avoided.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] rtmp: Account bytes_read may wrap around

2017-01-25 Thread Martin Storsjö

On Wed, 25 Jan 2017, Luca Barbato wrote:


The servers seem to be happy to receive the wrapped around value as long
they receive a report, otherwise they would timeout.

Initially reported and analyzed by Thomas Bernhard.
---

Here the alternative patch discussed on IRC.

libavformat/rtmpproto.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavformat/rtmpproto.c b/libavformat/rtmpproto.c
index 5298c18..49a40dd 100644
--- a/libavformat/rtmpproto.c
+++ b/libavformat/rtmpproto.c
@@ -2416,7 +2416,7 @@ static int get_packet(URLContext *s, int for_header)
rt->last_timestamp = rpkt.timestamp;

rt->bytes_read += ret;
-if (rt->bytes_read > rt->last_bytes_read + rt->client_report_size) {
+if (rt->bytes_read - rt->last_bytes_read > rt->client_report_size) {
av_log(s, AV_LOG_DEBUG, "Sending bytes read report\n");
if ((ret = gen_bytes_read(s, rt, rpkt.timestamp + 1)) < 0)
return ret;
--
2.9.2


Seems ok to me, although I'm not able to test the issue easily.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 01/12] dashenc: fix ISO8601 UTC parsing

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


From: Anton Schubert 

Appends Z to timestamp to force ISO8601 datetime parsing as UTC.
Without Z, some browsers (Chrome) interpret the timestamp as
localtime and others (Firefox) interpret it as UTC.

Signed-off-by: Anton Schubert 
---
libavformat/dashenc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index ce01860..2b27950 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -429,7 +429,7 @@ static void format_date_now(char *buf, int size)
struct tm *ptm, tmbuf;
ptm = gmtime_r(&t, &tmbuf);
if (ptm) {
-if (!strftime(buf, size, "%Y-%m-%dT%H:%M:%S", ptm))
+if (!strftime(buf, size, "%Y-%m-%dT%H:%M:%SZ", ptm))
buf[0] = '\0';
}
}
--
2.10.2


LGTM, pushed

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 02/12] dashenc: add mandatory id to AdaptationSet and Period in manifest

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


From: Peter Große 

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 10 +-
1 file changed, 5 insertions(+), 5 deletions(-)


LGTM, will push

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 03/12] dashenc: increase buffer time hint in the manifest

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


From: Anton Schubert 

to avoid rebuffering on the clientside for difficult network conditions.

Signed-off-by: Anton Schubert 
---
libavformat/dashenc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 865f50a..44785cd 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -481,7 +481,7 @@ static int write_manifest(AVFormatContext *s, int final)
}
}
avio_printf(out, "\tminBufferTime=\"");
-write_time(out, c->last_duration);
+write_time(out, c->last_duration * 2);
avio_printf(out, "\">\n");
avio_printf(out, "\t\n");
if (title) {
--
2.10.2


I guess this is ok. I tried to keep the buffer amount low to allow as 
reasonably low latency as possible in practice, but I guess this is a 
safer compromise.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 04/12] dashenc: add option to provide UTC timing source

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


From: Peter Große 

If set, adds a UTCTime tag in the manifest.


UTCTiming, not UTCTime, right?


See 
http://vm2.dashif.org/dash.js/docs/jsdocs/MediaPlayer.html#addUTCTimingSource 
for more information.
Usable default: "https://time.akamai.com/?iso";


Hmm, I don't see this documented in ISO/IEC 23009-1:2014 at least, and the 
documentation from dashif don't seem to define it either.


I won't mind including it, but I'd like to know what standard defines it 
before merging.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 05/12] dashenc: separate segments based on current segment duration

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


The current implementation creates new segments comparing

 pkt->pts - first_pts > total_duration


I guess this would be more understandable written like this:
   pkt->pts - first_pts > nb_segs * min_seg_duration
or
   total_duration > nb_segs * min_seg_duration


This works fine, but if the keyframe interval is smaller than "min_seg_duration"
segments shorter than the minimum segment duration are created.

Example: keyint=50, min_seg_duration=300
segment 1 contains keyframe 1 (duration=2s <  total_duration=3s)
   and keyframe 2 (duration=4s >= total_duration=3s)
segment 2 contains keyframe 3 (duration=6s >= total_duration=6s)
segment 3 contains keyframe 4 (duration=8s <  total_duration=9s)
   and keyframe 5 (duration=10s >= total_duration=9s)
...

Segment 2 is only 2s long, shorter than min_seg_duration = 3s.


FWIW, this was the indended behaviour originally, but I agree that the 
option name isn't quite optimal given this.



To fix this, new segments are created based on the actual written duration.
Otherwise the option name "min_seg_duration" is misleading.

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 17 -
1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 198932c..93c292f 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -62,7 +62,7 @@ typedef struct OutputStream {
int ctx_inited;
uint8_t iobuf[32768];
AVIOContext *out;
-int packets_written;
+int duration_written;
char initfile[1024];
int64_t init_start_pos;
int init_range_length;
@@ -785,7 +785,7 @@ static int dash_flush(AVFormatContext *s, int final, int 
stream)
int64_t start_pos;
int range_length, index_length = 0;

-if (!os->packets_written)
+if (!os->duration_written)
continue;

// Flush the single stream that got a keyframe right now.
@@ -823,7 +823,7 @@ static int dash_flush(AVFormatContext *s, int final, int 
stream)

av_write_frame(os->ctx, NULL);
avio_flush(os->ctx->pb);
-os->packets_written = 0;
+os->duration_written = 0;

range_length = avio_tell(os->ctx->pb) - start_pos;
if (c->single_file) {
@@ -868,7 +868,6 @@ static int dash_write_packet(AVFormatContext *s, AVPacket 
*pkt)
DASHContext *c = s->priv_data;
AVStream *st = s->streams[pkt->stream_index];
OutputStream *os = &c->streams[pkt->stream_index];
-int64_t seg_end_duration = (os->segment_index) * (int64_t) 
c->min_seg_duration;
int ret;

ret = update_stream_extradata(s, os, st->codecpar);
@@ -897,9 +896,9 @@ static int dash_write_packet(AVFormatContext *s, AVPacket 
*pkt)
os->first_pts = pkt->pts;

if ((!c->has_video || st->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) &&
-pkt->flags & AV_PKT_FLAG_KEY && os->packets_written &&
-av_compare_ts(pkt->pts - os->first_pts, st->time_base,
-  seg_end_duration, AV_TIME_BASE_Q) >= 0) {
+pkt->flags & AV_PKT_FLAG_KEY && os->duration_written &&
+av_compare_ts(os->duration_written + pkt->duration, st->time_base,
+  c->min_seg_duration, AV_TIME_BASE_Q) >= 0) {
int64_t prev_duration = c->last_duration;

c->last_duration = av_rescale_q(pkt->pts - os->start_pts,
@@ -922,7 +921,7 @@ static int dash_write_packet(AVFormatContext *s, AVPacket 
*pkt)
return ret;
}

-if (!os->packets_written) {
+if (!os->duration_written) {
// If we wrote a previous segment, adjust the start time of the segment
// to the end of the previous one (which is the same as the mp4 muxer
// does). This avoids gaps in the timeline.
@@ -935,7 +934,7 @@ static int dash_write_packet(AVFormatContext *s, AVPacket 
*pkt)
os->max_pts = pkt->pts + pkt->duration;
else
os->max_pts = FFMAX(os->max_pts, pkt->pts + pkt->duration);
-os->packets_written++;
+os->duration_written += pkt->duration;
return ff_write_chained(os->ctx, 0, pkt, s);
}


This isn't the best way of achieving what you're trying to do.

Don't accumulate pkt->duration values. In general we can't really assume 
that pkt->duration is set sensibly.


In some places we need an estimate of the duration of a packet, and in 
those cases we fill it in using the duration of the previous packet (i.e. 
the diff between this packets timestamp and the previous one) if we don't 
have a definitive value available.


In those cases where we use an estimate of the duration, we make sure to 
correct it as soon as possible when we get the real timestamps of the next 
packet. But we shouldn't accumulate the values since that would end up 
with accumulating the error, blowing it out of propertion at worst.



To get the behaviour you want, couldn't you just change the comparison 
like this:


-av_compare_ts(pkt->pts - os->first_pts, st->

Re: [libav-devel] [PATCH 06/12] dashenc: support for hinting stream bandwidth using metadata option

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


From: Peter Große 

Bandwidth information is required in the manifest, but not always provided by 
the demuxer.
So enable hinting the stream bandwidth via a metadata field, supports same 
values as codec bitrate setting.

Example: -metadata:s:v:0 bitrate=3500k

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 12 
1 file changed, 12 insertions(+)


This is kinda abusing the metadata mechanism, which is mostly intended to 
be used for user-visible metadata like author/title and such.


I'd like to ask for Anton's opinion on this (although I'm pretty sure 
we've discussed it before).


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


Bandwidth information is required in the manifest, but not always
provided by the demuxer. If not available via metadata calculate
bandwith based on the size and duration of the first segment.

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 11 +++
1 file changed, 11 insertions(+)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 1cd9563..1b12d4f 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -846,6 +846,17 @@ static int dash_flush(AVFormatContext *s, int final, int 
stream)
if (ret < 0)
break;
}
+
+if (!os->bit_rate) {
+// calculate average bitrate of first segment
+double bitrate = (int)( (double) range_length * 8.0 * AV_TIME_BASE / 
(double)(os->max_pts - os->start_pts) );
+if (bitrate >= 0 && bitrate <= INT64_MAX) {
+os->bit_rate = bitrate;
+snprintf(os->bandwidth_str, sizeof(os->bandwidth_str),
+ " bandwidth=\"%d\"", os->bit_rate);
+}
+}
+
add_segment(os, filename, os->start_pts, os->max_pts - os->start_pts, 
start_pos, range_length, index_length);
av_log(s, AV_LOG_VERBOSE, "Representation %d media segment %d written to: 
%s\n", i, os->segment_index, full_path);
}
--
2.10.2


This probably is fine with me.

We can have $Bandwidth$ as part of the segment filename template - I guess 
this is too late to get it right for those cases? Although I don't mind if 
we don't support that case when we don't have a nominal bitrate set and 
try to use it in the template.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


This patch is based on the stream assignment code in webmdashenc.


This muxer isn't available in libav, so the reference doesn't really work 
in context here.




Additional changes:
 * Default to one AdaptationSet per stream

   Previously all mapped streams of a media type (video, audio) where assigned
   to a single AdaptationSet. Using the DASH live profile it is mandatory, that
   the segments of all representations are aligned, which is currently not
   enforced. This leads to problems when using video streams with different
   key frame intervals. So to play safe, default to one AdaptationSet per 
stream,
   unless overwritten by explicit assignment


Is there any way to easily get it back to the original behaviour, e.g. 
tell it to group all audio tracks together and all video tracks together, 
without having to manually specify the stream numbers?



 * Make sure all streams are assigned to exactly one AdaptationSet

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 193 ++
1 file changed, 162 insertions(+), 31 deletions(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 1b12d4f..c561ad1 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -24,6 +24,7 @@
#include 
#endif

+#include "libavutil/avutil.h"
#include "libavutil/avstring.h"
#include "libavutil/eval.h"
#include "libavutil/intreadwrite.h"
@@ -58,9 +59,15 @@ typedef struct Segment {
int n;
} Segment;

+typedef struct AdaptationSet {
+char id[10];
+enum AVMediaType media_type;
+AVDictionary *metadata;
+} AdaptationSet;
+
typedef struct OutputStream {
AVFormatContext *ctx;
-int ctx_inited;
+int ctx_inited, as_idx;
uint8_t iobuf[32768];
AVIOContext *out;
int duration_written;
@@ -79,6 +86,9 @@ typedef struct OutputStream {

typedef struct DASHContext {
const AVClass *class;  /* Class for private options. */
+char *adaptation_sets;
+AdaptationSet *as;
+int nb_as;
int window_size;
int extra_window_size;
int min_seg_duration;
@@ -87,7 +97,7 @@ typedef struct DASHContext {
int use_timeline;
int single_file;
OutputStream *streams;
-int has_video, has_audio;
+int has_video;
int64_t last_duration;
int64_t total_duration;
char availability_start_time[100];
@@ -176,6 +186,16 @@ static void dash_free(AVFormatContext *s)
{
DASHContext *c = s->priv_data;
int i, j;
+
+if (c->as) {
+for (i = 0; i < c->nb_as; i++) {
+if (&c->as[i].metadata)
+av_dict_free(&c->as[i].metadata);
+}
+av_freep(&c->as);
+c->nb_as = 0;
+}
+
if (!c->streams)
return;
for (i = 0; i < s->nb_streams; i++) {
@@ -436,12 +456,144 @@ static void format_date_now(char *buf, int size)
}
}

+static int write_adaptation_set(AVFormatContext *s, AVIOContext *out, int 
as_index)
+{
+DASHContext *c = s->priv_data;
+AdaptationSet *as = &c->as[as_index];
+int i;
+
+avio_printf(out, "\t\t\n",
+as->id, as->media_type == AVMEDIA_TYPE_VIDEO ? "video" : 
"audio");
+
+for (i = 0; i < s->nb_streams; i++) {
+OutputStream *os = &c->streams[i];
+
+if (os->as_idx - 1 != as_index)
+continue;
+
+if (as->media_type == AVMEDIA_TYPE_VIDEO) {
+avio_printf(out, "\t\t\t\n",
+i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->width, 
s->streams[i]->codecpar->height);
+} else {
+avio_printf(out, "\t\t\t\n",
+i, os->codec_str, os->bandwidth_str, 
s->streams[i]->codecpar->sample_rate);
+avio_printf(out, "\t\t\t\t\n",
+s->streams[i]->codecpar->channels);
+}
+output_segment_list(os, out, c);
+avio_printf(out, "\t\t\t\n");
+}
+avio_printf(out, "\t\t\n");
+
+return 0;
+}
+
+static int parse_adaptation_sets(AVFormatContext *s)
+{
+DASHContext *c = s->priv_data;
+char *p = c->adaptation_sets;
+char *q;
+enum { new_set, parse_id, parsing_streams } state;
+int i;
+
+/* default: one AdaptationSet for each stream */
+if (!p) {
+void *mem = av_mallocz(sizeof(*c->as) * s->nb_streams);
+if (!mem)
+return AVERROR(ENOMEM);
+c->as = mem;
+c->nb_as = s->nb_streams;
+
+for (i = 0; i < s->nb_streams; i++) {
+AdaptationSet *as = &c->as[i];
+OutputStream *os = &c->streams[i];
+snprintf(as->id, sizeof(as->id), "%d", i);
+as->metadata = NULL;
+as->media_type = s->streams[i]->codecpar->codec_type;
+os->as_idx = i + 1;
+}
+return 0;
+}
+
+/* syntax id=0,streams=0,1,2 id=1,streams=3,4 and so on */
+state = new_set;
+while (p < c->adaptation_sets + strlen(c->adaptation_sets)) {


Isn't this equivalent to "while (*p) {"?


+if (*p == ' ') 

Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


This patch is based on the stream assignment code in webmdashenc.

Additional changes:
 * Default to one AdaptationSet per stream

   Previously all mapped streams of a media type (video, audio) where assigned
   to a single AdaptationSet. Using the DASH live profile it is mandatory, that
   the segments of all representations are aligned, which is currently not
   enforced. This leads to problems when using video streams with different
   key frame intervals. So to play safe, default to one AdaptationSet per 
stream,
   unless overwritten by explicit assignment
 * Make sure all streams are assigned to exactly one AdaptationSet

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 193 ++
1 file changed, 162 insertions(+), 31 deletions(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 1b12d4f..c561ad1 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -24,6 +24,7 @@
#include 
#endif

+#include "libavutil/avutil.h"
#include "libavutil/avstring.h"
#include "libavutil/eval.h"
#include "libavutil/intreadwrite.h"
@@ -58,9 +59,15 @@ typedef struct Segment {
int n;
} Segment;

+typedef struct AdaptationSet {
+char id[10];
+enum AVMediaType media_type;
+AVDictionary *metadata;
+} AdaptationSet;


Oh, I also forgot to say; as far as I can see (on a sloppy read-through), 
metadata is unused in this patch. If that's really true, skip it here and 
only add it in the next patch where it is used. That makes this patch 
slightly less confusing.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 09/12] dashenc: copy language and role metadata from streams assigned to sets

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 24 +++-
1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index c561ad1..a0c7811 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -460,10 +460,19 @@ static int write_adaptation_set(AVFormatContext *s, 
AVIOContext *out, int as_ind
{
DASHContext *c = s->priv_data;
AdaptationSet *as = &c->as[as_index];
+AVDictionaryEntry *lang, *role;
int i;

-avio_printf(out, "\t\t\n",
+avio_printf(out, "\t\tid, as->media_type == AVMEDIA_TYPE_VIDEO ? "video" : 
"audio");
+lang = av_dict_get(as->metadata, "language", NULL, 0);
+if (lang)
+avio_printf(out, " lang=\"%s\"", lang->value);
+avio_printf(out, ">\n");
+
+role = av_dict_get(as->metadata, "role", NULL, 0);
+if (role)
+avio_printf(out, "\t\t\t\n", role->value);

for (i = 0; i < s->nb_streams; i++) {
OutputStream *os = &c->streams[i];
@@ -669,6 +678,14 @@ static int write_manifest(AVFormatContext *s, int final)
return ff_rename(temp_filename, s->filename);
}

+static int dict_copy_entry(AVDictionary **dst, const AVDictionary *src, const 
char *key)
+{
+AVDictionaryEntry *entry = av_dict_get(src, key, NULL, 0);
+if (entry)
+av_dict_set(dst, key, entry->value, AV_DICT_DONT_OVERWRITE);
+return 0;
+}
+
static int dash_write_header(AVFormatContext *s)
{
DASHContext *c = s->priv_data;
@@ -713,6 +730,7 @@ static int dash_write_header(AVFormatContext *s)

for (i = 0; i < s->nb_streams; i++) {
OutputStream *os = &c->streams[i];
+AdaptationSet *as = &c->as[os->as_idx - 1];
AVFormatContext *ctx;
AVStream *st;
AVDictionary *opts = NULL;
@@ -743,6 +761,10 @@ static int dash_write_header(AVFormatContext *s)
}
}

+// copy AdaptationSet language and role from stream metadata
+dict_copy_entry(&as->metadata, s->streams[i]->metadata, "language");
+dict_copy_entry(&as->metadata, s->streams[i]->metadata, "role");
+
ctx = avformat_alloc_context();
if (!ctx) {
ret = AVERROR(ENOMEM);
--
2.10.2


This seems ok, even though it (mis)uses metadata for this kind of info. Or 
what does Anton think of this one?


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 10/12] dashenc: use avio_dynbuf instead of packet_write callback

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


The dash_write function drops data, if no IOContext is initialized.
This might happen when a subordinate muxer calls avio_flush().
Using a dynamic buffer fixes that.

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 87 +--
1 file changed, 50 insertions(+), 37 deletions(-)


This patch at least needs a mention of what muxer would do that.

Currently, the only subordinate muxer you can use is the mp4 one, and that 
one (with the configured options) doesn't flush except when told to.




diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index a0c7811..86b454e 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -68,11 +68,10 @@ typedef struct AdaptationSet {
typedef struct OutputStream {
AVFormatContext *ctx;
int ctx_inited, as_idx;
-uint8_t iobuf[32768];
AVIOContext *out;
int duration_written;
char initfile[1024];
-int64_t init_start_pos;
+int64_t init_start_pos, pos;
int init_range_length;
int nb_segments, segments_size, segment_index;
Segment **segments;
@@ -108,14 +107,6 @@ typedef struct DASHContext {
const char *utc_timing_url;
} DASHContext;

-static int dash_write(void *opaque, uint8_t *buf, int buf_size)
-{
-OutputStream *os = opaque;
-if (os->out)
-avio_write(os->out, buf, buf_size);
-return buf_size;
-}
-
// RFC 6381
static void set_codec_str(AVFormatContext *s, AVCodecParameters *par,
  char *str, int size)
@@ -182,6 +173,29 @@ static void set_codec_str(AVFormatContext *s, 
AVCodecParameters *par,
}
}

+static int flush_dynbuf(OutputStream *os, int *range_length)
+{
+uint8_t *buffer;
+int ret;
+
+if (!os->ctx->pb) {
+return AVERROR(EINVAL);
+}
+
+// flush
+av_write_frame(os->ctx, NULL);
+avio_flush(os->ctx->pb);
+
+// write out to file
+*range_length = avio_close_dyn_buf(os->ctx->pb, &buffer);
+avio_write(os->out, buffer, *range_length);
+av_free(buffer);
+
+// re-open buffer
+ret = avio_open_dyn_buf(&os->ctx->pb);
+return ret;


The intermediate variable feels a little superfluous here


+}
+
static void dash_free(AVFormatContext *s)
{
DASHContext *c = s->priv_data;
@@ -203,7 +217,7 @@ static void dash_free(AVFormatContext *s)
if (os->ctx && os->ctx_inited)
av_write_trailer(os->ctx);
if (os->ctx && os->ctx->pb)
-av_free(os->ctx->pb);
+ffio_free_dyn_buf(&os->ctx->pb);
ff_format_io_close(s, &os->out);
if (os->ctx)
avformat_free_context(os->ctx);
@@ -689,7 +703,7 @@ static int dict_copy_entry(AVDictionary **dst, const 
AVDictionary *src, const ch
static int dash_write_header(AVFormatContext *s)
{
DASHContext *c = s->priv_data;
-int ret = 0, i;
+int ret = 0, i, range_length;
AVOutputFormat *oformat;
char *ptr;
char basename[1024];
@@ -786,11 +800,9 @@ static int dash_write_header(AVFormatContext *s)
st->time_base = s->streams[i]->time_base;
ctx->avoid_negative_ts = s->avoid_negative_ts;

-ctx->pb = avio_alloc_context(os->iobuf, sizeof(os->iobuf), 
AVIO_FLAG_WRITE, os, NULL, dash_write, NULL);
-if (!ctx->pb) {
-ret = AVERROR(ENOMEM);
+if ((ret = avio_open_dyn_buf(&ctx->pb)) < 0)
goto fail;
-}
+ctx->pb->seekable = 0;

if (c->single_file) {
if (c->single_file_name)
@@ -807,11 +819,17 @@ static int dash_write_header(AVFormatContext *s)
os->init_start_pos = 0;

av_dict_set(&opts, "movflags", "frag_custom+dash+delay_moov", 0);
-if ((ret = avformat_write_header(ctx, &opts)) < 0) {
- goto fail;
-}
+if ((ret = avformat_write_header(ctx, &opts)) < 0)
+goto fail;
+
+if ((ret = flush_dynbuf(os, &range_length)) < 0)
+goto fail;
+
+if (!c->single_file)
+ff_format_io_close(s, &os->out);
+
+os->pos = os->init_range_length = range_length;


This isn't correct. See c5e7ea13d2d4da0c5da91973a547afff6fe9e011. We 
intentionally defer writing the init segment until we have passed the 
first segment of packets into the muxer, to handle the cases where the 
init segments depend on knowledge from the actual first few packets.


(Proper edit lists for handling b-frame delay needs to know the timestamp 
of the first packet, and for AC3, the moov box takes a copy of the full 
first packet.)


For using the webm muxer, you probably might need to have this behaviour 
conditional on the kind of muxer.



os->ctx_inited = 1;
-avio_flush(ctx->pb);
av_dict_free(&opts);

av_log(s, AV_LOG_VERBOSE, "Representation %d init segment will be written 
to: %s\n", i, filename);
@@ -946,7 +964,6 @@ static int dash_flush(AVFormatContext *s, int final, int 
stream)
for (i = 0; i < s->nb_streams; i++) {
OutputStream 

Re: [libav-devel] [PATCH 11/12] dashenc: add webm support

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


Use webm muxer for VP8, VP9 and Opus codec, mp4 muxer otherwise.
Also copy stream metadata to output stream.

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 68 ---
1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 86b454e..24665cd 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -113,6 +113,23 @@ static void set_codec_str(AVFormatContext *s, 
AVCodecParameters *par,
{
const AVCodecTag *tags[2] = { NULL, NULL };
uint32_t tag;
+
+// common Webm codecs are not part of RFC 6381
+switch (par->codec_id) {
+case AV_CODEC_ID_VP8:
+snprintf(str, size, "vp8");
+return;
+case AV_CODEC_ID_VP9:
+snprintf(str, size, "vp9");
+return;
+case AV_CODEC_ID_VORBIS:
+snprintf(str, size, "vorbis");
+return;
+case AV_CODEC_ID_OPUS:
+snprintf(str, size, "opus");
+return;
+}
+


Hmm, I'm pondering if it'd be worth to store these in some more compact 
form, like a table, for all codecs which just have a single simple plain 
string?



if (par->codec_type == AVMEDIA_TYPE_VIDEO)
tags[0] = ff_codec_movvideo_tags;
else if (par->codec_type == AVMEDIA_TYPE_AUDIO)
@@ -495,11 +512,11 @@ static int write_adaptation_set(AVFormatContext *s, 
AVIOContext *out, int as_ind
continue;

if (as->media_type == AVMEDIA_TYPE_VIDEO) {
-avio_printf(out, "\t\t\t\n",
-i, os->codec_str, os->bandwidth_str, s->streams[i]->codecpar->width, 
s->streams[i]->codecpar->height);
+avio_printf(out, "\t\t\t\n",
+i, os->ctx->oformat->name, os->codec_str, os->bandwidth_str, 
s->streams[i]->codecpar->width, s->streams[i]->codecpar->height);
} else {
-avio_printf(out, "\t\t\t\n",
-i, os->codec_str, os->bandwidth_str, 
s->streams[i]->codecpar->sample_rate);
+avio_printf(out, "\t\t\t\n",
+i, os->ctx->oformat->name, os->codec_str, os->bandwidth_str, 
s->streams[i]->codecpar->sample_rate);


Using oformat->name here feels a little fragile. I think I'd rather have 
that stored as a string somewhere in the context. (Not sure if it's better 
to have it "flexible" to compose it as video/ and audio/, 
or store the full "video/mp4" etc as audioMime and videoMime.)



avio_printf(out, "\t\t\t\t\n",
s->streams[i]->codecpar->channels);
}
@@ -700,11 +717,18 @@ static int dict_copy_entry(AVDictionary **dst, const 
AVDictionary *src, const ch
return 0;
}

+static int dict_set_int(AVDictionary **pm, const char *key, int64_t value, int 
flags)
+{
+char valuestr[22];
+snprintf(valuestr, sizeof(valuestr), "%"PRId64, value);
+flags &= ~AV_DICT_DONT_STRDUP_VAL;
+return av_dict_set(pm, key, valuestr, flags);
+}
+
static int dash_write_header(AVFormatContext *s)
{
DASHContext *c = s->priv_data;
int ret = 0, i, range_length;
-AVOutputFormat *oformat;
char *ptr;
char basename[1024];

@@ -727,12 +751,6 @@ static int dash_write_header(AVFormatContext *s)
if (ptr)
*ptr = '\0';

-oformat = av_guess_format("mp4", NULL, NULL);
-if (!oformat) {
-ret = AVERROR_MUXER_NOT_FOUND;
-goto fail;
-}
-
c->streams = av_mallocz(sizeof(*c->streams) * s->nb_streams);
if (!c->streams) {
ret = AVERROR(ENOMEM);
@@ -784,12 +802,25 @@ static int dash_write_header(AVFormatContext *s)
ret = AVERROR(ENOMEM);
goto fail;
}
+
+// choose muxer based on codec: webm for VP8/9 and opus, mp4 otherwise
+if (s->streams[i]->codecpar->codec_id == AV_CODEC_ID_VP8 ||
+s->streams[i]->codecpar->codec_id == AV_CODEC_ID_VP9 ||
+s->streams[i]->codecpar->codec_id == AV_CODEC_ID_OPUS) {
+ctx->oformat = av_guess_format("webm", NULL, NULL);
+} else {
+ctx->oformat = av_guess_format("mp4", NULL, NULL);
+}
+if (!ctx->oformat) {
+ret = AVERROR_MUXER_NOT_FOUND;
+goto fail;
+}
os->ctx = ctx;
-ctx->oformat = oformat;
ctx->interrupt_callback = s->interrupt_callback;
ctx->opaque = s->opaque;
ctx->io_close   = s->io_close;
ctx->io_open= s->io_open;
+av_dict_copy(&ctx->metadata, s->metadata, 0);


Does the webm muxer need some specific metadata which we don't pass 
through right now, or is it just for making sure that metadata ends up set 
on the chained muxer and included in the stream? I'd at least want a 
comment in the commit message explaining why this is necessary.




if (!(st = avformat_new_stream(ctx, NULL))) {
ret = AVERROR(ENOMEM);
@@ -798,7 +829,10 @@ static int dash_write_header(AVFormatContext *s)
avcodec_parameters_copy(s

Re: [libav-devel] [PATCH 12/12] doc: add dash muxer

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


Signed-off-by: Peter Große 
---
doc/muxers.texi | 53 +
1 file changed, 53 insertions(+)

diff --git a/doc/muxers.texi b/doc/muxers.texi
index 5430da7..95d8051 100644
--- a/doc/muxers.texi
+++ b/doc/muxers.texi
@@ -53,6 +53,59 @@ avconv -i INPUT -c:a pcm_u8 -c:v mpeg2video -f crc -

See also the @ref{framecrc} muxer.

+@anchor{dash}
+@section dash
+
+Dynamic Adaptive Streaming over HTTP (DASH) muxer that creates segments
+and manifest files according to the MPEG-DASH standard ISO/IEC 23009-1:2014.
+
+For more information see:
+
+@itemize @bullet
+@item
+WebM DASH Specification: 
@url{https://sites.google.com/a/webmproject.org/wiki/adaptive-streaming/webm-dash-specification}
+@item
+ISO DASH Specification: 
@url{http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip}
+@end itemize


I'd rather have the actual ISO spec first :P


+
+It creates a MPD manifest file and segment files for each stream.
+
+The segment filename might contain pre-defined identifiers used with 
SegmentTemplate
+as defined in section 5.3.9.4.4 of the standard. Available identifiers are 
"$RepresentationID$",
+"$Number$", "$Bandwidth$" and "$Time$".
+
+@example
+avconv -i in.nut -f dash
+@end example


It might be of value to show a full testcase also, where you use multiple 
quality streams for both audio and video.


I've used a command line like this locally for testing that:

./avconv -re -i  -map 0 -map 0 -acodec libfdk_aac -vcodec libx264 
-b:v:0 800k -b:v:1 300k -s:v:1 320x170 -profile:v:1 baseline -profile:v:0 
main -bf 1 -keyint_min 120 -g 120 -sc_threshold 0 -b_strategy 0 
-use_timeline 1 -use_template 1 -window_size 5 -f dash -ar:a:1 22050 
/path/to/out.mpd


This should afaik produce well-aligned keyframes in all substreams. Feel 
free to pick through this and see what of it makes sense to include in an 
example.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 04/12] dashenc: add option to provide UTC timing source

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 14:58:21 +0200 (EET)
Martin Storsjö  wrote:


On Fri, 27 Jan 2017, Peter Große wrote:


If set, adds a UTCTime tag in the manifest.


UTCTiming, not UTCTime, right?


Right.


See
http://vm2.dashif.org/dash.js/docs/jsdocs/MediaPlayer.html#addUTCTimingSource
for more information. Usable default: "https://time.akamai.com/?iso";


Hmm, I don't see this documented in ISO/IEC 23009-1:2014 at least, and the
documentation from dashif don't seem to define it either.

I won't mind including it, but I'd like to know what standard defines it
before merging.


Mh true, this is not part of the standard, but introduced in their "Guidelines
for Implementations: DASH-IF Interoperability Points" [1] in Section 4.7.

Which URL shall I reference in the commit message? The guidelines link page,
the PDF, or not at all but mentioning the document's title?


I'd at least refer to the guidelines page, but linking explicitly to the 
PDF as well probably is good.



The "usable default" is just a hint. It is not mentioned in any document, it was
just what I found in player implementations (like mentioned in the message
above). I could drop it.


No, do keep it, any such extra info is useful.

But I started wondering, is there any actual use for this, when we 
explicitly list all the segments (either as individual segment filenames 
or via the template). As far as I can see, this is mostly necessary if you 
implicitly calculate the live edge based on availabilityStartTime and the 
segment length. So is there actually any gain from using it in our 
configuration?


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 15:21:20 +0200 (EET)
Martin Storsjö  wrote:


On Fri, 27 Jan 2017, Peter Große wrote:


Bandwidth information is required in the manifest, but not always
provided by the demuxer. If not available via metadata calculate
bandwith based on the size and duration of the first segment.



This probably is fine with me.

We can have $Bandwidth$ as part of the segment filename template - I guess
this is too late to get it right for those cases? Although I don't mind if
we don't support that case when we don't have a nominal bitrate set and
try to use it in the template.


Than we have to postpone opening any file handle until the first call to
dash_flush, right?
Which should be possible, but requires some code to be moved. Using a dyn_buf
(patch 10) might also help here, so writing to files happens as late a possible.


Hmm, indeed. If we postpone this patch until after that, this should be 
easier to handle right. (The alternative would be to just count the 
bitrate of the codec payload as we pass into the chained muxer; the 
bit_rate value we use otherwise also refers to only the codec payload data 
not including the container, but since we're probably doing the dyn_buf 
stuff anyway, that's even simpler.)


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 08/12] dashenc: add support for assigning streams to AdaptationSets

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 15:39:30 +0200 (EET)
Martin Storsjö  wrote:


On Fri, 27 Jan 2017, Peter Große wrote:


This patch is based on the stream assignment code in webmdashenc.


This muxer isn't available in libav, so the reference doesn't really work
in context here.


True, I thought I changed to something like "This patch is based on the stream
assignment code of a webm dash muxer by Vignesh Venkatasubramanian".

So far only a big part of the parse_adaption_set function is copied over, the
rest is basically my own code.
But it looks like, I'll to also make changes to that part (regarding your
comments below) so I might not need the reference anymore?!


It might be ok to drop the reference altogether. Or perhaps just add 
something like "originally based partially on code by V.V."? That way, you 
have something like proper attribution, given the amount of code taken 
(i.e. much inspiration but not necessarily much literal lines).


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 10/12] dashenc: use avio_dynbuf instead of packet_write callback

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 16:02:23 +0200 (EET)
Martin Storsjö  wrote:


On Fri, 27 Jan 2017, Peter Große wrote:


av_dict_set(&opts, "movflags", "frag_custom+dash+delay_moov", 0);
-if ((ret = avformat_write_header(ctx, &opts)) < 0) {
- goto fail;
-}
+if ((ret = avformat_write_header(ctx, &opts)) < 0)
+goto fail;
+
+if ((ret = flush_dynbuf(os, &range_length)) < 0)
+goto fail;
+
+if (!c->single_file)
+ff_format_io_close(s, &os->out);
+
+os->pos = os->init_range_length = range_length;


This isn't correct. See c5e7ea13d2d4da0c5da91973a547afff6fe9e011. We
intentionally defer writing the init segment until we have passed the
first segment of packets into the muxer, to handle the cases where the
init segments depend on knowledge from the actual first few packets.

(Proper edit lists for handling b-frame delay needs to know the timestamp
of the first packet, and for AC3, the moov box takes a copy of the full
first packet.)

For using the webm muxer, you probably might need to have this behaviour
conditional on the kind of muxer.


So how do you avoid spilling actual packets into the init segment?
Is it that av_write_frame only writes to the file, what was flushed before
using avio_flush?


This relies on the fact that in frag_custom mode, the muxer itself will 
never spontaneously flush any output, it only does that when explicitly 
flushed (with av_write_frame(NULL)). When you have the delay_moov flag 
added, it won't write anything at all during av_write_header, and will 
write the init segment during the first av_write_frame(NULL). So if 
changing existing code, if you add delay_moov, you need to add one extra 
flush to get the init segment. (If you flush earlier, the moov will be 
based more on guesses and less on actual data.)



This code resulted from dealing with matroskaenc having his own idea when to
flush and start new clusters.

So my first idea would to the move the "writing out the init segment" code to a
function and call it conditionally depending on the muxer.
If webm, call it in write_header. Then in dash_flush call it, if
os->init_range_length is zero.


Yeah, that sounds sensible. Or rather invert the condition - instead of 
if (webm), do if (!mp4), since it's specific to the mp4+delay_moov combo.


// Martin;
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 11/12] dashenc: add webm support

2017-01-27 Thread Martin Storsjö

On Fri, 27 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 16:09:06 +0200 (EET)
Martin Storsjö  wrote:


+// common Webm codecs are not part of RFC 6381
+switch (par->codec_id) {
+case AV_CODEC_ID_VP8:
+snprintf(str, size, "vp8");
+return;
+case AV_CODEC_ID_VP9:
+snprintf(str, size, "vp9");
+return;
+case AV_CODEC_ID_VORBIS:
+snprintf(str, size, "vorbis");
+return;
+case AV_CODEC_ID_OPUS:
+snprintf(str, size, "opus");
+return;
+}
+


Hmm, I'm pondering if it'd be worth to store these in some more compact
form, like a table, for all codecs which just have a single simple plain
string?


Do you have some example code? I'm not sure if I get how to do tables in C.

Like with a struct?

struct codec_string {
int codec_id;
const char *codec_str;
};
struct codec_string codecs[] = {
   {AV_CODEC_ID_VP8, "vp8"},
{AV_CODEC_ID_VP9, "vp9"},
...
};


Yes, something like this. If favouring compact code (which we tend to do), 
something like this would do:


static struct codec_string {
int id;
const char *str;
} codecs[] = {
{ AV_CODEC_ID_VP8, "vp8" },
...
{ 0, NULL }
};

int i;
for (i = 0; codecs[i].id; i++)
if (codecs[i].id == id)
return codecs[i].str;

All in all, the number of lines probably might end up higher than before, 
but the bulk of the repetitive part is more compact, and adding another 
entry is less annoying.



Using oformat->name here feels a little fragile. I think I'd rather have
that stored as a string somewhere in the context. (Not sure if it's better
to have it "flexible" to compose it as video/ and audio/,
or store the full "video/mp4" etc as audioMime and videoMime.)


I might opt for the short version, since I use the format name also for 
conditionals like

 if (!strcmp(os->ctx->oformat->name, "mp4"))

Therefor maybe a char *format_name in the OutputStream struct?


That might be ok. Maybe a comment at the place where it's assigned, saying 
that this will be used as part of a mime string (so future other formats 
won't accidentally create bogus mime strings).



ctx->interrupt_callback = s->interrupt_callback;
ctx->opaque = s->opaque;
ctx->io_close   = s->io_close;
ctx->io_open= s->io_open;
+av_dict_copy(&ctx->metadata, s->metadata, 0);


Does the webm muxer need some specific metadata which we don't pass
through right now, or is it just for making sure that metadata ends up set
on the chained muxer and included in the stream? I'd at least want a
comment in the commit message explaining why this is necessary.


No specific metadata is needed. I can move this to a separate commit.


if (!(st = avformat_new_stream(ctx, NULL))) {
ret = AVERROR(ENOMEM);
@@ -798,7 +829,10 @@ static int dash_write_header(AVFormatContext *s)
avcodec_parameters_copy(st->codecpar, s->streams[i]->codecpar);
st->sample_aspect_ratio = s->streams[i]->sample_aspect_ratio;
st->time_base = s->streams[i]->time_base;
+st->avg_frame_rate = s->streams[i]->avg_frame_rate;
ctx->avoid_negative_ts = s->avoid_negative_ts;
+ctx->flags = s->flags;
+ctx->max_delay = s->max_delay;


Are these new settings strictly necessary for chained-webm? If not, I'd
rather add them in a separate commit afterwards, with an explanation on
what it helps for.


To be honest, I haven't checked. I just looked at what other muxers having 
subordinate muxers do, when they initialize the context.


Ok - absolutely in another commit then, if at all.

In general it might seem useful to pass on as many fields as possible, but 
so far, I've taken the inverse approach; only forward the ones I 
absolutely know I need. I probably don't mind adding this with a commit 
message with a good motivation though.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 07/12] dashenc: calculate stream bitrate from first segment if not available

2017-01-28 Thread Martin Storsjö

On Sat, 28 Jan 2017, Peter Große wrote:


On Fri, 27 Jan 2017 22:24:35 +0200 (EET)
Martin Storsjö  wrote:


Than we have to postpone opening any file handle until the first call to
dash_flush, right?
Which should be possible, but requires some code to be moved. Using a
dyn_buf (patch 10) might also help here, so writing to files happens as
late a possible.


Hmm, indeed. If we postpone this patch until after that, this should be
easier to handle right. (The alternative would be to just count the
bitrate of the codec payload as we pass into the chained muxer; the
bit_rate value we use otherwise also refers to only the codec payload data
not including the container, but since we're probably doing the dyn_buf
stuff anyway, that's even simpler.)


Mh, after thinking about this a bit more, I'm not sure the complexity is worth
it.

I'd have to store the contents of the dyn_buf containing the init data somewhere
until the first packet is ready to be flushed so I can calculate the average
bitrate to be used in the filenames. Only then I know the init segment filename.

This might be ok for mp4, since everything happens within the first call to
dash_flush. But for webm this doesn't work that easy.

So maybe the patch is ok for now.


Ok - it's not worth complicating the code too much just for that 
pathological case.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH v2 01/10] dashenc: add option to provide UTC timing source

2017-01-29 Thread Martin Storsjö

On Sun, 29 Jan 2017, Peter Große wrote:


From: Peter Große 

If set, adds a UTCTiming tag in the manifest.

This is part of the recommendations listed in the "Guidelines for
Implementations: DASH-IF Interoperability Points" [1][2]
Section 4.7 describes means for the Availability Time Synchronization.

A usable default is "https://time.akamai.com/?iso";

[1] http://dashif.org/guidelines/
[2] http://dashif.org/wp-content/uploads/2016/12/DASH-IF-IOP-v4.0-clean.pdf
   (current version as of writing)

Signed-off-by: Peter Große 
---
v2: add references to IOP


Content-wise, the patch looks fine. But I'm still missing a comment on a 
question from the previous review:


But I started wondering, is there any actual use for this, when we 
explicitly list all the segments (either as individual segment filenames 
or via the template). As far as I can see, this is mostly necessary if 
you implicitly calculate the live edge based on availabilityStartTime 
and the segment length. So is there actually any gain from using it in 
our configuration?


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH v2 02/10] dashenc: separate segments based on current segment duration

2017-01-29 Thread Martin Storsjö

On Sun, 29 Jan 2017, Peter Große wrote:


The current implementation creates new segments comparing

 pkt->pts - first_pts > nb_segs * min_seg_duration

This works fine, but if the keyframe interval is smaller than "min_seg_duration"
segments shorter than the minimum segment duration are created.

Example: keyint=50, min_seg_duration=300
segment 1 contains keyframe 1 (duration=2s <  total_duration=3s)
   and keyframe 2 (duration=4s >= total_duration=3s)
segment 2 contains keyframe 3 (duration=6s >= total_duration=6s)
segment 3 contains keyframe 4 (duration=8s <  total_duration=9s)
   and keyframe 5 (duration=10s >= total_duration=9s)
...

Segment 2 is only 2s long, shorter than min_seg_duration = 3s.

To fix this, new segments are created based on the actual written duration.
Otherwise the option name "min_seg_duration" is misleading.

Signed-off-by: Peter Große 
---
v2: use pts difference instead of fragile duration calculation


Seems ok to me.

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH v2 03/10] dashenc: calculate stream bitrate from first segment if not available

2017-01-29 Thread Martin Storsjö

On Sun, 29 Jan 2017, Peter Große wrote:


Bandwidth information is required in the manifest, but not always
provided by the demuxer. In that case calculate the bandwith based
on the size and duration of the first segment.

Signed-off-by: Peter Große 
---
v2: reword commit message
---
libavformat/dashenc.c | 10 ++
1 file changed, 10 insertions(+)

diff --git a/libavformat/dashenc.c b/libavformat/dashenc.c
index 98722cd..ae1bb0b 100644
--- a/libavformat/dashenc.c
+++ b/libavformat/dashenc.c
@@ -834,6 +834,16 @@ static int dash_flush(AVFormatContext *s, int final, int 
stream)
if (ret < 0)
break;
}
+
+if (!os->bit_rate) {
+// calculate average bitrate of first segment
+double bitrate = (int)( (double) range_length * 8.0 * AV_TIME_BASE / 
(double)(os->max_pts - os->start_pts) );


Sorry for not pointing it out in the last round, but promoting this to 
double feels a bit superfluous. Just cast to int64_t instead, that should 
be just fine for this use case. (Floats can end up rounded differently on 
different platforms, and even though it doesn't really matter here, having 
it completely determinisic when easily possible is desireable.)


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH v2 04/10] dashenc: add support for assigning streams to AdaptationSets

2017-01-29 Thread Martin Storsjö

On Sun, 29 Jan 2017, Peter Große wrote:


Also makes sure all streams are assigned to exactly one AdaptationSet.

This patch is originally based partially on code by Vignesh Venkatasubramanian.

Signed-off-by: Peter Große 
---
v2:
* changing default stream assignment moved to separate patch
* removed metadata field from AdaptationSet
* check strtol result in parse_adaptation_set to detect non-numeric input
* move allocation of adaptation set to separate function
* fix string parsing problems pointed out by Martin
---
libavformat/dashenc.c | 201 ++
1 file changed, 170 insertions(+), 31 deletions(-)


Very nice improvement from the previous iteration! I appreciate the fact 
that the default behaviour isn't changed here as well.



+os = &c->streams[i];
+if (as->media_type == AVMEDIA_TYPE_UNKNOWN) {
+as->media_type = s->streams[i]->codecpar->codec_type;
+} else if (as->media_type != s->streams[i]->codecpar->codec_type) {
+av_log(s, AV_LOG_ERROR, "Mixing codec types within an AdaptationSet 
is not allowed\n");
+return AVERROR(EINVAL);
+} else if (os->as_idx) {
+av_log(s, AV_LOG_ERROR, "Assigning a stream to more than one 
AdaptationSet is not allowed\n");
+return AVERROR(EINVAL);
+}
+os->as_idx = c->nb_as;
+
+if (*p == ' ') state = new_set;
+if (*p)
+p++;


Nitpick: The state = new_set; statement could be on a separate line. I can 
change that before pushing. Other than that, I think this is good to go.


// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH v2 05/10] dashenc: allow assigning all streams of a media type to an AdaptationSet

2017-01-29 Thread Martin Storsjö

On Sun, 29 Jan 2017, Peter Große wrote:


Using the characters "v" or "a" instead of stream index numbers for assigning
streams in the adaption_set option, all streams matching that given type will
be added to the AdaptationSet.

Signed-off-by: Peter Große 
---
libavformat/dashenc.c | 61 +--
1 file changed, 45 insertions(+), 16 deletions(-)


Nice, this looks good

// Martin
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

  1   2   3   4   5   6   7   8   9   10   >