[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers

2022-09-20 Thread James Cowgill
ffmpeg | branch: release/4.4 | James Cowgill  | Sun Aug 25 
09:18:00 2019 +0100| [aa28df74ab197c49a05fecc40c81e0f8ec4ad0c3] | committer: 
Martin Storsjö

avcodec/arm/sbcenc: avoid callee preserved vfp registers

When compiling FFmpeg with GCC-9, some very random segfaults were
observed in code which had previously called down into the SBC encoder
NEON assembly routines. This was caused by these functions clobbering
some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
using these registers to save local variables, but after these
functions returned, they would contain garbage.

Fix by reallocating the registers in the two affected functions in
the following way:
 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
 ff_sbc_analyze_8_neon: q2-q9 => q8-q15

The reason for using these replacements is to keep closely related
sets of registers consecutively numbered which hopefully makes the
code more easy to follow. Since this commit only reallocates
registers, it should have no performance impact.

Signed-off-by: James Cowgill 
Signed-off-by: Martin Storsjö 
(cherry picked from commit 50a4dff69f6477b06f00eae1cac2a53ae22fe9a5)
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=aa28df74ab197c49a05fecc40c81e0f8ec4ad0c3
---

 libavcodec/arm/sbcdsp_neon.S | 220 +--
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index d83d21d202..914abfb6cc 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
+vld1.16 {d16, d17}, [r0, :64]!
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmull.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmull.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmull.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmlal.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmlal.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmlal.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vmlal.s16   q1, d5, d9
+vmlal.s16   q0, d16, d20
+vmlal.s16   q1, d17, d21
 
 vpadd.s32   d0, d0, d1
 vpadd.s32   d1, d2, d3
 
 vrshrn.s32  d0, q0, SBC_PROTO_FIXED_SCALE
 
-vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+vld1.16 {d16, d17, d18, d19}, [r2, :128]!
 
 vdup.i32d1, d0[1]  /* TODO: can be eliminated */
 vdup.i32d0, d0[0]  /* TODO: can be eliminated */
 
-vmull.s16   q3, d2, d0
-vmull.s16   q4, d3, d0
-vmlal.s16   q3, d4, d1
-vmlal.s16   q4, d5, d1
+vmull.s16   q10, d16, d0
+vmull.s16   q11, d17, d0
+vmlal.s16   q10, d18, d1
+vmlal.s16   q11, d19, d1
 
-vpadd.s32   d0, d6, d7 /* TODO: can be eliminated */
-vpadd.s32   d1, d8, d9 /* TODO: can be eliminated */
+vpadd.s32   d0, d20, d21 /* TODO: can be eliminated */
+vpadd.s32   d1, d22, d23 /* TODO: can be eliminated */
 
 vst1.32 {d0, d1}, [r1, :128]
 
@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
-
-vmull.s16   q6, d4, d8
- 

[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers

2022-09-20 Thread James Cowgill
ffmpeg | branch: release/5.0 | James Cowgill  | Sun Aug 25 
09:18:00 2019 +0100| [c1b8ffbed83e79c2a12a8e3639baae6160a484c5] | committer: 
Martin Storsjö

avcodec/arm/sbcenc: avoid callee preserved vfp registers

When compiling FFmpeg with GCC-9, some very random segfaults were
observed in code which had previously called down into the SBC encoder
NEON assembly routines. This was caused by these functions clobbering
some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
using these registers to save local variables, but after these
functions returned, they would contain garbage.

Fix by reallocating the registers in the two affected functions in
the following way:
 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
 ff_sbc_analyze_8_neon: q2-q9 => q8-q15

The reason for using these replacements is to keep closely related
sets of registers consecutively numbered which hopefully makes the
code more easy to follow. Since this commit only reallocates
registers, it should have no performance impact.

Signed-off-by: James Cowgill 
Signed-off-by: Martin Storsjö 
(cherry picked from commit 50a4dff69f6477b06f00eae1cac2a53ae22fe9a5)
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c1b8ffbed83e79c2a12a8e3639baae6160a484c5
---

 libavcodec/arm/sbcdsp_neon.S | 220 +--
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index d83d21d202..914abfb6cc 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
+vld1.16 {d16, d17}, [r0, :64]!
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmull.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmull.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmull.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmlal.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmlal.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmlal.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vmlal.s16   q1, d5, d9
+vmlal.s16   q0, d16, d20
+vmlal.s16   q1, d17, d21
 
 vpadd.s32   d0, d0, d1
 vpadd.s32   d1, d2, d3
 
 vrshrn.s32  d0, q0, SBC_PROTO_FIXED_SCALE
 
-vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+vld1.16 {d16, d17, d18, d19}, [r2, :128]!
 
 vdup.i32d1, d0[1]  /* TODO: can be eliminated */
 vdup.i32d0, d0[0]  /* TODO: can be eliminated */
 
-vmull.s16   q3, d2, d0
-vmull.s16   q4, d3, d0
-vmlal.s16   q3, d4, d1
-vmlal.s16   q4, d5, d1
+vmull.s16   q10, d16, d0
+vmull.s16   q11, d17, d0
+vmlal.s16   q10, d18, d1
+vmlal.s16   q11, d19, d1
 
-vpadd.s32   d0, d6, d7 /* TODO: can be eliminated */
-vpadd.s32   d1, d8, d9 /* TODO: can be eliminated */
+vpadd.s32   d0, d20, d21 /* TODO: can be eliminated */
+vpadd.s32   d1, d22, d23 /* TODO: can be eliminated */
 
 vst1.32 {d0, d1}, [r1, :128]
 
@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
-
-vmull.s16   q6, d4, d8
- 

[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers

2022-09-20 Thread James Cowgill
ffmpeg | branch: release/5.1 | James Cowgill  | Sun Aug 25 
09:18:00 2019 +0100| [2d04a18264e6e9b7f548954c5d1c51c3ab01b038] | committer: 
Martin Storsjö

avcodec/arm/sbcenc: avoid callee preserved vfp registers

When compiling FFmpeg with GCC-9, some very random segfaults were
observed in code which had previously called down into the SBC encoder
NEON assembly routines. This was caused by these functions clobbering
some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
using these registers to save local variables, but after these
functions returned, they would contain garbage.

Fix by reallocating the registers in the two affected functions in
the following way:
 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
 ff_sbc_analyze_8_neon: q2-q9 => q8-q15

The reason for using these replacements is to keep closely related
sets of registers consecutively numbered which hopefully makes the
code more easy to follow. Since this commit only reallocates
registers, it should have no performance impact.

Signed-off-by: James Cowgill 
Signed-off-by: Martin Storsjö 
(cherry picked from commit 50a4dff69f6477b06f00eae1cac2a53ae22fe9a5)
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2d04a18264e6e9b7f548954c5d1c51c3ab01b038
---

 libavcodec/arm/sbcdsp_neon.S | 220 +--
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index d83d21d202..914abfb6cc 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
+vld1.16 {d16, d17}, [r0, :64]!
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmull.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmull.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmull.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmlal.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmlal.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmlal.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vmlal.s16   q1, d5, d9
+vmlal.s16   q0, d16, d20
+vmlal.s16   q1, d17, d21
 
 vpadd.s32   d0, d0, d1
 vpadd.s32   d1, d2, d3
 
 vrshrn.s32  d0, q0, SBC_PROTO_FIXED_SCALE
 
-vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+vld1.16 {d16, d17, d18, d19}, [r2, :128]!
 
 vdup.i32d1, d0[1]  /* TODO: can be eliminated */
 vdup.i32d0, d0[0]  /* TODO: can be eliminated */
 
-vmull.s16   q3, d2, d0
-vmull.s16   q4, d3, d0
-vmlal.s16   q3, d4, d1
-vmlal.s16   q4, d5, d1
+vmull.s16   q10, d16, d0
+vmull.s16   q11, d17, d0
+vmlal.s16   q10, d18, d1
+vmlal.s16   q11, d19, d1
 
-vpadd.s32   d0, d6, d7 /* TODO: can be eliminated */
-vpadd.s32   d1, d8, d9 /* TODO: can be eliminated */
+vpadd.s32   d0, d20, d21 /* TODO: can be eliminated */
+vpadd.s32   d1, d22, d23 /* TODO: can be eliminated */
 
 vst1.32 {d0, d1}, [r1, :128]
 
@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
-
-vmull.s16   q6, d4, d8
- 

[FFmpeg-cvslog] avcodec/arm/sbcenc: avoid callee preserved vfp registers

2022-09-13 Thread James Cowgill
ffmpeg | branch: master | James Cowgill  | Sun Aug 25 
09:18:00 2019 +0100| [50a4dff69f6477b06f00eae1cac2a53ae22fe9a5] | committer: 
Martin Storsjö

avcodec/arm/sbcenc: avoid callee preserved vfp registers

When compiling FFmpeg with GCC-9, some very random segfaults were
observed in code which had previously called down into the SBC encoder
NEON assembly routines. This was caused by these functions clobbering
some of the vfp callee saved registers (d8 - d15 aka q4 - q7). GCC was
using these registers to save local variables, but after these
functions returned, they would contain garbage.

Fix by reallocating the registers in the two affected functions in
the following way:
 ff_sbc_analyze_4_neon: q2-q5 => q8-q11, then q1-q4 => q8-q11
 ff_sbc_analyze_8_neon: q2-q9 => q8-q15

The reason for using these replacements is to keep closely related
sets of registers consecutively numbered which hopefully makes the
code more easy to follow. Since this commit only reallocates
registers, it should have no performance impact.

Signed-off-by: James Cowgill 
Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=50a4dff69f6477b06f00eae1cac2a53ae22fe9a5
---

 libavcodec/arm/sbcdsp_neon.S | 220 +--
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S
index d83d21d202..914abfb6cc 100644
--- a/libavcodec/arm/sbcdsp_neon.S
+++ b/libavcodec/arm/sbcdsp_neon.S
@@ -38,49 +38,49 @@ function ff_sbc_analyze_4_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
+vld1.16 {d16, d17}, [r0, :64]!
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmull.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmull.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmull.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmlal.s16   q1, d5, d9
-vld1.16 {d10, d11}, [r2, :128]!
+vmlal.s16   q0, d16, d20
+vld1.16 {d18, d19}, [r0, :64]!
+vmlal.s16   q1, d17, d21
+vld1.16 {d22, d23}, [r2, :128]!
 
-vmlal.s16   q0, d6, d10
-vld1.16 {d4, d5}, [r0, :64]!
-vmlal.s16   q1, d7, d11
-vld1.16 {d8, d9}, [r2, :128]!
+vmlal.s16   q0, d18, d22
+vld1.16 {d16, d17}, [r0, :64]!
+vmlal.s16   q1, d19, d23
+vld1.16 {d20, d21}, [r2, :128]!
 
-vmlal.s16   q0, d4, d8
-vmlal.s16   q1, d5, d9
+vmlal.s16   q0, d16, d20
+vmlal.s16   q1, d17, d21
 
 vpadd.s32   d0, d0, d1
 vpadd.s32   d1, d2, d3
 
 vrshrn.s32  d0, q0, SBC_PROTO_FIXED_SCALE
 
-vld1.16 {d2, d3, d4, d5}, [r2, :128]!
+vld1.16 {d16, d17, d18, d19}, [r2, :128]!
 
 vdup.i32d1, d0[1]  /* TODO: can be eliminated */
 vdup.i32d0, d0[0]  /* TODO: can be eliminated */
 
-vmull.s16   q3, d2, d0
-vmull.s16   q4, d3, d0
-vmlal.s16   q3, d4, d1
-vmlal.s16   q4, d5, d1
+vmull.s16   q10, d16, d0
+vmull.s16   q11, d17, d0
+vmlal.s16   q10, d18, d1
+vmlal.s16   q11, d19, d1
 
-vpadd.s32   d0, d6, d7 /* TODO: can be eliminated */
-vpadd.s32   d1, d8, d9 /* TODO: can be eliminated */
+vpadd.s32   d0, d20, d21 /* TODO: can be eliminated */
+vpadd.s32   d1, d22, d23 /* TODO: can be eliminated */
 
 vst1.32 {d0, d1}, [r1, :128]
 
@@ -91,57 +91,57 @@ function ff_sbc_analyze_8_neon, export=1
 /* TODO: merge even and odd cases (or even merge all four calls to this
  * function) in order to have only aligned reads from 'in' array
  * and reduce number of load instructions */
-vld1.16 {d4, d5}, [r0, :64]!
-vld1.16 {d8, d9}, [r2, :128]!
-
-vmull.s16   q6, d4, d8
-vld1.16 {d6,  d7}, [r0, :64]!
-vmull.s16   q7, d5, d9
-vld1.16