Re: [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.

2022-10-11 Thread Reimar Döffinger

Hi Martin,

> On 10 Oct 2022, at 23:29, Martin Storsjö  wrote:
> 
> On Sun, 9 Oct 2022, reimar.doeffin...@gmx.de wrote:
> 
>> From: Reimar Döffinger 
>> 
>> Currently it is done in several different ways, which
>> might cause needless dependencies or in case of
>> tx_float_neon.S is incorrect.
> 
> This looks reasonable to me, assuming that it passes fate. Do you want to 
> push it yourself, or do you want me to do it?

Thanks, I pushed it.
I had only run checkasm on it (I had made a couple of mistakes first), but it 
also passes fate.

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.

2022-10-10 Thread Martin Storsjö

On Sun, 9 Oct 2022, reimar.doeffin...@gmx.de wrote:


From: Reimar Döffinger 

Currently it is done in several different ways, which
might cause needless dependencies or in case of
tx_float_neon.S is incorrect.

Signed-off-by: Reimar Döffinger 
---
libavcodec/aarch64/fft_neon.S  |  3 +-
libavcodec/aarch64/h264idct_neon.S |  6 +-
libavcodec/aarch64/hevcdsp_sao_neon.S  |  3 +-
libavcodec/aarch64/mdct_neon.S | 18 ++
libavcodec/aarch64/me_cmp_neon.S   |  6 +-
libavcodec/aarch64/synth_filter_neon.S |  3 +-
libavcodec/aarch64/vp9itxfm_neon.S | 28 -
libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +--
libavcodec/aarch64/vp9lpf_neon.S   | 80 +-
libavutil/aarch64/tx_float_neon.S  | 52 -
10 files changed, 109 insertions(+), 122 deletions(-)


This looks reasonable to me, assuming that it passes fate. Do you want to 
push it yourself, or do you want me to do it?


// Martin
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] aarch64: Implement stack spilling in a consistent way.

2022-10-09 Thread Reimar . Doeffinger
From: Reimar Döffinger 

Currently it is done in several different ways, which
might cause needless dependencies or in case of
tx_float_neon.S is incorrect.

Signed-off-by: Reimar Döffinger 
---
 libavcodec/aarch64/fft_neon.S  |  3 +-
 libavcodec/aarch64/h264idct_neon.S |  6 +-
 libavcodec/aarch64/hevcdsp_sao_neon.S  |  3 +-
 libavcodec/aarch64/mdct_neon.S | 18 ++
 libavcodec/aarch64/me_cmp_neon.S   |  6 +-
 libavcodec/aarch64/synth_filter_neon.S |  3 +-
 libavcodec/aarch64/vp9itxfm_neon.S | 28 -
 libavcodec/aarch64/vp9lpf_16bpp_neon.S | 32 +--
 libavcodec/aarch64/vp9lpf_neon.S   | 80 +-
 libavutil/aarch64/tx_float_neon.S  | 52 -
 10 files changed, 109 insertions(+), 122 deletions(-)

diff --git a/libavcodec/aarch64/fft_neon.S b/libavcodec/aarch64/fft_neon.S
index 9ff3f9c526..d7225511dd 100644
--- a/libavcodec/aarch64/fft_neon.S
+++ b/libavcodec/aarch64/fft_neon.S
@@ -342,8 +342,7 @@ endfunc
 function fft\n\()_neon, align=6
 AARCH64_VALID_JUMP_TARGET
 AARCH64_SIGN_LINK_REGISTER
-sub sp,  sp,  #16
-stp x28, x30, [sp]
+stp x28, x30, [sp, #-16]!
 add x28, x0,  #\n4*2*8
 bl  fft\n2\()_neon
 mov x0,  x28
diff --git a/libavcodec/aarch64/h264idct_neon.S 
b/libavcodec/aarch64/h264idct_neon.S
index 7d2879b0ce..375da31d65 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -157,8 +157,7 @@ function ff_h264_idct_add16intra_neon, export=1
 endfunc
 
 function ff_h264_idct_add8_neon, export=1
-sub sp,  sp, #0x40
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-0x40]!
 mov x12, x30
 ldp x6,  x15, [x0]  // dest[0], dest[1]
 add x5,  x1,  #16*4 // block_offset
@@ -187,8 +186,7 @@ function ff_h264_idct_add8_neon, export=1
 cselx6,  x15, x6,  eq
 cmp x10, #20
 b.lt1b
-ldp x19, x20, [sp]
-add sp,  sp,  #0x40
+ldp x19, x20, [sp], #0x40
 ret x12
 endfunc
 
diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S 
b/libavcodec/aarch64/hevcdsp_sao_neon.S
index d4decfde3b..30e83dda5d 100644
--- a/libavcodec/aarch64/hevcdsp_sao_neon.S
+++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
@@ -33,8 +33,7 @@
 //  int16_t *sao_offset_val, int sao_left_class,
 //  int width, int height)
 function ff_hevc_sao_band_filter_8x8_8_neon, export=1
-sub sp,  sp, #64
-stp xzr, xzr, [sp]
+stp xzr, xzr, [sp, #-64]!
 stp xzr, xzr, [sp, #16]
 stp xzr, xzr, [sp, #32]
 stp xzr, xzr, [sp, #48]
diff --git a/libavcodec/aarch64/mdct_neon.S b/libavcodec/aarch64/mdct_neon.S
index 6091e72022..98b09bf1ab 100644
--- a/libavcodec/aarch64/mdct_neon.S
+++ b/libavcodec/aarch64/mdct_neon.S
@@ -23,8 +23,7 @@
 #include "libavutil/aarch64/asm.S"
 
 function ff_imdct_half_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 mov x12, #1
@@ -120,17 +119,15 @@ function ff_imdct_half_neon, export=1
 st2 {v4.2s,v5.2s},  [x0]
 st2 {v6.2s,v7.2s},  [x8]
 
-ldp x19, x20, [sp]
 ldr x30, [sp, #16]
 AARCH64_VALIDATE_LINK_REGISTER
-add sp,  sp,  #32
+ldp x19, x20, [sp], #32
 
 ret
 endfunc
 
 function ff_imdct_calc_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 ldr w3,  [x0, #28]  // mdct_bits
@@ -163,18 +160,16 @@ function ff_imdct_calc_neon, export=1
 subsx19, x19,  #16
 b.gt1b
 
-ldp x19, x20, [sp]
 ldr x30, [sp, #16]
 AARCH64_VALIDATE_LINK_REGISTER
-add sp,  sp,  #32
+ldp x19, x20, [sp], #32
 
 ret
 endfunc
 
 
 function ff_mdct_calc_neon, export=1
-sub sp,  sp,  #32
-stp x19, x20, [sp]
+stp x19, x20, [sp, #-32]!
 AARCH64_SIGN_LINK_REGISTER
 str x30, [sp, #16]
 
@@ -323,10 +318,9 @@ function ff_mdct_calc_neon, export=1
 st2 {v4.2s,v5.2s},  [x0]
 st2 {v6.2s,v7.2s},  [x8]
 
-ldp x19, x20, [sp]
 ldr