On Mon, 14 Nov 2016, Janne Grunau wrote:
Since aarch64 has enough free general purpose registers use them to
branch to the appropiate storage code. 1-2 cycles faster for the
functions using loop_filter 8/16, ... on a cortex-a53. Mixed results
(up to 2 cycles faster/slower) on a cortex-a57.
---
libavcodec/aarch64/vp9lpf_neon.S | 48 +++++++++++++++-------------------------
1 file changed, 18 insertions(+), 30 deletions(-)
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 995a97d..3a82bd4 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -410,15 +410,19 @@
.endif
// If no pixels needed flat8in nor flat8out, jump to a
// writeout of the inner 4 pixels
- cbz x5, 7f
+ cbnz x5, 1f
+ br x14
+1:
mov x5, v7.d[0]
.ifc \sz, .16b
mov x6, v2.d[1]
orr x5, x5, x6
.endif
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
- cbz x5, 8f
+ cbnz x5, 1f
+ br x15
+1:
// flat8out
// This writes all outputs into v2-v17 (skipping v6 and v16).
// If this part is skipped, the output is read from v21-v26 (which is
the input
@@ -549,35 +553,24 @@ endfunc
function vp9_loop_filter_8
loop_filter 8, .8b, 0, v16, v17, v18, v19, v28, v29, v30, v31
- mov x5, #0
ret
6:
- mov x5, #6
- ret
+ br x13
9:
br x10
endfunc
Looks really neat, thanks!
Couldn't you get rid of the 6: label here as well, with something like
this?
@@ -352,7 +352,13 @@
.endif
// If no pixels need flat8in, jump to flat8out
// (or to a writeout of the inner 4 pixels, for wd=8)
+.if \wd == 16
cbz x5, 6f
+.else
+ cbnz x5, 6f
+ br x13
+6:
+.endif
And similarly for the 9: label for all cases except \wd == 16 (where we
need it for the clobbered registers).
// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel