For this case, with 8 inputs but only changing 4 of them, we can fit
all 16 input pixels into a q register, and still have enough temporary
registers for doing the loop filter.
The wd=8 filters would require too many temporary registers for
processing all 16 pixels at once though.
Before: Cortex A7 A8 A9 A53
vp9_loop_filter_mix2_v_44_16_neon: 289.7 256.2 237.5 181.2
After:
vp9_loop_filter_mix2_v_44_16_neon: 221.2 150.5 177.7 138.0
---
libavcodec/arm/vp9dsp_init_arm.c | 7 +-
libavcodec/arm/vp9lpf_neon.S | 191 +++++++++++++++++++++++++++++++++++++++
2 files changed, 195 insertions(+), 3 deletions(-)
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index e99d931..1ede170 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -194,6 +194,8 @@ define_loop_filters(8, 8);
define_loop_filters(16, 8);
define_loop_filters(16, 16);
+define_loop_filters(44, 16);
+
#define lf_mix_fn(dir, wd1, wd2, stridea)
\
static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,
\
ptrdiff_t stride,
\
@@ -207,7 +209,6 @@ static void
loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,
lf_mix_fn(h, wd1, wd2, stride) \
lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
-lf_mix_fns(4, 4)
lf_mix_fns(4, 8)
lf_mix_fns(8, 4)
lf_mix_fns(8, 8)
@@ -227,8 +228,8 @@ static av_cold void
vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
- dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon;
- dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon;
+ dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
+ dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index e31c807..12984a9 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -44,6 +44,109 @@
vtrn.8 \r2, \r3
.endm
+@ The input to and output from this macro is in the registers q8-q15,
+@ and q0-q7 are used as scratch registers.
+@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
+.macro loop_filter_q
+ vdup.u8 d0, r2 @ E
+ lsr r2, r2, #8
+ vdup.u8 d2, r3 @ I
+ lsr r3, r3, #8
+ vdup.u8 d1, r2 @ E
+ vdup.u8 d3, r3 @ I