On 2011-12-10 15:43:55 +0000, Mans Rullgard wrote:
> Signed-off-by: Mans Rullgard <[email protected]>
> ---
> libavcodec/arm/asm.S | 6 +++
> libavcodec/arm/rv40dsp_init_neon.c | 10 ++++
> libavcodec/arm/rv40dsp_neon.S | 84
> ++++++++++++++++++++++++++++++++++++
> 3 files changed, 100 insertions(+), 0 deletions(-)
>
> diff --git a/libavcodec/arm/asm.S b/libavcodec/arm/asm.S
> index a124918..d9f9051 100644
> --- a/libavcodec/arm/asm.S
> +++ b/libavcodec/arm/asm.S
> @@ -113,6 +113,12 @@ T add \rn, \rn, \rm
> T ldr \rt, [\rn]
> .endm
>
> +.macro ldr_dpre rt, rn, rm:vararg
> +A ldr \rt, [\rn, -\rm]!
> +T add \rn, \rn, \rm
sub?
> +T ldr \rt, [\rn]
> +.endm
> +
> .macro ldr_post rt, rn, rm:vararg
> A ldr \rt, [\rn], \rm
> T ldr \rt, [\rn]
> diff --git a/libavcodec/arm/rv40dsp_init_neon.c
> b/libavcodec/arm/rv40dsp_init_neon.c
> index 36d75e6..59dddb6 100644
> --- a/libavcodec/arm/rv40dsp_init_neon.c
> +++ b/libavcodec/arm/rv40dsp_init_neon.c
> @@ -54,6 +54,13 @@ void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *,
> int, int, int, int);
> void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int,
> int);
> void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int,
> int);
>
> +int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, int stride,
> + int beta, int beta2, int edge,
> + int *p1, int *q1);
> +int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride,
> + int beta, int beta2, int edge,
> + int *p1, int *q1);
> +
> void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
> {
> c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
> @@ -116,4 +123,7 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext*
> dsp)
>
> c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
> c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
> +
> + c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
> + c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
> }
> diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
> index 07ba842..a06532a 100644
> --- a/libavcodec/arm/rv40dsp_neon.S
> +++ b/libavcodec/arm/rv40dsp_neon.S
> @@ -722,3 +722,87 @@ function ff_rv40_weight_func_8_neon, export=1
> bne 1b
> bx lr
> endfunc
> +
> +function ff_rv40_h_loop_filter_strength_neon, export=1
> + pkhbt r2, r3, r2, lsl #18
> +
> + ldr r3, [r0]
> + ldr_dpre r12, r0, r1
> + teq r3, r12
> + beq 1f
> +
> + sub r0, r0, r1, lsl #1
> +
> + vld1.32 {d4[]}, [r0,:32], r1 @ -3
> + vld1.32 {d0[]}, [r0,:32], r1 @ -2
> + vld1.32 {d4[1]}, [r0,:32], r1 @ -1
> + vld1.32 {d5[]}, [r0,:32], r1 @ 0
> + vld1.32 {d1[]}, [r0,:32], r1 @ 1
> + vld1.32 {d5[0]}, [r0,:32], r1 @ 2
> +
> + vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1,
> 1, 1
> + vpaddl.u8 q9, q2 @ -1, -1, -3, -3, 0, 0,
> 2, 2
comment is wrong, it's @ -1, -1, -3, -3, 2, 2, 0, 0
> + vdup.32 d30, r2 @ beta2, beta << 2
> + vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
> + vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
> + vabd.u16 d16, d18, d16
> + vclt.u16 d16, d16, d30
> +
> + ldrd r2, r3, [sp, #4]
> + vmovl.u16 q12, d16
> + vtrn.16 d16, d17
> + vshr.u32 q12, q12, #15
> + ldr r0, [sp]
> + vst1.32 {d24[1]}, [r2,:32]
> + vst1.32 {d25[1]}, [r3,:32]
> +
> + cmp r0, #0
> + bxeq lr
> +
> + vand d18, d16, d17
> + vtrn.32 d18, d19
> + vand d18, d18, d19
> + vmov.u16 r0, d18[0]
> + bx lr
> +1:
> + ldrd r2, r3, [sp, #4]
> + mov r0, #0
> + str r0, [r2]
> + str r0, [r3]
> + bx lr
> +endfunc
> +
> +function ff_rv40_v_loop_filter_strength_neon, export=1
> + sub r0, r0, #3
> + pkhbt r2, r3, r2, lsl #18
> +
> + vld1.8 {d0}, [r0], r1
> + vld1.8 {d1}, [r0], r1
> + vld1.8 {d2}, [r0], r1
> + vld1.8 {d3}, [r0], r1
> +
> + vaddl.u8 q0, d0, d1
> + vaddl.u8 q1, d2, d3
> + vdup.32 q15, r2
> + vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
> + vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
> + vabd.u16 q0, q1, q0
> + vclt.u16 q0, q0, q15
> +
> + ldrd r2, r3, [sp, #4]
> + vmovl.u16 q1, d0
> + vext.16 d1, d0, d1, #3
> + vshr.u32 q1, q1, #15
> + ldr r0, [sp]
> + vst1.32 {d2[1]}, [r2,:32]
> + vst1.32 {d3[1]}, [r3,:32]
> +
> + cmp r0, #0
> + bxeq lr
> +
> + vand d0, d0, d1
> + vtrn.16 d0, d1
> + vand d0, d0, d1
> + vmov.u16 r0, d0[0]
> + bx lr
> +endfunc
ok otherwise
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel