On 2016-11-02 13:47:37 +0200, Martin Storsjö wrote:
...
> ---
> libavcodec/arm/Makefile | 2 +
> libavcodec/arm/vp9dsp_init_arm.c | 140 +++++++
> libavcodec/arm/vp9mc_neon.S | 764
> +++++++++++++++++++++++++++++++++++++++
> libavcodec/vp9.h | 4 +-
> libavcodec/vp9block.c | 10 +-
> libavcodec/vp9dsp.c | 2 +
> 6 files changed, 919 insertions(+), 3 deletions(-)
> create mode 100644 libavcodec/arm/vp9dsp_init_arm.c
> create mode 100644 libavcodec/arm/vp9mc_neon.S
>
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index bd4dd4e..2638230 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -45,6 +45,7 @@ OBJS-$(CONFIG_MLP_DECODER) +=
> arm/mlpdsp_init_arm.o
> OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
> OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
> OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
> +OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_arm.o
>
>
> # ARMv5 optimizations
> @@ -138,3 +139,4 @@ NEON-OBJS-$(CONFIG_RV40_DECODER) +=
> arm/rv34dsp_neon.o \
> arm/rv40dsp_neon.o
> NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
> NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
> +NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9mc_neon.o
> diff --git a/libavcodec/arm/vp9mc_neon.S b/libavcodec/arm/vp9mc_neon.S
> new file mode 100644
> index 0000000..0651ec7
> --- /dev/null
> +++ b/libavcodec/arm/vp9mc_neon.S
> @@ -0,0 +1,764 @@
> +
> +@ All public functions in this file have the following signature:
> +@ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
> +@ const uint8_t *ref, ptrdiff_t ref_stride,
> +@ int h, int mx, int my);
> +
> +function ff_vp9_copy64_neon, export=1
> + ldr r12, [sp]
> + sub r1, r1, #32
> + sub r3, r3, #32
> +1:
> + vld1.8 {q0, q1}, [r2]!
> + vst1.8 {q0, q1}, [r0, :128]!
> + vld1.8 {q2, q3}, [r2], r3
> + subs r12, r12, #1
> + vst1.8 {q2, q3}, [r0, :128], r1
> + bne 1b
> + bx lr
> +endfunc
> +
> +function ff_vp9_avg64_neon, export=1
> + ldr r12, [sp]
> + push {r4}
if you use lr instead of r4 you can return with pop {pc}
> +function ff_vp9_copy16_neon, export=1
> + ldr r12, [sp]
> + push {r4-r5}
same here, push {r4, lr}; pop {r4,pc}
> +function ff_vp9_copy4_neon, export=1
> + ldr r12, [sp]
> +1:
> + vld1.32 {d0[]}, [r2], r3
> + vld1.32 {d1[]}, [r2], r3
> + vst1.32 {d0[0]}, [r0, :32], r1
> + vld1.32 {d2[]}, [r2], r3
> + vst1.32 {d1[0]}, [r0, :32], r1
> + vld1.32 {d3[]}, [r2], r3
> + subs r12, r12, #4
> + vst1.32 {d2[0]}, [r0, :32], r1
> + vst1.32 {d3[0]}, [r0, :32], r1
> + bne 1b
> + bx lr
> +endfunc
have you tried using arm register instead, I would expect them to be
faster for size 4
> +@ Instantiate a horizontal filter function for the given size.
> +@ This can work on 4, 8 or 16 pixels in parallel; for larger
> +@ widths it will do 16 pixels at a time and loop horizontally.
> +@ The actual width is passed in r5, the height in r4 and
> +@ the filter coefficients in r12. idx2 is the index of the largest
> +@ filter coefficient (3 or 4) and idx1 is the other one of them.
> +.macro do_8tap_h type, size, idx1, idx2
> +function \type\()_8tap_\size\()h_\idx1\idx2
> + sub r2, r2, #3
> + add r6, r0, r1
> + add r7, r2, r3
> + add r1, r1, r1
> + add r3, r3, r3
> + @ Only size >= 16 loops horizontally and needs
> + @ reduced dst stride
> +.if \size >= 16
> + sub r1, r1, r5
> +.endif
> + @ size >= 16 loads two qwords and increments r2,
> + @ for size 4/8 it's enough with one qword and no
> + @ postincrement
> +.if \size >= 16
> + sub r3, r3, r5
> + sub r3, r3, #8
> +.endif
> + @ Load the filter vector
> + vld1.16 {q0}, [r12,:128]
> +1:
> +.if \size >= 16
> + mov r12, r5
> +.endif
> + @ Load src
> +.if \size >= 16
> + vld1.8 {q8}, [r2]!
> + vld1.8 {q11}, [r7]!
> + vld1.8 {d20}, [r2]!
> + vld1.8 {d26}, [r7]!
> +.else
> + vld1.8 {q8}, [r2]
> + vld1.8 {q11}, [r7]
> +.endif
> + vmovl.u8 q9, d17
> + vmovl.u8 q8, d16
> + vmovl.u8 q12, d23
> + vmovl.u8 q11, d22
> +.if \size >= 16
> + vmovl.u8 q10, d20
> + vmovl.u8 q13, d26
> +.endif
.if \size >= 16
vld1.8 {d18, d19, d20}, [r2]!
vld1.8 {d24, d25, d26}, [r7]!
.else
vld1.8 {q9}, [r2]
vld1.8 {q12}, [r7]
.endif
vmovl.u8 q8, d18
vmovl.u8 q9, d19
vmovl.u8 q11, d24
vmovl.u8 q12, d25
should be marginally faster
> +@ Instantiate a vertical filter function for filtering a 4 pixels wide
> +@ slice. The first half of the registers contain one row, while the second
> +@ half of a register contains the second-next row (also stored in the first
> +@ half of the register two steps ahead). The convolution does two outputs
> +@ at a time; the output of q5-q12 into one, and q4-q13 into another one.
> +@ The first half of first output is the first output row, the first half
> +@ of the other output is the second output row. The second halves of the
> +@ registers are rows 3 and 4.
> +@ This only is designed to work for 4 or 8 output lines.
> +.macro do_8tap_4v type, idx1, idx2
> +function \type\()_8tap_4v_\idx1\idx2
> + sub r2, r2, r3, lsl #1
> + sub r2, r2, r3
> + vld1.16 {q0}, [r12, :128]
> +
> + vld1.32 {d2[]}, [r2], r3
> + vld1.32 {d3[]}, [r2], r3
> + vld1.32 {d4[]}, [r2], r3
> + vld1.32 {d5[]}, [r2], r3
> + vld1.32 {d6[]}, [r2], r3
> + vld1.32 {d7[]}, [r2], r3
> + vext.8 d2, d2, d4, #4
> + vld1.32 {d8[]}, [r2], r3
> + vext.8 d3, d3, d5, #4
> + vld1.32 {d9[]}, [r2], r3
> + vmovl.u8 q5, d2
> + vext.8 d4, d4, d6, #4
> + vld1.32 {d28[]}, [r2], r3
> + vmovl.u8 q6, d3
> + vext.8 d5, d5, d7, #4
> + vmovl.u8 q7, d4
> + vext.8 d6, d6, d8, #4
> + vld1.32 {d9[1]}, [r2], r3
it probably makes sense to continue the vld1.32 {d[]}, vext.8 pattern.
d30 and d31 should be free. It shouldn't be much slower for the height
== 4 case and help for height == 8.
> + vmovl.u8 q8, d5
> + vext.8 d7, d7, d9, #4
> + vmovl.u8 q9, d6
> + vext.8 d8, d8, d28, #4
> + vld1.32 {d28[1]}, [r2]
> + vmovl.u8 q10, d7
> + vmovl.u8 q11, d8
> + sub r2, r2, r3
> + vmovl.u8 q12, d9
> + vmovl.u8 q13, d28
> +
> + convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12,
> q13, \idx1, \idx2, q4, q3
> + do_store4 q1, d2, q2, d4, d3, d5, \type
> + subs r4, r4, #4
> + beq 9f
> +
> + vld1.32 {d2[]}, [r2], r3
> + vld1.32 {d3[]}, [r2], r3
already loaded in d30/d31
> + vld1.32 {d4[]}, [r2], r3
> + vld1.32 {d5[]}, [r2], r3
> + sub r2, r2, r3, lsl #1
> + vld1.32 {d2[1]}, [r2], r3
> + vld1.32 {d3[1]}, [r2], r3
already loaded in d4/d5
Otherwise ok
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel