"Ronald S. Bultje" <[email protected]> writes:
> From: "Ronald S. Bultje" <[email protected]>
>
> From 52.503s (~40fps) to 27.973sec (~80fps) decoding of 480p sintel
> trailer, i.e. a ~2x speedup overall, on a Nexus S.
[...]
> --- a/libavcodec/arm/asm.S
> +++ b/libavcodec/arm/asm.S
> @@ -97,6 +97,12 @@ T add \rn, \rn, \rm
> T ldr \rt, [\rn]
> .endm
>
> +.macro ldr_dpren rt, rn, rm:vararg
> +A ldr \rt, [\rn, -\rm]
> +T sub \rt, \rn, \rm
> +T ldr \rt, [\rt]
> +.endm
> +
> .macro ldr_post rt, rn, rm:vararg
> A ldr \rt, [\rn], \rm
> T ldr \rt, [\rn]
> @@ -133,6 +139,12 @@ T ldrh \rt, [\rn]
> T add \rn, \rn, \rm
> .endm
>
> +.macro ldrb_post rt, rn, rm
> +A ldrb \rt, [\rn], \rm
> +T ldrb \rt, [\rn]
> +T add \rn, \rn, \rm
> +.endm
Those macro names are very badly chosen.
> +@ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16])
> +function ff_vp8_luma_dc_wht_armv6, export=1
> + push {r4 - r10, lr}
Please use whitespace consistent with other files, ignoring old files
using random whitespace.
> + @ load dc[] and zero memory
> + mov r12, #0
> + ldr r2, [r1] @ dc0[0,1]
> + ldr r3, [r1, #4] @ dc0[2,3]
> + ldr r4, [r1, #8] @ dc1[0,1]
> + ldr r5, [r1, #12] @ dc1[2,3]
> + ldr r6, [r1, #16] @ dc2[0,1]
> + ldr r7, [r1, #20] @ dc2[2,3]
> + ldr r8, [r1, #24] @ dc3[0,1]
> + ldr r9, [r1, #28] @ dc3[2,3]
> + str r12,[r1]
> + str r12,[r1, #4]
> + str r12,[r1, #8]
> + str r12,[r1, #12]
> + str r12,[r1, #16]
> + str r12,[r1, #20]
> + str r12,[r1, #24]
> + str r12,[r1, #28]
> +
> + @ loop1
> + uadd16 r12, r2, r8 @ t0[0,1]
> + uadd16 r14, r3, r9 @ t0[2,3]
> + usub16 r2, r2, r8 @ t3[0,1]
> + usub16 r3, r3, r9 @ t3[2,3]
> + uadd16 r8, r4, r6 @ t1[0,1]
> + uadd16 r9, r5, r7 @ t1[2,3]
> + usub16 r4, r4, r6 @ t2[0,1]
> + usub16 r5, r5, r7 @ t2[2,3]
> +
> + uadd16 r6, r12, r8 @ dc0[0,1]
> + uadd16 r7, r14, r9 @ dc0[2,3]
> + usub16 r12, r12, r8 @ dc2[0,1]
> + usub16 r14, r14, r9 @ dc2[2,3]
> + uadd16 r8, r2, r4 @ dc1[0,1]
> + uadd16 r9, r3, r5 @ dc1[2,3]
> + usub16 r2, r2, r4 @ dc3[0,1]
> + usub16 r3, r3, r5 @ dc3[2,3]
Here you have not even tried to schedule instructions anything
resembling optimally. Please read the relevant manuals an try again.
Same throughout file.
> + mov r1, #3
> + orr r1, r1, #0x30000 @ 3 | 3 (round)
Interleaving this with something else might save a cycle depending on
how it happens to issue.
> + @ "transpose"
Why the scarequotes?
> + pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0]
> + pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1]
> + pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0]
> + pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1]
> + pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2]
> + uadd16 r4, r4, r1
> + uadd16 r5, r5, r1
> + pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3]
> + pkhbt r2, r14, r3, lsl #16 @ dc{2,3}[2]
> + pkhtb r14, r3, r14, asr #16 @ dc{2,3}[3]
> +
> + @ loop2
Loop? I don't see any loops.
> + uadd16 r9, r4, r7 @ t0[0,1]
> + uadd16 r3, r5, r14 @ t0[2,3]
> + usub16 r4, r4, r7 @ t3[0,1]
> + usub16 r5, r5, r14 @ t3[2,3]
> + uadd16 r7, r6, r8 @ t1[0,1]
> + uadd16 r14, r12, r2 @ t1[2,3]
> + usub16 r6, r6, r8 @ t2[0,1]
> + usub16 r12, r12, r2 @ t2[2,3]
> +
> + uadd16 r8, r9, r7 @ block[0,1][0]
> + uadd16 r2, r3, r14 @ block[2,3][0]
> + usub16 r9, r9, r7 @ block[0,1][2]
> + usub16 r3, r3, r14 @ block[2,3][2]
> + uadd16 r7, r4, r6 @ block[0,1][1]
> + uadd16 r14, r5, r12 @ block[2,3][1]
> + usub16 r4, r4, r6 @ block[0,1][3]
> + usub16 r5, r5, r12 @ block[2,3][3]
> +
> + @ store
> + mov r6, r8, asr #19 @ block[1][0]
> + mov r12, r7, asr #19 @ block[1][1]
> + mov r1, r9, asr #19 @ block[1][2]
> + mov r10, r4, asr #19 @ block[1][3]
Please use standard UAL mnemonics.
> + sxth r8, r8
> + sxth r7, r7
> + sxth r9, r9
> + sxth r4, r4
> + asr r8, #3 @ block[0][0]
> + asr r7, #3 @ block[0][1]
> + asr r9, #3 @ block[0][2]
> + asr r4, #3 @ block[0][3]
Store? I don't see any stores.
[...]
> +@ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride)
> +function ff_vp8_idct_add_armv6, export=1
> + push {r4 - r11, lr}
> + sub sp, sp, #32
> +
> + mov r3, #0x00004E00 @ cos
> + orr r3, r3, #0x0000007B @ cospi8sqrt2minus1 = 20091
> + mov r4, #0x00008A00 @ sin
> + orr r4, r4, #0x0000008C @ sinpi8sqrt2 = 35468
Do you enjoy typing and reading zeros?
> + mov r5, #0x2 @ i=2
> +1:
> + ldr r6, [r1, #8] @ i5 | i4 = block1[1] |
> block1[0]
> + ldr r12,[r1, #24] @ i13 | i12 = block3[1] |
> block3[0]
> + ldr r14,[r1, #16] @ i9 | i8 = block2[1] |
> block2[0]
> +
> + smulwt r9, r3, r6 @ (ip[5] *
> cospi8sqrt2minus1) >> 16
> + smulwb r7, r3, r6 @ (ip[4] *
> cospi8sqrt2minus1) >> 16
> + smulwt r10, r4, r6 @ (ip[5] * sinpi8sqrt2) >> 16
> + smulwb r8, r4, r6 @ (ip[4] * sinpi8sqrt2) >> 16
> + pkhbt r7, r7, r9, lsl #16 @ 5c | 4c
> + smulwt r11, r3, r12 @ (ip[13] *
> cospi8sqrt2minus1) >> 16
> + pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first
> half
> + uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first
> half
> + smulwt r7, r4, r12 @ (ip[13] * sinpi8sqrt2) >>
> 16
> + smulwb r9, r3, r12 @ (ip[12] *
> cospi8sqrt2minus1) >> 16
> + smulwb r10, r4, r12 @ (ip[12] * sinpi8sqrt2) >>
> 16
A bit less space before the comments would make these lines fit in 80 columns.
> + subs r5, r5, #1 @ i--
Oh, really?
> +@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
> +function ff_vp8_v_loop_filter16_simple_armv6, export=1
> + push {r4 - r11, lr}
> +
> + ldr_dpren r3, r0, r1, lsl #1 @ p1
> + ldr_dpren r4, r0, r1 @ p0
> + ldr r5, [r0] @ q0
> + ldr r6, [r0, r1] @ q1
> + orr r2, r2, r2, lsl #16
> + mov r9, #4 @ count
> + mov lr, #0 @ need 0 in a couple places
> + orr r12, r2, r2, lsl #8 @ splat int -> byte
> + ldr r2, c0x80808080
[...]
> +c0x01010101: .long 0x01010101
> +c0x03030303: .long 0x03030303
> +c0x04040404: .long 0x04040404
> +c0x7F7F7F7F: .long 0x7F7F7F7F
> +c0x80808080: .long 0x80808080
Please learn about the ldr reg, =foo syntax and the mov32 macro.
> +@ void vp8_v_loop_filter16_inner(uint8_t *dst, int stride,
> +@ int fE, int fI, int hev_thresh)
> +@ and
> +@ void vp8_v_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
> +@ int fE, int fI, int hev_thresh)
> +@ call:
> +@ void vp8_v_loop_filter_inner(uint8_t *dst, int stride,
> +@ int fE, int fI, int hev_thresh, int count)
> +function ff_vp8_v_loop_filter_inner_armv6, export=1
> + push {r4 - r11, lr}
> +
> + sub r0, r0, r1, lsl #2 @ move r0 pointer down by 4
> lines
That's not the direction I usually call down.
> +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
> +@ arithmatic can be used to apply filters
> +const sixtap_filters_13245600, align=4
> + .short 2, 108, -11, 36, -8, 1, 0, 0
> + .short 3, 77, -16, 77, -16, 3, 0, 0
> + .short 1, 36, -8, 108, -11, 2, 0, 0
> +endconst
> +const fourtap_filters_1324, align=4
> + .short -6, 12, 123, -1
> + .short -9, 50, 93, -6
> + .short -6, 93, 50, -9
> + .short -1, 123, 12, -6
> +endconst
A blank line between those blocks wouldn't hurt.
[...]
> + ldr r8, [lr, #4] @ stall - if only I had more
> registers...
[...]
> + ldr r8, [lr, #8] @ stall - if only I had more
> registers...
[...]
> + ldr r10,[lr] @ stall - if only I had more
> registers...
Please leave whining out of the code.
> + subs r4, r4, #1 @ counter--
Please keep comments somewhat informative or omit them entirely.
> + add sp, sp, #4 @ restore stack after
> push{r1} above
Ditto.
> + ldr r9, [r2, #3] @ load source data
Ditto.
> + subs r4, r4, #1 @ counter--
Ditto.
[...]
> +extern void put_vp8_epel4_v6_c(uint8_t *dst, int d, uint8_t *src, int s, int
> h, int mx, int my);
> +#undef printf
WTF?
> +#define VP8_EPEL_H_OR_V(SIZE, NAME, HV) \
> +static void ff_put_vp8_##NAME##SIZE##_##HV##_armv6( \
> + uint8_t *dst, int dststride, uint8_t
> *src, \
> + int srcstride, int h, int mx, int
> my) \
> +{ \
> + ff_put_vp8_## NAME ## _ ## HV ## _armv6(dst, dststride, src, srcstride, \
> + SIZE, h, mx | my); \
> +}
> +
> +VP8_EPEL_H_OR_V(4, epel, h6);
> +VP8_EPEL_H_OR_V(4, epel, h4);
> +VP8_EPEL_H_OR_V(4, epel, v6);
> +VP8_EPEL_H_OR_V(4, epel, v4);
> +VP8_EPEL_H_OR_V(4, bilin, v);
> +VP8_EPEL_H_OR_V(4, bilin, h);
> +VP8_EPEL_H_OR_V(8, epel, h6);
> +VP8_EPEL_H_OR_V(8, epel, h4);
> +VP8_EPEL_H_OR_V(8, epel, v6);
> +VP8_EPEL_H_OR_V(8, epel, v4);
> +VP8_EPEL_H_OR_V(8, bilin, v);
> +VP8_EPEL_H_OR_V(8, bilin, h);
> +VP8_EPEL_H_OR_V(16, epel, h6);
> +VP8_EPEL_H_OR_V(16, epel, v6);
> +VP8_EPEL_H_OR_V(16, bilin, v);
> +VP8_EPEL_H_OR_V(16, bilin, h);
No C wrappers please. Compilers suck.
> av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
> {
> +#define set_func_ptrs(opt) \
> + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_##opt; \
> + dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; \
> + \
> + dsp->vp8_idct_add = ff_vp8_idct_add_##opt; \
> + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_##opt; \
> + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_##opt; \
> + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_##opt; \
> + \
Ugly formatting.
[...]
> + set_func_ptrs(neon);
> + } else if (HAVE_ARMV6) {
> + set_func_ptrs(armv6);
> }
> }
I did not read the code in depth. Might do if next round looks more
promising.
--
Måns Rullgård
[email protected]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel