Re: [libav-devel] [PATCH] VP8: armv6 optimizations.

Måns Rullgård Thu, 25 Aug 2011 18:33:51 -0700

"Ronald S. Bultje" <[email protected]> writes:

> From: "Ronald S. Bultje" <[email protected]>
>
> From 52.503s (~40fps) to 27.973sec (~80fps) decoding of 480p sintel
> trailer, i.e. a ~2x speedup overall, on a Nexus S.


[...]

> --- a/libavcodec/arm/asm.S
> +++ b/libavcodec/arm/asm.S
> @@ -97,6 +97,12 @@ T       add             \rn, \rn, \rm
>  T       ldr             \rt, [\rn]
>  .endm
>
> +.macro  ldr_dpren       rt,  rn,  rm:vararg
> +A       ldr             \rt, [\rn, -\rm]
> +T       sub             \rt, \rn, \rm
> +T       ldr             \rt, [\rt]
> +.endm
> +
>  .macro  ldr_post        rt,  rn,  rm:vararg
>  A       ldr             \rt, [\rn], \rm
>  T       ldr             \rt, [\rn]
> @@ -133,6 +139,12 @@ T       ldrh            \rt, [\rn]
>  T       add             \rn, \rn, \rm
>  .endm
>
> +.macro  ldrb_post       rt,  rn,  rm
> +A       ldrb            \rt, [\rn], \rm
> +T       ldrb            \rt, [\rn]
> +T       add             \rn, \rn, \rm
> +.endm

Those macro names are very badly chosen.

> +@ void vp8_luma_dc_wht(DCTELEM block[4][4][16], DCTELEM dc[16])
> +function ff_vp8_luma_dc_wht_armv6, export=1
> +        push           {r4 - r10, lr}

Please use whitespace consistent with other files, ignoring old files
using random whitespace.

> +        @ load dc[] and zero memory
> +        mov             r12, #0
> +        ldr             r2, [r1]                @ dc0[0,1]
> +        ldr             r3, [r1,  #4]           @ dc0[2,3]
> +        ldr             r4, [r1,  #8]           @ dc1[0,1]
> +        ldr             r5, [r1,  #12]          @ dc1[2,3]
> +        ldr             r6, [r1,  #16]          @ dc2[0,1]
> +        ldr             r7, [r1,  #20]          @ dc2[2,3]
> +        ldr             r8, [r1,  #24]          @ dc3[0,1]
> +        ldr             r9, [r1,  #28]          @ dc3[2,3]
> +        str             r12,[r1]
> +        str             r12,[r1,  #4]
> +        str             r12,[r1,  #8]
> +        str             r12,[r1,  #12]
> +        str             r12,[r1,  #16]
> +        str             r12,[r1,  #20]
> +        str             r12,[r1,  #24]
> +        str             r12,[r1,  #28]
> +
> +        @ loop1
> +        uadd16          r12, r2,  r8            @ t0[0,1]
> +        uadd16          r14, r3,  r9            @ t0[2,3]
> +        usub16          r2,  r2,  r8            @ t3[0,1]
> +        usub16          r3,  r3,  r9            @ t3[2,3]
> +        uadd16          r8,  r4,  r6            @ t1[0,1]
> +        uadd16          r9,  r5,  r7            @ t1[2,3]
> +        usub16          r4,  r4,  r6            @ t2[0,1]
> +        usub16          r5,  r5,  r7            @ t2[2,3]
> +
> +        uadd16          r6,  r12, r8            @ dc0[0,1]
> +        uadd16          r7,  r14, r9            @ dc0[2,3]
> +        usub16          r12, r12, r8            @ dc2[0,1]
> +        usub16          r14, r14, r9            @ dc2[2,3]
> +        uadd16          r8,  r2,  r4            @ dc1[0,1]
> +        uadd16          r9,  r3,  r5            @ dc1[2,3]
> +        usub16          r2,  r2,  r4            @ dc3[0,1]
> +        usub16          r3,  r3,  r5            @ dc3[2,3]

Here you have not even tried to schedule instructions anything
resembling optimally.  Please read the relevant manuals an try again.
Same throughout file.

> +        mov             r1,  #3
> +        orr             r1,  r1,  #0x30000      @ 3 | 3 (round)

Interleaving this with something else might save a cycle depending on
how it happens to issue.

> +        @ "transpose"

Why the scarequotes?

> +        pkhbt           r4,  r6,  r8,  lsl #16  @ dc{0,1}[0]
> +        pkhtb           r6,  r8,  r6,  asr #16  @ dc{0,1}[1]
> +        pkhbt           r5,  r12, r2,  lsl #16  @ dc{2,3}[0]
> +        pkhtb           r12, r2,  r12, asr #16  @ dc{2,3}[1]
> +        pkhbt           r8,  r7,  r9,  lsl #16  @ dc{0,1}[2]
> +        uadd16          r4,  r4,  r1
> +        uadd16          r5,  r5,  r1
> +        pkhtb           r7,  r9,  r7,  asr #16  @ dc{0,1}[3]
> +        pkhbt           r2,  r14, r3,  lsl #16  @ dc{2,3}[2]
> +        pkhtb           r14, r3,  r14, asr #16  @ dc{2,3}[3]
> +
> +        @ loop2

Loop?  I don't see any loops.

> +        uadd16          r9,  r4,  r7            @ t0[0,1]
> +        uadd16          r3,  r5,  r14           @ t0[2,3]
> +        usub16          r4,  r4,  r7            @ t3[0,1]
> +        usub16          r5,  r5,  r14           @ t3[2,3]
> +        uadd16          r7,  r6,  r8            @ t1[0,1]
> +        uadd16          r14, r12, r2            @ t1[2,3]
> +        usub16          r6,  r6,  r8            @ t2[0,1]
> +        usub16          r12, r12, r2            @ t2[2,3]
> +
> +        uadd16          r8,  r9,  r7            @ block[0,1][0]
> +        uadd16          r2,  r3,  r14           @ block[2,3][0]
> +        usub16          r9,  r9,  r7            @ block[0,1][2]
> +        usub16          r3,  r3,  r14           @ block[2,3][2]
> +        uadd16          r7,  r4,  r6            @ block[0,1][1]
> +        uadd16          r14, r5,  r12           @ block[2,3][1]
> +        usub16          r4,  r4,  r6            @ block[0,1][3]
> +        usub16          r5,  r5,  r12           @ block[2,3][3]
> +
> +        @ store
> +        mov             r6,  r8,  asr #19       @ block[1][0]
> +        mov             r12, r7,  asr #19       @ block[1][1]
> +        mov             r1,  r9,  asr #19       @ block[1][2]
> +        mov             r10, r4,  asr #19       @ block[1][3]

Please use standard UAL mnemonics.

> +        sxth            r8,  r8
> +        sxth            r7,  r7
> +        sxth            r9,  r9
> +        sxth            r4,  r4
> +        asr             r8,  #3                 @ block[0][0]
> +        asr             r7,  #3                 @ block[0][1]
> +        asr             r9,  #3                 @ block[0][2]
> +        asr             r4,  #3                 @ block[0][3]

Store?  I don't see any stores.


[...]

> +@ void vp8_idct_add(uint8_t *dst, DCTELEM block[16], int stride)
> +function ff_vp8_idct_add_armv6, export=1
> +        push           {r4 - r11, lr}
> +        sub             sp,  sp,  #32
> +
> +        mov             r3,  #0x00004E00        @ cos
> +        orr             r3,  r3, #0x0000007B    @ cospi8sqrt2minus1 = 20091
> +        mov             r4,  #0x00008A00        @ sin
> +        orr             r4,  r4, #0x0000008C    @ sinpi8sqrt2 = 35468

Do you enjoy typing and reading zeros?

> +        mov             r5,  #0x2               @ i=2
> +1:
> +        ldr             r6, [r1, #8]            @  i5 | i4  = block1[1] | 
> block1[0]
> +        ldr             r12,[r1, #24]           @ i13 | i12 = block3[1] | 
> block3[0]
> +        ldr             r14,[r1, #16]           @  i9 | i8  = block2[1] | 
> block2[0]
> +
> +        smulwt          r9,  r3,  r6            @ (ip[5] * 
> cospi8sqrt2minus1) >> 16
> +        smulwb          r7,  r3,  r6            @ (ip[4] * 
> cospi8sqrt2minus1) >> 16
> +        smulwt          r10, r4,  r6            @ (ip[5] * sinpi8sqrt2) >> 16
> +        smulwb          r8,  r4,  r6            @ (ip[4] * sinpi8sqrt2) >> 16
> +        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 4c
> +        smulwt          r11, r3,  r12           @ (ip[13] * 
> cospi8sqrt2minus1) >> 16
> +        pkhbt           r8,  r8,  r10, lsl #16  @ 5s | 4s         = t2 first 
> half
> +        uadd16          r6,  r6,  r7            @ 5c+5 | 4c+4     = t3 first 
> half
> +        smulwt          r7,  r4,  r12           @ (ip[13] * sinpi8sqrt2) >> 
> 16
> +        smulwb          r9,  r3,  r12           @ (ip[12] * 
> cospi8sqrt2minus1) >> 16
> +        smulwb          r10, r4,  r12           @ (ip[12] * sinpi8sqrt2) >> 
> 16

A bit less space before the comments would make these lines fit in 80 columns.

> +        subs            r5,  r5,  #1            @ i--

Oh, really?

> +@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
> +function ff_vp8_v_loop_filter16_simple_armv6, export=1
> +        push           {r4 - r11, lr}
> +
> +        ldr_dpren       r3,  r0,  r1,  lsl #1   @ p1
> +        ldr_dpren       r4,  r0,  r1            @ p0
> +        ldr             r5, [r0]                @ q0
> +        ldr             r6, [r0,  r1]           @ q1
> +        orr             r2,  r2,  r2,  lsl #16
> +        mov             r9,  #4                 @ count
> +        mov             lr,  #0                 @ need 0 in a couple places
> +        orr             r12, r2,  r2,  lsl #8   @ splat int -> byte
> +        ldr             r2,  c0x80808080

[...]

> +c0x01010101: .long 0x01010101
> +c0x03030303: .long 0x03030303
> +c0x04040404: .long 0x04040404
> +c0x7F7F7F7F: .long 0x7F7F7F7F
> +c0x80808080: .long 0x80808080

Please learn about the ldr reg, =foo syntax and the mov32 macro.

> +@ void vp8_v_loop_filter16_inner(uint8_t *dst, int stride,
> +@                                int fE, int fI, int hev_thresh)
> +@ and
> +@ void vp8_v_loop_filter8uv_inner(uint8_t *dstU, uint8_t *dstV, int stride,
> +@                                 int fE, int fI, int hev_thresh)
> +@ call:
> +@ void vp8_v_loop_filter_inner(uint8_t *dst, int stride,
> +@                              int fE, int fI, int hev_thresh, int count)
> +function ff_vp8_v_loop_filter_inner_armv6, export=1
> +        push           {r4 - r11, lr}
> +
> +        sub             r0,  r0,  r1,  lsl #2   @ move r0 pointer down by 4 
> lines

That's not the direction I usually call down.

> +@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
> +@ arithmatic can be used to apply filters
> +const sixtap_filters_13245600, align=4
> +        .short     2, 108, -11,  36,  -8, 1, 0, 0
> +        .short     3,  77, -16,  77, -16, 3, 0, 0
> +        .short     1,  36,  -8, 108, -11, 2, 0, 0
> +endconst
> +const fourtap_filters_1324, align=4
> +        .short     -6,  12, 123, -1
> +        .short     -9,  50,  93, -6
> +        .short     -6,  93,  50, -9
> +        .short     -1, 123,  12, -6
> +endconst

A blank line between those blocks wouldn't hurt.

[...]

> +        ldr             r8, [lr,  #4]           @ stall - if only I had more 
> registers...

[...]

> +        ldr             r8, [lr,  #8]           @ stall - if only I had more 
> registers...

[...]

> +        ldr             r10,[lr]                @ stall - if only I had more 
> registers...

Please leave whining out of the code.

> +        subs            r4,  r4,  #1            @ counter--

Please keep comments somewhat informative or omit them entirely.

> +        add             sp,  sp,  #4            @ restore stack after 
> push{r1} above

Ditto.

> +        ldr             r9, [r2,  #3]           @ load source data

Ditto.

> +        subs            r4,  r4,  #1            @ counter--

Ditto.

[...]

> +extern void put_vp8_epel4_v6_c(uint8_t *dst, int d, uint8_t *src, int s, int 
> h, int mx, int my);
> +#undef printf

WTF?

> +#define VP8_EPEL_H_OR_V(SIZE, NAME, HV) \
> +static void ff_put_vp8_##NAME##SIZE##_##HV##_armv6( \
> +                                        uint8_t *dst, int dststride, uint8_t 
> *src, \
> +                                        int srcstride, int h, int mx, int 
> my) \
> +{ \
> +    ff_put_vp8_## NAME ## _ ## HV ## _armv6(dst, dststride, src, srcstride, \
> +                                            SIZE, h, mx | my); \
> +}
> +
> +VP8_EPEL_H_OR_V(4,  epel,  h6);
> +VP8_EPEL_H_OR_V(4,  epel,  h4);
> +VP8_EPEL_H_OR_V(4,  epel,  v6);
> +VP8_EPEL_H_OR_V(4,  epel,  v4);
> +VP8_EPEL_H_OR_V(4,  bilin, v);
> +VP8_EPEL_H_OR_V(4,  bilin, h);
> +VP8_EPEL_H_OR_V(8,  epel,  h6);
> +VP8_EPEL_H_OR_V(8,  epel,  h4);
> +VP8_EPEL_H_OR_V(8,  epel,  v6);
> +VP8_EPEL_H_OR_V(8,  epel,  v4);
> +VP8_EPEL_H_OR_V(8,  bilin, v);
> +VP8_EPEL_H_OR_V(8,  bilin, h);
> +VP8_EPEL_H_OR_V(16, epel,  h6);
> +VP8_EPEL_H_OR_V(16, epel,  v6);
> +VP8_EPEL_H_OR_V(16, bilin, v);
> +VP8_EPEL_H_OR_V(16, bilin, h);

No C wrappers please.  Compilers suck.

>  av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
>  {
> +#define set_func_ptrs(opt) \
> +        dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_##opt; \
> +        dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; \
> + \
> +        dsp->vp8_idct_add       = ff_vp8_idct_add_##opt; \
> +        dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_##opt; \
> +        dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_##opt; \
> +        dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_##opt; \
> + \

Ugly formatting.


[...]

> +        set_func_ptrs(neon);
> +    } else if (HAVE_ARMV6) {
> +        set_func_ptrs(armv6);
>      }
>  }

I did not read the code in depth.  Might do if next round looks more
promising.

-- 
Måns Rullgård
[email protected]
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] VP8: armv6 optimizations.

Reply via email to