On Sun, 13 Nov 2016, Janne Grunau wrote:

On 2016-11-11 21:43:09 +0200, Martin Storsjö wrote:
This work is sponsored by, and copyright, Google.

These are ported from the ARM version; thanks to the larger
amount of registers available, we can do the loop filters with
16 pixels at a time. The implementation is fully templated, with
a single macro which can generate versions for both 8 and
16 pixels wide, for both 4, 8 and 16 pixels loop filters
(and the 4/8 mixed versions as well).

For the 8 pixel wide versions, it is pretty close in speed (the
v_4_8 and v_8_8 filters are the best examples of this; the h_4_8
and h_8_8 filters seem to get some gain in the load/transpose/store
part). For the 16 pixels wide ones, we get a speedup of around
1.2-1.4x compared to the 32 bit version.

Examples of runtimes vs the 32 bit version, on a Cortex A53:
                                       ARM AArch64
vp9_loop_filter_h_4_8_neon:          144.0   128.2
vp9_loop_filter_h_8_8_neon:          207.0   182.5
vp9_loop_filter_h_16_8_neon:         415.0   329.7
vp9_loop_filter_h_16_16_neon:        672.0   558.0
vp9_loop_filter_mix2_h_44_16_neon:   302.0   202.5
vp9_loop_filter_mix2_h_48_16_neon:   365.0   307.2
vp9_loop_filter_mix2_h_84_16_neon:   365.0   307.2
vp9_loop_filter_mix2_h_88_16_neon:   376.0   307.2
vp9_loop_filter_mix2_v_44_16_neon:   193.2   128.2
vp9_loop_filter_mix2_v_48_16_neon:   246.7   219.2
vp9_loop_filter_mix2_v_84_16_neon:   248.0   219.5
vp9_loop_filter_mix2_v_88_16_neon:   302.0   219.2
vp9_loop_filter_v_4_8_neon:           89.0    89.7
vp9_loop_filter_v_8_8_neon:          141.0   137.7
vp9_loop_filter_v_16_8_neon:         295.0   273.7
vp9_loop_filter_v_16_16_neon:        546.0   454.7

Apple A7 benchmarkswhere just too unrelieable so I skipped them.

                         A57 gcc-5.3  neon
loop_filter_h_4_8_neon:        256.6  93.4
loop_filter_h_8_8_neon:        307.3 139.1
loop_filter_h_16_8_neon:       340.1 254.1
loop_filter_h_16_16_neon:      827.0 407.9
loop_filter_mix2_h_44_16_neon: 524.5 155.4
loop_filter_mix2_h_48_16_neon: 644.5 173.3
loop_filter_mix2_h_84_16_neon: 630.5 222.0
loop_filter_mix2_h_88_16_neon: 697.3 222.0
loop_filter_mix2_v_44_16_neon: 598.5 100.6
loop_filter_mix2_v_48_16_neon: 651.5 127.0
loop_filter_mix2_v_84_16_neon: 591.5 167.1
loop_filter_mix2_v_88_16_neon: 855.1 166.7
loop_filter_v_4_8_neon:        271.7  65.3
loop_filter_v_8_8_neon:        312.5 106.9
loop_filter_v_16_8_neon:       473.3 206.5
loop_filter_v_16_16_neon:      976.1 327.8

The speed-up compared to the C functions is 2.5 to 6 and the cortex-a57
is again 30-50% faster than the cortex-a53.

Thanks!

+.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
+.if \mix == 0
+        dup             v0.8h,  w2        // E
+        dup             v1.8h,  w2        // E
+        dup             v2\sz,  w3        // I
+        dup             v3\sz,  w4        // H
+.else
+        and             w5,     w2,  #0xff
+        and             w6,     w3,  #0xff
+        and             w7,     w4,  #0xff
+        dup             v0.8h,  w5        // E

        dup             v0.8h,  w2        // E
        ushr            v1.8h,  v0.8h,  #8
        bic             v0.8h,  #255, lsl 8

is one instruction less and looks faster on a cortex-a57

Turns out to be equally fast on A53, after rescheduling it.

+        dup             v2.8b,  w6        // I
+        dup             v3.8b,  w7        // H

the last two can use w3 and w4 directly without and 0xff

Indeed, thanks!

+// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
+// while we need those for inputs/outputs in wd=16 and use v8-v15
+// for temp registers there instead.
+function vp9_loop_filter_4
+        loop_filter     4,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
+        mov             x5,  #0

unneeded

Thanks, yes, that's a leftover since when the 9: label had a ret instruction.

+        ret
+9:
+        br              x10
+endfunc
+
+function vp9_loop_filter_4_16b_mix_44
+        loop_filter     4,  .16b, 44,   v16, v17, v18, v19, v28, v29, v30, v31
+        mov             x5,  #0

same

Ditto, fixed - thanks.

+        ret
+9:
+        br              x10
+endfunc
+
+function vp9_loop_filter_8
+        loop_filter     8,  .8b,  0,    v16, v17, v18, v19, v28, v29, v30, v31
+        mov             x5,  #0
+        ret
+6:
+        mov             x5,  #6
+        ret

I tried to return directly by loading the labels into registers and br
to the directly but with mixed results.

Oh, clever. I'll hold off doing that right now though.

+9:
+        br              x10
+endfunc
+
+function vp9_loop_filter_8_16b_mix
+        loop_filter     8,  .16b, 88,   v16, v17, v18, v19, v28, v29, v30, v31
+        mov             x5,  #0
+        ret
+6:
+        mov             x5,  #6
+        ret
+9:
+        br              x10
+endfunc
+
+function vp9_loop_filter_16
+        loop_filter     16, .8b,  0,    v8,  v9,  v10, v11, v12, v13, v14, v15
+        mov             x5,  #0
+        ret
+7:
+        mov             x5,  #7
+        ret
+8:
+        mov             x5,  #8
+        ret
+9:
+        ldp             d8,  d9,  [sp]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d14, d15, [sp, #0x30]
+        add             sp,  sp,  #0x40

if the post index variant is not slower we could use that

Hmm, in general, the loads/stores with writeback (for ld1/st1) seem to introduce a bit of data dependency, but on the A53, the runtime turns out to be identical to before. It's probably worthwhile to do then though, since it's one instruction less. I'll update it to do that everywhere.

I'll go with the same approach, "stp d8, d9, [sp, #-0x10]!" for pushing as well - that actually seems to be a 1 cycle speedup.

+function ff_vp9_loop_filter_v_16_8_neon, export=1
+        mov             x10, x30
+        sub             sp,  sp,  #0x40
+        stp             d8,  d9,  [sp]
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+        sub             x9,  x0,  x1, lsl #3
+        ld1             {v16.8b}, [x9], x1 // p7
+        ld1             {v24.8b}, [x0], x1 // q0
+        ld1             {v17.8b}, [x9], x1 // p6
+        ld1             {v25.8b}, [x0], x1 // q1
+        ld1             {v18.8b}, [x9], x1 // p5
+        ld1             {v26.8b}, [x0], x1 // q2
+        ld1             {v19.8b}, [x9], x1 // p4
+        ld1             {v27.8b}, [x0], x1 // q3
+        ld1             {v20.8b}, [x9], x1 // p3
+        ld1             {v28.8b}, [x0], x1 // q4
+        ld1             {v21.8b}, [x9], x1 // p2
+        ld1             {v29.8b}, [x0], x1 // q5
+        ld1             {v22.8b}, [x9], x1 // p1
+        ld1             {v30.8b}, [x0], x1 // q6
+        ld1             {v23.8b}, [x9], x1 // p0
+        ld1             {v31.8b}, [x0], x1 // q7
+        sub             x9,  x9,  x1, lsl #3
+        sub             x0,  x0,  x1, lsl #3
+        add             x9,  x9,  x1
+
+        loop_filter_16
+
+        // If we did the flat8out part, we get the output in
+        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
+        // store v2-v9 there, and v10-v17 into x0.
+        st1             {v2.8b},  [x9], x1
+        st1             {v10.8b}, [x0], x1
+        st1             {v3.8b},  [x9], x1
+        st1             {v11.8b}, [x0], x1
+        st1             {v4.8b},  [x9], x1
+        st1             {v12.8b}, [x0], x1
+        st1             {v5.8b},  [x9], x1
+        st1             {v13.8b}, [x0], x1
+        st1             {v6.8b},  [x9], x1
+        st1             {v14.8b}, [x0], x1
+        st1             {v8.8b},  [x9], x1
+        st1             {v15.8b}, [x0], x1
+        st1             {v9.8b},  [x9], x1
+        st1             {v17.8b}, [x0], x1
+
+        ldp             d8,  d9,  [sp]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d14, d15, [sp, #0x30]
+        add             sp,  sp,  #0x40
+        br              x10

I don't think it makes sense to duplicate this for every variant, just
branch to tail of the function, the same applies to the other 16_8/16
functions

Fair point. I'll use the main/full version as the proper tail (which isn't at the absolute end of the function), and sacrifice one cycle for the branch for the shortcuts, instead of sacrificing one cycle for the worst case for the full processing.

I'll push soon with these modifications.

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to