Re: [libav-devel] [PATCH] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

Janne Grunau Fri, 10 Feb 2017 13:09:56 -0800

On 2017-02-09 13:27:04 +0200, Martin Storsjö wrote:
> The idct32x32 function actually backed up and restored d8-d15 even


... pushed onto the stack ... is imo clearer even though there are no 
explicit push/pop instructions

> though it didn't clobber them; there are plenty of registers that
> can be used to allow keeping all the idct coefficients in registers
> without having to reload different subsets of them at different
> stages in the transform.
> 
> After this, we still can skip backing up and restoring d12-d15.

same

> 
> Before:
> vp9_inv_dct_dct_32x32_sub32_add_neon: 8128.3
> After:
> vp9_inv_dct_dct_32x32_sub32_add_neon: 8053.3
> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 110 
> +++++++++++++++----------------------
>  1 file changed, 43 insertions(+), 67 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
> b/libavcodec/aarch64/vp9itxfm_neon.S
> index c954d1a..64286df 100644
> --- a/libavcodec/aarch64/vp9itxfm_neon.S
> +++ b/libavcodec/aarch64/vp9itxfm_neon.S
> @@ -1106,18 +1106,14 @@ endfunc
>  .endm
>  
>  function idct32_odd
> -        ld1             {v0.8h,v1.8h}, [x11]
> -
> -        dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> -        dmbutterfly     v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = 
> t17a, v23 = t30a
> -        dmbutterfly     v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = 
> t18a, v27 = t29a
> -        dmbutterfly     v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = 
> t19a, v19 = t28a
> -        dmbutterfly     v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = 
> t20a, v29 = t27a
> -        dmbutterfly     v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = 
> t21a, v21 = t26a
> -        dmbutterfly     v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = 
> t22a, v25 = t25a
> -        dmbutterfly     v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = 
> t23a, v17 = t24a
> -
> -        ld1             {v0.8h}, [x10]
> +        dmbutterfly     v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> +        dmbutterfly     v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = 
> t17a, v23 = t30a
> +        dmbutterfly     v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = 
> t18a, v27 = t29a
> +        dmbutterfly     v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = 
> t19a, v19 = t28a
> +        dmbutterfly     v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = 
> t20a, v29 = t27a
> +        dmbutterfly     v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = 
> t21a, v21 = t26a
> +        dmbutterfly     v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = 
> t22a, v25 = t25a
> +        dmbutterfly     v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = 
> t23a, v17 = t24a
>  
>          butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
>          butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
> @@ -1136,18 +1132,14 @@ function idct32_odd
>  endfunc
>  
>  function idct32_odd_half
> -        ld1             {v0.8h,v1.8h}, [x11]
> -
> -        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> -        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = 
> t17a, v23 = t30a
> -        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = 
> t18a, v27 = t29a
> -        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = 
> t19a, v19 = t28a
> -        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = 
> t20a, v29 = t27a
> -        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = 
> t21a, v21 = t26a
> -        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = 
> t22a, v25 = t25a
> -        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = 
> t23a, v17 = t24a
> -
> -        ld1             {v0.8h}, [x10]
> +        dmbutterfly_h1  v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> +        dmbutterfly_h2  v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = 
> t17a, v23 = t30a
> +        dmbutterfly_h1  v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = 
> t18a, v27 = t29a
> +        dmbutterfly_h2  v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = 
> t19a, v19 = t28a
> +        dmbutterfly_h1  v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = 
> t20a, v29 = t27a
> +        dmbutterfly_h2  v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = 
> t21a, v21 = t26a
> +        dmbutterfly_h1  v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = 
> t22a, v25 = t25a
> +        dmbutterfly_h2  v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = 
> t23a, v17 = t24a
>  
>          butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
>          butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
> @@ -1166,18 +1158,14 @@ function idct32_odd_half
>  endfunc
>  
>  function idct32_odd_quarter
> -        ld1             {v0.8h,v1.8h}, [x11]
> -
> -        dsmull_h        v4,  v5,  v16, v0.h[0]
> -        dsmull_h        v28, v29, v19, v0.h[7]
> -        dsmull_h        v30, v31, v16, v0.h[1]
> -        dsmull_h        v22, v23, v17, v1.h[6]
> -        dsmull_h        v7,  v6,  v17, v1.h[7]
> -        dsmull_h        v26, v27, v19, v0.h[6]
> -        dsmull_h        v20, v21, v18, v1.h[0]
> -        dsmull_h        v24, v25, v18, v1.h[1]
> -
> -        ld1             {v0.8h}, [x10]
> +        dsmull_h        v4,  v5,  v16, v8.h[0]
> +        dsmull_h        v28, v29, v19, v8.h[7]
> +        dsmull_h        v30, v31, v16, v8.h[1]
> +        dsmull_h        v22, v23, v17, v9.h[6]
> +        dsmull_h        v7,  v6,  v17, v9.h[7]
> +        dsmull_h        v26, v27, v19, v8.h[6]
> +        dsmull_h        v20, v21, v18, v9.h[0]
> +        dsmull_h        v24, v25, v18, v9.h[1]
>  
>          neg             v28.4s, v28.4s
>          neg             v29.4s, v29.4s
> @@ -1223,12 +1211,8 @@ endfunc
>  // x1 = unused
>  // x2 = src
>  // x9 = double input stride
> -// x10 = idct_coeffs
> -// x11 = idct_coeffs + 32
>  function idct32_1d_8x32_pass1\suffix\()_neon
>          mov             x14, x30
> -        ld1             {v0.8h,v1.8h}, [x10]
> -
>          movi            v2.8h, #0
>  
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
> @@ -1261,14 +1245,14 @@ function idct32_1d_8x32_pass1\suffix\()_neon
>  .macro store_rev a, b
>          // There's no rev128 instruction, but we reverse each 64 bit
>          // half, and then flip them using an ext with 8 bytes offset.
> -        rev64           v1.8h, \b
> +        rev64           v3.8h, \b
>          st1             {\a},  [x0], #16
> -        rev64           v0.8h, \a
> -        ext             v1.16b, v1.16b, v1.16b, #8
> +        rev64           v2.8h, \a
> +        ext             v3.16b, v3.16b, v3.16b, #8
>          st1             {\b},  [x0], #16
> -        ext             v0.16b, v0.16b, v0.16b, #8
> -        st1             {v1.8h},  [x0], #16
> -        st1             {v0.8h},  [x0], #16
> +        ext             v2.16b, v2.16b, v2.16b, #8
> +        st1             {v3.8h},  [x0], #16
> +        st1             {v2.8h},  [x0], #16
>  .endm
>          store_rev       v16.8h, v24.8h
>          store_rev       v17.8h, v25.8h
> @@ -1322,20 +1306,20 @@ function idct32_1d_8x32_pass1\suffix\()_neon
>          // subtracted from the output.
>  .macro store_rev a, b
>          ld1             {v4.8h},  [x0]
> -        rev64           v1.8h, \b
> +        rev64           v3.8h, \b
>          add             v4.8h, v4.8h, \a
> -        rev64           v0.8h, \a
> +        rev64           v2.8h, \a
>          st1             {v4.8h},  [x0], #16
> -        ext             v1.16b, v1.16b, v1.16b, #8
> +        ext             v3.16b, v3.16b, v3.16b, #8
>          ld1             {v5.8h},  [x0]
> -        ext             v0.16b, v0.16b, v0.16b, #8
> +        ext             v2.16b, v2.16b, v2.16b, #8
>          add             v5.8h, v5.8h, \b
>          st1             {v5.8h},  [x0], #16
>          ld1             {v6.8h},  [x0]
> -        sub             v6.8h, v6.8h, v1.8h
> +        sub             v6.8h, v6.8h, v3.8h
>          st1             {v6.8h},  [x0], #16
>          ld1             {v7.8h},  [x0]
> -        sub             v7.8h, v7.8h, v0.8h
> +        sub             v7.8h, v7.8h, v2.8h
>          st1             {v7.8h},  [x0], #16
>  .endm
>  
> @@ -1359,12 +1343,8 @@ endfunc
>  // x2 = src (temp buffer)
>  // x7 = negative double temp buffer stride
>  // x9 = double temp buffer stride
> -// x10 = idct_coeffs
> -// x11 = idct_coeffs + 32
>  function idct32_1d_8x32_pass2\suffix\()_neon
>          mov             x14, x30
> -        ld1             {v0.8h,v1.8h}, [x10]
> -
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
>  .ifb \suffix
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> @@ -1437,15 +1417,15 @@ function idct32_1d_8x32_pass2\suffix\()_neon
>          sub             v6.8h, v6.8h, \c
>          sub             v7.8h, v7.8h, \d
>  .endif
> -        ld1             {v0.8b}, [x0], x1
> -        ld1             {v1.8b}, [x0], x1
> +        ld1             {v10.8b}, [x0], x1
> +        ld1             {v11.8b}, [x0], x1
>          srshr           v4.8h, v4.8h, #6
>          ld1             {v2.8b}, [x0], x1
>          srshr           v5.8h, v5.8h, #6
> -        uaddw           v4.8h, v4.8h, v0.8b
> +        uaddw           v4.8h, v4.8h, v10.8b
>          ld1             {v3.8b}, [x0], x1
>          srshr           v6.8h, v6.8h, #6
> -        uaddw           v5.8h, v5.8h, v1.8b
> +        uaddw           v5.8h, v5.8h, v11.8b
>          srshr           v7.8h, v7.8h, #6
>          sub             x0,  x0,  x1, lsl #2
>          uaddw           v6.8h, v6.8h, v2.8b
> @@ -1486,13 +1466,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>          b.eq            idct32x32_dc_add_neon
>  
>          movrel          x10, idct_coeffs
> -        add             x11, x10, #32
>          movrel          x12, min_eob_idct_idct_32, 2
>  
>          mov             x15, x30
>  
> -        stp             d14, d15, [sp, #-0x10]!
> -        stp             d12, d13, [sp, #-0x10]!
>          stp             d10, d11, [sp, #-0x10]!
>          stp             d8,  d9,  [sp, #-0x10]!
>  
> @@ -1506,6 +1483,9 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>          mov             x9,  #128
>          neg             x7,  x9
>  
> +        ld1             {v0.8h,v1.8h}, [x10], #32
> +        ld1             {v8.8h,v9.8h}, [x10]
> +
>          cmp             w3,  #34
>          b.le            idct32x32_quarter_add_neon
>          cmp             w3,  #135
> @@ -1548,8 +1528,6 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>  
>          ldp             d8,  d9,  [sp], 0x10
>          ldp             d10, d11, [sp], 0x10
> -        ldp             d12, d13, [sp], 0x10
> -        ldp             d14, d15, [sp], 0x10
>  
>          br              x15
>  endfunc
> @@ -1575,8 +1553,6 @@ function idct32x32_\size\()_add_neon
>  
>          ldp             d8,  d9,  [sp], 0x10
>          ldp             d10, d11, [sp], 0x10
> -        ldp             d12, d13, [sp], 0x10
> -        ldp             d14, d15, [sp], 0x10
>  
>          br              x15
>  endfunc

ok

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH] aarch64: vp9itxfm: Avoid reloading the idct32 coefficients

Reply via email to