quarter idct16 and idct32 (alternative 2)

Janne Grunau Sun, 05 Feb 2017 02:12:35 -0800

On 2016-12-01 11:27:02 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> This makes it easier to avoid filling the temp buffer with zeros for the
> skipped slices, and leads to slightly more straightforward code for these
> cases (for the 16x16 case, where the special case pass functions are
> written out instead of templated from the same macro), instead of riddling
> the common code with special case branches or macro .ifs.
> 
> The code size increases from 14740 bytes to 24472 bytes.
> 
> Before:
> vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
> vp9_inv_dct_dct_16x16_sub2_add_neon:    1051.0
> vp9_inv_dct_dct_16x16_sub4_add_neon:    1051.0
> vp9_inv_dct_dct_16x16_sub8_add_neon:    1051.0
> vp9_inv_dct_dct_16x16_sub12_add_neon:   1390.3
> vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
> vp9_inv_dct_dct_32x32_sub1_add_neon:     556.5
> vp9_inv_dct_dct_32x32_sub2_add_neon:    5199.1
> vp9_inv_dct_dct_32x32_sub4_add_neon:    5199.9
> vp9_inv_dct_dct_32x32_sub8_add_neon:    5196.9
> vp9_inv_dct_dct_32x32_sub12_add_neon:   6171.6
> vp9_inv_dct_dct_32x32_sub16_add_neon:   6170.9
> vp9_inv_dct_dct_32x32_sub20_add_neon:   7147.1
> vp9_inv_dct_dct_32x32_sub24_add_neon:   7147.0
> vp9_inv_dct_dct_32x32_sub28_add_neon:   8118.8
> vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8
> 
> After:
> vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
> vp9_inv_dct_dct_16x16_sub2_add_neon:     639.0
> vp9_inv_dct_dct_16x16_sub4_add_neon:     639.0
> vp9_inv_dct_dct_16x16_sub8_add_neon:     845.0
> vp9_inv_dct_dct_16x16_sub12_add_neon:   1389.4
> vp9_inv_dct_dct_16x16_sub16_add_neon:   1389.3
> vp9_inv_dct_dct_32x32_sub1_add_neon:     556.5
> vp9_inv_dct_dct_32x32_sub2_add_neon:    3684.1
> vp9_inv_dct_dct_32x32_sub4_add_neon:    3682.6
> vp9_inv_dct_dct_32x32_sub8_add_neon:    3684.1
> vp9_inv_dct_dct_32x32_sub12_add_neon:   5319.0
> vp9_inv_dct_dct_32x32_sub16_add_neon:   5323.5
> vp9_inv_dct_dct_32x32_sub20_add_neon:   7149.8
> vp9_inv_dct_dct_32x32_sub24_add_neon:   7148.2
> vp9_inv_dct_dct_32x32_sub28_add_neon:   8124.5
> vp9_inv_dct_dct_32x32_sub32_add_neon:   8122.1
> 
> ---
> If we wouldn't have made the core transforms standalone functions,
> the code size would end up at around 34 KB.
> 
> The binary output is 6 KB larger than in the other alternative,
> but is more straightforward and gives better opportunities to
> special case them further.
> 
> In the arm version, there was a significant speedup compared to the
> other alternative (having cmps within the functions), skipping
> zeroing of the temp buffer. Here there's much less difference.


And the relative binary size difference is even larger. It would a 
little strange to choose different alternatives for 32- and 64-bit but 
it sounds like alternative 1 might be better for arm64. Please run a 
full decoding benchmark for arm64 too.

> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 628 
> +++++++++++++++++++++++++++++++++----
>  1 file changed, 566 insertions(+), 62 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
> b/libavcodec/aarch64/vp9itxfm_neon.S
> index be9643e..9910170 100644
> --- a/libavcodec/aarch64/vp9itxfm_neon.S
> +++ b/libavcodec/aarch64/vp9itxfm_neon.S
> @@ -75,6 +75,16 @@ endconst
>  .endif
>  .endm
>  
> +// Same as dmbutterfly0 above, but treating the input in in2 as zero,
> +// writing the same output into both out1 and out2.
> +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, 
> tmp6
> +        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
> +        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
> +        rshrn           \out1\().4h,  \tmp1\().4s, #14
> +        rshrn2          \out1\().8h,  \tmp2\().4s, #14
> +        mov             \out2\().16b, \out1\().16b
> +.endm
> +
>  // out1,out2 = in1 * coef1 - in2 * coef2
>  // out3,out4 = in1 * coef2 + in2 * coef1
>  // out are 4 x .4s registers, in are 2 x .8h registers
> @@ -104,6 +114,43 @@ endconst
>          rshrn2          \inout2\().8h, \tmp4\().4s,  #14
>  .endm
>  
> +// Same as dmbutterfly above, but treating the input in inout2 as zero
> +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
> +        smull           \tmp1\().4s, \inout1\().4h, \coef1
> +        smull2          \tmp2\().4s, \inout1\().8h, \coef1
> +        smull           \tmp3\().4s, \inout1\().4h, \coef2
> +        smull2          \tmp4\().4s, \inout1\().8h, \coef2
> +        rshrn           \inout1\().4h, \tmp1\().4s, #14
> +        rshrn2          \inout1\().8h, \tmp2\().4s, #14
> +        rshrn           \inout2\().4h, \tmp3\().4s, #14
> +        rshrn2          \inout2\().8h, \tmp4\().4s, #14
> +.endm
> +
> +// Same as dmbutterfly above, but treating the input in inout1 as zero
> +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
> +        smull           \tmp1\().4s, \inout2\().4h, \coef2
> +        smull2          \tmp2\().4s, \inout2\().8h, \coef2
> +        smull           \tmp3\().4s, \inout2\().4h, \coef1
> +        smull2          \tmp4\().4s, \inout2\().8h, \coef1
> +        neg             \tmp1\().4s, \tmp1\().4s
> +        neg             \tmp2\().4s, \tmp2\().4s
> +        rshrn           \inout2\().4h, \tmp3\().4s, #14
> +        rshrn2          \inout2\().8h, \tmp4\().4s, #14
> +        rshrn           \inout1\().4h, \tmp1\().4s, #14
> +        rshrn2          \inout1\().8h, \tmp2\().4s, #14
> +.endm
> +
> +.macro dsmull_h out1, out2, in, coef
> +        smull           \out1\().4s, \in\().4h, \coef
> +        smull2          \out2\().4s, \in\().8h, \coef
> +.endm
> +
> +.macro drshrn_h out, in1, in2, shift
> +        rshrn           \out\().4h, \in1\().4s, \shift
> +        rshrn2          \out\().8h, \in2\().4s, \shift
> +.endm
> +
> +
>  // out1 = in1 + in2
>  // out2 = in1 - in2
>  .macro butterfly_8h out1, out2, in1, in2
> @@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
>          ret
>  endfunc
>  
> -function idct16
> +.macro idct16_full
>          dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
> t0a,  v24 = t1a
>          dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
> t2a,  v28 = t3a
>          dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
> t4a,  v30 = t7a
> @@ -485,7 +532,10 @@ function idct16
>          dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31       
>  // v22 = t6a,  v26 = t5a
>          dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31       
>  // v23 = t9a,  v25 = t14a
>          dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, 
> neg=1 // v27 = t13a, v21 = t10a
> +        idct16_end

see comment in alternative 1

> +.endm
>  
> +.macro idct16_end
>          butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  
> = t7a
>          butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 
> = t6
>          butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 
> = t5
> @@ -507,6 +557,68 @@ function idct16
>          butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], 
> v27 = out[11]
>          butterfly_8h    v21, v26, v26, v3                // v21 = out[5], 
> v26 = out[10]
>          ret
> +.endm
> +
> +function idct16
> +        idct16_full
> +endfunc
> +
> +function idct16_half
> +        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
> t0a,  v24 = t1a
> +        dmbutterfly_h1  v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
> t2a,  v28 = t3a
> +        dmbutterfly_h1  v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
> t4a,  v30 = t7a
> +        dmbutterfly_h2  v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = 
> t5a,  v22 = t6a
> +        dmbutterfly_h1  v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = 
> t8a,  v31 = t15a
> +        dmbutterfly_h2  v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = 
> t9a,  v23 = t14a
> +        dmbutterfly_h1  v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = 
> t10a, v27 = t13a
> +        dmbutterfly_h2  v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = 
> t11a, v19 = t12a
> +
> +        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 
> = t3
> +        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 
> = t2
> +        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 
> = t5
> +        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 
> = t6
> +        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 
> = t9
> +        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 
> = t10
> +        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 
> = t13
> +        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 
> = t14
> +
> +        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31       
>  // v22 = t6a,  v26 = t5a
> +        dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31       
>  // v23 = t9a,  v25 = t14a
> +        dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, 
> neg=1 // v27 = t13a, v21 = t10a
> +        idct16_end
> +endfunc
> +
> +function idct16_quarter
> +        dsmull_h        v24, v25, v19, v1.h[6]
> +        dsmull_h        v4,  v5,  v17, v0.h[7]
> +        dsmull_h        v7,  v6,  v18, v0.h[4]
> +        dsmull_h        v30, v31, v18, v0.h[3]
> +        neg             v24.4s,  v24.4s
> +        neg             v25.4s,  v25.4s
> +        dsmull_h        v29, v28, v17, v1.h[0]
> +        dsmull_h        v26, v27, v19, v1.h[5]
> +        dsmull_h        v22, v23, v16, v0.h[0]
> +        drshrn_h        v24, v24, v25, #14
> +        drshrn_h        v16, v4,  v5,  #14
> +        drshrn_h        v7,  v7,  v6,  #14
> +        drshrn_h        v6,  v30, v31, #14
> +        drshrn_h        v29, v29, v28, #14
> +        drshrn_h        v17, v26, v27, #14
> +        drshrn_h        v28, v22, v23, #14
> +
> +        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
> +        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
> +        neg             v22.4s,  v22.4s
> +        neg             v23.4s,  v23.4s
> +        drshrn_h        v27, v20, v21, #14
> +        drshrn_h        v21, v22, v23, #14
> +        drshrn_h        v23, v18, v19, #14
> +        drshrn_h        v25, v30, v31, #14
> +        mov             v4.16b,  v28.16b
> +        mov             v5.16b,  v28.16b
> +        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
> +        mov             v20.16b, v28.16b
> +        idct16_end
>  endfunc
>  
>  function iadst16
> @@ -598,6 +710,51 @@ endfunc
>          st1             {v2.8h},  [\src], \inc
>  .endm
>  
> +.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, 
> coef7, tmp1, tmp2
> +        srshr           \coef0, \coef0, #6
> +        ld1             {v2.8b},  [x0], x1
> +        srshr           \coef1, \coef1, #6
> +        ld1             {v3.8b},  [x3], x1
> +        srshr           \coef2, \coef2, #6
> +        ld1             {v4.8b},  [x0], x1
> +        srshr           \coef3, \coef3, #6
> +        uaddw           \coef0, \coef0, v2.8b
> +        ld1             {v5.8b},  [x3], x1
> +        uaddw           \coef1, \coef1, v3.8b
> +        srshr           \coef4, \coef4, #6
> +        ld1             {v6.8b},  [x0], x1
> +        srshr           \coef5, \coef5, #6
> +        ld1             {v7.8b},  [x3], x1
> +        sqxtun          v2.8b,  \coef0
> +        srshr           \coef6, \coef6, #6
> +        sqxtun          v3.8b,  \coef1
> +        srshr           \coef7, \coef7, #6
> +        uaddw           \coef2, \coef2, v4.8b
> +        ld1             {\tmp1},  [x0], x1
> +        uaddw           \coef3, \coef3, v5.8b
> +        ld1             {\tmp2},  [x3], x1
> +        sqxtun          v4.8b,  \coef2
> +        sub             x0,  x0,  x1, lsl #2
> +        sub             x3,  x3,  x1, lsl #2
> +        sqxtun          v5.8b,  \coef3
> +        uaddw           \coef4, \coef4, v6.8b
> +        st1             {v2.8b},  [x0], x1
> +        uaddw           \coef5, \coef5, v7.8b
> +        st1             {v3.8b},  [x3], x1
> +        sqxtun          v6.8b,  \coef4
> +        st1             {v4.8b},  [x0], x1
> +        sqxtun          v7.8b,  \coef5
> +        st1             {v5.8b},  [x3], x1
> +        uaddw           \coef6, \coef6, \tmp1
> +        st1             {v6.8b},  [x0], x1
> +        uaddw           \coef7, \coef7, \tmp2
> +        st1             {v7.8b},  [x3], x1
> +        sqxtun          \tmp1,  \coef6
> +        sqxtun          \tmp2,  \coef7
> +        st1             {\tmp1},  [x0], x1
> +        st1             {\tmp2},  [x3], x1
> +.endm
> +
>  // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
>  // transpose into a horizontal 16x8 slice and store.
>  // x0 = dst (temp buffer)
> @@ -671,53 +828,8 @@ function \txfm\()16_1d_8x16_pass2_neon
>          lsl             x1,  x1,  #1
>          bl              \txfm\()16
>  
> -.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, 
> coef7, tmp1, tmp2
> -        srshr           \coef0, \coef0, #6
> -        ld1             {v2.8b},  [x0], x1
> -        srshr           \coef1, \coef1, #6
> -        ld1             {v3.8b},  [x3], x1
> -        srshr           \coef2, \coef2, #6
> -        ld1             {v4.8b},  [x0], x1
> -        srshr           \coef3, \coef3, #6
> -        uaddw           \coef0, \coef0, v2.8b
> -        ld1             {v5.8b},  [x3], x1
> -        uaddw           \coef1, \coef1, v3.8b
> -        srshr           \coef4, \coef4, #6
> -        ld1             {v6.8b},  [x0], x1
> -        srshr           \coef5, \coef5, #6
> -        ld1             {v7.8b},  [x3], x1
> -        sqxtun          v2.8b,  \coef0
> -        srshr           \coef6, \coef6, #6
> -        sqxtun          v3.8b,  \coef1
> -        srshr           \coef7, \coef7, #6
> -        uaddw           \coef2, \coef2, v4.8b
> -        ld1             {\tmp1},  [x0], x1
> -        uaddw           \coef3, \coef3, v5.8b
> -        ld1             {\tmp2},  [x3], x1
> -        sqxtun          v4.8b,  \coef2
> -        sub             x0,  x0,  x1, lsl #2
> -        sub             x3,  x3,  x1, lsl #2
> -        sqxtun          v5.8b,  \coef3
> -        uaddw           \coef4, \coef4, v6.8b
> -        st1             {v2.8b},  [x0], x1
> -        uaddw           \coef5, \coef5, v7.8b
> -        st1             {v3.8b},  [x3], x1
> -        sqxtun          v6.8b,  \coef4
> -        st1             {v4.8b},  [x0], x1
> -        sqxtun          v7.8b,  \coef5
> -        st1             {v5.8b},  [x3], x1
> -        uaddw           \coef6, \coef6, \tmp1
> -        st1             {v6.8b},  [x0], x1
> -        uaddw           \coef7, \coef7, \tmp2
> -        st1             {v7.8b},  [x3], x1
> -        sqxtun          \tmp1,  \coef6
> -        sqxtun          \tmp2,  \coef7
> -        st1             {\tmp1},  [x0], x1
> -        st1             {\tmp2},  [x3], x1
> -.endm
>          load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, 
> v22.8h, v23.8h, v16.8b, v17.8b
>          load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, 
> v30.8h, v31.8h, v16.8b, v17.8b
> -.purgem load_add_store
>  
>          br              x14
>  endfunc
> @@ -731,6 +843,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, 
> export=1
>  .ifc \txfm1\()_\txfm2,idct_idct
>          cmp             w3,  #1
>          b.eq            idct16x16_dc_add_neon
> +        cmp             w3,  #10
> +        b.le            idct16x16_quarter_add_neon
> +        cmp             w3,  #38
> +        b.le            idct16x16_half_add_neon
>  .endif
>          mov             x15, x30
>          // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
> @@ -812,6 +928,155 @@ itxfm_func16x16 iadst, idct
>  itxfm_func16x16 idct,  iadst
>  itxfm_func16x16 iadst, iadst
>  
> +function idct16_1d_8x16_pass1_quarter_neon
> +        mov             x14, x30
> +        movi            v2.8h, #0
> +.irp i, 16, 17, 18, 19
> +        load_clear      \i,  x2,  x9
> +.endr
> +
> +        bl              idct16_quarter
> +
> +        // Do two 8x8 transposes. Originally, v16-v31 contain the
> +        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
> +        // transposed 8x8 blocks.
> +        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
> +        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
> +
> +        // Store the transposed 8x8 blocks horizontally.
> +        // The first 8x8 block is kept in registers for the second pass,
> +        // store the rest in the temp buffer.
> +        // Since only a 4x4 part of the input was nonzero, this means that
> +        // only 4 rows are nonzero after transposing, and the second pass
> +        // only reads the topmost 4 rows. Therefore only store the topmost
> +        // 4 rows.
> +        add             x0,  x0,  #16
> +.irp i, 24, 25, 26, 27
> +        store           \i,  x0,  x9
> +.endr
> +        br              x14
> +endfunc
> +
> +function idct16_1d_8x16_pass2_quarter_neon
> +        mov             x14, x30
> +        cbz             x3,  1f
> +.irp i, 16, 17, 18, 19
> +        load            \i,  x2,  x9
> +.endr
> +1:
> +
> +        add             x3,  x0,  x1
> +        lsl             x1,  x1,  #1
> +        bl              idct16_quarter
> +
> +        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, 
> v22.8h, v23.8h, v16.8b, v17.8b
> +        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, 
> v30.8h, v31.8h, v16.8b, v17.8b
> +
> +        br              x14
> +endfunc
> +
> +function idct16_1d_8x16_pass1_half_neon
> +        mov             x14, x30
> +        movi            v2.8h, #0
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load_clear      \i,  x2,  x9
> +.endr
> +
> +        bl              idct16_half
> +
> +        // Do two 8x8 transposes. Originally, v16-v31 contain the
> +        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
> +        // transposed 8x8 blocks.
> +        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
> +        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
> +
> +        // Store the transposed 8x8 blocks horizontally.
> +        // The first 8x8 block is kept in registers for the second pass,
> +        // store the rest in the temp buffer.
> +        add             x0,  x0,  #16
> +.irp i, 24, 25, 26, 27, 28, 29, 30, 31
> +        store           \i,  x0,  x9
> +.endr
> +        br              x14
> +endfunc
> +
> +function idct16_1d_8x16_pass2_half_neon
> +        mov             x14, x30
> +        cbz             x3,  1f
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load            \i,  x2,  x9
> +.endr
> +1:
> +
> +        add             x3,  x0,  x1
> +        lsl             x1,  x1,  #1
> +        bl              idct16_half
> +
> +        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, 
> v22.8h, v23.8h, v16.8b, v17.8b
> +        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, 
> v30.8h, v31.8h, v16.8b, v17.8b
> +
> +        br              x14
> +endfunc
> +
> +function idct16x16_quarter_add_neon, export=1
> +        mov             x15, x30
> +
> +        sub             sp,  sp,  #512
> +
> +        mov             x4,  x0
> +        mov             x5,  x1
> +        mov             x6,  x2
> +
> +        movrel          x10, idct_coeffs
> +        ld1             {v0.8h,v1.8h}, [x10]
> +        mov             x9, #32
> +
> +.irp i, 0
> +        add             x0,  sp,  #(\i*32)
> +        add             x2,  x6,  #(\i*2)
> +        bl              idct16_1d_8x16_pass1_quarter_neon
> +.endr
> +.irp i, 0, 8
> +        add             x0,  x4,  #(\i)
> +        mov             x1,  x5
> +        add             x2,  sp,  #(\i*2)
> +        mov             x3,  #\i
> +        bl              idct16_1d_8x16_pass2_quarter_neon
> +.endr
> +
> +        add             sp,  sp,  #512
> +        br              x15
> +endfunc
> +
> +function idct16x16_half_add_neon, export=1
> +        mov             x15, x30
> +
> +        sub             sp,  sp,  #512
> +
> +        mov             x4,  x0
> +        mov             x5,  x1
> +        mov             x6,  x2
> +
> +        movrel          x10, idct_coeffs
> +        ld1             {v0.8h,v1.8h}, [x10]
> +        mov             x9, #32
> +
> +.irp i, 0
> +        add             x0,  sp,  #(\i*32)
> +        add             x2,  x6,  #(\i*2)
> +        bl              idct16_1d_8x16_pass1_half_neon
> +.endr
> +.irp i, 0, 8
> +        add             x0,  x4,  #(\i)
> +        mov             x1,  x5
> +        add             x2,  sp,  #(\i*2)
> +        mov             x3,  #\i
> +        bl              idct16_1d_8x16_pass2_half_neon
> +.endr
> +
> +        add             sp,  sp,  #512
> +        br              x15
> +endfunc

this two should be templated

>  
>  function idct32x32_dc_add_neon
>          movrel          x4, idct_coeffs
> @@ -848,7 +1113,7 @@ function idct32x32_dc_add_neon
>          ret
>  endfunc
>  
> -function idct32_odd
> +.macro idct32_odd_full
>          ld1             {v0.8h,v1.8h}, [x11]
>  
>          dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> @@ -875,7 +1140,10 @@ function idct32_odd
>          dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, 
> neg=1 // v27 = t29a, v20 = t18a
>          dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19       
>  // v21 = t21a, v26 = t26a
>          dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, 
> neg=1 // v25 = t25a, v22 = t22a
> +        idct32_end
> +.endm
>  
> +.macro idct32_end
>          butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
>          butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
>          butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
> @@ -904,8 +1172,91 @@ function idct32_odd
>          dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = 
> t25,  v22 = t22
>          dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = 
> t24a, v23 = t23a
>          ret
> +.endm
> +
> +function idct32_odd
> +        idct32_odd_full
> +endfunc
> +
> +function idct32_odd_half
> +        ld1             {v0.8h,v1.8h}, [x11]
> +
> +        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = 
> t16a, v31 = t31a
> +        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = 
> t17a, v23 = t30a
> +        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = 
> t18a, v27 = t29a
> +        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = 
> t19a, v19 = t28a
> +        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = 
> t20a, v29 = t27a
> +        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = 
> t21a, v21 = t26a
> +        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = 
> t22a, v25 = t25a
> +        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = 
> t23a, v17 = t24a
> +
> +        ld1             {v0.8h}, [x10]
> +
> +        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
> +        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
> +        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
> +        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
> +        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
> +        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
> +        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
> +        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
> +
> +        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19       
>  // v23 = t17a, v24 = t30a
> +        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, 
> neg=1 // v27 = t29a, v20 = t18a
> +        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19       
>  // v21 = t21a, v26 = t26a
> +        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, 
> neg=1 // v25 = t25a, v22 = t22a
> +        idct32_end
> +endfunc
> +
> +function idct32_odd_quarter
> +        ld1             {v0.8h,v1.8h}, [x11]
> +
> +        dsmull_h        v4,  v5,  v16, v0.h[0]
> +        dsmull_h        v28, v29, v19, v0.h[7]
> +        dsmull_h        v30, v31, v16, v0.h[1]
> +        dsmull_h        v22, v23, v17, v1.h[6]
> +        dsmull_h        v7,  v6,  v17, v1.h[7]
> +        dsmull_h        v26, v27, v19, v0.h[6]
> +        dsmull_h        v20, v21, v18, v1.h[0]
> +        dsmull_h        v24, v25, v18, v1.h[1]
> +
> +        ld1             {v0.8h}, [x10]
> +
> +        neg             v28.4s, v28.4s
> +        neg             v29.4s, v29.4s
> +        neg             v7.4s,  v7.4s
> +        neg             v6.4s,  v6.4s
> +
> +        drshrn_h        v4,  v4,  v5,  #14
> +        drshrn_h        v5,  v28, v29, #14
> +        drshrn_h        v29, v30, v31, #14
> +        drshrn_h        v28, v22, v23, #14
> +        drshrn_h        v7,  v7,  v6,  #14
> +        drshrn_h        v31, v26, v27, #14
> +        drshrn_h        v6,  v20, v21, #14
> +        drshrn_h        v30, v24, v25, #14
> +
> +        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[3], v0.h[4]
> +        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[3], v0.h[4]
> +        drshrn_h        v23, v16, v17, #14
> +        drshrn_h        v24, v18, v19, #14
> +        neg             v20.4s, v20.4s
> +        neg             v21.4s, v21.4s
> +        drshrn_h        v27, v27, v26, #14
> +        drshrn_h        v20, v20, v21, #14
> +        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[5], v0.h[6]
> +        drshrn_h        v21, v16, v17, #14
> +        drshrn_h        v26, v18, v19, #14
> +        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[5], v0.h[6]
> +        drshrn_h        v25, v16, v17, #14
> +        neg             v18.4s, v18.4s
> +        neg             v19.4s, v19.4s
> +        drshrn_h        v22, v18, v19, #14
> +
> +        idct32_end
>  endfunc
>  
> +.macro idct32_funcs suffix
>  // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
>  // The 32-point IDCT can be decomposed into two 16-point IDCTs;
>  // a normal IDCT16 with every other input component (the even ones, with
> @@ -917,19 +1268,30 @@ endfunc
>  // x9 = double input stride
>  // x10 = idct_coeffs
>  // x11 = idct_coeffs + 32
> -function idct32_1d_8x32_pass1_neon
> +function idct32_1d_8x32_pass1\suffix\()_neon
>          mov             x14, x30
>          ld1             {v0.8h,v1.8h}, [x10]
>  
> -        movi            v4.8h, #0
> +        movi            v2.8h, #0
>  
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
> +.ifb \suffix
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> -        ld1             {v\i\().8h}, [x2]
> -        st1             {v4.8h},  [x2], x9
> +        load_clear      \i, x2, x9
> +.endr
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> +        load_clear      \i, x2, x9
> +.endr
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load_clear      \i, x2, x9
>  .endr
> +.endif
>  
> -        bl              idct16
> +        bl              idct16\suffix
>  
>          // Do two 8x8 transposes. Originally, v16-v31 contain the
>          // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
> @@ -964,17 +1326,36 @@ function idct32_1d_8x32_pass1_neon
>  
>          // Move x2 back to the start of the input, and move
>          // to the first odd row
> +.ifb \suffix
>          sub             x2,  x2,  x9, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> +        sub             x2,  x2,  x9, lsl #2
> +.endif
> +.ifc \suffix,_half
> +        sub             x2,  x2,  x9, lsl #3
> +.endif
>          add             x2,  x2,  #64
>  
> -        movi            v4.8h, #0
> +        movi            v2.8h, #0
>          // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
> +.ifb \suffix
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> -        ld1             {v\i\().8h}, [x2]
> -        st1             {v4.8h},  [x2], x9
> +        load_clear      \i, x2, x9
> +.endr
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> +        load_clear      \i, x2, x9
> +.endr
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load_clear      \i, x2, x9
>  .endr
> +.endif
>  
> -        bl              idct32_odd
> +        bl              idct32_odd\suffix
>  
>          transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
>          transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
> @@ -1023,33 +1404,61 @@ endfunc
>  // x9 = double temp buffer stride
>  // x10 = idct_coeffs
>  // x11 = idct_coeffs + 32
> -function idct32_1d_8x32_pass2_neon
> +function idct32_1d_8x32_pass2\suffix\()_neon
>          mov             x14, x30
>          ld1             {v0.8h,v1.8h}, [x10]
>  
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
> +.ifb \suffix
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> -        ld1             {v\i\().8h}, [x2], x9
> +        load            \i, x2, x9
>  .endr
>          sub             x2,  x2,  x9, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> +        load            \i, x2, x9
> +.endr
> +        sub             x2,  x2,  x9, lsl #2
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load            \i, x2, x9
> +.endr
> +        sub             x2,  x2,  x9, lsl #3
> +.endif
>  
> -        bl              idct16
> +        bl              idct16\suffix
>  
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> -        st1             {v\i\().8h}, [x2], x9
> +        store           \i, x2, x9
>  .endr
>  
>          sub             x2,  x2,  x9, lsl #4
>          add             x2,  x2,  #64
>  
>          // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
> +.ifb \suffix
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> -        ld1             {v\i\().8h}, [x2], x9
> +        load            \i, x2, x9
>  .endr
>          sub             x2,  x2,  x9, lsl #4
> +.endif
> +.ifc \suffix,_quarter
> +.irp i, 16, 17, 18, 19
> +        load            \i, x2, x9
> +.endr
> +        sub             x2,  x2,  x9, lsl #2
> +.endif
> +.ifc \suffix,_half
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        load            \i, x2, x9
> +.endr
> +        sub             x2,  x2,  x9, lsl #3
> +.endif
>          sub             x2,  x2,  #64
>  
> -        bl              idct32_odd
> +        bl              idct32_odd\suffix
>  
>  .macro load_acc_store a, b, c, d, neg=0
>  .if \neg == 0
> @@ -1105,6 +1514,11 @@ function idct32_1d_8x32_pass2_neon
>  .purgem load_acc_store
>          br              x14
>  endfunc
> +.endm
> +
> +idct32_funcs
> +idct32_funcs _quarter
> +idct32_funcs _half
>  
>  const min_eob_idct_idct_32, align=4
>          .short  0, 34, 135, 336
> @@ -1113,6 +1527,10 @@ endconst
>  function ff_vp9_idct_idct_32x32_add_neon, export=1
>          cmp             w3,  #1
>          b.eq            idct32x32_dc_add_neon

saving d8-d15 should be done here, saves duplicating it in the 
quarter/half variants. same for the idct_coeffs and other stuff.

> +        cmp             w3,  #34
> +        b.le            idct32x32_quarter_add_neon
> +        cmp             w3,  #135
> +        b.le            idct32x32_half_add_neon
>  
>          movrel          x10, idct_coeffs
>          add             x11, x10, #32
> @@ -1177,3 +1595,89 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>  
>          br              x15
>  endfunc
> +
> +function idct32x32_quarter_add_neon
> +        movrel          x10, idct_coeffs
> +        add             x11, x10, #32
> +
> +        mov             x15, x30
> +
> +        stp             d14, d15, [sp, #-0x10]!
> +        stp             d12, d13, [sp, #-0x10]!
> +        stp             d10, d11, [sp, #-0x10]!
> +        stp             d8,  d9,  [sp, #-0x10]!
> +
> +        sub             sp,  sp,  #2048
> +
> +        mov             x4,  x0
> +        mov             x5,  x1
> +        mov             x6,  x2
> +
> +        // Double stride of the input, since we only read every other line
> +        mov             x9,  #128
> +        neg             x7,  x9
> +
> +.irp i, 0
> +        add             x0,  sp,  #(\i*64)
> +        add             x2,  x6,  #(\i*2)
> +        bl              idct32_1d_8x32_pass1_quarter_neon
> +.endr
> +.irp i, 0, 8, 16, 24
> +        add             x0,  x4,  #(\i)
> +        mov             x1,  x5
> +        add             x2,  sp,  #(\i*2)
> +        bl              idct32_1d_8x32_pass2_quarter_neon
> +.endr
> +
> +        add             sp,  sp,  #2048
> +
> +        ldp             d8,  d9,  [sp], 0x10
> +        ldp             d10, d11, [sp], 0x10
> +        ldp             d12, d13, [sp], 0x10
> +        ldp             d14, d15, [sp], 0x10
> +
> +        br              x15
> +endfunc
> +
> +function idct32x32_half_add_neon
> +        movrel          x10, idct_coeffs
> +        add             x11, x10, #32
> +
> +        mov             x15, x30
> +
> +        stp             d14, d15, [sp, #-0x10]!
> +        stp             d12, d13, [sp, #-0x10]!
> +        stp             d10, d11, [sp, #-0x10]!
> +        stp             d8,  d9,  [sp, #-0x10]!
> +
> +        sub             sp,  sp,  #2048
> +
> +        mov             x4,  x0
> +        mov             x5,  x1
> +        mov             x6,  x2
> +
> +        // Double stride of the input, since we only read every other line
> +        mov             x9,  #128
> +        neg             x7,  x9
> +
> +.irp i, 0, 8
> +        add             x0,  sp,  #(\i*64)
> +        add             x2,  x6,  #(\i*2)
> +        bl              idct32_1d_8x32_pass1_half_neon
> +.endr
> +.irp i, 0, 8, 16, 24
> +        add             x0,  x4,  #(\i)
> +        mov             x1,  x5
> +        add             x2,  sp,  #(\i*2)
> +        bl              idct32_1d_8x32_pass2_half_neon
> +.endr
> +
> +        add             sp,  sp,  #2048
> +
> +        ldp             d8,  d9,  [sp], 0x10
> +        ldp             d10, d11, [sp], 0x10
> +        ldp             d12, d13, [sp], 0x10
> +        ldp             d14, d15, [sp], 0x10
> +
> +        br              x15
> +endfunc

otherwise ok

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32 (alternative 2)

Reply via email to