idct32 when possible (alternative 1)

Martin Storsjö Sun, 05 Feb 2017 04:06:07 -0800

On Sun, 5 Feb 2017, Janne Grunau wrote:

On 2016-12-01 11:27:01 +0200, Martin Storsjö wrote:

This work is sponsored by, and copyright, Google.


This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o
from 14740 to 18504 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    1051.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1390.3
vp9_inv_dct_dct_16x16_sub16_add_neon:   1390.1
vp9_inv_dct_dct_32x32_sub1_add_neon:     556.5
vp9_inv_dct_dct_32x32_sub2_add_neon:    5199.1
vp9_inv_dct_dct_32x32_sub4_add_neon:    5199.9
vp9_inv_dct_dct_32x32_sub8_add_neon:    5196.9
vp9_inv_dct_dct_32x32_sub12_add_neon:   6171.6
vp9_inv_dct_dct_32x32_sub16_add_neon:   6170.9
vp9_inv_dct_dct_32x32_sub20_add_neon:   7147.1
vp9_inv_dct_dct_32x32_sub24_add_neon:   7147.0
vp9_inv_dct_dct_32x32_sub28_add_neon:   8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon:   8125.8

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:     697.0
vp9_inv_dct_dct_16x16_sub4_add_neon:     697.0
vp9_inv_dct_dct_16x16_sub8_add_neon:     908.0
vp9_inv_dct_dct_16x16_sub12_add_neon:   1399.6
vp9_inv_dct_dct_16x16_sub16_add_neon:   1403.3
vp9_inv_dct_dct_32x32_sub1_add_neon:     554.1
vp9_inv_dct_dct_32x32_sub2_add_neon:    3879.7
vp9_inv_dct_dct_32x32_sub4_add_neon:    3952.2
vp9_inv_dct_dct_32x32_sub8_add_neon:    3948.4
vp9_inv_dct_dct_32x32_sub12_add_neon:   5462.1
vp9_inv_dct_dct_32x32_sub16_add_neon:   5461.7
vp9_inv_dct_dct_32x32_sub20_add_neon:   7169.2
vp9_inv_dct_dct_32x32_sub24_add_neon:   7162.4
vp9_inv_dct_dct_32x32_sub28_add_neon:   8137.4
vp9_inv_dct_dct_32x32_sub32_add_neon:   8136.7

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
If we wouldn't have made the core transforms standalone functions,
the code size would end up at around 28 KB.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 367 +++++++++++++++++++++++++++++++++++--
 1 file changed, 347 insertions(+), 20 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index be9643e..bb79348 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@ endconst
 .endif
 .endm

+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
+        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
+        rshrn           \out1\().4h,  \tmp1\().4s, #14
+        rshrn2          \out1\().8h,  \tmp2\().4s, #14
+        mov             \out2\().16b, \out1\().16b
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 .endm

+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout1\().4h, \coef1
+        smull2          \tmp2\().4s, \inout1\().8h, \coef1
+        smull           \tmp3\().4s, \inout1\().4h, \coef2
+        smull2          \tmp4\().4s, \inout1\().8h, \coef2
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout2\().4h, \coef2
+        smull2          \tmp2\().4s, \inout2\().8h, \coef2
+        smull           \tmp3\().4s, \inout2\().4h, \coef1
+        smull2          \tmp4\().4s, \inout2\().8h, \coef1
+        neg             \tmp1\().4s, \tmp1\().4s
+        neg             \tmp2\().4s, \tmp2\().4s
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().4s, \in\().4h, \coef
+        smull2          \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().4h, \in1\().4s, \shift
+        rshrn2          \out\().8h, \in2\().4s, \shift
+.endm
+
+
 // out1 = in1 + in2
 // out2 = in1 - in2
 .macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
         ret
 endfunc

-function idct16
+.macro idct16_full
         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
t0a,  v24 = t1a
         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
         dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a
@@ -485,7 +532,10 @@ function idct16
         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        
// v22 = t6a,  v26 = t5a
         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        
// v23 = t9a,  v25 = t14a
         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 
// v27 = t13a, v21 = t10a
+        idct16_end

I think it would be clearer if idct16_end is used directly from themacro. it would probably also make sense to move idct16_end and avoidthe idct16_full macro. The patch might be smaller and it is immediatelyobvious that there is no code change but the resulting code is morecomlicated than it needs to be. same applies to arm if we go withalternative 1.


Ok, so you mean like this?

function idct16
        dmbutterfly...
        ....
        idct16_end
endfunc

The same goes for alt 2 as well, these parts are identical in both.

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 5/5] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)

Reply via email to