This work is sponsored by, and copyright, Google.
This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o
from 14740 to 18504 bytes.
Before:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon: 1051.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 1051.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 1051.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3
vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5
vp9_inv_dct_dct_32x32_sub2_add_neon: 5199.1
vp9_inv_dct_dct_32x32_sub4_add_neon: 5199.9
vp9_inv_dct_dct_32x32_sub8_add_neon: 5196.9
vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6
vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9
vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1
vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0
vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8
After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon: 697.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 697.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 908.0
vp9_inv_dct_dct_16x16_sub12_add_neon: 1399.6
vp9_inv_dct_dct_16x16_sub16_add_neon: 1403.3
vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1
vp9_inv_dct_dct_32x32_sub2_add_neon: 3879.7
vp9_inv_dct_dct_32x32_sub4_add_neon: 3952.2
vp9_inv_dct_dct_32x32_sub8_add_neon: 3948.4
vp9_inv_dct_dct_32x32_sub12_add_neon: 5462.1
vp9_inv_dct_dct_32x32_sub16_add_neon: 5461.7
vp9_inv_dct_dct_32x32_sub20_add_neon: 7169.2
vp9_inv_dct_dct_32x32_sub24_add_neon: 7162.4
vp9_inv_dct_dct_32x32_sub28_add_neon: 8137.4
vp9_inv_dct_dct_32x32_sub32_add_neon: 8136.7
I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
If we wouldn't have made the core transforms standalone functions,
the code size would end up at around 28 KB.
---
libavcodec/aarch64/vp9itxfm_neon.S | 367 +++++++++++++++++++++++++++++++++++--
1 file changed, 347 insertions(+), 20 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S
b/libavcodec/aarch64/vp9itxfm_neon.S
index be9643e..bb79348 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@ endconst
.endif
.endm
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+ smull \tmp1\().4s, \in1\().4h, v0.h[0]
+ smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
+ rshrn \out1\().4h, \tmp1\().4s, #14
+ rshrn2 \out1\().8h, \tmp2\().4s, #14
+ mov \out2\().16b, \out1\().16b
+.endm
+
// out1,out2 = in1 * coef1 - in2 * coef2
// out3,out4 = in1 * coef2 + in2 * coef1
// out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
rshrn2 \inout2\().8h, \tmp4\().4s, #14
.endm
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout1\().4h, \coef1
+ smull2 \tmp2\().4s, \inout1\().8h, \coef1
+ smull \tmp3\().4s, \inout1\().4h, \coef2
+ smull2 \tmp4\().4s, \inout1\().8h, \coef2
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout2\().4h, \coef2
+ smull2 \tmp2\().4s, \inout2\().8h, \coef2
+ smull \tmp3\().4s, \inout2\().4h, \coef1
+ smull2 \tmp4\().4s, \inout2\().8h, \coef1
+ neg \tmp1\().4s, \tmp1\().4s
+ neg \tmp2\().4s, \tmp2\().4s
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+ smull \out1\().4s, \in\().4h, \coef
+ smull2 \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+ rshrn \out\().4h, \in1\().4s, \shift
+ rshrn2 \out\().8h, \in2\().4s, \shift
+.endm
+
+
// out1 = in1 + in2
// out2 = in1 - in2
.macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
ret
endfunc
-function idct16
+.macro idct16_full
dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 =
t0a, v24 = t1a
dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 =
t2a, v28 = t3a
dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 =
t4a, v30 = t7a
@@ -485,7 +532,10 @@ function idct16
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31
// v22 = t6a, v26 = t5a
dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31
// v23 = t9a, v25 = t14a
dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1
// v27 = t13a, v21 = t10a
+ idct16_end