All elements are used pairwise, except for the first one.
Previously, the 16th element was unused. Move the unused element
to the second slot, to make the later element pairs not split
across registers.
This simplifies loading only parts of the coefficients,
reducing the difference to the 16 bpp version.
---
libavcodec/aarch64/vp9itxfm_neon.S | 124 ++---
1 file changed, 62 insertions(+), 62 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S
b/libavcodec/aarch64/vp9itxfm_neon.S
index c954d1a..f87f6bd 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -22,7 +22,7 @@
#include "neon.S"
const itxfm4_coeffs, align=4
-.short 11585, 6270, 15137, 0
+.short 11585, 0, 6270, 15137
iadst4_coeffs:
.short 5283, 15212, 9929, 13377
endconst
@@ -30,8 +30,8 @@ endconst
const iadst8_coeffs, align=4
.short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
idct_coeffs:
-.short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
-.short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+.short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+.short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
.short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
.short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
endconst
@@ -192,14 +192,14 @@ endconst
.endm
.macro idct4 c0, c1, c2, c3
-smull v22.4s,\c1\().4h, v0.h[2]
-smull v20.4s,\c1\().4h, v0.h[1]
+smull v22.4s,\c1\().4h, v0.h[3]
+smull v20.4s,\c1\().4h, v0.h[2]
add v16.4h,\c0\().4h, \c2\().4h
sub v17.4h,\c0\().4h, \c2\().4h
-smlal v22.4s,\c3\().4h, v0.h[1]
+smlal v22.4s,\c3\().4h, v0.h[2]
smull v18.4s,v16.4h,v0.h[0]
smull v19.4s,v17.4h,v0.h[0]
-smlsl v20.4s,\c3\().4h, v0.h[2]
+smlsl v20.4s,\c3\().4h, v0.h[3]
rshrn v22.4h,v22.4s,#14
rshrn v18.4h,v18.4s,#14
rshrn v19.4h,v19.4s,#14
@@ -326,9 +326,9 @@ itxfm_func4x4 iwht, iwht
.macro idct8
dmbutterfly0v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 =
t0a, v20 = t1a
-dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 =
t2a, v22 = t3a
-dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 =
t4a, v23 = t7a
-dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 =
t5a, v19 = t6a
+dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 =
t2a, v22 = t3a
+dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 =
t4a, v23 = t7a
+dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 =
t5a, v19 = t6a
butterfly_8hv24, v25, v16, v22 // v24 = t0, v25 = t3
butterfly_8hv28, v29, v17, v21 // v28 = t4, v29 = t5a
@@ -361,8 +361,8 @@ itxfm_func4x4 iwht, iwht
dmbutterfly0v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 //
v19 = -out[3], v20 = out[4]
neg v19.8h, v19.8h // v19 = out[3]
-dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] //
v26,v27 = t5a, v28,v29 = t4a
-dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] //
v2,v3 = t6a, v4,v5 = t7a
+dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] //
v26,v27 = t5a, v28,v29 = t4a
+dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] //
v2,v3 = t6a, v4,v5 = t7a
dbutterfly_nv17, v30, v28, v29, v2, v3, v6, v7, v24, v25 //
v17 = -out[1], v30 = t6
dbutterfly_nv22, v31, v26, v27, v4, v5, v6, v7, v24, v25 //
v22 = out[6], v31 = t7
@@ -537,13 +537,13 @@ endfunc
function idct16
dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 =
t0a, v24 = t1a
-dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 =
t2a, v28 = t3a
-dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 =
t4a, v30 = t7a
-dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 =
t5a, v22 = t6a
-dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 =
t8a, v31 = t15a
-dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 =
t9a, v23 = t14a
-dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 =
t10a, v27 = t13a
-dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 =
t11a, v19 = t12a
+dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 =
t2a, v28 = t3a
+dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 =
t4a, v30 = t7a
+