Re: [libav-devel] [PATCH 2/4] aarch64: vp9itxfm: Reorder the idct coefficients for better pairing

2017-02-23 Thread Janne Grunau
On 2017-02-09 14:33:54 +0200, Martin Storsjö wrote:
> All elements are used pairwise, except for the first one.
> Previously, the 16th element was unused. Move the unused element
> to the second slot, to make the later element pairs not split
> across registers.
> 
> This simplifies loading only parts of the coefficients,
> reducing the difference to the 16 bpp version.
> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 124 
> ++---
>  1 file changed, 62 insertions(+), 62 deletions(-)

ok

Janne
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 2/4] aarch64: vp9itxfm: Reorder the idct coefficients for better pairing

2017-02-09 Thread Martin Storsjö
All elements are used pairwise, except for the first one.
Previously, the 16th element was unused. Move the unused element
to the second slot, to make the later element pairs not split
across registers.

This simplifies loading only parts of the coefficients,
reducing the difference to the 16 bpp version.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 124 ++---
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index c954d1a..f87f6bd 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -22,7 +22,7 @@
 #include "neon.S"
 
 const itxfm4_coeffs, align=4
-.short  11585, 6270, 15137, 0
+.short  11585, 0, 6270, 15137
 iadst4_coeffs:
 .short  5283, 15212, 9929, 13377
 endconst
@@ -30,8 +30,8 @@ endconst
 const iadst8_coeffs, align=4
 .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
 idct_coeffs:
-.short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
-.short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+.short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
+.short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
 .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
 .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
 endconst
@@ -192,14 +192,14 @@ endconst
 .endm
 
 .macro idct4 c0, c1, c2, c3
-smull   v22.4s,\c1\().4h, v0.h[2]
-smull   v20.4s,\c1\().4h, v0.h[1]
+smull   v22.4s,\c1\().4h, v0.h[3]
+smull   v20.4s,\c1\().4h, v0.h[2]
 add v16.4h,\c0\().4h, \c2\().4h
 sub v17.4h,\c0\().4h, \c2\().4h
-smlal   v22.4s,\c3\().4h, v0.h[1]
+smlal   v22.4s,\c3\().4h, v0.h[2]
 smull   v18.4s,v16.4h,v0.h[0]
 smull   v19.4s,v17.4h,v0.h[0]
-smlsl   v20.4s,\c3\().4h, v0.h[2]
+smlsl   v20.4s,\c3\().4h, v0.h[3]
 rshrn   v22.4h,v22.4s,#14
 rshrn   v18.4h,v18.4s,#14
 rshrn   v19.4h,v19.4s,#14
@@ -326,9 +326,9 @@ itxfm_func4x4 iwht,  iwht
 
 .macro idct8
 dmbutterfly0v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = 
t0a, v20 = t1a
-dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = 
t2a, v22 = t3a
-dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = 
t4a, v23 = t7a
-dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = 
t5a, v19 = t6a
+dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = 
t2a, v22 = t3a
+dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = 
t4a, v23 = t7a
+dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = 
t5a, v19 = t6a
 
 butterfly_8hv24, v25, v16, v22 // v24 = t0, v25 = t3
 butterfly_8hv28, v29, v17, v21 // v28 = t4, v29 = t5a
@@ -361,8 +361,8 @@ itxfm_func4x4 iwht,  iwht
 dmbutterfly0v19, v20, v6, v7, v24, v26, v27, v28, v29, v30   // 
v19 = -out[3], v20 = out[4]
 neg v19.8h,   v19.8h  // v19 = out[3]
 
-dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[1], v0.h[2]   // 
v26,v27 = t5a, v28,v29 = t4a
-dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[2], v0.h[1]   // 
v2,v3   = t6a, v4,v5   = t7a
+dmbutterfly_l   v26, v27, v28, v29, v5,  v3,  v0.h[2], v0.h[3]   // 
v26,v27 = t5a, v28,v29 = t4a
+dmbutterfly_l   v2,  v3,  v4,  v5,  v31, v25, v0.h[3], v0.h[2]   // 
v2,v3   = t6a, v4,v5   = t7a
 
 dbutterfly_nv17, v30, v28, v29, v2,  v3,  v6,  v7,  v24, v25 // 
v17 = -out[1], v30 = t6
 dbutterfly_nv22, v31, v26, v27, v4,  v5,  v6,  v7,  v24, v25 // 
v22 = out[6],  v31 = t7
@@ -537,13 +537,13 @@ endfunc
 
 function idct16
 dmbutterfly0v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = 
t0a,  v24 = t1a
-dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
-dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a
-dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = 
t5a,  v22 = t6a
-dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = 
t8a,  v31 = t15a
-dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = 
t9a,  v23 = t14a
-dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = 
t10a, v27 = t13a
-dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = 
t11a, v19 = t12a
+dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = 
t2a,  v28 = t3a
+dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = 
t4a,  v30 = t7a
+