[libav-devel] [PATCH 05/11] arm: vp9itxfm: Make the larger core transforms standalone functions

Martin Storsjö Wed, 23 Nov 2016 05:01:24 -0800

This work is sponsored by, and copyright, Google.

This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15252 to 12316 bytes.


This gives a small slowdown of a couple tens of cycles, up to a few
hundred cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.

Before:
vp9_inv_dct_dct_16x16_sub4_add_neon:    2079.7   1606.3   1772.1   1264.8
vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1

After:
vp9_inv_dct_dct_16x16_sub4_add_neon:    2214.2   1617.2   1767.9   1286.5
vp9_inv_dct_dct_16x16_sub16_add_neon:   3256.8   2489.2   2554.8   2002.9
vp9_inv_dct_dct_32x32_sub4_add_neon:   10891.4   8124.4   8614.0   6828.5
vp9_inv_dct_dct_32x32_sub32_add_neon:  18626.4  17264.0  14298.7  12067.5
---
 libavcodec/arm/vp9itxfm_neon.S | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 769579a..d10de1e 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-.macro idct16
+function idct16
         mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 
= t1a
         mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
t3a
         mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
t7a
@@ -580,9 +580,10 @@ endfunc
         vmov            d4,  d21                         @ d4  = t10a
         butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = 
out[11]
         butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = 
out[10]
-.endm
+        bx              lr
+endfunc
 
-.macro iadst16
+function iadst16
         movrel          r12, iadst16_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -653,7 +654,8 @@ endfunc
 
         vmov            d16, d2
         vmov            d30, d4
-.endm
+        bx              lr
+endfunc
 
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -669,6 +671,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         cmp             r3,  r9
         ble             2f
 .endif
+        push            {lr}
 
         mov             r12, #32
         vmov.s16        q2, #0
@@ -677,7 +680,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        \txfm\()16
+        bl              \txfm\()16
 
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -690,7 +693,7 @@ function \txfm\()16_1d_4x16_pass1_neon
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         vst1.16         {d\i}, [r0,:64]!
 .endr
-        bx              lr
+        pop             {pc}
 1:
         @ Special case: For the last input column (r1 == 12),
         @ which would be stored as the last row in the temp buffer,
@@ -717,7 +720,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         vmov            d29, d17
         vmov            d30, d18
         vmov            d31, d19
-        bx              lr
+        pop             {pc}
 
 .ifc \txfm,idct
 2:
@@ -739,6 +742,7 @@ endfunc
 @ r2 = src (temp buffer)
 @ r3 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
+        push            {lr}
         mov             r12, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
         vld1.16         {d\i}, [r2,:64], r12
@@ -752,7 +756,7 @@ function \txfm\()16_1d_4x16_pass2_neon
 
         add             r3,  r0,  r1
         lsl             r1,  r1,  #1
-        \txfm\()16
+        bl              \txfm\()16
 
 .macro load_add_store coef0, coef1, coef2, coef3
         vrshr.s16       \coef0, \coef0, #6
@@ -793,7 +797,7 @@ function \txfm\()16_1d_4x16_pass2_neon
         load_add_store  q12, q13, q14, q15
 .purgem load_add_store
 
-        bx              lr
+        pop             {pc}
 endfunc
 .endm
 
@@ -906,7 +910,7 @@ function idct32x32_dc_add_neon
         bx              lr
 endfunc
 
-.macro idct32_odd
+function idct32_odd
         movrel          r12, idct_coeffs
         add             r12, r12, #32
         vld1.16         {q0-q1}, [r12,:128]
@@ -965,7 +969,8 @@ endfunc
         mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = 
t21a
         mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = 
t22
         mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = 
t23a
-.endm
+        bx              lr
+endfunc
 
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
 @ We don't have register space to do a single pass IDCT of 4x32 though,
@@ -981,6 +986,7 @@ function idct32_1d_4x32_pass1_neon
         @ Check if this whole input slice is zero
         cmp             r3,  r1
         ble             1f
+        push            {lr}
 
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
@@ -995,7 +1001,7 @@ function idct32_1d_4x32_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        idct16
+        bl              idct16
 
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -1031,7 +1037,7 @@ function idct32_1d_4x32_pass1_neon
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
-        idct32_odd
+        bl              idct32_odd
 
         transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, 
d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
 
@@ -1057,7 +1063,7 @@ function idct32_1d_4x32_pass1_neon
         store_rev       29, 25, 21, 17
         store_rev       28, 24, 20, 16
 .purgem store_rev
-        bx              lr
+        pop             {pc}
 
 1:
         @ Write zeros to the temp buffer for pass 2
@@ -1077,6 +1083,7 @@ endfunc
 @ r1 = dst stride
 @ r2 = src (temp buffer)
 function idct32_1d_4x32_pass2_neon
+        push            {lr}
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -1087,7 +1094,7 @@ function idct32_1d_4x32_pass2_neon
 .endr
         sub             r2,  r2,  r12, lsl #4
 
-        idct16
+        bl              idct16
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
@@ -1103,7 +1110,7 @@ function idct32_1d_4x32_pass2_neon
         sub             r2,  r2,  r12, lsl #4
         sub             r2,  r2,  #64
 
-        idct32_odd
+        bl              idct32_odd
 
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
@@ -1151,7 +1158,7 @@ function idct32_1d_4x32_pass2_neon
         load_acc_store  24, 25, 26, 27, 1
         load_acc_store  28, 29, 30, 31, 1
 .purgem load_acc_store
-        bx              lr
+        pop             {pc}
 endfunc
 
 const min_eob_idct_idct_32, align=4
-- 
2.7.4

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 05/11] arm: vp9itxfm: Make the larger core transforms standalone functions

Reply via email to