This work is sponsored by, and copyright, Google.
This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
15252 to 12316 bytes.
This gives a small slowdown of a couple tens of cycles, up to a few
hundred cycles for the full case of the largest transform, but makes
it more feasible to add more optimized versions of these transforms.
Before:
vp9_inv_dct_dct_16x16_sub4_add_neon: 2079.7 1606.3 1772.1 1264.8
vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7
vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9
vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1
After:
vp9_inv_dct_dct_16x16_sub4_add_neon: 2214.2 1617.2 1767.9 1286.5
vp9_inv_dct_dct_16x16_sub16_add_neon: 3256.8 2489.2 2554.8 2002.9
vp9_inv_dct_dct_32x32_sub4_add_neon: 10891.4 8124.4 8614.0 6828.5
vp9_inv_dct_dct_32x32_sub32_add_neon: 18626.4 17264.0 14298.7 12067.5
---
libavcodec/arm/vp9itxfm_neon.S | 41 ++++++++++++++++++++++++-----------------
1 file changed, 24 insertions(+), 17 deletions(-)
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 769579a..d10de1e 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
endfunc
.ltorg
-.macro idct16
+function idct16
mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24
= t1a
mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 =
t3a
mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 =
t7a
@@ -580,9 +580,10 @@ endfunc
vmov d4, d21 @ d4 = t10a
butterfly d20, d27, d6, d27 @ d20 = out[4], d27 =
out[11]
butterfly d21, d26, d26, d4 @ d21 = out[5], d26 =
out[10]
-.endm
+ bx lr
+endfunc
-.macro iadst16
+function iadst16
movrel r12, iadst16_coeffs
vld1.16 {q0-q1}, [r12,:128]
@@ -653,7 +654,8 @@ endfunc
vmov d16, d2
vmov d30, d4
-.endm
+ bx lr
+endfunc
.macro itxfm16_1d_funcs txfm
@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -669,6 +671,7 @@ function \txfm\()16_1d_4x16_pass1_neon
cmp r3, r9
ble 2f
.endif
+ push {lr}
mov r12, #32
vmov.s16 q2, #0
@@ -677,7 +680,7 @@ function \txfm\()16_1d_4x16_pass1_neon
vst1.16 {d4}, [r2,:64], r12
.endr
- \txfm\()16
+ bl \txfm\()16
@ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -690,7 +693,7 @@ function \txfm\()16_1d_4x16_pass1_neon
.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
vst1.16 {d\i}, [r0,:64]!
.endr
- bx lr
+ pop {pc}
1:
@ Special case: For the last input column (r1 == 12),
@ which would be stored as the last row in the temp buffer,
@@ -717,7 +720,7 @@ function \txfm\()16_1d_4x16_pass1_neon
vmov d29, d17
vmov d30, d18
vmov d31, d19
- bx lr
+ pop {pc}
.ifc \txfm,idct
2:
@@ -739,6 +742,7 @@ endfunc
@ r2 = src (temp buffer)
@ r3 = slice offset
function \txfm\()16_1d_4x16_pass2_neon
+ push {lr}
mov r12, #32
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
vld1.16 {d\i}, [r2,:64], r12
@@ -752,7 +756,7 @@ function \txfm\()16_1d_4x16_pass2_neon
add r3, r0, r1
lsl r1, r1, #1
- \txfm\()16
+ bl \txfm\()16
.macro load_add_store coef0, coef1, coef2, coef3
vrshr.s16 \coef0, \coef0, #6
@@ -793,7 +797,7 @@ function \txfm\()16_1d_4x16_pass2_neon
load_add_store q12, q13, q14, q15
.purgem load_add_store
- bx lr
+ pop {pc}
endfunc
.endm
@@ -906,7 +910,7 @@ function idct32x32_dc_add_neon
bx lr
endfunc
-.macro idct32_odd
+function idct32_odd
movrel r12, idct_coeffs
add r12, r12, #32
vld1.16 {q0-q1}, [r12,:128]
@@ -965,7 +969,8 @@ endfunc
mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 =
t21a
mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 =
t22
mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 =
t23a
-.endm
+ bx lr
+endfunc
@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
@ We don't have register space to do a single pass IDCT of 4x32 though,
@@ -981,6 +986,7 @@ function idct32_1d_4x32_pass1_neon
@ Check if this whole input slice is zero
cmp r3, r1
ble 1f
+ push {lr}
movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128]
@@ -995,7 +1001,7 @@ function idct32_1d_4x32_pass1_neon
vst1.16 {d4}, [r2,:64], r12
.endr
- idct16
+ bl idct16
@ Do four 4x4 transposes. Originally, d16-d31 contain the
@ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
@@ -1031,7 +1037,7 @@ function idct32_1d_4x32_pass1_neon
vst1.16 {d4}, [r2,:64], r12
.endr
- idct32_odd
+ bl idct32_odd
transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30,
d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
@@ -1057,7 +1063,7 @@ function idct32_1d_4x32_pass1_neon
store_rev 29, 25, 21, 17
store_rev 28, 24, 20, 16
.purgem store_rev
- bx lr
+ pop {pc}
1:
@ Write zeros to the temp buffer for pass 2
@@ -1077,6 +1083,7 @@ endfunc
@ r1 = dst stride
@ r2 = src (temp buffer)
function idct32_1d_4x32_pass2_neon
+ push {lr}
movrel r12, idct_coeffs
vld1.16 {q0-q1}, [r12,:128]
@@ -1087,7 +1094,7 @@ function idct32_1d_4x32_pass2_neon
.endr
sub r2, r2, r12, lsl #4
- idct16
+ bl idct16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vst1.16 {d\i}, [r2,:64], r12
@@ -1103,7 +1110,7 @@ function idct32_1d_4x32_pass2_neon
sub r2, r2, r12, lsl #4
sub r2, r2, #64
- idct32_odd
+ bl idct32_odd
mov r12, #128
.macro load_acc_store a, b, c, d, neg=0
@@ -1151,7 +1158,7 @@ function idct32_1d_4x32_pass2_neon
load_acc_store 24, 25, 26, 27, 1
load_acc_store 28, 29, 30, 31, 1
.purgem load_acc_store
- bx lr
+ pop {pc}
endfunc
const min_eob_idct_idct_32, align=4
--
2.7.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel