On Fri, 3 Feb 2017, Janne Grunau wrote:
On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:
This work is sponsored by, and copyright, Google.
This increases the code size of libavcodec/arm/vp9itxfm_neon.o
from 12388 to 15064 bytes.
Before: Cortex A7 A8 A9 A53
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.7 211.9 235.8
vp9_inv_dct_dct_16x16_sub2_add_neon: 2056.7 1521.2 1734.8 1262.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 2444.9 1801.6 2007.8 1508.5
vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7
vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5
vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 456.7 866.0 553.9
vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8
vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1
vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4
vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9
vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8
vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7
vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8
vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6
vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9
After:
vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.5 236.1
vp9_inv_dct_dct_16x16_sub2_add_neon: 1448.2 994.0 1191.3 836.0
vp9_inv_dct_dct_16x16_sub4_add_neon: 1437.0 991.0 1191.6 836.0
vp9_inv_dct_dct_16x16_sub8_add_neon: 2114.5 1757.9 1855.3 1335.3
vp9_inv_dct_dct_16x16_sub12_add_neon: 2862.7 2141.5 2293.3 1772.7
vp9_inv_dct_dct_16x16_sub16_add_neon: 3299.6 2419.1 2552.7 2033.0
vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0 457.5 864.3 554.8
vp9_inv_dct_dct_32x32_sub2_add_neon: 7867.8 5978.6 6594.6 5109.9
vp9_inv_dct_dct_32x32_sub4_add_neon: 7871.0 5772.5 6582.2 5108.5
vp9_inv_dct_dct_32x32_sub8_add_neon: 8694.8 6925.7 7125.7 5671.4
vp9_inv_dct_dct_32x32_sub12_add_neon: 11250.3 9654.7 9557.6 7540.5
vp9_inv_dct_dct_32x32_sub16_add_neon: 12129.5 11061.1 10295.0 8220.7
vp9_inv_dct_dct_32x32_sub20_add_neon: 15218.4 13580.8 11841.3 9739.9
vp9_inv_dct_dct_32x32_sub24_add_neon: 16343.5 15097.0 12629.2 10496.6
vp9_inv_dct_dct_32x32_sub28_add_neon: 17482.2 15516.4 13476.0 11261.0
vp9_inv_dct_dct_32x32_sub32_add_neon: 18586.7 16817.5 14289.3 12019.0
---
If we wouldn't have made the core transforms standalone functions
in the previous patch, the code size would increase to around 21 KB (which
isn't too bad), but the idct32 pass1/2 functions would bloat up so much
that they would require literal pools within the functions themselves.
---
libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 331 insertions(+), 20 deletions(-)
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 22e63e5..bd3f678 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
vrshrn.s32 \out2, \tmpq4, #14
.endm
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+ vmull.s16 \tmpq3, \in1, d0[0]
+ vrshrn.s32 \out1, \tmpq3, #14
+ vmov \out2, \out1
if you haven't already tried doing the vrshrn twice could be faster
since it has less dependencies
Didn't think of that - it does indeed seem to help (both here and in the
aarch64 version), so applied that.
@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon
mov r12, #32
vmov.s16 q2, #0
+
+.ifc \txfm,idct
+ cmp r3, #10
+ ble 3f
+ cmp r3, #38
+ ble 4f
+.endif
I'd test only for less or equal 38 here
+
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
.endr
bl \txfm\()16
+.ifc \txfm,idct
+ b 5f
cmp r3, #10
+
+3:
+.irp i, 16, 17, 18, 19
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
+.endr
+ bl idct16_quarter
+ b 5f
remove this
+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ vld1.16 {d\i}, [r2,:64]
+ vst1.16 {d4}, [r2,:64], r12
.if \i == 19
blle idct16_half
ble 5f
.endif
saves a little binary space not sure if it's worth it.
Thanks for the reviews!
Hmm, that looks pretty neat.
I folded in this change into the aarch64 version (and the rshrn instead of
mov) as well, using a b.gt instead of conditional bl, like this:
.if \i == 19
b.gt 4f
bl idct16_quarter
b 5f
4:
.endif
In principle I guess one could interleave the same in the full loop as
well, having only one loop, with special case checks for i == 19 and i ==
23. Then we'd end up with two comparisons instead of one when doing the
full case - not sure if it's preferrable or not.
The main question though is whether you prefer this or alternative 2.
// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel