idct32 when possible (alternative 1)

Martin Storsjö Fri, 03 Feb 2017 13:45:11 -0800

On Fri, 3 Feb 2017, Janne Grunau wrote:

On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:

This work is sponsored by, and copyright, Google.


This increases the code size of libavcodec/arm/vp9itxfm_neon.o
from 12388 to 15064 bytes.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.7    211.9    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    2056.7   1521.2   1734.8   1262.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    2444.9   1801.6   2007.8   1508.5
vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub1_add_neon:     752.0    456.7    866.0    553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    211.5    236.1
vp9_inv_dct_dct_16x16_sub2_add_neon:    1448.2    994.0   1191.3    836.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    1437.0    991.0   1191.6    836.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    2114.5   1757.9   1855.3   1335.3
vp9_inv_dct_dct_16x16_sub12_add_neon:   2862.7   2141.5   2293.3   1772.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3299.6   2419.1   2552.7   2033.0
vp9_inv_dct_dct_32x32_sub1_add_neon:     753.0    457.5    864.3    554.8
vp9_inv_dct_dct_32x32_sub2_add_neon:    7867.8   5978.6   6594.6   5109.9
vp9_inv_dct_dct_32x32_sub4_add_neon:    7871.0   5772.5   6582.2   5108.5
vp9_inv_dct_dct_32x32_sub8_add_neon:    8694.8   6925.7   7125.7   5671.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  11250.3   9654.7   9557.6   7540.5
vp9_inv_dct_dct_32x32_sub16_add_neon:  12129.5  11061.1  10295.0   8220.7
vp9_inv_dct_dct_32x32_sub20_add_neon:  15218.4  13580.8  11841.3   9739.9
vp9_inv_dct_dct_32x32_sub24_add_neon:  16343.5  15097.0  12629.2  10496.6
vp9_inv_dct_dct_32x32_sub28_add_neon:  17482.2  15516.4  13476.0  11261.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18586.7  16817.5  14289.3  12019.0

---
If we wouldn't have made the core transforms standalone functions
in the previous patch, the code size would increase to around 21 KB (which
isn't too bad), but the idct32 pass1/2 functions would bloat up so much
that they would require literal pools within the functions themselves.
---
 libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 331 insertions(+), 20 deletions(-)

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 22e63e5..bd3f678 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@ endconst
         vrshrn.s32      \out2, \tmpq4, #14
 .endm

+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s16       \tmpq3, \in1, d0[0]
+        vrshrn.s32      \out1,  \tmpq3, #14
+        vmov            \out2,  \out1

if you haven't already tried doing the vrshrn twice could be fastersince it has less dependencies

Didn't think of that - it does indeed seem to help (both here and in theaarch64 version), so applied that.

@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon

         mov             r12, #32
         vmov.s16        q2, #0
+
+.ifc \txfm,idct
+        cmp             r3,  #10
+        ble             3f
+        cmp             r3,  #38
+        ble             4f
+.endif


I'd test only for less or equal 38 here

+
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr

         bl              \txfm\()16
+.ifc \txfm,idct
+        b               5f


cmp             r3,  #10

+
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct16_quarter
+        b               5f


remove this

+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12


.if \i == 19
blle idct16_half
ble  5f
.endif

saves a little binary space not sure if it's worth it.


Thanks for the reviews!


Hmm, that looks pretty neat.

I folded in this change into the aarch64 version (and the rshrn instead ofmov) as well, using a b.gt instead of conditional bl, like this:


.if \i == 19
        b.gt            4f
        bl              idct16_quarter
        b               5f
4:
.endif

In principle I guess one could interleave the same in the full loop aswell, having only one loop, with special case checks for i == 19 and i ==23. Then we'd end up with two comparisons instead of one when doing thefull case - not sure if it's preferrable or not.


The main question though is whether you prefer this or alternative 2.

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Re: [libav-devel] [PATCH 2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)

Reply via email to