On Fri, Aug 16, 2013 at 02:15:15AM -0700, Alexander Strange wrote:
> On Thu, Aug 15, 2013 at 3:29 PM, Diego Biurrun <[email protected]> wrote:
> >
> > Older versions of clang choke if that function is forcibly inlined.
> > Furthermore, inlining the function gives no performance benefit at
> > least with gcc 4.4 and 4.6.
> 
> What does clang do?
> 
> I agree performance measurements are important for system compilers,
> but otherwise would prefer to ignore old versions.
> 
> That function should absolutely be always_inline though - its last two
> parameters are intended to be optimized out once it's inlined into
> dc_internal/nondc_internal/dc_internal_422. Not inlining it will just
> result in some very useless if tests.
> 
> But I guess the compiler could be defeating that intention by inlining
> in the wrong order - it does seem to be since the nm output isn't what
> I expect.
> 
> Could you try making these noinline:
> 
> decode_cabac_residual_dc_internal
> decode_cabac_residual_dc_internal_422
> decode_cabac_residual_nondc_internal

Here are some benchmarks and a patch with those functions marked noinline:

First on the K6-3 I tested previously:


noinline:

tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i 
~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -
bench: utime=299.907s maxrss=7152kB
real    5m12.217s
user    5m0.239s
sys     0m3.632s
tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i 
~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -
bench: utime=306.903s maxrss=7148kB
real    5m17.556s
user    5m7.115s
sys     0m3.324s
tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i 
~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -
bench: utime=300.819s maxrss=7152kB
real    5m11.440s
user    5m1.015s
sys     0m3.488s
tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i 
~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -
bench: utime=301.259s maxrss=7152kB
real    5m11.770s
user    5m1.487s
sys     0m3.364s
tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i 
~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -
bench: utime=300.583s maxrss=7148kB
real    5m11.147s
user    5m0.807s
sys     0m3.416s

tmp@silver:~/tmp/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 
0 -; done
bench: utime=300.035s maxrss=7124kB
real    5m16.712s
user    5m0.327s
sys     0m3.660s
bench: utime=299.731s maxrss=7124kB
real    5m28.680s
user    4m59.891s
sys     0m3.628s
bench: utime=301.163s maxrss=7120kB
real    5m15.194s
user    5m1.335s
sys     0m3.456s
bench: utime=298.979s maxrss=7120kB
real    5m15.782s
user    4m59.119s
sys     0m3.504s
bench: utime=299.683s maxrss=7120kB
real    5m15.201s
user    4m59.839s
sys     0m3.440s

tmp@silver:~/tmp/build/gcc-4.4$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 
0 -; done
bench: utime=330.105s maxrss=7276kB
real    5m45.106s
user    5m30.373s
sys     0m3.512s
bench: utime=328.757s maxrss=7280kB
real    5m46.130s
user    5m28.889s
sys     0m3.828s
bench: utime=327.268s maxrss=7280kB
real    5m54.365s
user    5m27.420s
sys     0m3.684s
bench: utime=331.225s maxrss=7280kB
real    5m47.056s
user    5m31.385s
sys     0m3.512s
bench: utime=325.700s maxrss=7280kB
real    5m43.511s
user    5m25.852s
sys     0m3.616s



vanilla:

tmp@silver:~/tmp/build/gcc-4.7$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 
0 -; done
bench: utime=301.783s maxrss=7152kB
real    5m10.303s
user    5m2.083s
sys     0m0.660s
bench: utime=302.543s maxrss=7152kB
real    5m9.697s
user    5m2.703s
sys     0m0.524s
bench: utime=303.107s maxrss=7152kB
real    5m9.864s
user    5m3.267s
sys     0m0.516s
bench: utime=304.279s maxrss=7148kB
real    5m11.067s
user    5m4.427s
sys     0m0.532s
bench: utime=301.775s maxrss=7152kB
real    5m8.560s
user    5m1.939s
sys     0m0.476s

tmp@silver:~/tmp/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 
0 -; done
bench: utime=305.879s maxrss=7116kB
real    5m13.957s
user    5m6.199s
sys     0m0.512s
bench: utime=305.515s maxrss=7120kB
real    5m13.007s
user    5m5.675s
sys     0m0.408s
bench: utime=304.467s maxrss=7120kB
real    5m11.319s
user    5m4.627s
sys     0m0.484s
bench: utime=304.039s maxrss=7116kB
real    5m11.216s
user    5m4.199s
sys     0m0.516s
bench: utime=305.775s maxrss=7120kB
real    5m12.841s
user    5m5.907s
sys     0m0.432s

tmp@silver:~/tmp/build/gcc-4.4$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 
0 -; done
bench: utime=329.289s maxrss=7256kB
real    5m38.045s
user    5m29.585s
sys     0m0.532s
bench: utime=323.552s maxrss=7256kB
real    5m31.077s
user    5m23.700s
sys     0m0.524s
bench: utime=324.616s maxrss=7256kB
real    5m31.713s
user    5m24.768s
sys     0m0.416s
bench: utime=323.600s maxrss=7248kB
real    5m30.699s
user    5m23.764s
sys     0m0.488s
bench: utime=327.632s maxrss=7252kB
real    5m34.876s
user    5m27.780s
sys     0m0.504s




Then on a Pentium M 1.6GHz:

noinline:

diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=60.392s maxrss=22220kB
real    1m0.999s
user    1m0.564s
sys     0m0.220s
bench: utime=60.356s maxrss=22220kB
real    1m0.966s
user    1m0.484s
sys     0m0.276s
bench: utime=60.220s maxrss=22216kB
real    1m0.840s
user    1m0.372s
sys     0m0.264s
bench: utime=60.272s maxrss=22220kB
real    1m0.909s
user    1m0.408s
sys     0m0.272s
bench: utime=60.312s maxrss=22220kB
real    1m0.948s
user    1m0.464s
sys     0m0.252s


diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=60.812s maxrss=22248kB
real    1m1.702s
user    1m0.984s
sys     0m0.460s
bench: utime=60.812s maxrss=22244kB
real    1m1.518s
user    1m0.960s
sys     0m0.344s
bench: utime=60.752s maxrss=22248kB
real    1m1.379s
user    1m0.892s
sys     0m0.276s
bench: utime=60.868s maxrss=22248kB
real    1m1.492s
user    1m1.040s
sys     0m0.240s
bench: utime=60.744s maxrss=22244kB
real    1m1.442s
user    1m0.900s
sys     0m0.240s



av_always_inline ---> inline:

diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=61.328s maxrss=22200kB
real    1m2.000s
user    1m1.496s
sys     0m0.292s
bench: utime=61.516s maxrss=22200kB
real    1m2.107s
user    1m1.680s
sys     0m0.208s
bench: utime=61.412s maxrss=22200kB
real    1m2.065s
user    1m1.560s
sys     0m0.280s
bench: utime=61.548s maxrss=22200kB
real    1m2.181s
user    1m1.712s
sys     0m0.256s
bench: utime=61.444s maxrss=22200kB
real    1m2.042s
user    1m1.620s
sys     0m0.200s

diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=61.200s maxrss=22260kB
real    1m1.796s
user    1m1.356s
sys     0m0.232s
bench: utime=61.012s maxrss=22260kB
real    1m1.636s
user    1m1.164s
sys     0m0.256s
bench: utime=61.316s maxrss=22260kB
real    1m1.999s
user    1m1.480s
sys     0m0.308s
bench: utime=61.164s maxrss=22260kB
real    1m1.725s
user    1m1.336s
sys     0m0.180s
bench: utime=61.840s maxrss=22256kB
real    1m2.480s
user    1m1.976s
sys     0m0.288s



vanilla:

diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=60.400s maxrss=22220kB
real    1m1.010s
user    1m0.568s
sys     0m0.232s
bench: utime=60.312s maxrss=22216kB
real    1m0.885s
user    1m0.444s
sys     0m0.236s
bench: utime=60.340s maxrss=22220kB
real    1m0.916s
user    1m0.492s
sys     0m0.208s
bench: utime=60.104s maxrss=22216kB
real    1m0.760s
user    1m0.248s
sys     0m0.312s
bench: utime=60.400s maxrss=22216kB
real    1m1.024s
user    1m0.544s
sys     0m0.252s

diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv 
-benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done
bench: utime=60.756s maxrss=22252kB
real    1m1.362s
user    1m0.916s
sys     0m0.236s
bench: utime=60.796s maxrss=22248kB
real    1m1.382s
user    1m0.944s
sys     0m0.240s
bench: utime=60.632s maxrss=22248kB
real    1m1.257s
user    1m0.784s
sys     0m0.272s
bench: utime=61.396s maxrss=22252kB
real    1m2.006s
user    1m1.556s
sys     0m0.248s
bench: utime=60.640s maxrss=22252kB
real    1m1.219s
user    1m0.772s
sys     0m0.244s


Diego
>From ae981f65ebda5b0dba624346b97f3031851fcca3 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <[email protected]>
Date: Fri, 16 Aug 2013 00:29:23 +0200
Subject: [PATCH] h264_cabac: Mark functions calling
 decode_cabac_residual_internal as noinline

This ensures that decode_cabac_residual_internal actually does get inlined,
which it otherwise does not, even though it is marked as always_inline.
---
 libavcodec/h264_cabac.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 79b715f..8f88064 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1740,7 +1740,7 @@ decode_cabac_residual_internal(H264Context *h, int16_t *block,
 
 }
 
-static void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block,
+static av_noinline void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block,
                                               int cat, int n,
                                               const uint8_t *scantable,
                                               int max_coeff)
@@ -1748,14 +1748,14 @@ static void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block,
     decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0);
 }
 
-static void decode_cabac_residual_dc_internal_422(H264Context *h, int16_t *block,
+static av_noinline void decode_cabac_residual_dc_internal_422(H264Context *h, int16_t *block,
                                                   int cat, int n, const uint8_t *scantable,
                                                   int max_coeff)
 {
     decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1);
 }
 
-static void decode_cabac_residual_nondc_internal(H264Context *h, int16_t *block,
+static av_noinline void decode_cabac_residual_nondc_internal(H264Context *h, int16_t *block,
                                                  int cat, int n,
                                                  const uint8_t *scantable,
                                                  const uint32_t *qmul,
-- 
1.7.9.5

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to