On Fri, Aug 16, 2013 at 02:15:15AM -0700, Alexander Strange wrote: > On Thu, Aug 15, 2013 at 3:29 PM, Diego Biurrun <[email protected]> wrote: > > > > Older versions of clang choke if that function is forcibly inlined. > > Furthermore, inlining the function gives no performance benefit at > > least with gcc 4.4 and 4.6. > > What does clang do? > > I agree performance measurements are important for system compilers, > but otherwise would prefer to ignore old versions. > > That function should absolutely be always_inline though - its last two > parameters are intended to be optimized out once it's inlined into > dc_internal/nondc_internal/dc_internal_422. Not inlining it will just > result in some very useless if tests. > > But I guess the compiler could be defeating that intention by inlining > in the wrong order - it does seem to be since the nm output isn't what > I expect. > > Could you try making these noinline: > > decode_cabac_residual_dc_internal > decode_cabac_residual_dc_internal_422 > decode_cabac_residual_nondc_internal
Here are some benchmarks and a patch with those functions marked noinline: First on the K6-3 I tested previously: noinline: tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 - bench: utime=299.907s maxrss=7152kB real 5m12.217s user 5m0.239s sys 0m3.632s tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 - bench: utime=306.903s maxrss=7148kB real 5m17.556s user 5m7.115s sys 0m3.324s tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 - bench: utime=300.819s maxrss=7152kB real 5m11.440s user 5m1.015s sys 0m3.488s tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 - bench: utime=301.259s maxrss=7152kB real 5m11.770s user 5m1.487s sys 0m3.364s tmp@silver:~/tmp/build/gcc-4.7$ time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 - bench: utime=300.583s maxrss=7148kB real 5m11.147s user 5m0.807s sys 0m3.416s tmp@silver:~/tmp/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -; done bench: utime=300.035s maxrss=7124kB real 5m16.712s user 5m0.327s sys 0m3.660s bench: utime=299.731s maxrss=7124kB real 5m28.680s user 4m59.891s sys 0m3.628s bench: utime=301.163s maxrss=7120kB real 5m15.194s user 5m1.335s sys 0m3.456s bench: utime=298.979s maxrss=7120kB real 5m15.782s user 4m59.119s sys 0m3.504s bench: utime=299.683s maxrss=7120kB real 5m15.201s user 4m59.839s sys 0m3.440s tmp@silver:~/tmp/build/gcc-4.4$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -; done bench: utime=330.105s maxrss=7276kB real 5m45.106s user 5m30.373s sys 0m3.512s bench: utime=328.757s maxrss=7280kB real 5m46.130s user 5m28.889s sys 0m3.828s bench: utime=327.268s maxrss=7280kB real 5m54.365s user 5m27.420s sys 0m3.684s bench: utime=331.225s maxrss=7280kB real 5m47.056s user 5m31.385s sys 0m3.512s bench: utime=325.700s maxrss=7280kB real 5m43.511s user 5m25.852s sys 0m3.616s vanilla: tmp@silver:~/tmp/build/gcc-4.7$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -; done bench: utime=301.783s maxrss=7152kB real 5m10.303s user 5m2.083s sys 0m0.660s bench: utime=302.543s maxrss=7152kB real 5m9.697s user 5m2.703s sys 0m0.524s bench: utime=303.107s maxrss=7152kB real 5m9.864s user 5m3.267s sys 0m0.516s bench: utime=304.279s maxrss=7148kB real 5m11.067s user 5m4.427s sys 0m0.532s bench: utime=301.775s maxrss=7152kB real 5m8.560s user 5m1.939s sys 0m0.476s tmp@silver:~/tmp/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -; done bench: utime=305.879s maxrss=7116kB real 5m13.957s user 5m6.199s sys 0m0.512s bench: utime=305.515s maxrss=7120kB real 5m13.007s user 5m5.675s sys 0m0.408s bench: utime=304.467s maxrss=7120kB real 5m11.319s user 5m4.627s sys 0m0.484s bench: utime=304.039s maxrss=7116kB real 5m11.216s user 5m4.199s sys 0m0.516s bench: utime=305.775s maxrss=7120kB real 5m12.841s user 5m5.907s sys 0m0.432s tmp@silver:~/tmp/build/gcc-4.4$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/cathedral-beta2-400extra-crop-avc.mp4 -an -f null -v 0 -; done bench: utime=329.289s maxrss=7256kB real 5m38.045s user 5m29.585s sys 0m0.532s bench: utime=323.552s maxrss=7256kB real 5m31.077s user 5m23.700s sys 0m0.524s bench: utime=324.616s maxrss=7256kB real 5m31.713s user 5m24.768s sys 0m0.416s bench: utime=323.600s maxrss=7248kB real 5m30.699s user 5m23.764s sys 0m0.488s bench: utime=327.632s maxrss=7252kB real 5m34.876s user 5m27.780s sys 0m0.504s Then on a Pentium M 1.6GHz: noinline: diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=60.392s maxrss=22220kB real 1m0.999s user 1m0.564s sys 0m0.220s bench: utime=60.356s maxrss=22220kB real 1m0.966s user 1m0.484s sys 0m0.276s bench: utime=60.220s maxrss=22216kB real 1m0.840s user 1m0.372s sys 0m0.264s bench: utime=60.272s maxrss=22220kB real 1m0.909s user 1m0.408s sys 0m0.272s bench: utime=60.312s maxrss=22220kB real 1m0.948s user 1m0.464s sys 0m0.252s diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=60.812s maxrss=22248kB real 1m1.702s user 1m0.984s sys 0m0.460s bench: utime=60.812s maxrss=22244kB real 1m1.518s user 1m0.960s sys 0m0.344s bench: utime=60.752s maxrss=22248kB real 1m1.379s user 1m0.892s sys 0m0.276s bench: utime=60.868s maxrss=22248kB real 1m1.492s user 1m1.040s sys 0m0.240s bench: utime=60.744s maxrss=22244kB real 1m1.442s user 1m0.900s sys 0m0.240s av_always_inline ---> inline: diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=61.328s maxrss=22200kB real 1m2.000s user 1m1.496s sys 0m0.292s bench: utime=61.516s maxrss=22200kB real 1m2.107s user 1m1.680s sys 0m0.208s bench: utime=61.412s maxrss=22200kB real 1m2.065s user 1m1.560s sys 0m0.280s bench: utime=61.548s maxrss=22200kB real 1m2.181s user 1m1.712s sys 0m0.256s bench: utime=61.444s maxrss=22200kB real 1m2.042s user 1m1.620s sys 0m0.200s diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=61.200s maxrss=22260kB real 1m1.796s user 1m1.356s sys 0m0.232s bench: utime=61.012s maxrss=22260kB real 1m1.636s user 1m1.164s sys 0m0.256s bench: utime=61.316s maxrss=22260kB real 1m1.999s user 1m1.480s sys 0m0.308s bench: utime=61.164s maxrss=22260kB real 1m1.725s user 1m1.336s sys 0m0.180s bench: utime=61.840s maxrss=22256kB real 1m2.480s user 1m1.976s sys 0m0.288s vanilla: diego@nibbler:~/src/build/gcc-4.6$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=60.400s maxrss=22220kB real 1m1.010s user 1m0.568s sys 0m0.232s bench: utime=60.312s maxrss=22216kB real 1m0.885s user 1m0.444s sys 0m0.236s bench: utime=60.340s maxrss=22220kB real 1m0.916s user 1m0.492s sys 0m0.208s bench: utime=60.104s maxrss=22216kB real 1m0.760s user 1m0.248s sys 0m0.312s bench: utime=60.400s maxrss=22216kB real 1m1.024s user 1m0.544s sys 0m0.252s diego@nibbler:~/src/build/gcc-4.5$ for i in $(seq 1 5); do time ./avconv -benchmark -i ~/Downloads/HD-h264.ts -an -f null -v 0 -; done bench: utime=60.756s maxrss=22252kB real 1m1.362s user 1m0.916s sys 0m0.236s bench: utime=60.796s maxrss=22248kB real 1m1.382s user 1m0.944s sys 0m0.240s bench: utime=60.632s maxrss=22248kB real 1m1.257s user 1m0.784s sys 0m0.272s bench: utime=61.396s maxrss=22252kB real 1m2.006s user 1m1.556s sys 0m0.248s bench: utime=60.640s maxrss=22252kB real 1m1.219s user 1m0.772s sys 0m0.244s Diego
>From ae981f65ebda5b0dba624346b97f3031851fcca3 Mon Sep 17 00:00:00 2001 From: Diego Biurrun <[email protected]> Date: Fri, 16 Aug 2013 00:29:23 +0200 Subject: [PATCH] h264_cabac: Mark functions calling decode_cabac_residual_internal as noinline This ensures that decode_cabac_residual_internal actually does get inlined, which it otherwise does not, even though it is marked as always_inline. --- libavcodec/h264_cabac.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c index 79b715f..8f88064 100644 --- a/libavcodec/h264_cabac.c +++ b/libavcodec/h264_cabac.c @@ -1740,7 +1740,7 @@ decode_cabac_residual_internal(H264Context *h, int16_t *block, } -static void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block, +static av_noinline void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block, int cat, int n, const uint8_t *scantable, int max_coeff) @@ -1748,14 +1748,14 @@ static void decode_cabac_residual_dc_internal(H264Context *h, int16_t *block, decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 0); } -static void decode_cabac_residual_dc_internal_422(H264Context *h, int16_t *block, +static av_noinline void decode_cabac_residual_dc_internal_422(H264Context *h, int16_t *block, int cat, int n, const uint8_t *scantable, int max_coeff) { decode_cabac_residual_internal(h, block, cat, n, scantable, NULL, max_coeff, 1, 1); } -static void decode_cabac_residual_nondc_internal(H264Context *h, int16_t *block, +static av_noinline void decode_cabac_residual_nondc_internal(H264Context *h, int16_t *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, -- 1.7.9.5
_______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
