Does anyone have further experience with the speedups of TCMalloc? I'm testing it on our infrastructure and on the ALPHA/debug/quick tests it shaves off 10% of runtime, but in the ALPHA/fast/quick tests it actually adds 50% of runtime. I haven't done extensive tests with other workloads or builds but I had assumed it would all go faster.
Lisa On Tue, Jun 5, 2012 at 1:24 AM, Ali Saidi <[email protected]> wrote: > changeset 904ddeecc653 in /z/repo/gem5 > details: http://repo.gem5.org/gem5?cmd=changeset;node=904ddeecc653 > description: > sim: Remove FastAlloc > > While FastAlloc provides a small performance increase (~1.5%) over > regular malloc it isn't thread safe. > After removing FastAlloc and using tcmalloc I've seen a > performance increase of 12% over libc malloc > when running twolf for ARM. > > diffstat: > > SConstruct | 8 +- > src/arch/x86/pagetable_walker.hh | 3 +- > src/base/SConscript | 1 - > src/base/fast_alloc.cc | 72 ---------- > src/base/fast_alloc.hh | 190 > ----------------------------- > src/cpu/base_dyn_inst.hh | 3 +- > src/cpu/inorder/inorder_dyn_inst.hh | 3 +- > src/cpu/o3/lsq_unit.hh | 3 +- > src/cpu/ozone/lw_lsq.hh | 3 +- > src/cpu/testers/memtest/memtest.hh | 3 +- > src/cpu/testers/networktest/networktest.hh | 3 +- > src/dev/dma_device.hh | 2 +- > src/mem/bridge.hh | 3 +- > src/mem/cache/cache_impl.hh | 3 +- > src/mem/packet.hh | 5 +- > src/mem/request.hh | 5 +- > src/python/swig/event.i | 1 - > src/sim/eventq.hh | 3 +- > 18 files changed, 16 insertions(+), 298 deletions(-) > > diffs (truncated from 587 to 300 lines): > > diff -r 1e2acba5e77e -r 904ddeecc653 SConstruct > --- a/SConstruct Tue Jun 05 01:23:08 2012 -0400 > +++ b/SConstruct Tue Jun 05 01:23:08 2012 -0400 > @@ -833,11 +833,6 @@ > ListVariable('CPU_MODELS', 'CPU models', > sorted(n for n,m in CpuModel.dict.iteritems() if > m.default), > sorted(CpuModel.list)), > - BoolVariable('NO_FAST_ALLOC', 'Disable fast object allocator', False), > - BoolVariable('FORCE_FAST_ALLOC', > - 'Enable fast object allocator, even for gem5.debug', > False), > - BoolVariable('FAST_ALLOC_STATS', 'Enable fast object allocator > statistics', > - False), > BoolVariable('EFENCE', 'Link with Electric Fence malloc debugger', > False), > BoolVariable('SS_COMPATIBLE_FP', > @@ -852,8 +847,7 @@ > ) > > # These variables get exported to #defines in config/*.hh (see > src/SConscript). > -export_vars += ['USE_FENV', 'NO_FAST_ALLOC', 'FORCE_FAST_ALLOC', > - 'FAST_ALLOC_STATS', 'SS_COMPATIBLE_FP', > +export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP', > 'TARGET_ISA', 'CP_ANNOTATE', 'USE_POSIX_CLOCK' ] > > ################################################### > diff -r 1e2acba5e77e -r 904ddeecc653 src/arch/x86/pagetable_walker.hh > --- a/src/arch/x86/pagetable_walker.hh Tue Jun 05 01:23:08 2012 -0400 > +++ b/src/arch/x86/pagetable_walker.hh Tue Jun 05 01:23:08 2012 -0400 > @@ -44,7 +44,6 @@ > > #include "arch/x86/pagetable.hh" > #include "arch/x86/tlb.hh" > -#include "base/fast_alloc.hh" > #include "base/types.hh" > #include "mem/mem_object.hh" > #include "mem/packet.hh" > @@ -86,7 +85,7 @@ > WalkerPort port; > > // State to track each walk of the page table > - class WalkerState : public FastAlloc > + class WalkerState > { > private: > enum State { > diff -r 1e2acba5e77e -r 904ddeecc653 src/base/SConscript > --- a/src/base/SConscript Tue Jun 05 01:23:08 2012 -0400 > +++ b/src/base/SConscript Tue Jun 05 01:23:08 2012 -0400 > @@ -40,7 +40,6 @@ > Source('circlebuf.cc') > Source('cprintf.cc') > Source('debug.cc') > -Source('fast_alloc.cc') > if env['USE_FENV']: > Source('fenv.c') > Source('hostinfo.cc') > diff -r 1e2acba5e77e -r 904ddeecc653 src/base/fast_alloc.cc > --- a/src/base/fast_alloc.cc Tue Jun 05 01:23:08 2012 -0400 > +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 > @@ -1,72 +0,0 @@ > -/* > - * Copyright (c) 2000-2005 The Regents of The University of Michigan > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions are > - * met: redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer; > - * redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution; > - * neither the name of the copyright holders nor the names of its > - * contributors may be used to endorse or promote products derived from > - * this software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > - * > - * Authors: Steve Reinhardt > - */ > - > -/* > - * This code was originally written by Steve Reinhardt as part of > - * the Wisconsin Wind Tunnel simulator. Relicensed as part of M5 > - * by permission. > - */ > - > -#include <cassert> > - > -#include "base/fast_alloc.hh" > - > -#if USE_FAST_ALLOC > - > -void *FastAlloc::freeLists[Num_Buckets]; > - > -#if FAST_ALLOC_STATS > -unsigned FastAlloc::newCount[Num_Buckets]; > -unsigned FastAlloc::deleteCount[Num_Buckets]; > -unsigned FastAlloc::allocCount[Num_Buckets]; > -#endif > - > -void * > -FastAlloc::moreStructs(int bucket) > -{ > - assert(bucket > 0 && bucket < Num_Buckets); > - > - int sz = bucket * Alloc_Quantum; > - const int nstructs = Num_Structs_Per_New; // how many to allocate? > - char *p = ::new char[nstructs * sz]; > - > -#if FAST_ALLOC_STATS > - ++allocCount[bucket]; > -#endif > - > - freeLists[bucket] = p; > - for (int i = 0; i < (nstructs-2); ++i, p += sz) > - *(void **)p = p + sz; > - *(void **)p = 0; > - > - return (p + sz); > -} > - > -#endif // USE_FAST_ALLOC > diff -r 1e2acba5e77e -r 904ddeecc653 src/base/fast_alloc.hh > --- a/src/base/fast_alloc.hh Tue Jun 05 01:23:08 2012 -0400 > +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 > @@ -1,190 +0,0 @@ > -/* > - * Copyright (c) 2000-2001, 2003-2005 The Regents of The University of > Michigan > - * All rights reserved. > - * > - * Redistribution and use in source and binary forms, with or without > - * modification, are permitted provided that the following conditions are > - * met: redistributions of source code must retain the above copyright > - * notice, this list of conditions and the following disclaimer; > - * redistributions in binary form must reproduce the above copyright > - * notice, this list of conditions and the following disclaimer in the > - * documentation and/or other materials provided with the distribution; > - * neither the name of the copyright holders nor the names of its > - * contributors may be used to endorse or promote products derived from > - * this software without specific prior written permission. > - * > - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > - * > - * Authors: Steve Reinhardt > - */ > - > -/* > - * This code was originally written by Steve Reinhardt as part of > - * the Wisconsin Wind Tunnel simulator. Relicensed as part of M5 > - * by permission. > - */ > - > -#ifndef __BASE_FAST_ALLOC_HH__ > -#define __BASE_FAST_ALLOC_HH__ > - > -#include <cstddef> > - > -// Fast structure allocator. Designed for small objects that are > -// frequently allocated and deallocated. This code is derived from the > -// 'alloc_struct' package used in WWT and Blizzard. C++ provides a > -// much nicer framework for the same optimization. The package is > -// implemented as a class, FastAlloc. Allocation and deletion are > -// performed using FastAlloc's new and delete operators. Any object > -// that derives from the FastAlloc class will transparently use this > -// allocation package. > - > -// The static allocate() and deallocate() methods can also be called > -// directly if desired. > - > -// In order for derived classes to call delete with the correct > -// structure size even when they are deallocated via a base-type > -// pointer, they must have a virtual destructor. It is sufficient for > -// FastAlloc to declare a virtual destructor (as it does); it is not > -// required for derived classes to declare their own destructor. The > -// compiler will automatically generate a virtual destructor for each > -// derived class. However, it is more efficient if each derived class > -// defines an inline destructor, so that the compiler can statically > -// collapse the destructor call chain back up the inheritance > -// hierarchy. > - > -#include "config/fast_alloc_stats.hh" > -#include "config/force_fast_alloc.hh" > -#include "config/no_fast_alloc.hh" > - > -// By default, we want to enable FastAlloc in any build other than > -// m5.debug. (FastAlloc's reuse policies can mask allocation bugs, so > -// we typically want it disabled when debugging.) Set > -// FORCE_FAST_ALLOC to enable even when debugging, and set > -// NO_FAST_ALLOC to disable even in non-debug builds. > -#define USE_FAST_ALLOC \ > - (FORCE_FAST_ALLOC || (!defined(DEBUG) && !NO_FAST_ALLOC)) > - > -#if !USE_FAST_ALLOC > - > -class FastAlloc > -{ > -}; > - > -#else > - > -class FastAlloc > -{ > - public: > - static void *allocate(size_t); > - static void deallocate(void *, size_t); > - > - void *operator new(size_t); > - void operator delete(void *, size_t); > - > - virtual ~FastAlloc() {} > - > - private: > - > - // Max_Alloc_Size is the largest object that can be allocated with > - // this class. There's no fundamental limit, but this limits the > - // size of the freeLists array. Let's not make this really huge > - // like in Blizzard. > - static const size_t Max_Alloc_Size = 512; > - > - // Alloc_Quantum is the difference in size between adjacent > - // buckets in the free list array. > - static const int Log2_Alloc_Quantum = 3; > - static const int Alloc_Quantum = (1 << Log2_Alloc_Quantum); > - > - // Num_Buckets = bucketFor(Max_Alloc_Size) + 1 > - static const int Num_Buckets = > - ((Max_Alloc_Size + Alloc_Quantum - 1) >> Log2_Alloc_Quantum) + 1; > - > - // when we call new() for more structures, how many should we get? > - static const int Num_Structs_Per_New = 20; > - > - static int bucketFor(size_t); > - static void *moreStructs(int bucket); > - > - static void *freeLists[Num_Buckets]; > - > -#if FAST_ALLOC_STATS > - static unsigned newCount[Num_Buckets]; > - static unsigned deleteCount[Num_Buckets]; > - static unsigned allocCount[Num_Buckets]; > -#endif > -}; > - > -inline int > -FastAlloc::bucketFor(size_t sz) > -{ > - return (sz + Alloc_Quantum - 1) >> Log2_Alloc_Quantum; > -} > - > -inline void * > -FastAlloc::allocate(size_t sz) > -{ > - int b; > - void *p; > - > - if (sz > Max_Alloc_Size) > - return (void *)::new char[sz]; > - > - b = bucketFor(sz); > - p = freeLists[b]; > - > - if (p) > - freeLists[b] = *(void **)p; > - else > - p = moreStructs(b); > - > -#if FAST_ALLOC_STATS > - ++newCount[b]; > -#endif > - > - return p; > -} > - > -inline void > -FastAlloc::deallocate(void *p, size_t sz) > -{ > - int b; > - > - if (sz > Max_Alloc_Size) { > - ::delete [] (char *)p; > _______________________________________________ > gem5-dev mailing list > [email protected] > http://m5sim.org/mailman/listinfo/gem5-dev > > _______________________________________________ gem5-dev mailing list [email protected] http://m5sim.org/mailman/listinfo/gem5-dev
