Does anyone have further experience with the speedups of TCMalloc?

I'm testing it on our infrastructure and on the ALPHA/debug/quick tests it
shaves off 10% of runtime, but in the ALPHA/fast/quick tests it actually
adds 50% of runtime.  I haven't done extensive tests with other workloads
or builds but I had assumed it would all go faster.

Lisa

On Tue, Jun 5, 2012 at 1:24 AM, Ali Saidi <[email protected]> wrote:

> changeset 904ddeecc653 in /z/repo/gem5
> details: http://repo.gem5.org/gem5?cmd=changeset;node=904ddeecc653
> description:
>         sim: Remove FastAlloc
>
>         While FastAlloc provides a small performance increase (~1.5%) over
> regular malloc it isn't thread safe.
>         After removing FastAlloc and using tcmalloc I've seen a
> performance increase of 12% over libc malloc
>         when running twolf for ARM.
>
> diffstat:
>
>  SConstruct                                 |    8 +-
>  src/arch/x86/pagetable_walker.hh           |    3 +-
>  src/base/SConscript                        |    1 -
>  src/base/fast_alloc.cc                     |   72 ----------
>  src/base/fast_alloc.hh                     |  190
> -----------------------------
>  src/cpu/base_dyn_inst.hh                   |    3 +-
>  src/cpu/inorder/inorder_dyn_inst.hh        |    3 +-
>  src/cpu/o3/lsq_unit.hh                     |    3 +-
>  src/cpu/ozone/lw_lsq.hh                    |    3 +-
>  src/cpu/testers/memtest/memtest.hh         |    3 +-
>  src/cpu/testers/networktest/networktest.hh |    3 +-
>  src/dev/dma_device.hh                      |    2 +-
>  src/mem/bridge.hh                          |    3 +-
>  src/mem/cache/cache_impl.hh                |    3 +-
>  src/mem/packet.hh                          |    5 +-
>  src/mem/request.hh                         |    5 +-
>  src/python/swig/event.i                    |    1 -
>  src/sim/eventq.hh                          |    3 +-
>  18 files changed, 16 insertions(+), 298 deletions(-)
>
> diffs (truncated from 587 to 300 lines):
>
> diff -r 1e2acba5e77e -r 904ddeecc653 SConstruct
> --- a/SConstruct        Tue Jun 05 01:23:08 2012 -0400
> +++ b/SConstruct        Tue Jun 05 01:23:08 2012 -0400
> @@ -833,11 +833,6 @@
>      ListVariable('CPU_MODELS', 'CPU models',
>                   sorted(n for n,m in CpuModel.dict.iteritems() if
> m.default),
>                   sorted(CpuModel.list)),
> -    BoolVariable('NO_FAST_ALLOC', 'Disable fast object allocator', False),
> -    BoolVariable('FORCE_FAST_ALLOC',
> -                 'Enable fast object allocator, even for gem5.debug',
> False),
> -    BoolVariable('FAST_ALLOC_STATS', 'Enable fast object allocator
> statistics',
> -                 False),
>      BoolVariable('EFENCE', 'Link with Electric Fence malloc debugger',
>                   False),
>      BoolVariable('SS_COMPATIBLE_FP',
> @@ -852,8 +847,7 @@
>      )
>
>  # These variables get exported to #defines in config/*.hh (see
> src/SConscript).
> -export_vars += ['USE_FENV', 'NO_FAST_ALLOC', 'FORCE_FAST_ALLOC',
> -                'FAST_ALLOC_STATS', 'SS_COMPATIBLE_FP',
> +export_vars += ['USE_FENV', 'SS_COMPATIBLE_FP',
>                  'TARGET_ISA', 'CP_ANNOTATE', 'USE_POSIX_CLOCK' ]
>
>  ###################################################
> diff -r 1e2acba5e77e -r 904ddeecc653 src/arch/x86/pagetable_walker.hh
> --- a/src/arch/x86/pagetable_walker.hh  Tue Jun 05 01:23:08 2012 -0400
> +++ b/src/arch/x86/pagetable_walker.hh  Tue Jun 05 01:23:08 2012 -0400
> @@ -44,7 +44,6 @@
>
>  #include "arch/x86/pagetable.hh"
>  #include "arch/x86/tlb.hh"
> -#include "base/fast_alloc.hh"
>  #include "base/types.hh"
>  #include "mem/mem_object.hh"
>  #include "mem/packet.hh"
> @@ -86,7 +85,7 @@
>          WalkerPort port;
>
>          // State to track each walk of the page table
> -        class WalkerState : public FastAlloc
> +        class WalkerState
>          {
>            private:
>              enum State {
> diff -r 1e2acba5e77e -r 904ddeecc653 src/base/SConscript
> --- a/src/base/SConscript       Tue Jun 05 01:23:08 2012 -0400
> +++ b/src/base/SConscript       Tue Jun 05 01:23:08 2012 -0400
> @@ -40,7 +40,6 @@
>  Source('circlebuf.cc')
>  Source('cprintf.cc')
>  Source('debug.cc')
> -Source('fast_alloc.cc')
>  if env['USE_FENV']:
>      Source('fenv.c')
>  Source('hostinfo.cc')
> diff -r 1e2acba5e77e -r 904ddeecc653 src/base/fast_alloc.cc
> --- a/src/base/fast_alloc.cc    Tue Jun 05 01:23:08 2012 -0400
> +++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
> @@ -1,72 +0,0 @@
> -/*
> - * Copyright (c) 2000-2005 The Regents of The University of Michigan
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions are
> - * met: redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer;
> - * redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution;
> - * neither the name of the copyright holders nor the names of its
> - * contributors may be used to endorse or promote products derived from
> - * this software without specific prior written permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> - *
> - * Authors: Steve Reinhardt
> - */
> -
> -/*
> - * This code was originally written by Steve Reinhardt as part of
> - * the Wisconsin Wind Tunnel simulator.  Relicensed as part of M5
> - * by permission.
> - */
> -
> -#include <cassert>
> -
> -#include "base/fast_alloc.hh"
> -
> -#if USE_FAST_ALLOC
> -
> -void *FastAlloc::freeLists[Num_Buckets];
> -
> -#if FAST_ALLOC_STATS
> -unsigned FastAlloc::newCount[Num_Buckets];
> -unsigned FastAlloc::deleteCount[Num_Buckets];
> -unsigned FastAlloc::allocCount[Num_Buckets];
> -#endif
> -
> -void *
> -FastAlloc::moreStructs(int bucket)
> -{
> -    assert(bucket > 0 && bucket < Num_Buckets);
> -
> -    int sz = bucket * Alloc_Quantum;
> -    const int nstructs = Num_Structs_Per_New;   // how many to allocate?
> -    char *p = ::new char[nstructs * sz];
> -
> -#if FAST_ALLOC_STATS
> -    ++allocCount[bucket];
> -#endif
> -
> -    freeLists[bucket] = p;
> -    for (int i = 0; i < (nstructs-2); ++i, p += sz)
> -        *(void **)p = p + sz;
> -    *(void **)p = 0;
> -
> -    return (p + sz);
> -}
> -
> -#endif // USE_FAST_ALLOC
> diff -r 1e2acba5e77e -r 904ddeecc653 src/base/fast_alloc.hh
> --- a/src/base/fast_alloc.hh    Tue Jun 05 01:23:08 2012 -0400
> +++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
> @@ -1,190 +0,0 @@
> -/*
> - * Copyright (c) 2000-2001, 2003-2005 The Regents of The University of
> Michigan
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions are
> - * met: redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer;
> - * redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution;
> - * neither the name of the copyright holders nor the names of its
> - * contributors may be used to endorse or promote products derived from
> - * this software without specific prior written permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> - *
> - * Authors: Steve Reinhardt
> - */
> -
> -/*
> - * This code was originally written by Steve Reinhardt as part of
> - * the Wisconsin Wind Tunnel simulator.  Relicensed as part of M5
> - * by permission.
> - */
> -
> -#ifndef __BASE_FAST_ALLOC_HH__
> -#define __BASE_FAST_ALLOC_HH__
> -
> -#include <cstddef>
> -
> -// Fast structure allocator.  Designed for small objects that are
> -// frequently allocated and deallocated.  This code is derived from the
> -// 'alloc_struct' package used in WWT and Blizzard.  C++ provides a
> -// much nicer framework for the same optimization.  The package is
> -// implemented as a class, FastAlloc.  Allocation and deletion are
> -// performed using FastAlloc's new and delete operators.  Any object
> -// that derives from the FastAlloc class will transparently use this
> -// allocation package.
> -
> -// The static allocate() and deallocate() methods can also be called
> -// directly if desired.
> -
> -// In order for derived classes to call delete with the correct
> -// structure size even when they are deallocated via a base-type
> -// pointer, they must have a virtual destructor.  It is sufficient for
> -// FastAlloc to declare a virtual destructor (as it does); it is not
> -// required for derived classes to declare their own destructor.  The
> -// compiler will automatically generate a virtual destructor for each
> -// derived class.  However, it is more efficient if each derived class
> -// defines an inline destructor, so that the compiler can statically
> -// collapse the destructor call chain back up the inheritance
> -// hierarchy.
> -
> -#include "config/fast_alloc_stats.hh"
> -#include "config/force_fast_alloc.hh"
> -#include "config/no_fast_alloc.hh"
> -
> -// By default, we want to enable FastAlloc in any build other than
> -// m5.debug.  (FastAlloc's reuse policies can mask allocation bugs, so
> -// we typically want it disabled when debugging.)  Set
> -// FORCE_FAST_ALLOC to enable even when debugging, and set
> -// NO_FAST_ALLOC to disable even in non-debug builds.
> -#define USE_FAST_ALLOC \
> -    (FORCE_FAST_ALLOC || (!defined(DEBUG) && !NO_FAST_ALLOC))
> -
> -#if !USE_FAST_ALLOC
> -
> -class FastAlloc
> -{
> -};
> -
> -#else
> -
> -class FastAlloc
> -{
> -  public:
> -    static void *allocate(size_t);
> -    static void deallocate(void *, size_t);
> -
> -    void *operator new(size_t);
> -    void operator delete(void *, size_t);
> -
> -    virtual ~FastAlloc() {}
> -
> -  private:
> -
> -    // Max_Alloc_Size is the largest object that can be allocated with
> -    // this class.  There's no fundamental limit, but this limits the
> -    // size of the freeLists array.  Let's not make this really huge
> -    // like in Blizzard.
> -    static const size_t Max_Alloc_Size = 512;
> -
> -    // Alloc_Quantum is the difference in size between adjacent
> -    // buckets in the free list array.
> -    static const int Log2_Alloc_Quantum = 3;
> -    static const int Alloc_Quantum = (1 << Log2_Alloc_Quantum);
> -
> -    // Num_Buckets = bucketFor(Max_Alloc_Size) + 1
> -    static const int Num_Buckets =
> -        ((Max_Alloc_Size + Alloc_Quantum - 1) >> Log2_Alloc_Quantum) + 1;
> -
> -    // when we call new() for more structures, how many should we get?
> -    static const int Num_Structs_Per_New = 20;
> -
> -    static int bucketFor(size_t);
> -    static void *moreStructs(int bucket);
> -
> -    static void *freeLists[Num_Buckets];
> -
> -#if FAST_ALLOC_STATS
> -    static unsigned newCount[Num_Buckets];
> -    static unsigned deleteCount[Num_Buckets];
> -    static unsigned allocCount[Num_Buckets];
> -#endif
> -};
> -
> -inline int
> -FastAlloc::bucketFor(size_t sz)
> -{
> -    return (sz + Alloc_Quantum - 1) >> Log2_Alloc_Quantum;
> -}
> -
> -inline void *
> -FastAlloc::allocate(size_t sz)
> -{
> -    int b;
> -    void *p;
> -
> -    if (sz > Max_Alloc_Size)
> -        return (void *)::new char[sz];
> -
> -    b = bucketFor(sz);
> -    p = freeLists[b];
> -
> -    if (p)
> -        freeLists[b] = *(void **)p;
> -    else
> -        p = moreStructs(b);
> -
> -#if FAST_ALLOC_STATS
> -    ++newCount[b];
> -#endif
> -
> -    return p;
> -}
> -
> -inline void
> -FastAlloc::deallocate(void *p, size_t sz)
> -{
> -    int b;
> -
> -    if (sz > Max_Alloc_Size) {
> -        ::delete [] (char *)p;
> _______________________________________________
> gem5-dev mailing list
> [email protected]
> http://m5sim.org/mailman/listinfo/gem5-dev
>
>
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to