I am running 4.5.0, built locally from a gcc.gnu.org distribution, on Ubuntu
10.04. 

When switching from -m32 to -m64 a dispatch loop in my code gets noticeably
slower (20% slower in 4.3.4 and 4.4.3, 10% slower in 4.5.0).

Investigation of the generated assembly shows that register-resident structures
are being flushed to locations on the stack around a call through a function
pointer. If I change the function call to take scalar arguments rather than a
structure passed by value, then the stack writes go away and perforamance
improves to be about 10% faster than the 32-bit code. 

The small testcase below includes three examples. One with a pass-by-value
through a function pointer which exhibits the problem. Second with passing
scalars instead of a structure which shows the workaround, and a third trivial
example with an empty structure being passed, which also exhibits a version of
the problem.

FWIW, my production code exhibits a particularly egregious version of the
problem, but I cannot seem to reproduce it in a small example: the non-inlined
function call is at the bottom of several layers of inlined function, and a
single register-resident structure is being flushed to multiple stack locations
(one per inlined stack frame?) around each call to the function.



Output of "g++ -v -save-temps -O3 -S test.cpp":

Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/lto-wrapper
Target: x86_64-unknown-linux-gnu
Configured with: ./configure
Thread model: posix
gcc version 4.5.0 (GCC) 
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic'
'-march=x86-64'
 /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -E -quiet -v
-D_GNU_SOURCE test.cpp -mtune=generic -march=x86-64 -O3 -fpch-preprocess -o
test.ii
ignoring nonexistent directory
"/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../x86_64-unknown-linux-gnu/include"
#include "..." search starts here:
#include <...> search starts here:

/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0

/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/x86_64-unknown-linux-gnu

/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/backward
 /usr/local/include
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include
 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include-fixed
 /usr/include
End of search list.
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic'
'-march=x86-64'
 /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -fpreprocessed
test.ii -quiet -dumpbase test.cpp -mtune=generic -march=x86-64 -auxbase test
-O3 -version -o test.s
GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu)
        compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version
2.4.2-p1, MPC version 0.8.1
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu)
        compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version
2.4.2-p1, MPC version 0.8.1
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 469157b70a6e6ab9e09e15344033d953
COMPILER_PATH=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/
LIBRARY_PATH=/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../:/lib/:/usr/lib/
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic'
'-march=x86-64'



And the contents of test.ii afterwards:

# 1 "test.cpp"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "test.cpp"
struct bitPointer {
    unsigned int * a;
    unsigned int b;
};

extern void (*gCausesFlushToStack)(bitPointer p);

void test1(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gCausesFlushToStack(p);

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}

extern void (*gSameValuesAsScalarsDoesntCauseFlush)(unsigned int* a, unsigned
int b);

void test2(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gSameValuesAsScalarsDoesntCauseFlush(p.a, p.b);

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}

struct emptyObject { };
extern void (*gEvenEmptyStructureCanCauseFlush)(emptyObject object);

void test3(unsigned int* a, int x) {
    bitPointer p = { a, 0 };

    for (int i = 0; i < x; ++i) {
        gEvenEmptyStructureCanCauseFlush(emptyObject());

        p.a += (p.b + 1) >> 3;
        p.b = (p.b + 1) & 0x7;
    }
}


-- 
           Summary: x86_64 passing structure by value to a non-inlined
                    function causes register-resident structures to flush to
                    stack
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: dickie at acm dot org
 GCC build triplet: x86_64-unknown-linux-gnu
  GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45090

Reply via email to