I am running 4.5.0, built locally from a gcc.gnu.org distribution, on Ubuntu 10.04.
When switching from -m32 to -m64 a dispatch loop in my code gets noticeably slower (20% slower in 4.3.4 and 4.4.3, 10% slower in 4.5.0). Investigation of the generated assembly shows that register-resident structures are being flushed to locations on the stack around a call through a function pointer. If I change the function call to take scalar arguments rather than a structure passed by value, then the stack writes go away and perforamance improves to be about 10% faster than the 32-bit code. The small testcase below includes three examples. One with a pass-by-value through a function pointer which exhibits the problem. Second with passing scalars instead of a structure which shows the workaround, and a third trivial example with an empty structure being passed, which also exhibits a version of the problem. FWIW, my production code exhibits a particularly egregious version of the problem, but I cannot seem to reproduce it in a small example: the non-inlined function call is at the bottom of several layers of inlined function, and a single register-resident structure is being flushed to multiple stack locations (one per inlined stack frame?) around each call to the function. Output of "g++ -v -save-temps -O3 -S test.cpp": Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/lto-wrapper Target: x86_64-unknown-linux-gnu Configured with: ./configure Thread model: posix gcc version 4.5.0 (GCC) COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64' /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -E -quiet -v -D_GNU_SOURCE test.cpp -mtune=generic -march=x86-64 -O3 -fpch-preprocess -o test.ii ignoring nonexistent directory "/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../x86_64-unknown-linux-gnu/include" #include "..." search starts here: #include <...> search starts here: /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0 /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/x86_64-unknown-linux-gnu /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../include/c++/4.5.0/backward /usr/local/include /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include /usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/include-fixed /usr/include End of search list. COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64' /usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/cc1plus -fpreprocessed test.ii -quiet -dumpbase test.cpp -mtune=generic -march=x86-64 -auxbase test -O3 -version -o test.s GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu) compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version 2.4.2-p1, MPC version 0.8.1 GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 GNU C++ (GCC) version 4.5.0 (x86_64-unknown-linux-gnu) compiled by GNU C version 4.5.0, GMP version 4.3.2, MPFR version 2.4.2-p1, MPC version 0.8.1 GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 Compiler executable checksum: 469157b70a6e6ab9e09e15344033d953 COMPILER_PATH=/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/libexec/gcc/x86_64-unknown-linux-gnu/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/ LIBRARY_PATH=/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.5.0/../../../:/lib/:/usr/lib/ COLLECT_GCC_OPTIONS='-v' '-save-temps' '-O3' '-S' '-mtune=generic' '-march=x86-64' And the contents of test.ii afterwards: # 1 "test.cpp" # 1 "<built-in>" # 1 "<command-line>" # 1 "test.cpp" struct bitPointer { unsigned int * a; unsigned int b; }; extern void (*gCausesFlushToStack)(bitPointer p); void test1(unsigned int* a, int x) { bitPointer p = { a, 0 }; for (int i = 0; i < x; ++i) { gCausesFlushToStack(p); p.a += (p.b + 1) >> 3; p.b = (p.b + 1) & 0x7; } } extern void (*gSameValuesAsScalarsDoesntCauseFlush)(unsigned int* a, unsigned int b); void test2(unsigned int* a, int x) { bitPointer p = { a, 0 }; for (int i = 0; i < x; ++i) { gSameValuesAsScalarsDoesntCauseFlush(p.a, p.b); p.a += (p.b + 1) >> 3; p.b = (p.b + 1) & 0x7; } } struct emptyObject { }; extern void (*gEvenEmptyStructureCanCauseFlush)(emptyObject object); void test3(unsigned int* a, int x) { bitPointer p = { a, 0 }; for (int i = 0; i < x; ++i) { gEvenEmptyStructureCanCauseFlush(emptyObject()); p.a += (p.b + 1) >> 3; p.b = (p.b + 1) & 0x7; } } -- Summary: x86_64 passing structure by value to a non-inlined function causes register-resident structures to flush to stack Product: gcc Version: 4.5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: dickie at acm dot org GCC build triplet: x86_64-unknown-linux-gnu GCC host triplet: x86_64-unknown-linux-gnu GCC target triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45090