Hi list, gcc-4.3 appears to make quite heavy use of cmov to eliminate conditional branches on x86(-64) architecture, even for those branches that are determined to be predictable.
The problem with this is that the data dependancy introduced by the cmov can restrict execution, wheras a predicted branch will not. The Intel manual says not to use cmov for predictable branches. I have a test case which I /believe/ demonstrates that a cond jump is not a great deal slower in the case where there are no data dependancy hazards hit, and can be quite a bit faster in the case where there is a dependancy -- if the branch is predictable. It also shows that if the branch is unpredictable then cmov can be a good win (as expected). Compiled with: gcc-4.3 -O3 -falign-functions=64 -falign-loops=64 -falign-jumps=64 -falign-labels=64 -march=core2/opteron Opteron: no deps, predictable -- C code took 8.92ns per iteration no deps, predictable -- cmov code took 9.09ns per iteration no deps, predictable -- jmp code took 9.60ns per iteration[1] has deps, predictable -- C code took 20.04ns per iteration has deps, predictable -- cmov code took 18.09ns per iteration has deps, predictable -- jmp code took 14.97ns per iteration[2] no deps, unpredictable -- C code took 8.92ns per iteration no deps, unpredictable -- cmov code took 9.09ns per iteration no deps, unpredictable -- jmp code took 15.19ns per iteration[3] has deps, unpredictable -- C code took 32.07ns per iteration has deps, unpredictable -- cmov code took 33.15ns per iteration has deps, unpredictable -- jmp code took 69.04ns per iteration[4] [1] jmp is slightly slower, can it be improved? [2] nice improvement here [3] mispredict penalty, cmov is a big win [4] mispredict which includes load missing cache! Core2: no deps, predictable -- C code took 4.24ns per iteration no deps, predictable -- cmov code took 4.27ns per iteration no deps, predictable -- jmp code took 4.24ns per iteration has deps, predictable -- C code took 6.04ns per iteration has deps, predictable -- cmov code took 6.59ns per iteration has deps, predictable -- jmp code took 5.04ns per iteration no deps, unpredictable -- C code took 4.24ns per iteration no deps, unpredictable -- cmov code took 4.25ns per iteration no deps, unpredictable -- jmp code took 8.79ns per iteration has deps, unpredictable -- C code took 49.78ns per iteration has deps, unpredictable -- cmov code took 50.28ns per iteration has deps, unpredictable -- jmp code took 75.67ns per iteration Core2 follows a similar pattern, although it's not seeing any slowdown in the "no deps, predictable, jmp" case like K8 does. Any comments? (please cc me) Should gcc be using conditional jumps more often eg. in the case of __builtin_expect())? Thanks, Nick
//#define noinline __attribute__((noinline)) #define noinline #define likely(x) __builtin_expect(!!(x), 1) static noinline int test_cmov(int a, int b) { int ret; asm volatile ( "cmpl %1, %2\n\t" "cmovle %2, %1\n\t" "movl %1, %0\n\t" : "=r" (ret) : "r" (a), "r" (b)); return ret; } static noinline int test_jmp(int a, int b) { int ret; asm volatile ( "cmpl %1, %2\n\t" "jle 1f\n\t" "movl %1, %0\n\t" "2:\n\t" ".subsection 1\n\t" "1:\n\t" "movl %2, %0\n\t" "jmp 2b\n\t" ".previous\n\t" : "=r" (ret) : "r" (a), "r" (b)); return ret; } static noinline int test_c(int a, int b) { int ret; if (likely(a < b)) ret = a; else ret = b; return ret; } #define ITERS 100000000 #define SIZE (4*1024*1024) static int array1[SIZE]; static int array2[SIZE]; static volatile int result[SIZE]; #include <sys/time.h> #include <stdlib.h> #include <stdio.h> #include <assert.h> static void print_time(const char *str, struct timeval *s, struct timeval *e, unsigned long iters) { unsigned long long us, ns; double ns_iter; us = 1000000*(e->tv_sec - s->tv_sec) + (e->tv_usec - s->tv_usec); ns = us * 1000; ns_iter = (double)ns / iters; ns /= iters; printf("%s took % 6.2lfns per iteration\n", str, ns_iter); } int main(void) { struct timeval s, e; int i; assert(test_cmov(1, 2) == test_c(1, 2)); assert(test_cmov(1, 1) == test_c(1, 1)); assert(test_cmov(2, 1) == test_c(2, 1)); assert(test_jmp(1, 2) == test_c(1, 2)); assert(test_jmp(1, 1) == test_c(1, 1)); assert(test_jmp(2, 1) == test_c(2, 1)); for (i = 0; i < SIZE; i++) { array1[i] = i; array2[i] = i + 1; result[i] = 0; } gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, predictable -- C code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, predictable -- cmov code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, predictable -- jmp code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_c(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, predictable -- C code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_cmov(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, predictable -- cmov code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_jmp(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, predictable -- jmp code", &s, &e, ITERS); for (i = 0; i < SIZE; i++) { array1[i] = random(); array2[i] = random(); result[i] = 0; } gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_c(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, unpredictable -- C code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_cmov(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, unpredictable -- cmov code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { result[i%SIZE] = test_jmp(array1[i%SIZE], array2[i%SIZE]); } gettimeofday(&e, NULL); print_time(" no deps, unpredictable -- jmp code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_c(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, unpredictable -- C code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_cmov(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, unpredictable -- cmov code", &s, &e, ITERS); gettimeofday(&s, NULL); for (i = 0; i < ITERS; i++) { unsigned int r; r = test_jmp(array1[i%SIZE], array2[i%SIZE]); result[r%SIZE]++; } gettimeofday(&e, NULL); print_time("has deps, unpredictable -- jmp code", &s, &e, ITERS); return 0; }