https://bugs.llvm.org/show_bug.cgi?id=50026
I reported it to the llvm people. it is two slightly different quicksort
algorithms which perform radically differently. The one which you could
assume would take more time, performs MUCH better.
I made a custom quicksort algorithm which outperforms qsort by A LOT for
sorting an array of around 300 randomly created unsigned characters, which
is what I use it for.
if you read the report a guy said there's a 10% difference for sorting 3
million characters on freebsd, but there's about 40% performance difference
on OpenBSD. maybe it's also how the OpenBSD team modified clang to prevent
rop chain stuff or something? I'm using a westmere based intell server.
it's the same for clang 11.
What's the deal?
-Luke
/*
clang sort_test2.c -O2 -o sort_test && ./sort_test
*/
/*
* quicksort should be slightly faster than quicksort1 because it is
* identical except quicksort1 does more work using global counters.
* quicsort1 runs 41% faster. Using gcc, quicksort is instead slightly faster.
*/
/*
* I'm using clang 10.0.1 on OpenBSD. these are my results:
quicksort
time = 0.004501771
quicksort1
time = 0.002655233
*/
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/param.h>
size_t m;
size_t g;
size_t i;
/*
* quicksort sorts characters on a u_char array.
* 'a' is the first character on the array.
* 'b' is last character on the array
*/
static
void quicksort(u_char a[], u_char b[])
{
u_char temp;
if (b - a <= 1)
{
if (a == b)
return;
if (*a > *b)
{
temp = *a; *a = *b; *b = temp;
}
return;
}
u_char * j = a + 1;
u_char * k = b;
u_char u = *a;
/*
* This if() avoids repeated awkward special case branching
* in the similar if() in the "while (++j < k)" loop
*/
if (*j > u)
{
do
{
if (*k < u)
{
temp = *j; *j = *k; *k = temp;
goto skip0;
}
} while (j < --k);
quicksort(j, b);
return;
skip0:
if (j == --k)
{
*a = *j; *j = u;
quicksort(j+1, b);
return;
}
}
while (++j < k)
{
if (*j > u)
{
do
{
if (*k < u)
{
temp = *j; *j = *k; *k = temp;
goto skip;
}
} while (j < --k);
quicksort(j, b);
*a = *--j; *j = u;
quicksort(a, j-1);
return;
skip:
if (j == --k)
{
*a = *j; *j = u;
quicksort(a, j-1);
quicksort(j+1, b);
return;
}
}
}
if (*j > u)
{
quicksort(j, b);
*a = *--j; *j = u;
quicksort(a, j-1);
}
else
{
*a = *j; *j = u;
quicksort(a, j-1);
if (j == b)
return;
quicksort(j+1, b);
}
}
/*
* quicksort1 is identical to quicksort() but it alters some global counters
* it should take slightly longer to run, but it runs in half the time
*/
static
void quicksort1(u_char a[], u_char b[], size_t level)
{
++m;
if (level > i)
i = level;
u_char temp;
if (b - a <= 1)
{
++g;
if (a == b)
return;
if (*a > *b)
{
temp = *a; *a = *b; *b = temp;
}
return;
}
u_char * j = a + 1;
u_char * k = b;
u_char u = *a;
/*
* This if() avoids repeated awkward special case branching
* in the similar if() in the "while (++j < k)" loop
*/
if (*j > u)
{
do
{
if (*k < u)
{
temp = *j; *j = *k; *k = temp;
goto skip0;
}
} while (j < --k);
quicksort1(j, b, level + 1);
return;
skip0:
if (j == --k)
{
*a = *j; *j = u;
quicksort1(j+1, b, level + 1);
return;
}
}
while (++j < k)
{
if (*j > u)
{
do
{
if (*k < u)
{
temp = *j; *j = *k; *k = temp;
goto skip;
}
} while (j < --k);
quicksort1(j, b, level + 1);
*a = *--j; *j = u;
quicksort1(a, j-1, level + 1);
return;
skip:
if (j == --k)
{
*a = *j; *j = u;
quicksort1(a, j-1, level + 1);
quicksort1(j+1, b, level + 1);
return;
}
}
}
if (*j > u)
{
quicksort1(j, b, level + 1);
*a = *--j; *j = u;
quicksort1(a, j-1, level + 1);
}
else
{
*a = *j; *j = u;
quicksort1(a, j-1, level + 1);
if (j == b)
return;
quicksort1(j+1, b, level + 1);
}
}
int main()
{
long double cpu_time_used;
struct timespec start, end;
u_char *buffer, *buffer2;
size_t y;
size_t n = 30000;
buffer = (u_char*)malloc(n);
if (buffer == NULL) err(1, "malloc");
buffer2 = (u_char*)malloc(n);
if (buffer2 == NULL) err(1, "malloc");
for (y = 0; y < n; ++y)
buffer[y] = rand() % 256;
// run it once to clear any wrappers
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
memcpy(buffer2, buffer, n);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
quicksort(buffer2, buffer2 + n - 1);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
cpu_time_used =
(long double) (end.tv_sec - start.tv_sec) +
(long double) (end.tv_nsec - start.tv_nsec) /
(long double) 1000000000;
printf("\n\nquicksort\n");
printf("time = %.9Lf\n\n", cpu_time_used);
memcpy(buffer2, buffer, n);
i = m = g = 0;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
quicksort1(buffer2, buffer2 + n - 1, 1);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
cpu_time_used =
(long double) (end.tv_sec - start.tv_sec) +
(long double) (end.tv_nsec - start.tv_nsec) /
(long double) 1000000000;
printf("\n\nquicksort1\n");
//~ printf("time = %.9Lf, max depth: %lu, calls: %lu, g: %lu\n\n\n", cpu_time_used, i, m, g);
printf("time = %.9Lf\n", cpu_time_used);
return 0;
}