On Sun, Nov 07, 2021 at 08:13:36PM -0600, Luke Small wrote:
> https://bugs.llvm.org/show_bug.cgi?id=50026
>
> I reported it to the llvm people. it is two slightly different quicksort
> algorithms which perform radically differently. The one which you could
> assume would take more time, performs MUCH better.
>
> I made a custom quicksort algorithm which outperforms qsort by A LOT for
> sorting an array of around 300 randomly created unsigned characters, which
> is what I use it for.
>
> if you read the report a guy said there's a 10% difference for sorting 3
> million characters on freebsd, but there's about 40% performance difference
> on OpenBSD. maybe it's also how the OpenBSD team modified clang to prevent
> rop chain stuff or something? I'm using a westmere based intell server.
>
> it's the same for clang 11.
>
> What's the deal?
1. Your test is too small. I increased the test size to get less error
in measurement. I also changed the code to either run quicksort or
quicksort depending on an argument.
2. I confirmed your observation using time on amd64, arm64 however
shows a difference more in line with expectations:
[otto@mc:8]$ time ./a.out A
quicksort
time = 0.335777021
0m00.35s real 0m00.35s user 0m00.01s system
[otto@mc:9]$ time ./a.out B
quicksort1
time = 0.345703033
0m00.36s real 0m00.36s user 0m00.01s system
I suspect some non-optimal decision of the optimizer on amd64 for the
first quicksort.
A next step could be somebody looking at the code produced by the
compiler. That is not going to be me.
-Otto
>
> -Luke
>
> /*
> clang sort_test2.c -O2 -o sort_test && ./sort_test
> */
>
> /*
> * quicksort should be slightly faster than quicksort1 because it is
> * identical except quicksort1 does more work using global counters.
> * quicsort1 runs 41% faster. Using gcc, quicksort is instead slightly faster.
> */
>
>
> /*
> * I'm using clang 10.0.1 on OpenBSD. these are my results:
>
>
> quicksort
> time = 0.004501771
>
>
>
> quicksort1
> time = 0.002655233
> */
>
>
> #include <err.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/param.h>
>
> size_t m;
> size_t g;
> size_t i;
>
>
> /*
> * quicksort sorts characters on a u_char array.
> * 'a' is the first character on the array.
> * 'b' is last character on the array
> */
> static
> void quicksort(u_char a[], u_char b[])
> {
> u_char temp;
>
> if (b - a <= 1)
> {
> if (a == b)
> return;
>
> if (*a > *b)
> {
> temp = *a; *a = *b; *b = temp;
> }
> return;
> }
> u_char * j = a + 1;
> u_char * k = b;
> u_char u = *a;
>
> /*
> * This if() avoids repeated awkward special case branching
> * in the similar if() in the "while (++j < k)" loop
> */
> if (*j > u)
> {
> do
> {
> if (*k < u)
> {
> temp = *j; *j = *k; *k = temp;
> goto skip0;
> }
> } while (j < --k);
>
> quicksort(j, b);
> return;
>
> skip0:
> if (j == --k)
> {
> *a = *j; *j = u;
>
> quicksort(j+1, b);
> return;
> }
> }
>
>
> while (++j < k)
> {
> if (*j > u)
> {
> do
> {
> if (*k < u)
> {
> temp = *j; *j = *k; *k = temp;
> goto skip;
> }
> } while (j < --k);
>
> quicksort(j, b);
>
> *a = *--j; *j = u;
>
> quicksort(a, j-1);
> return;
>
> skip:
> if (j == --k)
> {
> *a = *j; *j = u;
>
> quicksort(a, j-1);
> quicksort(j+1, b);
> return;
> }
> }
> }
>
> if (*j > u)
> {
> quicksort(j, b);
>
> *a = *--j; *j = u;
>
> quicksort(a, j-1);
> }
> else
> {
> *a = *j; *j = u;
>
> quicksort(a, j-1);
>
> if (j == b)
> return;
>
> quicksort(j+1, b);
> }
> }
>
> /*
> * quicksort1 is identical to quicksort() but it alters some global counters
> * it should take slightly longer to run, but it runs in half the time
> */
> static
> void quicksort1(u_char a[], u_char b[], size_t level)
> {
> ++m;
> if (level > i)
> i = level;
>
> u_char temp;
>
>
> if (b - a <= 1)
> {
> ++g;
> if (a == b)
> return;
> if (*a > *b)
> {
> temp = *a; *a = *b; *b = temp;
> }
> return;
> }
> u_char * j = a + 1;
> u_char * k = b;
> u_char u = *a;
>
> /*
> * This if() avoids repeated awkward special case branching
> * in the similar if() in the "while (++j < k)" loop
> */
> if (*j > u)
> {
> do
> {
> if (*k < u)
> {
> temp = *j; *j = *k; *k = temp;
> goto skip0;
> }
> } while (j < --k);
>
> quicksort1(j, b, level + 1);
> return;
>
> skip0:
> if (j == --k)
> {
> *a = *j; *j = u;
>
> quicksort1(j+1, b, level + 1);
> return;
> }
> }
>
>
> while (++j < k)
> {
> if (*j > u)
> {
> do
> {
> if (*k < u)
> {
> temp = *j; *j = *k; *k = temp;
> goto skip;
> }
> } while (j < --k);
>
> quicksort1(j, b, level + 1);
>
> *a = *--j; *j = u;
>
> quicksort1(a, j-1, level + 1);
> return;
>
> skip:
> if (j == --k)
> {
> *a = *j; *j = u;
>
> quicksort1(a, j-1, level + 1);
> quicksort1(j+1, b, level + 1);
> return;
> }
> }
> }
>
> if (*j > u)
> {
> quicksort1(j, b, level + 1);
>
> *a = *--j; *j = u;
>
> quicksort1(a, j-1, level + 1);
> }
> else
> {
> *a = *j; *j = u;
>
> quicksort1(a, j-1, level + 1);
>
> if (j == b)
> return;
>
> quicksort1(j+1, b, level + 1);
> }
> }
>
>
> int main()
> {
>
> long double cpu_time_used;
> struct timespec start, end;
> u_char *buffer, *buffer2;
>
> size_t y;
> size_t n = 30000;
>
> buffer = (u_char*)malloc(n);
> if (buffer == NULL) err(1, "malloc");
> buffer2 = (u_char*)malloc(n);
> if (buffer2 == NULL) err(1, "malloc");
>
>
> for (y = 0; y < n; ++y)
> buffer[y] = rand() % 256;
>
> // run it once to clear any wrappers
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
>
>
> memcpy(buffer2, buffer, n);
>
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
> quicksort(buffer2, buffer2 + n - 1);
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
>
> cpu_time_used =
> (long double) (end.tv_sec - start.tv_sec) +
> (long double) (end.tv_nsec - start.tv_nsec) /
> (long double) 1000000000;
>
> printf("\n\nquicksort\n");
> printf("time = %.9Lf\n\n", cpu_time_used);
>
>
>
> memcpy(buffer2, buffer, n);
>
> i = m = g = 0;
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
> quicksort1(buffer2, buffer2 + n - 1, 1);
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
>
> cpu_time_used =
> (long double) (end.tv_sec - start.tv_sec) +
> (long double) (end.tv_nsec - start.tv_nsec) /
> (long double) 1000000000;
>
> printf("\n\nquicksort1\n");
> //~ printf("time = %.9Lf, max depth: %lu, calls: %lu, g: %lu\n\n\n",
> cpu_time_used, i, m, g);
> printf("time = %.9Lf\n", cpu_time_used);
>
>
> return 0;
> }