On Sun, Nov 07, 2021 at 08:13:36PM -0600, Luke Small wrote: > https://bugs.llvm.org/show_bug.cgi?id=50026 > > I reported it to the llvm people. it is two slightly different quicksort > algorithms which perform radically differently. The one which you could > assume would take more time, performs MUCH better. > > I made a custom quicksort algorithm which outperforms qsort by A LOT for > sorting an array of around 300 randomly created unsigned characters, which > is what I use it for. > > if you read the report a guy said there's a 10% difference for sorting 3 > million characters on freebsd, but there's about 40% performance difference > on OpenBSD. maybe it's also how the OpenBSD team modified clang to prevent > rop chain stuff or something? I'm using a westmere based intell server. > > it's the same for clang 11. > > What's the deal?
1. Your test is too small. I increased the test size to get less error in measurement. I also changed the code to either run quicksort or quicksort depending on an argument. 2. I confirmed your observation using time on amd64, arm64 however shows a difference more in line with expectations: [otto@mc:8]$ time ./a.out A quicksort time = 0.335777021 0m00.35s real 0m00.35s user 0m00.01s system [otto@mc:9]$ time ./a.out B quicksort1 time = 0.345703033 0m00.36s real 0m00.36s user 0m00.01s system I suspect some non-optimal decision of the optimizer on amd64 for the first quicksort. A next step could be somebody looking at the code produced by the compiler. That is not going to be me. -Otto > > -Luke > > /* > clang sort_test2.c -O2 -o sort_test && ./sort_test > */ > > /* > * quicksort should be slightly faster than quicksort1 because it is > * identical except quicksort1 does more work using global counters. > * quicsort1 runs 41% faster. Using gcc, quicksort is instead slightly faster. > */ > > > /* > * I'm using clang 10.0.1 on OpenBSD. these are my results: > > > quicksort > time = 0.004501771 > > > > quicksort1 > time = 0.002655233 > */ > > > #include <err.h> > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > #include <unistd.h> > #include <sys/param.h> > > size_t m; > size_t g; > size_t i; > > > /* > * quicksort sorts characters on a u_char array. > * 'a' is the first character on the array. > * 'b' is last character on the array > */ > static > void quicksort(u_char a[], u_char b[]) > { > u_char temp; > > if (b - a <= 1) > { > if (a == b) > return; > > if (*a > *b) > { > temp = *a; *a = *b; *b = temp; > } > return; > } > u_char * j = a + 1; > u_char * k = b; > u_char u = *a; > > /* > * This if() avoids repeated awkward special case branching > * in the similar if() in the "while (++j < k)" loop > */ > if (*j > u) > { > do > { > if (*k < u) > { > temp = *j; *j = *k; *k = temp; > goto skip0; > } > } while (j < --k); > > quicksort(j, b); > return; > > skip0: > if (j == --k) > { > *a = *j; *j = u; > > quicksort(j+1, b); > return; > } > } > > > while (++j < k) > { > if (*j > u) > { > do > { > if (*k < u) > { > temp = *j; *j = *k; *k = temp; > goto skip; > } > } while (j < --k); > > quicksort(j, b); > > *a = *--j; *j = u; > > quicksort(a, j-1); > return; > > skip: > if (j == --k) > { > *a = *j; *j = u; > > quicksort(a, j-1); > quicksort(j+1, b); > return; > } > } > } > > if (*j > u) > { > quicksort(j, b); > > *a = *--j; *j = u; > > quicksort(a, j-1); > } > else > { > *a = *j; *j = u; > > quicksort(a, j-1); > > if (j == b) > return; > > quicksort(j+1, b); > } > } > > /* > * quicksort1 is identical to quicksort() but it alters some global counters > * it should take slightly longer to run, but it runs in half the time > */ > static > void quicksort1(u_char a[], u_char b[], size_t level) > { > ++m; > if (level > i) > i = level; > > u_char temp; > > > if (b - a <= 1) > { > ++g; > if (a == b) > return; > if (*a > *b) > { > temp = *a; *a = *b; *b = temp; > } > return; > } > u_char * j = a + 1; > u_char * k = b; > u_char u = *a; > > /* > * This if() avoids repeated awkward special case branching > * in the similar if() in the "while (++j < k)" loop > */ > if (*j > u) > { > do > { > if (*k < u) > { > temp = *j; *j = *k; *k = temp; > goto skip0; > } > } while (j < --k); > > quicksort1(j, b, level + 1); > return; > > skip0: > if (j == --k) > { > *a = *j; *j = u; > > quicksort1(j+1, b, level + 1); > return; > } > } > > > while (++j < k) > { > if (*j > u) > { > do > { > if (*k < u) > { > temp = *j; *j = *k; *k = temp; > goto skip; > } > } while (j < --k); > > quicksort1(j, b, level + 1); > > *a = *--j; *j = u; > > quicksort1(a, j-1, level + 1); > return; > > skip: > if (j == --k) > { > *a = *j; *j = u; > > quicksort1(a, j-1, level + 1); > quicksort1(j+1, b, level + 1); > return; > } > } > } > > if (*j > u) > { > quicksort1(j, b, level + 1); > > *a = *--j; *j = u; > > quicksort1(a, j-1, level + 1); > } > else > { > *a = *j; *j = u; > > quicksort1(a, j-1, level + 1); > > if (j == b) > return; > > quicksort1(j+1, b, level + 1); > } > } > > > int main() > { > > long double cpu_time_used; > struct timespec start, end; > u_char *buffer, *buffer2; > > size_t y; > size_t n = 30000; > > buffer = (u_char*)malloc(n); > if (buffer == NULL) err(1, "malloc"); > buffer2 = (u_char*)malloc(n); > if (buffer2 == NULL) err(1, "malloc"); > > > for (y = 0; y < n; ++y) > buffer[y] = rand() % 256; > > // run it once to clear any wrappers > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); > > > memcpy(buffer2, buffer, n); > > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); > quicksort(buffer2, buffer2 + n - 1); > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); > > cpu_time_used = > (long double) (end.tv_sec - start.tv_sec) + > (long double) (end.tv_nsec - start.tv_nsec) / > (long double) 1000000000; > > printf("\n\nquicksort\n"); > printf("time = %.9Lf\n\n", cpu_time_used); > > > > memcpy(buffer2, buffer, n); > > i = m = g = 0; > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); > quicksort1(buffer2, buffer2 + n - 1, 1); > clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); > > cpu_time_used = > (long double) (end.tv_sec - start.tv_sec) + > (long double) (end.tv_nsec - start.tv_nsec) / > (long double) 1000000000; > > printf("\n\nquicksort1\n"); > //~ printf("time = %.9Lf, max depth: %lu, calls: %lu, g: %lu\n\n\n", > cpu_time_used, i, m, g); > printf("time = %.9Lf\n", cpu_time_used); > > > return 0; > }