Am 23.01.2017 um 20:07 schrieb Junio C Hamano:
> René Scharfe <[email protected]> writes:
>
>> Implement qsort_s() as a wrapper to the GNU version of qsort_r(1) and
>> use it on Linux. Performance increases slightly:
>>
>> Test HEAD^ HEAD
>> --------------------------------------------------------------------
>> 0071.2: sort(1) 0.10(0.20+0.02) 0.10(0.21+0.01) +0.0%
>> 0071.3: string_list_sort() 0.17(0.15+0.01) 0.16(0.15+0.01) -5.9%
>>
>> Additionally the unstripped size of compat/qsort_s.o falls from 24576
>> to 16544 bytes in my build.
>>
>> IMHO these savings aren't worth the increased complexity of having to
>> support two implementations.
>
> I do worry about having to support more implementations in the
> future that have different function signature for the comparison
> callbacks, which will make things ugly, but this addition alone
> doesn't look too bad to me.
It is unfair of me to show a 5% speedup and then recommend to not
include it. ;-) That difference won't be measurable in real use cases
and the patch is not necessary. This patch is simple, but the overall
complexity (incl. #ifdefs etc.) will be lower without it.
But here's another one, with even higher performance and with an even
bigger recommendation to not include it! :) It veers off into another
direction: Parallel execution. It requires thread-safe comparison
functions, which might surprise callers. The value 1000 for the minimum
number of items before threading kicks in is just a guess, not based on
measurements. So it's quite raw -- and I'm not sure why it's still a
bit slower than sort(1).
Test HEAD^ HEAD
---------------------------------------------------------------------
0071.2: sort(1) 0.10(0.18+0.03) 0.10(0.20+0.02) +0.0%
0071.3: string_list_sort() 0.17(0.14+0.02) 0.11(0.18+0.02) -35.3%
---
compat/qsort_s.c | 76 ++++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 58 insertions(+), 18 deletions(-)
diff --git a/compat/qsort_s.c b/compat/qsort_s.c
index 52d1f0a73d..1304a089af 100644
--- a/compat/qsort_s.c
+++ b/compat/qsort_s.c
@@ -1,4 +1,5 @@
#include "../git-compat-util.h"
+#include "../thread-utils.h"
/*
* A merge sort implementation, simplified from the qsort implementation
@@ -6,29 +7,58 @@
* Added context pointer, safety checks and return value.
*/
-static void msort_with_tmp(void *b, size_t n, size_t s,
- int (*cmp)(const void *, const void *, void *),
- char *t, void *ctx)
+#define MIN_ITEMS_FOR_THREAD 1000
+
+struct work {
+ int nr_threads;
+ void *base;
+ size_t nmemb;
+ size_t size;
+ char *tmp;
+ int (*cmp)(const void *, const void *, void *);
+ void *ctx;
+};
+
+static void *msort_with_tmp(void *work)
{
+ struct work one, two, *w = work;
char *tmp;
char *b1, *b2;
size_t n1, n2;
+ size_t s, n;
- if (n <= 1)
- return;
+ if (w->nmemb <= 1)
+ return NULL;
- n1 = n / 2;
- n2 = n - n1;
- b1 = b;
- b2 = (char *)b + (n1 * s);
+ one = two = *w;
+ one.nr_threads /= 2;
+ two.nr_threads -= one.nr_threads;
+ n = one.nmemb;
+ s = one.size;
+ n1 = one.nmemb = n / 2;
+ n2 = two.nmemb = n - n1;
+ b1 = one.base;
+ b2 = two.base = b1 + n1 * s;
+ two.tmp += n1 * s;
- msort_with_tmp(b1, n1, s, cmp, t, ctx);
- msort_with_tmp(b2, n2, s, cmp, t, ctx);
+#ifndef NO_PTHREADS
+ if (one.nr_threads && n > MIN_ITEMS_FOR_THREAD) {
+ pthread_t thread;
+ int err = pthread_create(&thread, NULL, msort_with_tmp, &one);
+ msort_with_tmp(&two);
+ if (err || pthread_join(thread, NULL))
+ msort_with_tmp(&one);
+ } else
+#endif
+ {
+ msort_with_tmp(&one);
+ msort_with_tmp(&two);
+ }
- tmp = t;
+ tmp = one.tmp;
while (n1 > 0 && n2 > 0) {
- if (cmp(b1, b2, ctx) <= 0) {
+ if (one.cmp(b1, b2, one.ctx) <= 0) {
memcpy(tmp, b1, s);
tmp += s;
b1 += s;
@@ -42,7 +72,8 @@ static void msort_with_tmp(void *b, size_t n, size_t s,
}
if (n1 > 0)
memcpy(tmp, b1, n1 * s);
- memcpy(b, t, (n - n2) * s);
+ memcpy(one.base, one.tmp, (n - n2) * s);
+ return NULL;
}
int git_qsort_s(void *b, size_t n, size_t s,
@@ -50,20 +81,29 @@ int git_qsort_s(void *b, size_t n, size_t s,
{
const size_t size = st_mult(n, s);
char buf[1024];
+ struct work w;
if (!n)
return 0;
if (!b || !cmp)
return -1;
+ w.nr_threads = online_cpus();
+ w.base = b;
+ w.nmemb = n;
+ w.size = s;
+ w.cmp = cmp;
+ w.ctx = ctx;
+
if (size < sizeof(buf)) {
/* The temporary array fits on the small on-stack buffer. */
- msort_with_tmp(b, n, s, cmp, buf, ctx);
+ w.tmp = buf;
} else {
/* It's somewhat large, so malloc it. */
- char *tmp = xmalloc(size);
- msort_with_tmp(b, n, s, cmp, tmp, ctx);
- free(tmp);
+ w.tmp = xmalloc(size);
}
+ msort_with_tmp(&w);
+ if (w.tmp != buf)
+ free(w.tmp);
return 0;
}
--
2.11.0