[hackers] [libzahl] Optimise libzahl_memcpy for clang || Mattias Andrée

git Sat, 07 May 2016 16:18:07 -0700

commit 84ad8821d456e8f7f40df43b7eb7245703004ce7
Author:     Mattias AndrÃ©e <[email protected]>
AuthorDate: Sat May 7 18:15:59 2016 +0200
Commit:     Mattias AndrÃ©e <[email protected]>
CommitDate: Sat May 7 18:15:59 2016 +0200


    Optimise libzahl_memcpy for clang
    
    Signed-off-by: Mattias AndrÃ©e <[email protected]>

diff --git a/INSTALL b/INSTALL
index 37dbfe4..a907afb 100644
--- a/INSTALL
+++ b/INSTALL
@@ -33,4 +33,6 @@ libzahl contains some (very little) assembly code. In the 
event
 that the used instructions are not supported on your machine, please
 report it, and in the meanwhile add -DZAHL_NO_ASM to CPPFLAGS. You
 make also have to do this if you are compiling with a compiler that
-does not support extended inline assembly.
+does not support extended inline assembly. You may also have to add
+    #define ZAHL_NO_ASM
+to your program before includeing <zahl.h>
diff --git a/STATUS b/STATUS
index a9f91b6..bb06ce6 100644
--- a/STATUS
+++ b/STATUS
@@ -18,7 +18,7 @@ processes are fixed to one CPU.
 
   The following functions are probably implemented optimally:
 
-zset .................... always fastest (gcc); until ~1200 (clang [can be 
fixed with assembly])
+zset .................... always fastest
 zseti(a, +) ............. tomsfastmath is faster
 zseti(a, -) ............. tomsfastmath is faster
 zsetu ................... tomsfastmath is faster
@@ -30,8 +30,20 @@ zodd .................... always fastest (shared with gmp)
 zeven_nonzero ........... always fastest (shared with gmp)
 zodd_nonzero ............ always fastest (shared with gmp)
 zbtest .................. always fastest
-zsave ................... always fastest [clang needs zset fix]
-zload ................... always fastest [clang needs zset fix]
+zsave ................... always fastest
+zload ................... always fastest
+
+
+  The following functions are probably implemented optimally, but
+  depends on other functions or call-cases for better performance:
+
+zneg(a, b) .............. always fastest
+zabs(a, b) .............. always fastest
+ztrunc(a, b, c) ......... always fastest
+zbset(a, b, 1) .......... always fastest
+zbset(a, b, 0) .......... always fastest
+zbset(a, b, -1) ......... always fastest
+zsplit .................. alternating with gmp for fastest, but gmp is a bit 
faster on average
 
 
   The following functions are probably implemented close to
@@ -40,26 +52,14 @@ zload ................... always fastest [clang needs zset 
fix]
 zadd_unsigned ........... fastest after ~140 (depends on cc and libc) compared 
against zadd too
 ztrunc(a, a, b) ......... fastest until ~100, then 77 % (gcc) or 68 % (clang) 
of tomsfastmath
 zbset(a, a, 1) .......... always fastest
-zbset(a, a, 0) .......... always fastest (faster with clang than gcc)
+zbset(a, a, 0) .......... always fastest
 zbset(a, a, -1) ......... always fastest (only marginally faster than gmp with 
clang)
 zlsb .................... always fastest <<suspicious>>
 zlsh .................... not too fast anymore
-zand .................... fastest after ~400, tomsfastmath before (gcc+glibc 
is slow)
-zor ..................... fastest after ~1150, tomsfastmath before (gcc+glibc 
is slow)
-zxor .................... fastest after ~400, tomsfastmath before (clang), gcc 
is slow
-znot .................... always fastest (faster with musl than glibc)
-
-
-  The following functions are probably implemented optimally, but
-  depends on other functions or call-cases for better performance:
-
-zneg(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-zabs(a, b) .............. always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-ztrunc(a, b, c) ......... always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-zbset(a, b, 1) .......... always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-zbset(a, b, 0) .......... always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-zbset(a, b, -1) ......... always fastest (gcc+musl); gcc is a bit slow [clang 
needs zset fix]
-zsplit .................. alternating with gmp for fastest (clang and glibc is 
slower)
+zand .................... fastest after ~400, tomsfastmath before
+zor ..................... fastest after ~1150, tomsfastmath before
+zxor .................... alternative with gmp after ~700, tomsfastmath before 
(musl), a bit slow with glibc
+znot .................... always fastest
 
 
   The following functions require structural changes for
diff --git a/zahl/memory.h b/zahl/memory.h
index b3d6a37..797beab 100644
--- a/zahl/memory.h
+++ b/zahl/memory.h
@@ -34,16 +34,47 @@
 
 
 ZAHL_INLINE void
-libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, size_t n)
+libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, register size_t n)
 {
-       size_t i;
 #define LIBZAHL_X(I)  case I:  d[I - 1] = s[I - 1];
        LIBZAHL_SMALL_INPUT_BEGIN(n);
-       for (i = 0; i < n; i += 4) {
-               d[i + 0] = s[i + 0];
-               d[i + 1] = s[i + 1];
-               d[i + 2] = s[i + 2];
-               d[i + 3] = s[i + 3];
+       {
+#if defined(__x86_64__) && !defined(ZAHL_NO_ASM)
+               /* This crap is needed for clang. */
+               register zahl_char_t t;
+               __asm__ __volatile__ (
+# if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP)
+                       "\n    testq %[e], %[e]"
+                       "\n    jz 2f"
+# endif
+                       "\n    shlq $3, %[e]"
+                       "\n    addq %[d], %[e]"
+                       "\n 1:"
+                       "\n    movq 0(%[s]), %[t]"
+                       "\n    movq %[t], 0(%[d])"
+                       "\n    movq 8(%[s]), %[t]"
+                       "\n    movq %[t], 8(%[d])"
+                       "\n    movq 16(%[s]), %[t]"
+                       "\n    movq %[t], 16(%[d])"
+                       "\n    movq 24(%[s]), %[t]"
+                       "\n    movq %[t], 24(%[d])"
+                       "\n    addq $32, %[s]"
+                       "\n    addq $32, %[d]"
+                       "\n    cmpq %[e], %[d]"
+                       "\n    jl 1b"
+# if defined(ZAHL_ISA_MISSING_INDIRECT_JUMP)
+                       "\n 2:"
+# endif
+                       : [t]"=r"(t), [d]"+r"(d), [s]"+r"(s), [e]"+r"(n));
+#else
+               size_t i;
+               for (i = 0; i < n; i += 4) {
+                       d[i + 0] = s[i + 0];
+                       d[i + 1] = s[i + 1];
+                       d[i + 2] = s[i + 2];
+                       d[i + 3] = s[i + 3];
+               }
+#endif
        }
        LIBZAHL_SMALL_INPUT_END;
 #undef LIBZAHL_X

[hackers] [libzahl] Optimise libzahl_memcpy for clang || Mattias Andrée

Reply via email to