commit 9437becf6d8aa4d9a3872b2cd6b353dc4c90a1cb
Author:     Mattias Andrée <[email protected]>
AuthorDate: Thu May 5 21:11:43 2016 +0200
Commit:     Mattias Andrée <[email protected]>
CommitDate: Thu May 5 21:13:16 2016 +0200

    Optimisations
    
    Signed-off-by: Mattias Andrée <[email protected]>

diff --git a/STATUS b/STATUS
index a5f664e..a0fb7d2 100644
--- a/STATUS
+++ b/STATUS
@@ -1,3 +1,21 @@
+The following functions are probably implemented optimally:
+
+zswap ................... always fastest
+zzero ................... always fastest (shared with gmp)
+zsignum ................. always fastest (shared with gmp)
+zeven ................... always fastest
+zodd .................... always fastest
+zeven_nonzero ........... always fastest
+zodd_nonzero ............ always fastest (shared with gmp)
+zbtest .................. always fastest
+
+
+The following functions are probably implemented close to
+optimally, further optimisation should not be a priority:
+
+zadd_unsigned ........... fastest after ~70 compared against zadd too (x86-64)
+
+
 Optimisation progress for libzahl, compared to other big integer
 libraries. These comparisons are for 152-bit integers. Functions
 in parenthesis the right column are functions that needs
@@ -10,26 +28,18 @@ zset .................... fastest [always]
 zseti ................... tomsfastmath is faster [always]
 zsetu ................... tomsfastmath is faster [always]
 zneg(a, b) .............. fastest [always]
-zneg(a, a) .............. fastest [always] (shared with gmp)
+zneg(a, a) .............. fastest [always] (shared with gmp; faster with clang)
 zabs(a, b) .............. fastest [always]
 zabs(a, a) .............. tomsfastmath is faster [always]
-zadd_unsigned ........... fastest [always]
-zsub_unsigned ........... fastest [always]
-zadd .................... fastest [after ~100, tomsfastmath before] (shared 
with gmp)
+zsub_unsigned ........... fastest [always] (compared against zsub too)
+zadd .................... fastest [after ~110, tomsfastmath before] (x86-64)
 zsub .................... fastest [always]
 zand .................... 77 % of tomsfastmath [until ~900, alternating with 
gmp]
 zor ..................... 65 % of tomsfastmath [until ~1750, alternating with 
gmp (gcc) and tomsfastmath (clang)]
 zxor .................... 87 % of tomsfastmath [until ~700, alternating with 
gmp (gcc+clangs),]
 znot .................... fastest [always]
-zeven ................... fastest [always]
-zodd .................... fastest [always]
-zeven_nonzero ........... fastest [always]
-zodd_nonzero ............ fastest [always]
-zzero ................... fastest [always] (shared with gmp)
-zsignum ................. fastest [always] (shared with gmp)
 zbits ................... fastest [always]
 zlsb .................... fastest [always]
-zswap ................... fastest [always]
 zlsh .................... fastest [until ~1000, then gmp]
 zrsh .................... fastest [almost never]
 ztrunc(a, b, c) ......... fastest [always; alternating with gmp between 
1400~3000 (clang)]
@@ -46,7 +56,6 @@ zbset(a, b, 0) .......... fastest [always]
 zbset(a, a, 0) .......... fastest [always]
 zbset(a, b, -1) ......... fastest [always]
 zbset(a, a, -1) ......... fastest [always]
-zbtest .................. fastest [always]
 zgcd .................... 21 % of gmp (zcmpmag)
 zmul .................... slowest
 zsqr .................... slowest (zmul)
diff --git a/src/zadd.c b/src/zadd.c
index 8efdf19..5eb7050 100644
--- a/src/zadd.c
+++ b/src/zadd.c
@@ -4,116 +4,94 @@
 
 #if defined(__x86_64__)
 # define ASM3(code)  \
-       __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), 
"c"(cc + i))
+       __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), 
[c]"+r"(cc))
 
 # define ASM2(code)  \
-       __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+       __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc))
 
-# define ADD2(off)                         \
-       "\n    movq "#off"(%%rbx), %%rdx"  \
-       "\n    adcq %%rdx, "#off"(%%rax)"
+# define ADD2(off)                       \
+       "\n    movq "#off"(%[b]), %[x]"  \
+       "\n    adcq %[x], "#off"(%[a])"
 
-# define ADD3(off)                         \
-       "\n    movq "#off"(%%rbx), %%rdx"  \
-       "\n    adcq "#off"(%%rcx), %%rdx"  \
-       "\n    movq %%rdx, "#off"(%%rax)"
+# define ADD3(off)                       \
+       "\n    movq "#off"(%[b]), %[x]"  \
+       "\n    adcq "#off"(%[c]), %[x]"  \
+       "\n    movq %[x], "#off"(%[a])"
 
 # define WRAP_CARRY(interior)   \
-       "\n    clc"             \
-       "\n    cmpq $0, %%rdx"  \
-       "\n    je 1f"           \
-       "\n    stc"             \
-       "\n 1:"                 \
+       "\n    addq $-1, %[x]"  \
        interior                \
-       "\n    movq $1, %%rdx"  \
+       "\n    movq $1, %[x]"   \
        "\n    jc 1f"           \
-       "\n    movq $0, %%rdx"  \
+       "\n    movq $0, %[x]"   \
        "\n 1:"
+
+# define ASM_ADD(N)                                                            
              \
+       do {                                                                    
             \
+               register zahl_char_t carry = 0;                                 
             \
+               size_t i;                                                       
             \
+               for (i = 0; (INC(4)), (i += 4) <= n;)                           
             \
+                       ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) ADD##N(-16) 
ADD##N(-8)));  \
+               switch (n & 3) {                                                
             \
+               case 3:                                                         
             \
+                       ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24) 
ADD##N(-16)));             \
+                       break;                                                  
             \
+               case 2:                                                         
             \
+                       ASM##N(WRAP_CARRY(ADD##N(-32) ADD##N(-24)));            
             \
+                       break;                                                  
             \
+               case 1:                                                         
             \
+                       ASM##N(WRAP_CARRY(ADD##N(-32)));                        
             \
+                       break;                                                  
             \
+               default:                                                        
             \
+                       break;                                                  
             \
+               }                                                               
             \
+               i = n;                                                          
             \
+               while (carry) {                                                 
             \
+                       carry = libzahl_add_overflow(a->chars + i, a->chars[i], 
1);          \
+                       i++;                                                    
             \
+               }                                                               
             \
+               if (a->used < i)                                                
             \
+                       a->used = i;                                            
             \
+       } while (0)
 #endif
 
 
 static inline void
 zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
 {
-       zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
-       size_t i;
-
-#if defined(__x86_64__)
-       for (i = 0; (i += 4) <= n;)
-               ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
-       if (i > n) {
-               i -= 4;
-               switch (n & 3) {
-               case 3:
-                       ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
-                       break;
-               case 2:
-                       ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
-                       break;
-               case 1:
-                       ASM3(WRAP_CARRY(ADD3(0)));
-                       break;
-               default:
-                       break;
-               }
-       }
-       i = n;
-
-       while (carry) {
-               carry = libzahl_add_overflow(ac + i, ac[i], 1);
-               i++;
-       }
+#ifdef ASM_ADD
+       register zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+# define INC(P)  (ac += (P), bc += (P), cc += (P))
+       ASM_ADD(3);
+# undef INC
 #else
-       zahl_char_t tcarry;
+       zahl_char_t carry = 0, tcarry;
+       zahl_char_t *ac = a->chars, *bc = b->chars, *cc = c->chars;
+       size_t i;
 
        for (i = 0; i < n; i++) {
                tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
                carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, 
ac[i], carry);
        }
+
        while (carry) {
                carry = libzahl_add_overflow(ac + i, ac[i], 1);
                i++;
        }
-#endif
 
        if (a->used < i)
                a->used = i;
+#endif
 }
 
 static inline void
 zadd_impl_3(z_t a, z_t b, size_t n)
 {
-#if defined(__x86_64__)
-       zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
-       size_t i;
-
-       for (i = 0; (i += 4) <= n;)
-               ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
-       if (i > n) {
-               i -= 4;
-               switch (n & 3) {
-               case 3:
-                       ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
-                       break;
-               case 2:
-                       ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
-                       break;
-               case 1:
-                       ASM2(WRAP_CARRY(ADD2(0)));
-                       break;
-               default:
-                       break;
-               }
-       }
-       i = n;
-
-       while (carry) {
-               carry = libzahl_add_overflow(ac + i, ac[i], 1);
-               i++;
-       }
-
-       if (a->used < i)
-               a->used = i;
+#ifdef ASM_ADD
+       register zahl_char_t *ac = a->chars, *bc = b->chars;
+# define INC(P)  (ac += (P), bc += (P))
+       ASM_ADD(2);
+# undef INC
 #else
        zadd_impl_4(a, a, b, n);
 #endif
diff --git a/zahl/internals.h b/zahl/internals.h
index b7cd230..e0aa248 100644
--- a/zahl/internals.h
+++ b/zahl/internals.h
@@ -2,11 +2,7 @@
 
 #ifndef ZAHL_INLINE
 # if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-#  if defined(__GNUC__) || defined(__clang__)
-#   define ZAHL_INLINE  __attribute__((__always_inline__, __gnu_inline__)) 
static inline
-#  else
-#   define ZAHL_INLINE  static inline
-#  endif
+#  define ZAHL_INLINE  static inline
 # else
 #  define ZAHL_INLINE  static
 # endif

Reply via email to