commit 93bf9e5b4bf63708c732f5bf07619d2e59c81ec4
Author:     Mattias Andrée <[email protected]>
AuthorDate: Thu May 5 02:41:50 2016 +0200
Commit:     Mattias Andrée <[email protected]>
CommitDate: Thu May 5 02:41:50 2016 +0200

    Optimise zadd on x86-64
    
    Signed-off-by: Mattias Andrée <[email protected]>

diff --git a/src/zadd.c b/src/zadd.c
index a78a918..8efdf19 100644
--- a/src/zadd.c
+++ b/src/zadd.c
@@ -2,20 +2,79 @@
 #include "internals.h"
 
 
+#if defined(__x86_64__)
+# define ASM3(code)  \
+       __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i), 
"c"(cc + i))
+
+# define ASM2(code)  \
+       __asm__ __volatile__ (code : "+d"(carry) : "a"(ac + i), "b"(bc + i))
+
+# define ADD2(off)                         \
+       "\n    movq "#off"(%%rbx), %%rdx"  \
+       "\n    adcq %%rdx, "#off"(%%rax)"
+
+# define ADD3(off)                         \
+       "\n    movq "#off"(%%rbx), %%rdx"  \
+       "\n    adcq "#off"(%%rcx), %%rdx"  \
+       "\n    movq %%rdx, "#off"(%%rax)"
+
+# define WRAP_CARRY(interior)   \
+       "\n    clc"             \
+       "\n    cmpq $0, %%rdx"  \
+       "\n    je 1f"           \
+       "\n    stc"             \
+       "\n 1:"                 \
+       interior                \
+       "\n    movq $1, %%rdx"  \
+       "\n    jc 1f"           \
+       "\n    movq $0, %%rdx"  \
+       "\n 1:"
+#endif
+
+
 static inline void
 zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
 {
-       zahl_char_t carry = 0, tcarry;
+       zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars, *cc = c->chars;
        size_t i;
 
+#if defined(__x86_64__)
+       for (i = 0; (i += 4) <= n;)
+               ASM3(WRAP_CARRY(ADD3(-32) ADD3(-24) ADD3(-16) ADD3(-8)));
+       if (i > n) {
+               i -= 4;
+               switch (n & 3) {
+               case 3:
+                       ASM3(WRAP_CARRY(ADD3(0) ADD3(8) ADD3(16)));
+                       break;
+               case 2:
+                       ASM3(WRAP_CARRY(ADD3(0) ADD3(8)));
+                       break;
+               case 1:
+                       ASM3(WRAP_CARRY(ADD3(0)));
+                       break;
+               default:
+                       break;
+               }
+       }
+       i = n;
+
+       while (carry) {
+               carry = libzahl_add_overflow(ac + i, ac[i], 1);
+               i++;
+       }
+#else
+       zahl_char_t tcarry;
+
        for (i = 0; i < n; i++) {
-               tcarry = libzahl_add_overflow(a->chars + i, b->chars[i], 
c->chars[i]);
-               carry = tcarry | (zahl_char_t)libzahl_add_overflow(a->chars + 
i, a->chars[i], carry);
+               tcarry = libzahl_add_overflow(ac + i, bc[i], cc[i]);
+               carry = tcarry | (zahl_char_t)libzahl_add_overflow(ac + i, 
ac[i], carry);
        }
        while (carry) {
-               carry = libzahl_add_overflow(a->chars + i, a->chars[i], 1);
+               carry = libzahl_add_overflow(ac + i, ac[i], 1);
                i++;
        }
+#endif
 
        if (a->used < i)
                a->used = i;
@@ -24,7 +83,40 @@ zadd_impl_4(z_t a, z_t b, z_t c, size_t n)
 static inline void
 zadd_impl_3(z_t a, z_t b, size_t n)
 {
+#if defined(__x86_64__)
+       zahl_char_t carry = 0, *ac = a->chars, *bc = b->chars;
+       size_t i;
+
+       for (i = 0; (i += 4) <= n;)
+               ASM2(WRAP_CARRY(ADD2(-32) ADD2(-24) ADD2(-16) ADD2(-8)));
+       if (i > n) {
+               i -= 4;
+               switch (n & 3) {
+               case 3:
+                       ASM2(WRAP_CARRY(ADD2(0) ADD2(8) ADD2(16)));
+                       break;
+               case 2:
+                       ASM2(WRAP_CARRY(ADD2(0) ADD2(8)));
+                       break;
+               case 1:
+                       ASM2(WRAP_CARRY(ADD2(0)));
+                       break;
+               default:
+                       break;
+               }
+       }
+       i = n;
+
+       while (carry) {
+               carry = libzahl_add_overflow(ac + i, ac[i], 1);
+               i++;
+       }
+
+       if (a->used < i)
+               a->used = i;
+#else
        zadd_impl_4(a, a, b, n);
+#endif
 }
 
 static inline void

Reply via email to