commit fdc75a358e4c20116640c08f4e8ce7a09dc3cebd
Author:     Mattias Andrée <[email protected]>
AuthorDate: Sat May 7 03:02:56 2016 +0200
Commit:     Mattias Andrée <[email protected]>
CommitDate: Sat May 7 03:02:56 2016 +0200

    Optimisations
    
    Signed-off-by: Mattias Andrée <[email protected]>

diff --git a/Makefile b/Makefile
index cb3cbdf..d8bff8c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@ include config.mk
 
 HDR_SEMIPUBLIC =\
        zahl/inlines.h\
-       zahl/internals.h
+       zahl/internals.h\
+       zahl/memory.h
 
 HDR_PRIVATE =\
        src/internals.h
diff --git a/src/allocator.c b/src/allocator.c
index 4f81171..d03ff17 100644
--- a/src/allocator.c
+++ b/src/allocator.c
@@ -21,7 +21,7 @@ libzahl_realloc(z_t a, size_t need)
                zfree(a);
                a->chars = new;
        } else {
-               a->chars = realloc(a->chars, new_size * sizeof(zahl_char_t));
+               a->chars = realloc(a->chars, (new_size + ZAHL_FLUFF) * 
sizeof(zahl_char_t));
                if (check(!a->chars))
                        libzahl_memfailure();
        }
diff --git a/src/internals.h b/src/internals.h
index f97a789..2f42269 100644
--- a/src/internals.h
+++ b/src/internals.h
@@ -4,7 +4,6 @@
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
 
 /* clang pretends to be GCC... */
 #if defined(__GNUC__) && defined(__clang__)
@@ -105,7 +104,11 @@ extern void *libzahl_temp_allocation;
 #define zpositive2(a, b)             (zsignum(a) + zsignum(b) == 2)
 #define zzero2(a, b)                 (!(zsignum(a) | zsignum(b)))
 #define zmemcpy(d, s, n)             libzahl_memcpy(d, s, n)
+#define zmemmove(d, s, n)            libzahl_memmove(d, s, n)
+#define zmemmovef(d, s, n)           libzahl_memmovef(d, s, n)
+#define zmemmoveb(d, s, n)           libzahl_memmoveb(d, s, n)
 #define zmemset(a, v, n)             libzahl_memset(a, v, n)
+#define zmemset_precise(a, v, n)     libzahl_memset_precise(a, v, n)
 
 static inline int
 zzero1(z_t a, z_t b)
@@ -113,11 +116,10 @@ zzero1(z_t a, z_t b)
        return zzero(a) || zzero(b);
 }
 
-O2 static inline void
+static inline void
 zmemcpy_range(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, size_t i, size_t n)
 {
-       for (; i < n; i++)
-               d[i] = s[i];
+       zmemcpy(d + i, s + i, n - i);
 }
 
 static void
@@ -337,7 +339,7 @@ zfree_temp(z_t a)
 
 /* } */
 
-#define ZMEM_2OP(a, b, c, n, OP)                                              \
+#define ZMEM_2OP_PRECISE(a, b, c, n, OP)                                      \
        do {                                                                  \
                zahl_char_t *a__ = (a);                                       \
                const zahl_char_t *b__ = (b);                                 \
@@ -365,66 +367,29 @@ zfree_temp(z_t a)
                }                                                             \
        } while (0)
 
-#define ZMEM_1OP(a, b, n, OP)                                     \
-       do {                                                      \
-               zahl_char_t *a__ = (a);                           \
-               const zahl_char_t *b__ = (b);                     \
-               size_t i__, n__ = (n);                            \
-               if (n__ <= 4) {                                   \
-                       if (n__ >= 1)                             \
-                               a__[0] = OP(b__[0]);              \
-                       if (n__ >= 2)                             \
-                               a__[1] = OP(b__[1]);              \
-                       if (n__ >= 3)                             \
-                               a__[2] = OP(b__[2]);              \
-                       if (n__ >= 4)                             \
-                               a__[3] = OP(b__[3]);              \
-               } else {                                          \
-                       for (i__ = 0; (i__ += 4) < n__;) {        \
-                               a__[i__ - 1] = OP(b__[i__ - 1]);  \
-                               a__[i__ - 2] = OP(b__[i__ - 2]);  \
-                               a__[i__ - 3] = OP(b__[i__ - 3]);  \
-                               a__[i__ - 4] = OP(b__[i__ - 4]);  \
-                       }                                         \
-                       if (i__ > n__)                            \
-                               for (i__ -= 4; i__ < n__; i__++)  \
-                                       a__[i__] = OP(b__[i__]);  \
-               }                                                 \
+#define ZMEM_2OP(a, b, c, n, OP)                                      \
+       do {                                                          \
+               zahl_char_t *a__ = (a);                               \
+               const zahl_char_t *b__ = (b);                         \
+               const zahl_char_t *c__ = (c);                         \
+               size_t i__, n__ = (n);                                \
+               for (i__ = 0; i__ < n__; i__ += 4) {                  \
+                       a__[i__ + 0] = b__[i__ + 0] OP c__[i__ + 0];  \
+                       a__[i__ + 1] = b__[i__ + 1] OP c__[i__ + 1];  \
+                       a__[i__ + 2] = b__[i__ + 2] OP c__[i__ + 2];  \
+                       a__[i__ + 3] = b__[i__ + 3] OP c__[i__ + 3];  \
+               }                                                     \
        } while (0)
 
-static inline void
-zmemcpyb(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, size_t n_)
-{
-       ssize_t i, n = (ssize_t)n_;
-       switch (n & 3) {
-       case 3:
-               d[n - 1] = s[n - 1];
-               d[n - 2] = s[n - 2];
-               d[n - 3] = s[n - 3];
-               break;
-       case 2:
-               d[n - 1] = s[n - 1];
-               d[n - 2] = s[n - 2];
-               break;
-       case 1:
-               d[n - 1] = s[n - 1];
-               break;
-       default:
-               break;
-       }
-       for (i = n & ~3; (i -= 4) >= 0;) {
-               d[i + 3] = s[i + 3];
-               d[i + 2] = s[i + 2];
-               d[i + 1] = s[i + 1];
-               d[i + 0] = s[i + 0];
-       }
-}
-
-static inline void
-zmemmove(register zahl_char_t *d, register const zahl_char_t *s, size_t n)
-{
-       if (d < s)
-               zmemcpy(d, s, n);
-       else
-               zmemcpyb(d, s, n);
-}
+#define ZMEM_1OP(a, b, n, OP)                             \
+       do {                                              \
+               zahl_char_t *a__ = (a);                   \
+               const zahl_char_t *b__ = (b);             \
+               size_t i__, n__ = (n);                    \
+               for (i__ = 0; i__ < n__; i__ += 4) {      \
+                       a__[i__ + 0] = OP(b__[i__ + 0]);  \
+                       a__[i__ + 1] = OP(b__[i__ + 1]);  \
+                       a__[i__ + 2] = OP(b__[i__ + 2]);  \
+                       a__[i__ + 3] = OP(b__[i__ + 3]);  \
+               }                                         \
+       } while (0)
diff --git a/src/zadd.c b/src/zadd.c
index 5eb7050..56c901b 100644
--- a/src/zadd.c
+++ b/src/zadd.c
@@ -2,7 +2,7 @@
 #include "internals.h"
 
 
-#if defined(__x86_64__)
+#if defined(__x86_64__) && !defined(ZAHL_NO_ASM)
 # define ASM3(code)  \
        __asm__ __volatile__ (code : [x]"+r"(carry), [a]"+r"(ac), [b]"+r"(bc), 
[c]"+r"(cc))
 
diff --git a/src/zload.c b/src/zload.c
index 877c95e..f788a45 100644
--- a/src/zload.c
+++ b/src/zload.c
@@ -6,11 +6,11 @@ size_t
 zload(z_t a, const void *buffer)
 {
        const char *buf = buffer;
-       a->sign = *((const int *)buf),    buf += sizeof(int);
+       a->sign = *((const long *)buf),   buf += sizeof(long);
        a->used = *((const size_t *)buf), buf += sizeof(size_t);
        if (likely(a->sign)) {
                ENSURE_SIZE(a, a->used);
                zmemcpy(a->chars, (const zahl_char_t *)buf, a->used);
        }
-       return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * 
sizeof(zahl_char_t));
+       return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & 
~3) * sizeof(zahl_char_t));
 }
diff --git a/src/zlsh.c b/src/zlsh.c
index 59a05bf..dce8010 100644
--- a/src/zlsh.c
+++ b/src/zlsh.c
@@ -19,11 +19,11 @@ zlsh(z_t a, z_t b, size_t bits)
 
        ENSURE_SIZE(a, b->used + chars + 1);
        if (likely(a == b)) {
-               zmemcpyb(a->chars + chars, b->chars, b->used);
+               zmemmoveb(a->chars + chars, b->chars, b->used);
        } else {
                zmemcpy(a->chars + chars, b->chars, b->used);
        }
-       zmemset(a->chars, 0, chars);
+       zmemset_precise(a->chars, 0, chars);
        a->used = b->used + chars;
 
        if (likely(bits)) { /* This if statement is very important in C. */
diff --git a/src/zor.c b/src/zor.c
index 9b45620..2790f4a 100644
--- a/src/zor.c
+++ b/src/zor.c
@@ -19,11 +19,11 @@ zor(z_t a, z_t b, z_t c)
        ENSURE_SIZE(a, m);
 
        if (a == b) {
-               ZMEM_2OP(a->chars, a->chars, c->chars, n, |);
+               ZMEM_2OP_PRECISE(a->chars, a->chars, c->chars, n, |);
                if (a->used < c->used)
                        zmemcpy_range(a->chars, c->chars, n, m);
        } else if (unlikely(a == c)) {
-               ZMEM_2OP(a->chars, a->chars, b->chars, n, |);
+               ZMEM_2OP_PRECISE(a->chars, a->chars, b->chars, n, |);
                if (a->used < b->used)
                        zmemcpy_range(a->chars, b->chars, n, m);
        } else  if (m == b->used) {
diff --git a/src/zptest.c b/src/zptest.c
index 29e50b7..406cedc 100644
--- a/src/zptest.c
+++ b/src/zptest.c
@@ -47,8 +47,7 @@ zptest(z_t witness, z_t n, int t)
                        continue;
 
                for (i = 1; i < r; i++) {
-                       zsqr(x, x);
-                       zmod(x, x, n);
+                       zmodsqr(x, x, n);
                        if (!zcmp(x, libzahl_const_1)) {
                                if (witness)
                                        zswap(witness, a);
diff --git a/src/zsetup.c b/src/zsetup.c
index 3e3062b..7486860 100644
--- a/src/zsetup.c
+++ b/src/zsetup.c
@@ -21,7 +21,7 @@ struct zahl **libzahl_temp_stack_end;
 void *libzahl_temp_allocation = 0;
 
 #define X(i, x, f, v)  1 +
-static zahl_char_t constant_chars[LIST_CONSTS 0];
+static zahl_char_t constant_chars[LIST_CONSTS ZAHL_FLUFF];
 #undef X
 
 
diff --git a/src/zsqr.c b/src/zsqr.c
index dbbfba2..06c41a4 100644
--- a/src/zsqr.c
+++ b/src/zsqr.c
@@ -20,7 +20,7 @@ zsqr_ll(z_t a, z_t b)
 
 #define z2 a
        z_t z0, z1, high, low;
-       zahl_char_t auxchars[3];
+       zahl_char_t auxchars[3 * ZAHL_FLUFF];
        size_t bits;
 
        bits = zbits(b);
@@ -38,7 +38,7 @@ zsqr_ll(z_t a, z_t b)
         * which require constant auxiliary memory. */
        if (bits < BITS_PER_CHAR) {
                low->chars = auxchars;
-               high->chars = auxchars + 1;
+               high->chars = auxchars + ZAHL_FLUFF;
                zsplit_unsigned_fast_small_auto(high, low, b, bits);
        } else {
                bits = TRUNCATE_TO_CHAR(bits);
diff --git a/src/zxor.c b/src/zxor.c
index fe6b4cd..af1124e 100644
--- a/src/zxor.c
+++ b/src/zxor.c
@@ -26,11 +26,11 @@ zxor(z_t a, z_t b, z_t c)
        ENSURE_SIZE(a, m);
 
        if (a == b) {
-               ZMEM_2OP(a->chars, a->chars, cc, n, ^);
+               ZMEM_2OP_PRECISE(a->chars, a->chars, cc, n, ^);
                if (a->used < cn)
                        zmemcpy_range(a->chars, cc, n, m);
        } else if (unlikely(a == c)) {
-               ZMEM_2OP(a->chars, b->chars, cc, n, ^);
+               ZMEM_2OP_PRECISE(a->chars, a->chars, bc, n, ^);
                if (a->used < bn)
                        zmemcpy_range(a->chars, bc, n, m);
        } else if (m == bn) {
diff --git a/zahl.h b/zahl.h
index e2a77c7..cfd85a3 100644
--- a/zahl.h
+++ b/zahl.h
@@ -11,6 +11,7 @@
 #include <setjmp.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <unistd.h>
 
 #include "zahl/internals.h"
 
diff --git a/zahl/inlines.h b/zahl/inlines.h
index 9884812..c6ddc84 100644
--- a/zahl/inlines.h
+++ b/zahl/inlines.h
@@ -259,12 +259,16 @@ zsave(z_t a, void *buffer)
 {
        if (ZAHL_LIKELY(buffer)) {
                char *buf = buffer;
-               *((int *)buf)    = a->sign, buf += sizeof(int);
+               *((long *)buf)   = a->sign, buf += sizeof(long); /* Use `long` 
for alignment. */
                *((size_t *)buf) = a->used, buf += sizeof(size_t);
-               if (ZAHL_LIKELY(!zzero(a)))
+               if (ZAHL_LIKELY(!zzero(a))) {
+                       a->chars[a->used + 2] = 0;
+                       a->chars[a->used + 1] = 0;
+                       a->chars[a->used + 0] = 0;
                        libzahl_memcpy((zahl_char_t *)buf, a->chars, a->used);
+               }
        }
-       return sizeof(int) + sizeof(size_t) + (zzero(a) ? 0 : a->used * 
sizeof(zahl_char_t));
+       return sizeof(long) + sizeof(size_t) + (zzero(a) ? 0 : ((a->used + 3) & 
~3) * sizeof(zahl_char_t));
 }
 
 
diff --git a/zahl/internals.h b/zahl/internals.h
index 672dfcd..30e2765 100644
--- a/zahl/internals.h
+++ b/zahl/internals.h
@@ -51,6 +51,7 @@
 #define ZAHL_BITS_PER_CHAR                64
 #define ZAHL_LB_BITS_PER_CHAR             6
 #define ZAHL_CHAR_MAX                     UINT64_MAX
+#define ZAHL_FLUFF                        4
 /* Note: These cannot be changed willy-nilly, some code depends
  * on them, be cause being flexible would just be too painful. */
 
@@ -113,62 +114,4 @@ extern struct zahl libzahl_tmp_mod[1];
 
 void libzahl_realloc(struct zahl *, size_t);
 
-ZAHL_INLINE void
-libzahl_memcpy(register zahl_char_t *d, register const zahl_char_t *s, size_t 
n)
-{
-       size_t i;
-       if (n <= 4) {
-               if (n >= 1)
-                       d[0] = s[0];
-               if (n >= 2)
-                       d[1] = s[1];
-               if (n >= 3)
-                       d[2] = s[2];
-               if (n >= 4)
-                       d[3] = s[3];
-       } else {
-               for (i = 0; (i += 4) <= n;) {
-                       d[i - 4] = s[i - 4];
-                       d[i - 3] = s[i - 3];
-                       d[i - 2] = s[i - 2];
-                       d[i - 1] = s[i - 1];
-               }
-               if (i > n) {
-                       i -= 4;
-                       if (i < n)
-                               d[i] = s[i], i++;
-                       if (i < n)
-                               d[i] = s[i], i++;
-                       if (i < n)
-                               d[i] = s[i], i++;
-                       if (i < n)
-                               d[i] = s[i];
-               }
-       }
-}
-
-ZAHL_INLINE void
-libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
-{
-       size_t i;
-       if (n <= 4) {
-               if (n >= 1)
-                       a[0] = v;
-               if (n >= 2)
-                       a[1] = v;
-               if (n >= 3)
-                       a[2] = v;
-               if (n >= 4)
-                       a[3] = v;
-       } else {
-               for (i = 0; (i += 4) <= n;) {
-                       a[i - 1] = v;
-                       a[i - 2] = v;
-                       a[i - 3] = v;
-                       a[i - 4] = v;
-               }
-               if (i > n)
-                       for (i -= 4; i < n; i++)
-                               a[i] = v;
-       }
-}
+#include "memory.h"
diff --git a/zahl/memory.h b/zahl/memory.h
new file mode 100644
index 0000000..2eb2b68
--- /dev/null
+++ b/zahl/memory.h
@@ -0,0 +1,133 @@
+/* See LICENSE file for copyright and license details. */
+
+#define LIBZAHL_MEM_CASES  \
+       LIBZAHL_X(20);     \
+       LIBZAHL_X(19);     \
+       LIBZAHL_X(18);     \
+       LIBZAHL_X(17);     \
+       LIBZAHL_X(16);     \
+       LIBZAHL_X(15);     \
+       LIBZAHL_X(14);     \
+       LIBZAHL_X(13);     \
+       LIBZAHL_X(12);     \
+       LIBZAHL_X(11);     \
+       LIBZAHL_X(10);     \
+       LIBZAHL_X( 9);     \
+       LIBZAHL_X( 8);     \
+       LIBZAHL_X( 7);     \
+       LIBZAHL_X( 6);     \
+       LIBZAHL_X( 5);     \
+       LIBZAHL_X( 4);     \
+       LIBZAHL_X( 3);     \
+       LIBZAHL_X( 2);     \
+       LIBZAHL_X( 1);     \
+       case 0: break;
+
+
+#if defined(LIBZAHL_ISA_MISSING_INDIRECT_JUMP)
+# define LIBZAHL_SMALL_INPUT_BEGIN(n)
+# define LIBZAHL_SMALL_INPUT_END
+#else
+# define LIBZAHL_SMALL_INPUT_BEGIN(n)  switch (n) { LIBZAHL_MEM_CASES default:
+# define LIBZAHL_SMALL_INPUT_END       break; }
+#endif
+
+
+ZAHL_INLINE void
+libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, size_t n)
+{
+       size_t i;
+#define LIBZAHL_X(I)  case I:  d[I - 1] = s[I - 1];
+       LIBZAHL_SMALL_INPUT_BEGIN(n);
+       for (i = 0; i < n; i += 4) {
+               d[i + 0] = s[i + 0];
+               d[i + 1] = s[i + 1];
+               d[i + 2] = s[i + 2];
+               d[i + 3] = s[i + 3];
+       }
+       LIBZAHL_SMALL_INPUT_END;
+#undef LIBZAHL_X
+}
+
+
+ZAHL_INLINE void
+libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
+{
+       size_t i;
+       for (i = 0; i < n; i += 4) {
+               a[i + 0] = v;
+               a[i + 1] = v;
+               a[i + 2] = v;
+               a[i + 3] = v;
+       }
+}
+
+ZAHL_INLINE void
+libzahl_memset_precise(register zahl_char_t *a, register zahl_char_t v, size_t 
n)
+{
+       size_t i;
+       if (n <= 4) {
+               if (n >= 1)
+                       a[0] = v;
+               if (n >= 2)
+                       a[1] = v;
+               if (n >= 3)
+                       a[2] = v;
+               if (n >= 4)
+                       a[3] = v;
+       } else {
+               for (i = 0; (i += 4) <= n;) {
+                       a[i - 1] = v;
+                       a[i - 2] = v;
+                       a[i - 3] = v;
+                       a[i - 4] = v;
+               }
+               if (i > n)
+                       for (i -= 4; i < n; i++)
+                               a[i] = v;
+       }
+}
+
+
+ZAHL_INLINE void
+libzahl_memmovef(register zahl_char_t *d, register const zahl_char_t *s, 
size_t n)
+{
+       if (n && n < 4) {
+               d[0] = s[0];
+               d[1] = s[1];
+               d[2] = s[2];
+       } else {
+               size_t i;
+               for (i = 0; i < n; i += 4) {
+                       d[i + 0] = s[i + 0];
+                       d[i + 1] = s[i + 1];
+                       d[i + 2] = s[i + 2];
+                       d[i + 3] = s[i + 3];
+               }
+       }
+}
+
+ZAHL_INLINE void
+libzahl_memmoveb(register zahl_char_t *d, register const zahl_char_t *s, 
size_t n)
+{
+       ssize_t i;
+#define LIBZAHL_X(I)  case I:  d[I - 1] = s[I - 1];
+       LIBZAHL_SMALL_INPUT_BEGIN(n);
+       for (i = ((ssize_t)n + 3) & ~3; (i -= 4) >= 0;) {
+               d[i + 3] = s[i + 3];
+               d[i + 2] = s[i + 2];
+               d[i + 1] = s[i + 1];
+               d[i + 0] = s[i + 0];
+       }
+       LIBZAHL_SMALL_INPUT_END;
+#undef LIBZAHL_X
+}
+
+ZAHL_INLINE void
+libzahl_memmove(register zahl_char_t *d, register const zahl_char_t *s, size_t 
n)
+{
+       if (d < s)
+               libzahl_memmovef(d, s, n);
+       else
+               libzahl_memmoveb(d, s, n);
+}

Reply via email to