commit 40b860777616071997ec035783eeea402ffb1ae2
Author:     Mattias Andrée <[email protected]>
AuthorDate: Tue May 3 14:03:33 2016 +0200
Commit:     Mattias Andrée <[email protected]>
CommitDate: Tue May 3 14:03:33 2016 +0200

    Optimise libzahl_memcpy and libzahl_memset
    
    Signed-off-by: Mattias Andrée <[email protected]>

diff --git a/STATUS b/STATUS
index 36d9717..8cae48a 100644
--- a/STATUS
+++ b/STATUS
@@ -6,7 +6,7 @@ left column. Double-parenthesis means there may be a better way
 to do it. Inside square-brackets, there are some comments on
 multi-bit comparisons.
 
-zset .................... fastest [until ~750, then gmp, also tomsfastmath 
after ~2750]
+zset .................... fastest [always with gcc, unless ~250 with clang]
 zseti ................... tomsfastmath is faster [always]
 zsetu ................... tomsfastmath is faster [always]
 zneg(a, b) .............. fastest [until ~300, then gmp]
diff --git a/TODO b/TODO
index 56d8dbe..0327bca 100644
--- a/TODO
+++ b/TODO
@@ -5,9 +5,10 @@ Add zsets_radix
 Add zstr_radix
 
 Test big endian
-Test always having used > 0 for zero
+Test always having .used > 0 for zero
   Test negative/non-negative instead of sign
 Test long .sign
+Test always having .chars % 4 == 0
 
 Test optimisation of zmul:
   bc = [(Hb * Hc) << (m2 << 1)]
diff --git a/zahl-internals.h b/zahl-internals.h
index e9232dd..fc6768a 100644
--- a/zahl-internals.h
+++ b/zahl-internals.h
@@ -109,18 +109,62 @@ struct zahl {
 
 void libzahl_realloc(struct zahl *, size_t);
 
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
 libzahl_memcpy(register zahl_char_t *restrict d, register const zahl_char_t 
*restrict s, size_t n)
 {
        size_t i;
-       for (i = 0; i < n; i++)
-               d[i] = s[i];
+       if (n <= 4) {
+               if (n >= 1)
+                       d[0] = s[0];
+               if (n >= 2)
+                       d[1] = s[1];
+               if (n >= 3)
+                       d[2] = s[2];
+               if (n >= 4)
+                       d[3] = s[3];
+       } else {
+               for (i = 0; (i += 4) <= n;) {
+                       d[i - 1] = s[i - 1];
+                       d[i - 2] = s[i - 2];
+                       d[i - 3] = s[i - 3];
+                       d[i - 4] = s[i - 4];
+               }
+               if (i > n) {
+                       i -= 4;
+                       if (i < n)
+                               d[i] = s[i], i++;
+                       if (i < n)
+                               d[i] = s[i], i++;
+                       if (i < n)
+                               d[i] = s[i], i++;
+                       if (i < n)
+                               d[i] = s[i], i++;
+               }
+       }
 }
 
-ZAHL_O2 ZAHL_INLINE void
+ZAHL_INLINE void
 libzahl_memset(register zahl_char_t *a, register zahl_char_t v, size_t n)
 {
        size_t i;
-       for (i = 0; i < n; i++)
-               a[i] = v;
+       if (n <= 4) {
+               if (n >= 1)
+                       a[0] = v;
+               if (n >= 2)
+                       a[1] = v;
+               if (n >= 3)
+                       a[2] = v;
+               if (n >= 4)
+                       a[3] = v;
+       } else {
+               for (i = 0; (i += 4) <= n;) {
+                       a[i - 1] = v;
+                       a[i - 2] = v;
+                       a[i - 3] = v;
+                       a[i - 4] = v;
+               }
+               if (i > n)
+                       for (i -= 4; i < n; i++)
+                               a[i] = v;
+       }
 }

Reply via email to