[libmath-prime-util-perl] 11/18: Unroll inner loop of sieve for another 20% speedup

Partha P. Mukherjee Thu, 21 May 2015 11:50:51 -0700

This is an automated email from the git hooks/post-receive script.

ppm-guest pushed a commit to annotated tag v0.17
in repository libmath-prime-util-perl.


commit a6339fa0f1efadecbe6652ac0155e08ae4aa9b88
Author: Dana Jacobsen <[email protected]>
Date:   Tue Dec 18 02:27:22 2012 -0800

    Unroll inner loop of sieve for another 20% speedup
---
 Changes                | 10 ++++++++++
 lib/Math/Prime/Util.pm | 12 +++++++-----
 sieve.c                | 50 ++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/Changes b/Changes
index 5fb3a8d..bff55b0 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,15 @@
 Revision history for Perl extension Math::Prime::Util.
 
+0.17  18 December 2012
+
+    - Perl 5.8.1 - 5.8.7 miscalculates 12345 ** 4, which I used in a test.
+
+    - Fix (hopefully) for MSC compilation.
+
+    - Unroll sieve loop for another 20% or so speedup.  It won't have much
+      practical application now that we use Lehmer's method for counts, but
+      there are some cases that can still show speedups.
+
 0.16  11 December 2012
 
     - randbits >= 32 on some 32-bit systems was messing us up.  Restrict our
diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm
index 6ed33e2..a1e889a 100644
--- a/lib/Math/Prime/Util.pm
+++ b/lib/Math/Prime/Util.pm
@@ -2812,7 +2812,7 @@ seconds using the Lehmer algorithm in version 0.12.
 
    Small portable functions suitable for plugging into XS:
 
-       5.3  My segmented SoE used in this module
+       4.1  My segmented SoE used in this module (with unrolled inner loop)
       15.6  My Sieve of Eratosthenes using a mod-30 wheel
       17.2  A slightly modified verion of Terje Mathisen's mod-30 sieve
       35.5  Basic Sieve of Eratosthenes on odd numbers
@@ -2820,25 +2820,27 @@ seconds using the Lehmer algorithm in version 0.12.
       72.8  Sieve of Atkin, 10-minute fixup of basic algorithm
       91.6  Sieve of Atkin, Wikipedia-like
 
-Perl modules, counting the primes to C<800_000_000> (800 million), in seconds:
+Perl modules, counting the primes to C<800_000_000> (800 million):
 
   Time (s)   Module                      Version  Notes
   ---------  --------------------------  -------  -----------
        0.03  Math::Prime::Util           0.12     using Lehmer's method
-       0.36  Math::Prime::Util           0.09     segmented mod-30 sieve
+       0.28  Math::Prime::Util           0.17     segmented mod-30 sieve
        0.52  Math::Prime::Util::PP       0.14     Perl (Lehmer's method)
        0.9   Math::Prime::Util           0.01     mod-30 sieve
        2.9   Math::Prime::FastSieve      0.12     decent odd-number sieve
       11.7   Math::Prime::XS             0.29     "" but needs a count API
       15.0   Bit::Vector                 7.2
       57.3   Math::Prime::Util::PP       0.14     Perl (fastest I know of)
-     169.5   Python's mpmath primepi     0.17     Python, 25+GB RAM used
      170.0   Faster Perl sieve (net)     2012-01  array of odds
-     292.2   Python's sympy primepi      0.7.1    Python
      548.1   RosettaCode sieve (net)     2012-06  simplistic Perl
   ~11000     Math::Primality             0.04     Perl + Math::GMPz
   >20000     Math::Big                   1.12     Perl, > 26GB RAM used
 
+Python can do this in 2.8s using an RWH numpy function, 14.3s using an RWH
+pure Python function.  However the standard modules are far slower.  mpmath
+v0.17 primepi takes 169.5s and 25+ GB of RAM.  sympi 0.7.1 primepi takes
+292.2s.
 
 
 C<is_prime>: my impressions for various sized inputs:
diff --git a/sieve.c b/sieve.c
index eb38b82..bebd2df 100644
--- a/sieve.c
+++ b/sieve.c
@@ -163,12 +163,20 @@ unsigned char* sieve_erat30(UV end)
     assert(d < max_buf);
     assert(prime = 
(wdinc[0]+wdinc[1]+wdinc[2]+wdinc[3]+wdinc[4]+wdinc[5]+wdinc[6]+wdinc[7]));
 #endif
-    i = 0;        /* Mark the composites */
-    do {
-      mem[d] |= wmask[i];
-      d += wdinc[i];
-      i = (i+1) & 7;
-    } while (d < max_buf);
+    /* Regular code to mark composites.
+    * i = 0;
+    * do { mem[d] |= wmask[i]; d += wdinc[i]; i = (i+1)&7; } while (d < 
max_buf);
+    * Unrolled version: */
+    while (1) {
+      mem[d] |= wmask[0];  d += wdinc[0];  if (d >= max_buf) break;
+      mem[d] |= wmask[1];  d += wdinc[1];  if (d >= max_buf) break;
+      mem[d] |= wmask[2];  d += wdinc[2];  if (d >= max_buf) break;
+      mem[d] |= wmask[3];  d += wdinc[3];  if (d >= max_buf) break;
+      mem[d] |= wmask[4];  d += wdinc[4];  if (d >= max_buf) break;
+      mem[d] |= wmask[5];  d += wdinc[5];  if (d >= max_buf) break;
+      mem[d] |= wmask[6];  d += wdinc[6];  if (d >= max_buf) break;
+      mem[d] |= wmask[7];  d += wdinc[7];  if (d >= max_buf) break;
+    }
   }
 
   return mem;
@@ -226,6 +234,7 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd)
       UV minc = (2*p) - dinc*30;
       UV wdinc[8];
       unsigned char wmask[8];
+      UV offset_endd = endd - startd;
       int i;
 
       /* Find the positions of the next composites we will mark */
@@ -240,12 +249,37 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd)
         wmask[i%8] = masktab30[m];
       }
       d -= p;
+      d -= startd;
+#if 0
       i = 0;        /* Mark the composites */
       do {
-        mem[d-startd] |= wmask[i];
+        mem[d] |= wmask[i];
         d += wdinc[i];
         i = (i+1) & 7;
-      } while (d <= endd);
+      } while (d <= offset_endd);
+#else
+      /* Unrolled inner loop for marking composites */
+      while ( (d+p) <= offset_endd ) {
+        mem[d] |= wmask[0];  d += wdinc[0];
+        mem[d] |= wmask[1];  d += wdinc[1];
+        mem[d] |= wmask[2];  d += wdinc[2];
+        mem[d] |= wmask[3];  d += wdinc[3];
+        mem[d] |= wmask[4];  d += wdinc[4];
+        mem[d] |= wmask[5];  d += wdinc[5];
+        mem[d] |= wmask[6];  d += wdinc[6];
+        mem[d] |= wmask[7];  d += wdinc[7];
+      }
+      while (1) {
+        mem[d] |= wmask[0];  d += wdinc[0];  if (d > offset_endd) break;
+        mem[d] |= wmask[1];  d += wdinc[1];  if (d > offset_endd) break;
+        mem[d] |= wmask[2];  d += wdinc[2];  if (d > offset_endd) break;
+        mem[d] |= wmask[3];  d += wdinc[3];  if (d > offset_endd) break;
+        mem[d] |= wmask[4];  d += wdinc[4];  if (d > offset_endd) break;
+        mem[d] |= wmask[5];  d += wdinc[5];  if (d > offset_endd) break;
+        mem[d] |= wmask[6];  d += wdinc[6];  if (d > offset_endd) break;
+        mem[d] |= wmask[7];  d += wdinc[7];  if (d > offset_endd) break;
+      }
+#endif
 #endif
     }
   }

-- 
Alioth's /usr/local/bin/git-commit-notice on 
/srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git

_______________________________________________
Pkg-perl-cvs-commits mailing list
[email protected]
http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits

[libmath-prime-util-perl] 11/18: Unroll inner loop of sieve for another 20% speedup

Reply via email to