This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.36 in repository libmath-prime-util-perl.
commit db3e43acdfd1d1ee06fe9284c9f18d896d171e36 Author: Dana Jacobsen <d...@acm.org> Date: Sun Dec 15 02:00:04 2013 -0800 Documentation updates; popcnt updates for LMO --- lehmer.c | 49 ++++++++++++------------------------ lmo.c | 88 +++++++++++++++++++++++++++++++++++++--------------------------- 2 files changed, 67 insertions(+), 70 deletions(-) diff --git a/lehmer.c b/lehmer.c index 1425a24..89edbc7 100644 --- a/lehmer.c +++ b/lehmer.c @@ -5,7 +5,7 @@ /* Below this size, just sieve (with table speedup). */ #define SIEVE_LIMIT 60000000 -#define MAX_PHI_MEM (256*1024*1024) +#define MAX_PHI_MEM (896*1024*1024) /***************************************************************************** * @@ -31,40 +31,23 @@ * As with all Lehmer-Meissel-Legendre algorithms, memory use will be a * constraint with large values of x (see the table below). * - * If you want something better, I highly recommend the paper "Computing - * Pi(x): the combinatorial method" (2006) by Tomás Oliveira e Silva. His - * implementation is certainly much faster and lower memory than this. I have - * briefly run Christian Bau's LMO implementation which has the big advantage - * of using almost no memory. On the same machine that ran the times below, - * I got 10^16 in 36s, 10^17 in 165s, 10^18 in 769s, 10^19 in 4162s. All - * using a single core. That's 10-50x faster than this code. + * Math::Prime::Util now includes an extended LMO implementation, which will + * be quite a bit faster and much less memory than this code. It is the + * default method for large counts. Timing comparisons are in that file. * - * Using my sieve code with everything running in serial, calculating pi(10^12) - * is done under 1 second on my computer. pi(10^14) takes under 30 seconds, - * pi(10^16) in under 20 minutes. Compared with Thomas R. Nicely's pix4 - * program, this one is 5x faster and uses 10x less memory. When compiled - * with parallel primesieve it is over 10x faster. - * pix4(10^16) takes 124 minutes, this code + primesieve takes < 4 minutes. + * Times and memory use for prime_count(10^15) on a Haswell 4770K: * - * Timings with Perl + MPU with all-serial computation, no memory limit. - * The last column is the standalone time with 12-core parallel primesieve. - * - * n phi(x,a) mem/time | stage 4 mem/time | total time | pps time - * 10^19 17884MB 1979.41 | 9144MB | | 7h 26m - * 10^18 5515MB 483.46 | 2394MB | | 87m 0s - * 10^17 1698MB 109.56 | 634MB 9684.1 | 163m 36 s | 17m 45s - * 10^16 522MB 24.14 | 171MB 1066.3 | 18m 12 s | 3m 44s - * 10^15 159MB 5.58 | 47MB 142.5 | 2m 28 s | 47.66 s - * 10^14 48MB 1.28 | 13MB 22.26 | 23.61 s | 10.25 s - * 10^13 14MB 0.294 | 4MB 3.82 | 4.12 s | 2.21 s - * 10^12 4MB 0.070 | 1MB 0.707 | 0.78 s | 0.459s - * 10^11 1MB 0.015 | 0.135 | 0.158s | 0.097s - * 10^10 0.003 | 0.029 | 0.028s | 0.025s - * - * Using the standalone program with parallel primesieve speeds up stage 4 - * a lot for large values, as can be seen by the last column. It does use - * quite a bit more memory in stage 4 however (lowering SIEVE_MULT can reduce - * it by as much as 10x, at the expense of some performance). + * 4.80s 1.3MB LMO + * 27.51s* 136.0MB Lehmer Walisch primecount v0.7, 8 threads + * 42.99s* 159.4MB Lehmer standalone, 8 threads + * 50.82s* 136.2MB Meissel Walisch primecount v0.7, 8 threads + * 64.06s* 144.3MB Legendre Walisch primecount v0.7, 8 threads + * 114.40s 28.4MB Lehmer Walisch primecount v0.7, 1 thread + * 118.69s 27.1MB LMOS + * 120.39s 158.4MB Lehmer + * 112.40s 286.6MB Meissel + * 303.77s 28.0MB Legendre + * 868.96s 1668.1MB Lehmer pix4 by T.R. Nicely * * Reference: Hans Riesel, "Prime Numbers and Computer Methods for * Factorization", 2nd edition, 1994. diff --git a/lmo.c b/lmo.c index fc85ca5..0e2111b 100644 --- a/lmo.c +++ b/lmo.c @@ -27,29 +27,30 @@ * * Sieve: Segmented, single threaded, thread-safe. Small table enhanced, * fastest for n < 60M. Bad growth rate (like all sieves will have). - * Legendre:Combinatorial phi. Simple implementation. - * Meissel: Combinatorial phi. Simple implementation. - * Lehmer: Combinatorial phi. Memory use grows rapidly. - * LMOS: Combinatorial phi. Basic LMO implementation. - * LMO: Sieve phi. 10-50x faster than LMOS, better growth rate, - * Much, much better memory use than the others. + * Legendre:Simple. Recursive caching phi. + * Meissel: Simple. Non-recursive phi, lots of memory. + * Lehmer: Non-recursive phi, tries to restrict memory. + * LMOS: Simple. Non-recursive phi, less memory than Lehmer above. + * LMO: Sieve phi. Much faster and less memory than the others. * - * Performance is slightly worse than Christian Bau's implementation, but - * in theory this implementation has more parallelization opportunities. - * Timing below is single core using MPU. + * Timing below is single core Haswell 4770K using Math::Prime::Util. * - * | n | Meissel | Lehmer | LMOS | LMO | - * +-------+----------+----------+----------+-----------+ - * | 10^19 | | | | 3765.02 | - * | 10^18 | | | | 778.21 | - * | 10^17 | | | 8844.2 | 163.77 | - * | 10^16 | 1410.2 | 1043.6 | 1058.9 | 34.81 | - * | 10^15 | 137.1 | 137.3 | 142.7 | 7.905 | - * | 10^14 | 26.18 | 21.74 | 21.29 | 1.726 | - * | 10^13 | 5.155 | 3.947 | 3.353 | 0.405 | - * | 10^12 | 1.059 | 0.700 | 0.626 | 0.0936 | - * | 10^11 | 0.227 | 0.138 | 0.124 | 0.0227 | - * | 10^10 | 0.0509| 0.0309| 0.0286| 0.00589| + * | n | Legendre | Meissel | Lehmer | LMOS | LMO | + * +-------+----------+----------+----------+----------+-----------+ + * | 10^19 | | | | | 2493.4 | + * | 10^18 | | | | | 498.16 | + * | 10^17 | | 2950.9 | 7499.0 | 7125.7 | 103.03 | + * | 10^16 | 2422.7 | 575.8 | 872.5 | 884.9 | 21.94 | + * | 10^15 | 303.3 | 112.1 | 119.3 | 119.1 | 4.786 | + * | 10^14 | 40.10 | 22.06 | 19.06 | 17.58 | 1.052 | + * | 10^13 | 5.678 | 4.330 | 3.316 | 2.810 | 0.237 | + * | 10^12 | 0.901 | 0.889 | 0.617 | 0.524 | 54.9ms | + * | 10^11 | 0.182 | 0.192 | 0.122 | 0.114 | 13.80ms| + * | 10^10 | 40.2ms| 41.7ms| 26.6ms| 25.6ms| 3.64ms| + * + * Run with high memory limits: Meissel uses 1GB for 10^16, ~3GB for 10^17. + * Lehmer is limited at high n values by sieving speed. It is much faster + * using parallel primesieve, though cannot come close to LMO. */ /* Below this size, just sieve (with table speedup). */ @@ -77,26 +78,27 @@ typedef uint32_t uint32; #endif +/* UV is either uint32 or uint64 depending on Perl. We use this native size + * for the basic unit of the phi sieve. It can be easily overridden here. */ typedef UV sword_t; #define SWORD_BITS BITS_PER_WORD #define SWORD_ONES UV_MAX #define SWORD_MASKBIT(bits) (UVCONST(1) << ((bits) % SWORD_BITS)) #define SWORD_CLEAR(s,bits) s[bits/SWORD_BITS] &= ~SWORD_MASKBIT(bits) -#if defined(__GNUC__) && defined(__SSE4_2__) -static sword_t bitcount(sword_t b) { - sword_t ret; - __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b)); - return ret; -} -#elif SWORD_BITS == 64 -static sword_t bitcount(sword_t b) { - b -= (b >> 1) & 0x5555555555555555; - b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333); - b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f; - return (b * 0x0101010101010101)>>56; -} -#else /* Faster than __builtin_popcount */ +/* Compile with -march=native to get a large speedup on Nahalem and newer */ +#if SWORD_BITS == 64 + #if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR > 1)) + #define bitcount(b) __builtin_popcountll(b) + #else + static sword_t bitcount(sword_t b) { + b -= (b >> 1) & 0x5555555555555555; + b = (b & 0x3333333333333333) + ((b >> 2) & 0x3333333333333333); + b = (b + (b >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (b * 0x0101010101010101)>>56; + } + #endif +#else static const unsigned char byte_ones[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, @@ -124,15 +126,27 @@ static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) : (n/log(n)) * (1.0+1.0/logn+2.51/(logn*logn)); *number_of_primes = 0; New(0, plist, max_index+1, uint32_t); - if (plist == 0) - croak("Can not allocate small primes\n"); + if (plist == 0) croak("Can not allocate small primes\n"); plist[0] = 0; + /* We could do a simple SoE here. This is not time critical. */ START_DO_FOR_EACH_PRIME(2, n) { plist[++i] = p; } END_DO_FOR_EACH_PRIME; *number_of_primes = i; return plist; } +#if 0 /* primesieve 5.0 example */ +#include <primesieve.h> +static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) { + uint32_t plist; + uint32_t* psprimes = generate_primes(2, n, number_of_primes, UINT_PRIMES); + New(0, plist, *number_of_primes + 1, uint32_t); + plist[0] = 0; + memcpy(plist+1, psprimes, *number_of_primes * sizeof(uint32_t)); + primesieve_free(psprimes); + return plist; +} +#endif /* Given a max prime in small prime list, return max prev prime input */ static uint32 prev_sieve_max(UV maxprime) { -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits