This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.35 in repository libmath-prime-util-perl.
commit 69f65edb812c72e9663cd2bcd67a26774b276998 Author: Dana Jacobsen <d...@acm.org> Date: Thu Dec 5 08:48:23 2013 -0800 Extended LMO algorithm --- .gitignore | 1 + Changes | 5 + Makefile.PL | 1 + XS.xs | 3 + lehmer.c | 36 ++- lehmer.h | 3 +- lib/Math/Prime/Util.pm | 6 +- lmo.c | 628 +++++++++++++++++++++++++++++++++++++++++++++++++ lmo.h | 8 + util.c | 2 +- 10 files changed, 682 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 55be186..79f9d4f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ factor.o sieve.o util.o lehmer.o +lmo.o primality.o aks.o blib* diff --git a/Changes b/Changes index 64c32ff..0a5732e 100644 --- a/Changes +++ b/Changes @@ -18,10 +18,15 @@ Revision history for Perl module Math::Prime::Util [FUNCTIONALITY AND PERFORMANCE] + - Switched to extended LMO algorithm for prime_count. Much better memory + use and much faster for large values. Speeds up nth_prime also. + - Some fixes for 32-bit. - prime_count_approx, upper, and lower return exact answers in more cases. + - Fixed a problem with Lehmer prime_count introduced in 0.34. + 0.34 2013-11-19 diff --git a/Makefile.PL b/Makefile.PL index 02ce49a..3a9d727 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -27,6 +27,7 @@ WriteMakefile1( 'primality.o '. 'aks.o ' . 'lehmer.o ' . + 'lmo.o ' . 'sieve.o ' . 'util.o ' . 'XS.o', diff --git a/XS.xs b/XS.xs index de47b4e..4832367 100644 --- a/XS.xs +++ b/XS.xs @@ -17,6 +17,7 @@ #include "primality.h" #include "factor.h" #include "lehmer.h" +#include "lmo.h" #include "aks.h" #if BITS_PER_WORD == 64 && defined(_MSC_VER) @@ -202,6 +203,7 @@ _XS_nth_prime(IN UV n) _XS_meissel_pi = 4 _XS_lehmer_pi = 5 _XS_LMO_pi = 6 + _XS_LMOS_pi = 7 PREINIT: UV ret; CODE: @@ -213,6 +215,7 @@ _XS_nth_prime(IN UV n) case 4: ret = _XS_meissel_pi(n); break; case 5: ret = _XS_lehmer_pi(n); break; case 6: ret = _XS_LMO_pi(n); break; + case 7: ret = _XS_LMOS_pi(n); break; default: croak("_XS_nth_prime: Unknown function alias"); break; } RETVAL = ret; diff --git a/lehmer.c b/lehmer.c index 29ad8d7..368d194 100644 --- a/lehmer.c +++ b/lehmer.c @@ -723,8 +723,7 @@ static UV Pk_2_p(UV n, UV a, UV b, const uint32_t* primes, uint32_t lastidx) } static UV Pk_2(UV n, UV a, UV b) { - UV lastprime = b*SIEVE_MULT+1; - if (lastprime > 203280221) lastprime = 203280221; + UV lastprime = ((b*SIEVE_MULT+1) > 203280221) ? 203280221 : b*SIEVE_MULT+1; const uint32_t* primes = generate_small_primes(lastprime); UV P2 = Pk_2_p(n, a, b, primes, lastprime); Safefree(primes); @@ -842,7 +841,7 @@ UV _XS_lehmer_pi(UV n) * About the same speed as Lehmer, a bit less memory. * A better implementation can be 10-50x faster and much less memory. */ -UV _XS_LMO_pi(UV n) +UV _XS_LMOS_pi(UV n) { UV n13, a, b, sum, i, j, k, lastprime, P2, S1, S2; const uint32_t* primes = 0; /* small prime cache */ @@ -921,7 +920,34 @@ UV _XS_LMO_pi(UV n) return sum; } -UV _XS_legendre_phi(UV x, UV a) { return phi(x,a); } +static const unsigned char primes_small[] = + {0,2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97, + 101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191}; +#define NPRIMES_SMALL (sizeof(primes_small)/sizeof(primes_small[0])) + +UV _XS_legendre_phi(UV x, UV a) { + /* For small values, calculate directly */ + if (a < 3) return (a == 0) ? x : (a == 1) ? x-x/2 : x-x/2-x/3+x/6; + if (a <= 7) return mapes(x, a); + /* For large values, do our non-recursive phi */ + if (a > NPRIMES_SMALL) return phi(x,a); + /* Otherwise, recurse */ + { + UV i; + UV sum = mapes7(x); + for (i = 8; i <= a; i++) { + uint32_t p = primes_small[i]; + UV xp = x/p; + if (xp < p) { + while (x < primes_small[a]) + a--; + return (sum - a + i - 1); + } + sum -= _XS_legendre_phi(xp, i-1); + } + return sum; + } +} #ifdef PRIMESIEVE_STANDALONE @@ -947,7 +973,7 @@ int main(int argc, char *argv[]) if (!strcasecmp(method, "lehmer")) { pi = _XS_lehmer_pi(n); } else if (!strcasecmp(method, "meissel")) { pi = _XS_meissel_pi(n); } else if (!strcasecmp(method, "legendre")) { pi = _XS_legendre_pi(n); } - else if (!strcasecmp(method, "lmo")) { pi = _XS_LMO_pi(n); } + else if (!strcasecmp(method, "lmo")) { pi = _XS_LMOS_pi(n); } else if (!strcasecmp(method, "sieve")) { pi = _XS_prime_count(2, n); } else { printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n"); diff --git a/lehmer.h b/lehmer.h index e2c3325..29b4f53 100644 --- a/lehmer.h +++ b/lehmer.h @@ -6,7 +6,8 @@ extern UV _XS_legendre_pi(UV n); extern UV _XS_meissel_pi(UV n); extern UV _XS_lehmer_pi(UV n); -extern UV _XS_LMO_pi(UV n); +extern UV _XS_LMOS_pi(UV n); + extern UV _XS_legendre_phi(UV x, UV a); #endif diff --git a/lib/Math/Prime/Util.pm b/lib/Math/Prime/Util.pm index faf70fd..eba5a3b 100644 --- a/lib/Math/Prime/Util.pm +++ b/lib/Math/Prime/Util.pm @@ -1791,10 +1791,8 @@ sub prime_count { #if ($est_lehmer < $est_segment) { if ( ($high / ($high-$low+1)) < 100 ) { my $count; - $count = ($high > 8_000_000_000) ? _XS_LMO_pi($high) : _XS_lehmer_pi($high); - if ($low > 2) { - $count -= ($low > 8_000_000_000) ? _XS_LMO_pi($low-1) : _XS_lehmer_pi($low-1); - } + $count = _XS_LMO_pi($high); + $count -= _XS_LMO_pi($low-1) if $low > 2; return $count; } } diff --git a/lmo.c b/lmo.c new file mode 100644 index 0000000..eb37c8d --- /dev/null +++ b/lmo.c @@ -0,0 +1,628 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> + +/***************************************************************************** + * + * Prime counts using the extended Lagarias-Miller-Odlyzko combinatorial method. + * + * Copyright (c) 2013 Dana Jacobsen (d...@acm.org) + * This is free software; you can redistribute it and/or modify it under + * the same terms as the Perl 5 programming language system itself. + * + * This file is part of the Math::Prime::Util Perl module, but it should + * not be difficult to turn it into standalone code. + * + * The main references I used were: + * - Christian Bau's paper and example implementation, 2003, Christian Bau + * This was of immense help. References to "step #" refer to this preprint. + * - "Computing Pi(x): the combinatorial method", 2006, Tomás Oliveira e Silva + * - "Computing Pi(x): The Meissel, Lehmer, Lagarias, Miller, Odlyzko Method" + * 1996, Deléglise and Rivat. + * + * Comparisons to the other prime counting implementations here: + * + * Meissel: Combinatorial phi. Simple implementation. + * Lehmer: Combinatorial phi. Memory use grows rapidly. + * LMOS: Combinatorial phi. Very basic LMO implementation. + * LMO: Sieve phi. 10-50x faster than LMOS, better growth rate, + * Much, much better memory use than the others. + * + * Performance is slightly worse than Christian Bau's implementation, but + * in theory this implementation has more parallelization opportunities. + * Timing below is single core using MPU. + * + * | n | Meissel | Lehmer | LMOS | LMO | + * +-------+----------+----------+----------+-----------+ + * | 10^19 | | | | 3707.43 | + * | 10^18 | | | | 778.70 | + * | 10^17 | | | | 172.12 | + * | 10^16 | 1410.2 | 1043.6 | 1058.9 | 37.351 | + * | 10^15 | 137.1 | 137.3 | 142.7 | 8.072 | + * | 10^14 | 26.18 | 21.74 | 21.29 | 1.906 | + * | 10^13 | 5.155 | 3.947 | 3.353 | 0.455 | + * | 10^12 | 1.059 | 0.700 | 0.626 | 0.111 | + * | 10^11 | 0.227 | 0.138 | 0.124 | 0.0269 | + * | 10^10 | 0.0509| 0.0309| 0.0286| 0.00702| + */ + +/* Below this size, just sieve (with table speedup). */ +#define SIEVE_LIMIT 60000000 +/* Adjust to get best performance */ +#define M_FACTOR(n) (UV) ((double)n * (log(n)/log(3))) +/* Size of segment used for previous primes, must be >= 21 */ +#define PREV_SIEVE_SIZE 512 +/* Phi sieve multiplier, adjust for best performance */ +#define PHI_SIEVE_MULT 13 + +#include "lmo.h" +#include "util.h" +#include "cache.h" +#include "sieve.h" + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef uint32_t uint32; + +static UV icbrt(UV n) { + UV b, root = 0; + int s = 63; + if (n >= UVCONST(18446724184312856125)) return UVCONST(2642245); + for (; s >= 0; s -= 3) { + root += root; + b = 3*root*(root+1)+1; + if ((n >> s) >= b) { + n -= b << s; + root++; + } + } + return root; +} + +static const unsigned char byte_ones[256] = + {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, + 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8}; + +static uint32_t bitcount(uint32_t b) { + /* The simple table method is faster than __builtin_popcount or + * using a 11-bit word table. It is slower than the Nahalem asm. */ +#if 1 + return byte_ones[b&0xFF] + byte_ones[(b>>8)&0xFF] + byte_ones[(b>>16)&0xFF] + byte_ones[b>>24]; +#else + uint32 ret; + __asm__("popcnt %1, %0" : "=r" (ret) : "r" (b)); + return ret; +#endif +} + +/* static uint32 count_one_bits(const unsigned char* m, uint32 nbytes) { + uint32 count = 0; + while (nbytes--) count += byte_ones[*m++]; + return count; +} */ + +/* Create array of small primes: 0,2,3,5,...,prev_prime(n+1) */ +static uint32_t* make_primelist(uint32 n, uint32* number_of_primes) +{ + uint32 i = 0; + uint32_t* plist; + double logn = log(n); + uint32 max_index = (n < 67) ? 18 + : (n < 355991) ? 15+(n/(logn-1.09)) + : (n/log(n)) * (1.0+1.0/logn+2.51/(logn*logn)); + *number_of_primes = 0; + New(0, plist, max_index+1, uint32_t); + if (plist == 0) + croak("Can not allocate small primes\n"); + plist[0] = 0; + START_DO_FOR_EACH_PRIME(2, n) { + plist[++i] = p; + } END_DO_FOR_EACH_PRIME; + *number_of_primes = i; + return plist; +} + +/* Given a max prime in small prime list, return max prev prime input */ +static uint32 prev_sieve_max(UV maxprime) { + UV limit = maxprime*maxprime - (maxprime*maxprime % (16*PREV_SIEVE_SIZE)) - 1; + return (limit > U32_CONST(4294967295)) ? U32_CONST(4294967295) : limit; +} + +/* Simple SoE filling a segment */ +static void _prev_sieve_fill(uint32 start, uint8* sieve, const uint32_t* primes) { + uint32 i, j, p; + memset( sieve, 0xFF, PREV_SIEVE_SIZE ); + for (i = 2, p = 3; p*p < start + (16*PREV_SIEVE_SIZE); p = primes[++i]) + for (j = (start == 0) ? p*p/2 : (p-1) - ((start+(p-1))/2) % p; + j < (8*PREV_SIEVE_SIZE); j += p) + sieve[j/8] &= ~(1U << (j%8)); +} + +/* Calculate previous prime using small segment */ +static uint32 prev_sieve_prime(uint32 n, uint8* sieve, uint32* segment_start, uint32 sieve_max, const uint32_t* primes) +{ + uint32 sieve_start, bit_offset; + if (n <= 3) return (n == 3) ? 2 : 0; + if (n > sieve_max) croak("ps overflow\n"); + + /* If n > 3 && n <= sieve_max, then there is an odd prime we can find. */ + n -= 2; + bit_offset = n % (16*PREV_SIEVE_SIZE); + sieve_start = n - bit_offset; + bit_offset >>= 1; + + while (1) { + if (sieve_start != *segment_start) { /* Fill sieve if necessary */ + _prev_sieve_fill(sieve_start, sieve, primes); + *segment_start = sieve_start; + } + do { /* Look for a set bit in sieve */ + if (sieve[bit_offset / 8] & (1u << (bit_offset % 8))) + return sieve_start + 2*bit_offset + 1; + } while (bit_offset-- > 0); + sieve_start -= (16 * PREV_SIEVE_SIZE); + bit_offset = ((16 * PREV_SIEVE_SIZE) - 1) / 2; + } +} + +/* Create factor table. + * In lehmer.c we create mu and lpf arrays, which I suspect is faster. + * Here we use Christian Bau's method, which is more memory efficient. + * TODO: Neither the speed nor space really matter, but we should measure them. + * + * In a uint16 we have stored: + * 0 moebius(n) = 0 + * even moebius(n) = 1 + * odd moebius(n) = -1 (last bit indicates even/odd number of factors) + * v smallest odd prime factor of n is v&1 + * 65535 large prime + */ +static uint16* ft_create(uint32 max) +{ + uint16* factor_table; + uint32 i; + uint32 tableLimit = max + 338 + 1; /* At least one more prime */ + uint32 tableSize = tableLimit/2; + uint32 max_prime = (tableLimit - 1) / 3 + 1; + + New(0, factor_table, tableSize, uint16); + if (factor_table == 0) return 0; + + /* Set all values to 65535 (a large prime), set 0 to 65534. */ + factor_table[0] = 65534; + for (i = 1; i < tableSize; ++i) + factor_table[i] = 65535; + + /* Process each odd. */ + for (i = 1; i < tableSize; ++i) { + uint32 factor, max_factor; + uint32 p = i*2+1; + if (factor_table[i] != 65535) /* Already marked. */ + continue; + if (p < 65535) /* p is a small prime, so set the number. */ + factor_table[i] = p; + if (p >= max_prime) /* No multiples will be in the table */ + continue; + + max_factor = (tableLimit - 1) / p + 1; + /* Look for odd multiples of the prime p. */ + for (factor = 3; factor < max_factor; factor += 2) { + uint32 index = (p*factor)/2; + if (factor_table[index] == 65535) /* p is smallest factor */ + factor_table[index] = p; + else if (factor_table[index] > 0) /* Change number of factors */ + factor_table[index] ^= 0x01; + } + + /* Change all odd multiples of p*p to 0 to indicate non-square-free. */ + for (factor = p; factor < max_factor; factor += 2*p) + factor_table[ (p*factor) / 2] = 0; + } + return factor_table; +} + + +static UV mapes(UV x, uint32 a) +{ + UV val; + if (a == 0) return x; + if (a == 1) return x-x/2; + val = x-x/2-x/3+x/6; + if (a >= 3) val += 0-x/5+x/10+x/15-x/30; + if (a >= 4) val += 0-x/7+x/14+x/21-x/42+x/35-x/70-x/105+x/210; + if (a >= 5) val += 0-x/11+x/22+x/33-x/66+x/55-x/110-x/165+x/330+x/77-x/154-x/231+x/462-x/385+x/770+x/1155-x/2310; + if (a >= 6) val += 0-x/13+x/26+x/39-x/78+x/65-x/130-x/195+x/390+x/91-x/182-x/273+x/546-x/455+x/910+x/1365-x/2730+x/143-x/286-x/429+x/858-x/715+x/1430+x/2145-x/4290-x/1001+x/2002+x/3003-x/6006+x/5005-x/10010-x/15015+x/30030; + if (a >= 7) val += 0-x/17+x/34+x/51-x/102+x/85-x/170-x/255+x/510+x/119-x/238-x/357+x/714-x/595+x/1190+x/1785-x/3570+x/187-x/374-x/561+x/1122-x/935+x/1870+x/2805-x/5610-x/1309+x/2618+x/3927-x/7854+x/6545-x/13090-x/19635+x/39270+x/221-x/442-x/663+x/1326-x/1105+x/2210+x/3315-x/6630-x/1547+x/3094+x/4641-x/9282+x/7735-x/15470-x/23205+x/46410-x/2431+x/4862+x/7293-x/14586+x/12155-x/24310-x/36465+x/72930+x/17017-x/34034-x/51051+x/102102-x/85085+x/170170+x/255255-x/510510; + return (UV) val; +} + +typedef struct { + uint32_t *sieve; /* segment bit mask */ + uint8 *word_count; /* bit count in each 64-bit word */ + uint32 *word_count_sum; /* cumulative sum of word_count */ + UV *totals; /* total bit count for all phis at index */ + uint32 *prime_index; /* index of prime where phi(n/p/p(k+1))=1 */ + uint32 *first_bit_index; /* offset relative to start for this prime */ + uint8 *multiplier; /* mod-30 wheel of each prime */ + UV start; /* x value of first bit of segment */ + uint32 size; /* segment size in bits */ + uint32 first_prime; /* index of first prime in segment */ + uint32 last_prime; /* index of last prime in segment */ + uint32 last_prime_to_remove; /* index of last prime p, p^2 in segment */ +} sieve_t; + +/* Size of phi sieve in 32-bit words. Multiple of 2*2*3*5*7*11 bytes. */ +#define PHI_SIEVE_WORDS (1155 * PHI_SIEVE_MULT) + +/* Bit counting using cumulative sums. A bit slower than using a running sum, + * but a little simpler and can be run in parallel. */ +static void make_sieve_counts(const uint32_t* sieve, uint32 sieve_size, uint8* sieve_word_count) { + uint32 lwords = (sieve_size + 127) / 128; + while (lwords--) { + *sieve_word_count++ = bitcount(sieve[0]) + bitcount(sieve[1]); + sieve += 2; + } +} +static uint32 make_sieve_sums(uint32 sieve_size, const uint8* sieve_word_count, uint32* sieve_word_count_sum) { + uint32 i, bc, lwords = (sieve_size + 127) / 128; + sieve_word_count_sum[0] = 0; + for (i = 0, bc = 0; i+7 < lwords; i += 8) { + const uint8* cntptr = sieve_word_count + i; + uint32* sumptr = sieve_word_count_sum + i; + sumptr[1] = bc += cntptr[0]; + sumptr[2] = bc += cntptr[1]; + sumptr[3] = bc += cntptr[2]; + sumptr[4] = bc += cntptr[3]; + sumptr[5] = bc += cntptr[4]; + sumptr[6] = bc += cntptr[5]; + sumptr[7] = bc += cntptr[6]; + sumptr[8] = bc += cntptr[7]; + } + for (; i < lwords; i++) + sieve_word_count_sum[i+1] = sieve_word_count_sum[i] + sieve_word_count[i]; + return sieve_word_count_sum[lwords]; +} + +static UV _sieve_phi(UV segment_x, const uint32_t* sieve, const uint32* sieve_word_count_sum) { + uint32 bits = (segment_x + 1) / 2; + uint32 words = bits / 32; + uint32 sieve_sum = sieve_word_count_sum[words/2]; + if (words & 1) sieve_sum += bitcount( sieve[words-1] ); + sieve_sum += bitcount( sieve[words] & ~(0xfffffffful << (bits % 32)) ); + return sieve_sum; +} + +/* Erasing primes from the sieve is done using Christian Bau's + * case statement walker. It's not pretty, but it is short, fast, + * clever, and does the job. */ + +#define sieve_zero(sieve, si, wordcount) \ + { uint32 index = si/32; \ + uint32 mask = 1UL << (si % 32); \ + if (sieve[index] & mask) { \ + sieve[index] &= ~mask; \ + wordcount[index/2]--; \ + } } + +#define sieve_case_zero(casenum, skip, si, p, size, mult, sieve, wordcount) \ + case casenum: sieve_zero(sieve, si, wordcount); \ + si += skip * p; \ + mult = (casenum+1) % 8; \ + if (si >= size) break; + +static UV remove_primes(uint32 index, uint32 last_index, sieve_t* s, const uint32_t* primes) +{ + uint32 size = (s->size + 1) / 2; + uint32_t *sieve = s->sieve; + uint8 *word_count = s->word_count; + UV total = s->totals[last_index]; + + for ( ;index <= last_index; index++) { + if (index >= s->first_prime && index <= s->last_prime) { + uint32 b = (primes[index] - (uint32) s->start - 1) / 2; + sieve_zero(sieve, b, word_count); + } + if (index <= s->last_prime_to_remove) { + uint32 b = s->first_bit_index[index]; + if (b < size) { + uint32 p = primes[index]; + uint32 mult = s->multiplier[index]; + switch (mult) { + reloop: ; + sieve_case_zero(0, 3, b, p, size, mult, sieve, word_count); + sieve_case_zero(1, 2, b, p, size, mult, sieve, word_count); + sieve_case_zero(2, 1, b, p, size, mult, sieve, word_count); + sieve_case_zero(3, 2, b, p, size, mult, sieve, word_count); + sieve_case_zero(4, 1, b, p, size, mult, sieve, word_count); + sieve_case_zero(5, 2, b, p, size, mult, sieve, word_count); + sieve_case_zero(6, 3, b, p, size, mult, sieve, word_count); + sieve_case_zero(7, 1, b, p, size, mult, sieve, word_count); + goto reloop; + } + s->multiplier[index] = mult; + } + s->first_bit_index[index] = b - size; + } + } + s->totals[last_index] += make_sieve_sums(s->size, s->word_count, s->word_count_sum); + return total; +} + +static void word_tile (uint32_t* source, uint32 from, uint32 to) { + while (from < to) { + uint32 words = (2*from > to) ? to-from : from; + memcpy(source+from, source, 4*words); + from += words; + } +} + +static UV init_segment(sieve_t* s, UV segment_start, uint32 size, uint32 start_prime_index, uint32 sieve_last, const uint32_t* primes) +{ + uint32 i, words; + uint32_t* sieve = s->sieve; + uint8* word_count = s->word_count; + + s->start = segment_start; + s->size = size; + + if (segment_start == 0) { + s->last_prime = 0; + s->last_prime_to_remove = 0; + } + s->first_prime = s->last_prime + 1; + while (s->last_prime < sieve_last) { + uint32 p = primes[s->last_prime + 1]; + if (p >= segment_start + size) + break; + s->last_prime++; + } + while (s->last_prime_to_remove < sieve_last) { + UV p = primes[s->last_prime_to_remove + 1]; + UV p2 = p*p; + if (p2 >= segment_start + size) + break; + s->last_prime_to_remove++; + s->first_bit_index[s->last_prime_to_remove] = (p2 - segment_start - 1) / 2; + s->multiplier[s->last_prime_to_remove] = (p % 30) * 8 / 30; + } + + memset(sieve, 0xFF, 3*4); /* Set first 3 words to all 1 bits */ + if (start_prime_index >= 3) /* Remove multiples of 3. */ + for (i = 3/2; i < 3 * 32; i += 3) + sieve[i / 32] &= ~(1ul << (i % 32)); + + word_tile(sieve, 3, 15); /* Copy to first 15 = 3*5 words */ + if (start_prime_index >= 3) /* Remove multiples of 5. */ + for (i = 5/2; i < 15 * 32; i += 5) + sieve[i / 32] &= ~(1ul << (i % 32)); + + word_tile(sieve, 15, 105); /* Copy to first 105 = 3*5*7 words */ + if (start_prime_index >= 4) /* Remove multiples of 7. */ + for (i = 7/2; i < 105 * 32; i += 7) + sieve[i / 32] &= ~(1ul << (i % 32)); + + word_tile(sieve, 105, 1155); /* Copy to first 1155 = 3*5*7*11 words */ + if (start_prime_index >= 5) /* Remove multiples of 11. */ + for (i = 11/2; i < 1155 * 32; i += 11) + sieve[i / 32] &= ~(1ul << (i % 32)); + + size = (size+1) / 2; /* size to odds */ + words = (size + 31) / 32; /* sieve size in 32-bit words */ + word_tile(sieve, 1155, words); /* Copy first 1155 words to rest */ + /* Zero all unused bits and words */ + if (size % 32) + sieve[words-1] &= ~ (0xffffffffUL << (size % 32)); + memset(sieve + words, 0x00, 4*(PHI_SIEVE_WORDS+2-words)); + + /* Now remove the rest of the primes and create counts+sums. */ + make_sieve_counts(sieve, s->size, word_count); + for (i = 6; i <= start_prime_index; i++) { + if (i >= s->first_prime && i <= s->last_prime) { + uint32 b = (primes[i] - segment_start - 1) / 2; + sieve_zero(sieve, b, word_count); + } + } + return remove_primes(6, start_prime_index, s, primes); +} + +/* However we want to handle reduced prime counts */ +#define simple_pi(n) _XS_LMO_pi(n) +/* Macros to hide all the variables being passed */ +#define prev_sieve_prime(n) \ + prev_sieve_prime(n, &prev_sieve[0], &ps_start, ps_max, primes) +#define sieve_phi(x) \ + phi_total + _sieve_phi((x) - ss.start, ss.sieve, ss.word_count_sum) + + +UV _XS_LMO_pi(UV n) +{ + UV N2, N3, K2, K3, M, sum1, sum2, phi_total, phi_value; + UV sieve_start, sieve_end, least_divisor, step7_max, last_phi_sieve; + uint32 j, k, piM, KM, end, prime, prime_index; + uint32 ps_start, ps_max, smallest_divisor, nprimes; + uint8 prev_sieve[PREV_SIEVE_SIZE]; + uint32_t *primes; + uint16 *factor_table; + sieve_t ss; + + const uint32 c = 7; /* We can use out Mapes function for c <= 7 */ + + /* For "small" n, use our table+segment sieve. */ + if (n < SIEVE_LIMIT || n < 10000) return _XS_prime_count(2, n); + /* n should now be reasonably sized (not tiny). */ + + N2 = isqrt(n); /* floor(N^1/2) */ + N3 = icbrt(n); /* floor(N^1/3) */ + K2 = simple_pi(N2); /* Pi(N2) */ + K3 = simple_pi(N3); /* Pi(N3) */ + + /* M is N^1/3 times a tunable performance factor. */ + M = M_FACTOR(N3); + if (M >= N2) M = N2 - 1; /* M must be smaller than N^1/2 */ + if (M < N3) M = N3; /* M must be at least N^1/3 */ + + /* Create the array of small primes, and least-prime-factor/moebius table */ + primes = make_primelist( M + 500, &nprimes ); + factor_table = ft_create( M ); + if (primes == 0 || factor_table == 0) + croak("Allocation failure in LMO Pi\n"); + + /* Create other arrays */ + New(0, ss.sieve, PHI_SIEVE_WORDS + 2, uint32_t); + New(0, ss.word_count, PHI_SIEVE_WORDS/2 + 2, uint8); + New(0, ss.word_count_sum, PHI_SIEVE_WORDS/2 + 2, uint32); + New(0, ss.totals, K3+2, UV); + New(0, ss.prime_index, K3+2, uint32); + New(0, ss.first_bit_index, K3+2, uint32); + New(0, ss.multiplier, K3+2, uint8); + + if (ss.sieve == 0 || ss.word_count == 0 || ss.word_count_sum == 0 || + ss.totals == 0 || ss.prime_index == 0 || ss.first_bit_index == 0 || + ss.multiplier == 0) + croak("Allocation failure in LMO Pi\n"); + + /* Variables for fast prev_prime using small segment sieves (up to M^2) */ + ps_max = prev_sieve_max( primes[nprimes] ); + ps_start = U32_CONST(0xFFFFFFFF); + + /* Look for the smallest divisor: the smallest number > M which is + * square-free and not divisible by any prime covered by our Mapes + * small-phi case. The largest value we will look up in the phi + * sieve is n/smallest_divisor. */ + for (j = (M+1)/2; factor_table[j] <= primes[c]; j++) /* */; + smallest_divisor = 2*j+1; + /* largest_divisor = (N2 > (UV)M * (UV)M) ? N2 : (UV)M * (UV)M; */ + + M = smallest_divisor - 1; /* Increase M if possible */ + piM = simple_pi(M); + if (piM < c) croak("N too small for LMO\n"); + last_phi_sieve = n / smallest_divisor + 1; + + /* KM = smallest k, c <= k <= piM, s.t. primes[k+1] * primes[k+2] > M. */ + for (KM = c; primes[KM+1] * primes[KM+2] <= M && KM < piM; KM++) /* */; + if (K3 < KM) K3 = KM; /* Ensure K3 >= KM */ + + /* Start calculating Pi(n). Steps 4-10 from Bau. */ + sum1 = (K2 - 1) + (UV) (piM - K3 - 1) * (UV) (piM - K3) / 2; + sum2 = 0; + end = (M+1)/2; + + /* Start at index K2, which is the prime preceeding N^1/2 */ + prime = prev_sieve_prime(N2+1); + prime_index = K2 - 1; + step7_max = K3; + + /* Step 4: For 1 <= x <= M where x is square-free and has no + * factor <= primes[c], sum phi(n / x, c). */ + for (j = 0; j < end; j++) { + uint32 lpf = factor_table[j]; + if (lpf > primes[c]) { + phi_value = mapes(n / (2*j+1), c); /* x = 2j+1 */ + if (lpf & 0x01) sum2 += phi_value; else sum1 += phi_value; + } + } + + /* Step 5: For 1+M/primes[c+1] <= x <= M, x square-free and + * has no factor <= primes[c+1], sum phi(n / (x*primes[c+1]), c). */ + if (c < piM) { + UV pc_1 = primes[c+1]; + for (j = (1+M/pc_1)/2; j < end; j++) { + uint32 lpf = factor_table[j]; + if (lpf > pc_1) { + phi_value = mapes(n / (pc_1 * (2*j+1)), c); /* x = 2j+1 */ + if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value; + } + } + } + + for (k = 0; k <= K3; k++) ss.totals[k] = 0; + for (k = 0; k < KM; k++) ss.prime_index[k] = end; + + /* Instead of dividing by all primes up to pi(M), once a divisor is large + * enough then phi(n / (p*primes[k+1]), k) = 1. */ + { + uint32 last_prime = piM; + for (k = KM; k < K3; k++) { + UV pk = primes[k+1]; + while (last_prime > k+1 && pk * pk * primes[last_prime] > n) + last_prime--; + ss.prime_index[k] = last_prime; + sum1 += piM - last_prime; + } + } + + for (sieve_start = 0; sieve_start < last_phi_sieve; sieve_start = sieve_end) { + /* This phi segment goes from sieve_start to sieve_end. */ + sieve_end = ((sieve_start + 64*PHI_SIEVE_WORDS) < last_phi_sieve) + ? sieve_start + 64*PHI_SIEVE_WORDS : last_phi_sieve; + /* Only divisors s.t. sieve_start <= N / divisor < sieve_end considered. */ + least_divisor = n / sieve_end; + /* Initialize the sieve segment and all associated variables. */ + phi_total = init_segment(&ss, sieve_start, sieve_end - sieve_start, c, K3, primes); + + /* Step 6: For c < k < KM: For 1+M/primes[k+1] <= x <= M, x square-free + * and has no factor <= primes[k+1], sum phi(n / (x*primes[k+1]), k). */ + for (k = c+1; k < KM; k++) { + UV pk = primes[k+1]; + uint32 start = (least_divisor >= pk * U32_CONST(0xFFFFFFFE)) + ? U32_CONST(0xFFFFFFFF) + : (least_divisor / pk + 1)/2; + phi_total = remove_primes(k, k, &ss, primes); + for (j = ss.prime_index[k] - 1; j >= start; j--) { + uint32 lpf = factor_table[j]; + if (lpf > pk) { + phi_value = sieve_phi(n / (pk * (2*j+1))); + if (lpf & 0x01) sum1 += phi_value; else sum2 += phi_value; + } + } + if (start < ss.prime_index[k]) + ss.prime_index[k] = start; + } + /* Step 7: For KM <= K < Pi_M: For primes[k+2] <= x <= M, + * sum phi(n / (x*primes[k+1]), k). */ + for (; k < step7_max; k++) { + phi_total = remove_primes(k, k, &ss, primes); + if (ss.prime_index[k] >= k+2) { + UV pk = primes[k+1]; + for (j = ss.prime_index[k]; j >= k+2; j--) { + UV divisor = pk * primes[j]; + if (divisor <= least_divisor) break; + sum1 += sieve_phi(n / divisor); + } + ss.prime_index[k] = j; + } + } + /* Restrict work for the above loop when we know it will be empty. */ + while (step7_max > KM && ss.prime_index[step7_max-1] < (step7_max-1)+2) + step7_max--; + + /* Step 8: For KM <= K < K3, sum -phi(n / primes[k+1], k) */ + phi_total = remove_primes(k, K3, &ss, primes); + /* Step 9: For K3 <= k < K2, sum -phi(n / primes[k+1], k) + (k-K3). */ + while (prime > least_divisor && prime_index >= piM) { + sum1 += prime_index - K3; + sum2 += sieve_phi(n / prime); + prime_index--; + prime = prev_sieve_prime(prime); + } + } + + Safefree(ss.sieve); + Safefree(ss.word_count); + Safefree(ss.word_count_sum); + Safefree(ss.totals); + Safefree(ss.prime_index); + Safefree(ss.first_bit_index); + Safefree(ss.multiplier); + Safefree(factor_table); + Safefree(primes); + + return sum1 - sum2; +} diff --git a/lmo.h b/lmo.h new file mode 100644 index 0000000..95aa2ff --- /dev/null +++ b/lmo.h @@ -0,0 +1,8 @@ +#ifndef MPU_LMO_H +#define MPU_LMO_H + +#include "ptypes.h" + +extern UV _XS_LMO_pi(UV n); + +#endif diff --git a/util.c b/util.c index 0b17ca4..7a62714 100644 --- a/util.c +++ b/util.c @@ -60,7 +60,7 @@ #include "sieve.h" #include "primality.h" #include "cache.h" -#include "lehmer.h" +#include "lmo.h" static int _verbose = 0; void _XS_set_verbose(int v) { _verbose = v; } -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits