This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.23 in repository libmath-prime-util-perl.
commit 705c9247e386af8a9ec55a2ee00245182178f89e Author: Dana Jacobsen <d...@acm.org> Date: Sun Mar 3 05:45:08 2013 -0800 Start to add LMO prime_count --- Changes | 5 ++ TODO | 6 +++ XS.xs | 5 +- aks.c | 6 +-- factor.c | 24 +++++----- lehmer.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- lehmer.h | 1 + sieve.c | 5 +- util.c | 36 ++++++++------- util.h | 12 +++++ 10 files changed, 212 insertions(+), 47 deletions(-) diff --git a/Changes b/Changes index 994544b..31bd220 100644 --- a/Changes +++ b/Changes @@ -18,6 +18,11 @@ Revision history for Perl extension Math::Prime::Util. - Add the first and second Chebyshev functions (theta and psi). + - put isqrt(n) in util.h, use it everywhere. + put icbrt(n) in lehmer.h, use it there. + + - Start on Lagarias-Miller-Odlyzko prime count. + - Performance: - Divisor sum with no sub is ~10x faster. - Speed up PP version of exp_mangoldt, create XS version. diff --git a/TODO b/TODO index cb3763b..e523058 100644 --- a/TODO +++ b/TODO @@ -47,3 +47,9 @@ } END_FOREACH_PRIME that (1) handles any low / high, and (2) segments. Take segment_primes from XS.xs to do this. + +- Try in lehmer.c's phi function: Use two linear arrays to replace the + heap+array. If we add val in one and sval in the other, they can remain + sorted, then reading the next combined value is easy. + +- Implement S2 calculation for LMO prime count. diff --git a/XS.xs b/XS.xs index b46185e..8892164 100644 --- a/XS.xs +++ b/XS.xs @@ -62,6 +62,9 @@ UV _XS_lehmer_pi(IN UV n) UV +_XS_LMO_pi(IN UV n) + +UV _XS_nth_prime(IN UV n) int @@ -158,7 +161,7 @@ segment_primes(IN UV low, IN UV high); { /* Avoid recalculations of this */ UV endp = (high_d >= (UV_MAX/30)) ? UV_MAX-2 : 30*high_d+29; - prime_precalc( sqrt(endp) + 0.1 + 1 ); + prime_precalc(isqrt(endp) + 1 ); } while ( low_d <= high_d ) { diff --git a/aks.c b/aks.c index 0e3b123..3b7f18c 100644 --- a/aks.c +++ b/aks.c @@ -55,7 +55,7 @@ static int is_perfect_power(UV n) { if ( n < (UV) pow(10, DBL_DIG) ) { #endif /* Simple floating point method. Fast, but need enough mantissa. */ - b = sqrt(n)+0.5; if (b*b == n) return 1; /* perfect square */ + b = isqrt(n); if (b*b == n) return 1; /* perfect square */ for (b = 3; b < last; b = _XS_next_prime(b)) { UV root = pow(n, 1.0 / (double)b) + 0.5; if ( ((UV)(pow(root, b)+0.5)) == n) return 1; @@ -163,7 +163,7 @@ static UV* poly_mod_pow(UV* pn, UV power, UV r, UV mod) { UV* res; UV* temp; - int use_sqr = (mod > sqrt(UV_MAX/r)) ? 0 : 1; + int use_sqr = (mod > isqrt(UV_MAX/r)) ? 0 : 1; Newz(0, res, r, UV); New(0, temp, r, UV); @@ -223,7 +223,7 @@ int _XS_is_aks_prime(UV n) if (is_perfect_power(n)) return 0; - sqrtn = sqrt(n); + sqrtn = isqrt(n); log2n = log(n) / log(2); /* C99 has a log2() function */ limit = (UV) floor(log2n * log2n); diff --git a/factor.c b/factor.c index 8bc1357..c547159 100644 --- a/factor.c +++ b/factor.c @@ -90,7 +90,7 @@ int factor(UV n, UV *factors) /* Factor via trial division. Nothing should make it here. */ UV f = tlim; UV m = tlim % 30; - UV limit = (UV) (sqrt(n)+0.1); + UV limit = isqrt(n); if (verbose) printf("doing trial on %"UVuf"\n", n); while (f <= limit) { if ( (n%f) == 0 ) { @@ -98,7 +98,7 @@ int factor(UV n, UV *factors) n /= f; fac_stack[nfac++] = f; } while ( (n%f) == 0 ); - limit = (UV) (sqrt(n)+0.1); + limit = isqrt(n); } f += wheeladvance30[m]; m = nextwheel30[m]; @@ -178,7 +178,7 @@ int trial_factor(UV n, UV *factors, UV maxtrial) } /* Trial division to this number at most. Reduced as we find factors. */ - limit = (UV) (sqrt(n)+0.1); + limit = isqrt(n); if (limit > maxtrial) limit = maxtrial; @@ -192,7 +192,7 @@ int trial_factor(UV n, UV *factors, UV maxtrial) factors[nfactors++] = f; n /= f; } while ( (n%f) == 0 ); - newlimit = (UV) (sqrt(n)+0.1); + newlimit = isqrt(n); if (newlimit < limit) limit = newlimit; } f = primes_small[++sp]; @@ -208,7 +208,7 @@ int trial_factor(UV n, UV *factors, UV maxtrial) factors[nfactors++] = f; n /= f; } while ( (n%f) == 0 ); - newlimit = (UV) (sqrt(n)+0.1); + newlimit = isqrt(n); if (newlimit < limit) limit = newlimit; } f += wheeladvance30[m]; @@ -280,7 +280,7 @@ static int is_perfect_square(UV n, UV* sqrtn) m = n % 63; if ((m*0x3d491df7) & (m*0xc824a9f9) & 0x10f14008) return 0; #endif - m = sqrt(n); + m = isqrt(n); if (n != (m*m)) return 0; @@ -459,7 +459,7 @@ int fermat_factor(UV n, UV *factors, UV rounds) MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in fermat_factor"); - sqn = (UV) (sqrt(n)+0.1); + sqn = isqrt(n); x = 2 * sqn + 1; y = 1; r = (sqn*sqn) - n; @@ -630,7 +630,7 @@ int pminus1_factor(UV n, UV *factors, UV B1, UV B2) UV savea = 2; UV saveq = 2; UV j = 1; - UV sqrtB1 = sqrt(B1); + UV sqrtB1 = isqrt(B1); MPUassert( (n >= 3) && ((n%2) != 0) , "bad n in pminus1_factor"); for (q = 2; q <= sqrtB1; q = _XS_next_prime(q)) { @@ -761,7 +761,7 @@ int squfof_factor(UV n, UV *factors, UV rounds) /* TODO: What value of n leads to overflow? */ qlast = 1; - s = sqrt(n); + s = isqrt(n); p = s; temp = n - (s*s); /* temp = n - floor(sqrt(n))^2 */ @@ -798,7 +798,7 @@ int squfof_factor(UV n, UV *factors, UV rounds) q = t; p = pnext; /* check for square; even iter */ if (jter & 1) continue; /* jter is odd:omit square test */ - r = (int)sqrt((double)q); /* r = floor(sqrt(q)) */ + r = isqrt(q); /* r = floor(sqrt(q)) */ if (q != r*r) continue; if (qpoint == 0) break; qqueue[qpoint] = 0; @@ -1005,8 +1005,8 @@ int racing_squfof_factor(UV n, UV *factors, UV rounds) } mult_save[i].valid = 1; - mult_save[i].b0 = sqrt( (double)nn64 ); - mult_save[i].imax = sqrt( (double)mult_save[i].b0 ) / 3; + mult_save[i].b0 = isqrt(nn64); + mult_save[i].imax = sqrt(mult_save[i].b0) / 3; if (mult_save[i].imax < 20) mult_save[i].imax = 20; if (mult_save[i].imax > rounds) mult_save[i].imax = rounds; diff --git a/lehmer.c b/lehmer.c index aef6b37..48d57d1 100644 --- a/lehmer.c +++ b/lehmer.c @@ -54,6 +54,7 @@ */ static int const verbose = 0; +/* #define STAGE_TIMING 1 */ #ifdef STAGE_TIMING #include <sys/time.h> @@ -95,6 +96,16 @@ typedef signed long IV; #define croak(fmt,...) { printf(fmt,##__VA_ARGS__); exit(1); } #define prime_precalc(n) /* */ +static UV isqrt(UV n) +{ + if (sizeof(UV) == 8 && n >= 18446744065119617025UL) return 4294967295UL; + if (sizeof(UV) == 4 && n >= 4294836225UL) return 65535UL; + UV root = (UV) sqrt((double)n); + while (root*root > n) root--; + while ((root+1)*(root+1) <= n) root++; + return root; +} + /* There has _got_ to be a better way to get an array of small primes using * primesieve. This is ridiculous. */ static UV* sieve_array = 0; @@ -161,6 +172,36 @@ static UV* generate_small_primes(UV n) #endif +static UV icbrt(UV n) +{ +#if 0 + /* The integer cube root code is about 30% faster for me */ + if (n >= UVCONST(18446724184312856125)) return UVCONST(2642245); + UV root = (UV) pow(n, 1.0/3.0); + if (root*root*root > n) { + root--; + while (root*root*root > n) root--; + } else { + while ((root+1)*(root+1)*(root+1) <= n) root++; + } + return root; +#else + int s; + UV y = 0; + for (s = (sizeof(UV)*8)-1; s >= 0; s -= 3) { + UV b; + y += y; + b = 3*y*(y+1)+1; + if ((n >> s) >= b) { + n -= b << s; + y++; + } + } + return y; +#endif +} + + /* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime. * This is actually quite fast, and definitely faster than sieving. By using * this we can avoid caching prime counts and also skip most calls to the @@ -455,7 +496,7 @@ UV _XS_legendre_pi(UV n) if (n < SIEVE_LIMIT) return _XS_prime_count(2, n); - a = _XS_legendre_pi( (UV) (sqrt(n)+0.5) ); + a = _XS_legendre_pi(isqrt(n)); return phi(n, a) + a - 1; } @@ -472,9 +513,9 @@ UV _XS_meissel_pi(UV n) if (verbose > 0) printf("meissel %lu stage 1: calculate a,b,c \n", n); TIMING_START; - a = _XS_meissel_pi(pow(n, 1.0/3.0)+0.5); /* a = floor(n^1/3) */ - b = _XS_meissel_pi(sqrt(n)+0.5); /* b = floor(n^1/2) */ - c = a; /* c = a */ + a = _XS_meissel_pi(icbrt(n)); /* a = floor(n^1/3) */ + b = _XS_meissel_pi(isqrt(n)); /* b = floor(n^1/2) */ + c = a; /* c = a */ TIMING_END_PRINT("stage 1") if (verbose > 0) printf("meissel %lu stage 2: phi(x,a) (a=%lu b=%lu c=%lu)\n", n, a, b, c); @@ -491,7 +532,7 @@ UV _XS_meissel_pi(UV n) lastpc = primes[lastprime]; TIMING_END_PRINT("small primes") - prime_precalc( sqrt( n / primes[a+1] ) ); + prime_precalc(isqrt(n / primes[a+1])); if (verbose > 0) printf("meissel %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]); TIMING_START; @@ -532,10 +573,10 @@ UV _XS_lehmer_pi(UV n) if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n); TIMING_START; - z = (UV) sqrt((double)n+0.5); - a = _XS_lehmer_pi(sqrt((double)z)+0.5); /* a = floor(n^1/4) */ - b = _XS_lehmer_pi(z); /* b = floor(n^1/2) */ - c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5); /* c = floor(n^1/3) */ + z = isqrt(n); + a = _XS_lehmer_pi(isqrt(z)); /* a = floor(n^1/4) */ + b = _XS_lehmer_pi(z); /* b = floor(n^1/2) */ + c = _XS_lehmer_pi(icbrt(n)); /* c = floor(n^1/3) */ TIMING_END_PRINT("stage 1") @@ -560,7 +601,7 @@ UV _XS_lehmer_pi(UV n) /* Ensure we have the base sieve for big prime_count ( n/primes[i] ). */ /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */ - prime_precalc( sqrt( n / primes[a+1] ) ); + prime_precalc(isqrt(n / primes[a+1])); if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]); TIMING_START; @@ -574,7 +615,7 @@ UV _XS_lehmer_pi(UV n) lastw = w; sum = sum - lastwpc; if (i <= c) { - UV bi = bs_prime_count( (UV) (sqrt(w) + 0.5), primes, lastprime ); + UV bi = bs_prime_count( isqrt(w), primes, lastprime ); for (j = i; j <= bi; j++) { sum = sum - bs_prime_count(w / primes[j], primes, lastprime) + j - 1; } @@ -585,6 +626,99 @@ UV _XS_lehmer_pi(UV n) return sum; } +UV _XS_LMO_pi(UV n) +{ + UV a, b, sum, i, lastprime, lastpc, lastw, lastwpc; + UV n13, n12, n23; + IV S1; + UV S2, P2; + const UV* primes = 0; /* small prime cache */ + char* mu = 0; /* moebius to n^1/3 */ + UV* lpf = 0; /* least prime factor to n^1/3 */ + DECLARE_TIMING_VARIABLES; + if (n < SIEVE_LIMIT) + return _XS_prime_count(2, n); + + if (verbose > 0) printf("LMO %lu stage 1: calculate pi(n^1/3) \n", n); + TIMING_START; + n13 = icbrt(n); + n12 = isqrt(n); + n23 = (UV) (pow(n, 2.0/3.0)+0.01); + a = _XS_lehmer_pi(n13); + b = _XS_lehmer_pi(n12); + TIMING_END_PRINT("stage 1") + + lastprime = b*16; + if (verbose > 0) printf("LMO %lu stage 2: %lu small primes\n", n, lastprime); + TIMING_START; + primes = generate_small_primes(lastprime); + if (primes == 0) croak("Error generating primes.\n"); + lastpc = primes[lastprime]; + TIMING_END_PRINT("small primes") + + if (verbose > 0) printf("LMO %lu stage 3: calculate mu/lpf to %lu\n", n, a); + TIMING_START; + /* We could call MPU's: + * mu = _moebius_range(0, n13+1) + * but (1) it's a bit slower (something to be addressed), and (2) we will + * do the least prime factor calculation at the same time. + */ + New(0, mu, n13+1, char); + memset(mu, 1, sizeof(char) * (n13+1)); + New(0, lpf, n13+1, UV); + memset(lpf, 0, sizeof(UV) * (n13+1)); + mu[0] = 0; + for (i = 1; i <= a; i++) { + UV primei = primes[i]; + UV j; + for (j = primei; j <= n13; j += primei) { + mu[j] = -mu[j]; + if (lpf[j] == 0) lpf[j] = primei; + } + UV isquared = primei * primei; + for (j = isquared; j <= n13; j += isquared) + mu[j] = 0; + } + //for (i = 0; i <= n13; i++) { printf("mu %lu %ld\n", i, (IV)mu[i]); } + TIMING_END_PRINT("mu") + + if (verbose > 0) printf("LMO %lu stage 4: calculate S1 (%lu)\n", n, n13); + TIMING_START; + S1 = 0; + for (i = 1; i <= n13; i++) + if (mu[i] != 0) + S1 += mu[i] * (IV) (n/i); + TIMING_END_PRINT("S1") + if (verbose > 0) printf("LMO %lu stage 4: S1 = %ld\n", n, S1); + + S2 = 0; + /* TODO... */ + + Safefree(mu); + Safefree(lpf); + + prime_precalc(isqrt(n / primes[a+1])); + if (verbose > 0) printf("LMO %lu stage 5: P2 loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]); + TIMING_START; + P2 = 0; + /* Reverse the i loop so w increases. Count w in segments. */ + lastw = 0; + lastwpc = 0; + for (i = b; i > a; i--) { + UV w = n / primes[i]; + lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime) + : lastwpc + _XS_prime_count(lastw+1, w); + lastw = w; + P2 += lastwpc; + } + P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1; + TIMING_END_PRINT("P2") + if (verbose > 0) printf("LMO %lu stage 5: P2 = %lu\n", n, P2); + Safefree(primes); + sum = P2 + S1 + S2; + return sum; +} + #ifdef PRIMESIEVE_STANDALONE int main(int argc, char *argv[]) @@ -607,9 +741,10 @@ int main(int argc, char *argv[]) if (!strcasecmp(method, "lehmer")) { pi = _XS_lehmer_pi(n); } else if (!strcasecmp(method, "meissel")) { pi = _XS_meissel_pi(n); } else if (!strcasecmp(method, "legendre")) { pi = _XS_legendre_pi(n); } + else if (!strcasecmp(method, "lmo")) { pi = _XS_LMO_pi(n); } else if (!strcasecmp(method, "sieve")) { pi = _XS_prime_count(2, n); } else { - printf("method must be one of: lehmer, meissel, legendre, or sieve\n"); + printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n"); return(2); } gettimeofday(&t1, 0); diff --git a/lehmer.h b/lehmer.h index e071deb..94b46a4 100644 --- a/lehmer.h +++ b/lehmer.h @@ -6,5 +6,6 @@ extern UV _XS_legendre_pi(UV n); extern UV _XS_meissel_pi(UV n); extern UV _XS_lehmer_pi(UV n); +extern UV _XS_LMO_pi(UV n); #endif diff --git a/sieve.c b/sieve.c index bebd2df..16d22fc 100644 --- a/sieve.c +++ b/sieve.c @@ -6,6 +6,7 @@ #include "sieve.h" #include "ptypes.h" #include "cache.h" +#include "util.h" /* 1001 bytes of presieved mod-30 bytes. If the area to be sieved is @@ -136,7 +137,7 @@ unsigned char* sieve_erat30(UV end) /* Fill buffer with marked 7, 11, and 13 */ sieve_prefill(mem, 0, max_buf-1); - limit = sqrt((double) end); /* prime*prime can overflow */ + limit = isqrt(end); /* prime*prime can overflow */ for (prime = 17; prime <= limit; prime = next_prime_in_sieve(mem,prime)) { UV d = (prime*prime)/30; UV m = (prime*prime) - d*30; @@ -198,7 +199,7 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd) /* Fill buffer with marked 7, 11, and 13 */ sieve_prefill(mem, startd, endd); - limit = sqrt( (double) endp ) + 1; + limit = isqrt(endp) + 1; /* printf("segment sieve from %"UVuf" to %"UVuf" (aux sieve to %"UVuf")\n", startp, endp, limit); */ pcsize = get_prime_cache(limit, &sieve); if (pcsize < limit) { diff --git a/util.c b/util.c index 159d1b7..b041fb6 100644 --- a/util.c +++ b/util.c @@ -89,7 +89,7 @@ static UV count_zero_bits(const unsigned char* m, UV nbytes) static int _is_trial_prime7(UV n) { UV limit, i; - limit = sqrt(n); + limit = isqrt(n); i = 7; while (1) { /* trial division, skipping multiples of 2/3/5 */ if (i > limit) break; if ((n % i) == 0) return 0; i += 4; @@ -112,7 +112,7 @@ static int _is_prime7(UV n) if (n > MPU_PROB_PRIME_BEST) return _XS_is_prob_prime(n); /* We know this works for all 64-bit n */ - limit = sqrt(n); + limit = isqrt(n); i = 7; while (1) { /* trial division, skipping multiples of 2/3/5 */ if (i > limit) break; if ((n % i) == 0) return 0; i += 4; @@ -438,7 +438,7 @@ UV _XS_prime_count(UV low, UV high) /* Expand sieve to sqrt(n) */ UV endp = (high_d >= (UV_MAX/30)) ? UV_MAX-2 : 30*high_d+29; release_prime_cache(cache_sieve); - segment_size = get_prime_cache( sqrt(endp) + 1 , &cache_sieve) / 30; + segment_size = get_prime_cache( isqrt(endp) + 1 , &cache_sieve) / 30; } if ( (segment_size > 0) && (low_d <= segment_size) ) { @@ -615,7 +615,7 @@ UV _XS_nth_prime(UV n) } /* Make sure the segment siever won't have to keep resieving. */ - prime_precalc(sqrt(upper_limit)); + prime_precalc(isqrt(upper_limit)); } if (count == target) @@ -659,7 +659,7 @@ IV* _moebius_range(UV lo, UV hi) /* This implementation follows that of Deléglise & Rivat (1996), which is * a segmented version of Lioen & van de Lune (1994). */ - sqrtn = (UV) (sqrt(hi) + 0.5); + sqrtn = isqrt(hi); New(0, mu, hi-lo+1, IV); if (mu == 0) @@ -725,7 +725,7 @@ IV _XS_mertens(UV n) { IV sum; if (n <= 1) return n; - u = (UV) sqrt(n); + u = isqrt(n); mu = _moebius_range(0, u); New(0, M, u+1, IV); M[0] = 0; @@ -981,7 +981,6 @@ static const long double riemann_zeta_table[] = { long double ld_riemann_zeta(long double x) { long double const tol = 1e-17; int i; - KAHAN_INIT(sum); if (x < 0) croak("Invalid input to RiemannZeta: x must be >= 0"); if (x == 1) return INFINITY; @@ -1014,7 +1013,7 @@ long double ld_riemann_zeta(long double x) { 1.000000000000000000000L }; long double sumn = C8p[0]+x*(C8p[1]+x*(C8p[2]+x*(C8p[3]+x*(C8p[4]+x*(C8p[5]+x*(C8p[6]+x*(C8p[7]+x*C8p[8]))))))); long double sumd = C8q[0]+x*(C8q[1]+x*(C8q[2]+x*(C8q[3]+x*(C8q[4]+x*(C8q[5]+x*(C8q[6]+x*(C8q[7]+x*C8q[8]))))))); - sum = (sumn - (x-1)*sumd) / ((x-1)*sumd); + long double sum = (sumn - (x-1)*sumd) / ((x-1)*sumd); return sum; } @@ -1026,16 +1025,19 @@ long double ld_riemann_zeta(long double x) { } #if 0 - /* Simple defining series, works well. */ - for (i = 5; i <= 1000000; i++) { - long double term = powl(i, -x); - KAHAN_SUM(sum, term); - if (term < tol*sum) break; + { + KAHAN_INIT(sum); + /* Simple defining series, works well. */ + for (i = 5; i <= 1000000; i++) { + long double term = powl(i, -x); + KAHAN_SUM(sum, term); + if (term < tol*sum) break; + } + KAHAN_SUM(sum, powl(4, -x) ); + KAHAN_SUM(sum, powl(3, -x) ); + KAHAN_SUM(sum, powl(2, -x) ); + return sum; } - KAHAN_SUM(sum, powl(4, -x) ); - KAHAN_SUM(sum, powl(3, -x) ); - KAHAN_SUM(sum, powl(2, -x) ); - return sum; #endif /* The 2n!/B_2k series used by the Cephes library. */ diff --git a/util.h b/util.h index 3492469..9d9821e 100644 --- a/util.h +++ b/util.h @@ -32,4 +32,16 @@ extern double _XS_RiemannR(double x); #define MPU_PROB_PRIME_BEST UVCONST(100000000) #endif +static UV isqrt(UV n) { + #if BIT_PER_WORD == 32 + if (n >= UVCONST(4294836225)) return UVCONST(65535); + #else + if (n >= UVCONST(18446744065119617025)) return UVCONST(4294967295); + #endif + UV root = (UV) sqrt((double)n); + while (root*root > n) root--; + while ((root+1)*(root+1) <= n) root++; + return root; +} + #endif -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits