This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.13 in repository libmath-prime-util-perl.
commit 3b884098fe37e7793d923f0975ebd9918df4961a Author: Dana Jacobsen <[email protected]> Date: Fri Nov 16 03:49:37 2012 -0800 Comments and a small speedup for Lehmer --- lehmer.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 184 insertions(+), 26 deletions(-) diff --git a/lehmer.c b/lehmer.c index 5c82ba1..4977c0f 100644 --- a/lehmer.c +++ b/lehmer.c @@ -8,8 +8,71 @@ #include "cache.h" #include "sieve.h" +/***************************************************************************** + * + * Lehmer prime counting utility. Calculates pi(x), count of primes <= x. + * + * Copyright (c) 2012 Dana Jacobsen ([email protected]). + * This is free software; you can redistribute it and/or modify it under + * the same terms as the Perl 5 programming language system itself. + * + * This file is part of the Math::Prime::Util Perl module. It relies on two + * features: (1) a sieve to generate very small primes which could be as + * simple as a three-line SoE, and (2) _XS_prime_count(low, high) is expected + * to return the prime count within the segment low to high inclusive. These + * are used for the relatively small segments in the final stage, but making + * it fast is important for the final performance. The primesieve package is + * one source of excellent routines for either task. + * + * The phi(x,a) calculation is unique, to the best of my knowledge. It keeps + * a heap of all x values + signed counts for the given 'a' value, and walks + * 'a' down until it is small enough to calculate directly (either with Mapes + * or using a calculated table using the primorial/totient method). This + * is relatively fast and low memory compared to many other solutions. As with + * all Lehmer-Meissel-Legendre algorithms, memory use will be a constraint + * with large values of x. + * + * Calculating pi(10^11) is done in under 1 second on my computer. pi(10^14) + * takes about 1 minute, pi(10^16) in under an hour. Compared to Thomas + * R. Nicely's pix4 program, this one is about 3-4x faster and uses 3x less + * memory. However those comparisons were done without the big pre-computed + * prime count tables pix4 can use to speed up the final stage. + * + * n phi(x,a) mem/time | stage 4 mem/time | total time + * 10^17 5988MB 1244.11 | 2688MB + * 10^16 1877MB 254.83 | 945MB 1478.9 | 28m 51.4s + * 10^15 754MB 50.50 | 392MB 224.2 | 4m 33.8s + * 10^14 260MB 9.879 | 217MB 38.06 | 47.83s + * 10^13 1.852 | 162MB 6.987 | 8.864s + * 10^12 0.363 | 1.358 | 1.768s + * 10^11 0.064 | 0.276 | 0.381s + * 10^10 xxxxxMB 0.011 | xxxxxMB 0.056 | 0.105s + * + * Reference: "Prime numbers and computer methods for factorization", Riesel. + */ + static int const verbose = 0; +#define DO_TIMING 0 +#ifdef DO_TIMING + #include <sys/time.h> + #define DECLARE_TIMING_VARIABLES struct timeval t0, t1; + #define TIMING_START gettimeofday(&t0, 0); + #define TIMING_END_PRINT(text) \ + { unsigned long long t; \ + gettimeofday(&t1, 0); \ + t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \ + printf("%s: %10.5f\n", text, ((double)t) / 1000000); } +#else + #define DECLARE_TIMING_VARIABLES + #define TIMING_START + #define TIMING_END_PRINT(text) +#endif +/* Use Mapes' method to calculate phi(x,a) for small a. This is really + * convenient and a little Perl script will spit this code out for whatever + * limit we select. It gets unwieldy however, and using the primorial/totient + * method looks faster for the final output. + */ static IV mapes(IV x, UV a) { IV val; @@ -24,6 +87,12 @@ static IV mapes(IV x, UV a) return val; } +/******************************************************************************/ +/* Modified heap for manipulating our UV value / IV count pairs */ +/******************************************************************************/ + +/* TODO: This should be cleaned up */ + /* heap of values and counts, stored by value */ typedef struct { UV v; @@ -34,15 +103,30 @@ typedef struct { vc_t* array; UV array_size; UV N; + UV small_limit; + IV* small_array; + UV Nsmall; + int ptr_small; } heap_t; static void heap_insert(heap_t* h, vc_t v) { UV k; + UV val = v.v; + if (v.c == 0) + return; + if (val < h->small_limit) { + if (h->small_array[val] == 0) ++(h->Nsmall); + h->small_array[val] += v.c; + if (h->small_array[val] == 0) --(h->Nsmall); + else if (h->ptr_small < (int)val) h->ptr_small = (int) val; +// printf("small array insert %lu count %ld, coalesce to %ld, Nsmall = %lu, ptr %d\n", v.v, v.c, h->small_array[val], h->Nsmall, h->ptr_small); + return; + } UV n = ++(h->N); vc_t* a = h->array; if (n >= h->array_size) { - UV new_size = (h->array_size == 0) ? 20000 : 2 * h->array_size; + UV new_size = (h->array_size == 0) ? 20000 : 1.5 * h->array_size; if (verbose>2) printf("REALLOCing %p, new size %lu\n", h->array, new_size); h->array = realloc( h->array, new_size * sizeof(vc_t) ); if (h->array == 0) @@ -84,7 +168,24 @@ static vc_t heap_remove(heap_t* h) } static vc_t heap_remove_coalesce(heap_t* h) { - vc_t v = heap_remove(h); + vc_t v; + if (h->N == 0) { + int i; + if (h->Nsmall == 0) croak("remove from empty heap\n"); + for (i = h->ptr_small; i >= 0; i--) { + if (h->small_array[i] != 0) { + v.v = (UV) i; + v.c = h->small_array[i]; + h->small_array[i] = 0; + h->ptr_small = i-1; + --(h->Nsmall); + //printf("small array returning %lu count %ld\n", v.v, v.c); + return v; + } + } + croak("walked off small array\n"); + } + v = heap_remove(h); /* get rest of entries with same value */ while (h->N > 0 && h->array[1].v == v.v) { v.c += h->array[1].c; @@ -92,12 +193,20 @@ static vc_t heap_remove_coalesce(heap_t* h) } return v; } -static heap_t heap_create(void) +static heap_t heap_create(UV small_size) { heap_t h; h.array = 0; h.array_size = 0; h.N = 0; + h.small_limit = small_size; + h.small_array = 0; + if (small_size > 0) { + if (verbose>1)printf("creating small array of size %lu\n", small_size); + h.small_array = (IV*) calloc( small_size, sizeof(IV) ); + } + h.Nsmall = 0; + h.ptr_small = -1; return h; } static void heap_destroy(heap_t* h) @@ -109,34 +218,54 @@ static void heap_destroy(heap_t* h) h->array = 0; h->array_size = 0; h->N = 0; + if (h->small_array != 0) { + free(h->small_array); + } + h->small_array = 0; + h->Nsmall = 0; + h->ptr_small = -1; } static void heap_empty(heap_t* h) { h->N = 0; + h->Nsmall = 0; + h->ptr_small = -1; + if (h->small_limit > 0) + memset(h->small_array, 0, h->small_limit * sizeof(IV)); } +#define heap_not_empty(h) ((h).N > 0 || (h).Nsmall > 0) + +/******************************************************************************/ + -static UV phi(UV x, UV a) + +/* + * The main phi(x,a) algorithm. In this implementation, it takes about 25% + * of the total time for the Lehmer algorithm, but it is by far the most memory + * consuming section. + */ + +static UV phi(UV x, UV a, UV* primes) { UV i; - IV primea; IV sum = 0; - if (x < _XS_nth_prime(a+1)) return (x > 0) ? 1 : 0; - if (a == 1) return ((x+1)/2); - if (a <= 7) return mapes(x, a); + UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1); + if (x < primea) return (x > 0) ? 1 : 0; + if (a == 1) return ((x+1)/2); + if (a <= 7) return mapes(x, a); - primea = _XS_nth_prime(a); + primea = primes ? primes[a] : _XS_prev_prime(primea); - heap_t h1 = heap_create(); - heap_t h2 = heap_create(); + heap_t h1 = heap_create(20 * x/primea/primea/primea); + heap_t h2 = heap_create(20 * x/primea/primea/primea); vc_t v; v.v = x; v.c = 1; heap_insert(&h1, v); - while (a > 5) { + while (a > 6) { heap_empty(&h2); - /* Walk heap */ - while (h1.N > 0) { + while ( heap_not_empty(h1) ) { UV sval; v = heap_remove_coalesce(&h1); if (v.c == 0) @@ -153,10 +282,10 @@ static UV phi(UV x, UV a) } { heap_t t = h1; h1 = h2; h2 = t; } a--; - primea = _XS_prev_prime(primea); + primea = primes ? primes[a] : _XS_prev_prime(primea); } heap_destroy(&h2); - while (h1.N > 0) { + while ( heap_not_empty(h1) ) { v = heap_remove_coalesce(&h1); if (v.c != 0) sum += v.c * mapes(v.v, a); /* This could be faster */ @@ -165,6 +294,7 @@ static UV phi(UV x, UV a) return sum; } + /* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */ UV _XS_legendre_pi(UV n) @@ -174,29 +304,32 @@ UV _XS_legendre_pi(UV n) UV a = _XS_legendre_pi(sqrt(n)); prime_precalc(a); - return phi(n, a) + a - 1; + return phi(n, a, 0) + a - 1; } +/* Lehmer's method. See Riesel for basic program. */ + UV _XS_lehmer_pi(UV n) { + DECLARE_TIMING_VARIABLES; UV z, a, b, c, sum, i, j, lastw, lastwpc; UV* primea = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) primes */ UV* pca = 0; /* small prime count cache, first z=sqrt(n) numbers */ if (n < 1000000) return _XS_prime_count(2, n); - if (verbose > 0) printf("lehmer stage 1 - %lu\n", n); + if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n); + TIMING_START; z = sqrt((double)n+0.5); a = _XS_lehmer_pi(sqrt((double)z)+0.5); b = _XS_lehmer_pi(z); c = _XS_lehmer_pi(pow((double)n, 1.0/3.0)+0.5); - if (verbose > 0) printf("lehmer stage 2 - %lu (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c); - sum = phi(n, a) + ( (b+a-2) * (b-a+1) / 2); + TIMING_END_PRINT("stage 1") - if (verbose > 0) printf("lehmer stage 3 - n=%lu a=%lu\n", n, a); - /* Calculate the first b primes and first z primecounts */ - { - UV count = 0; + + if (verbose > 0) printf("lehmer %lu stage 2: %lu small primes, %lu counts\n", n, b, z); + TIMING_START; + { /* Calculate the first b primes */ const unsigned char* sieve; if (get_prime_cache(z, &sieve) < z) { release_prime_cache(sieve); @@ -218,7 +351,19 @@ UV _XS_lehmer_pi(UV n) if (i < b) croak("Did not generate enough small primes. Bug in Lehmer code.\n"); if (verbose > 1) printf("generated first %lu small primes, from 2 to %lu\n", i, z); + } + TIMING_END_PRINT("small primes") + + + if (verbose > 0) printf("lehmer %lu stage 3: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c); + TIMING_START; + sum = phi(n, a, primea) + ( (b+a-2) * (b-a+1) / 2); + TIMING_END_PRINT("phi(x,a)") + + TIMING_START; + { /* Generate primecounts up to z used by stage 4. */ + UV count = 0; pca = (UV*) malloc( (z+4) * sizeof(UV) ); if (pca == 0) croak("Can not allocate small prime counts\n"); @@ -229,12 +374,24 @@ UV _XS_lehmer_pi(UV n) } if (verbose > 1) printf("generated first %lu prime counts, from 2 to %lu\n", count, z); } + TIMING_END_PRINT("prime counts") +printf("precalculated counts to %lu\n", z); - /* Ensure we have the base sieve for prime_count ( n/primea[i] ) */ + /* Ensure we have the base sieve for prime_count ( n/primea[i] ). */ + /* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */ prime_precalc( sqrt( n / primea[a+1] ) ); + /* TODO: + * (1) generate 2b primes instead of b + * (2) generate counts for all 2b primes + * (3) store counts on odd numbers only. + * (4) in loop below, check if we have the result in pca and use it. + * This will cut in half the prime_count calls below, at a little memory + * cost (almost entirely mitigated by change #3) + */ - if (verbose > 0) printf("lehmer stage 4 - n=%lu a=%lu\n", n, a); + if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primea[a+1]); + TIMING_START; /* Reverse the i loop so w increases. Count w in segments. */ lastw = 0; lastwpc = 0; @@ -250,6 +407,7 @@ UV _XS_lehmer_pi(UV n) } } } + TIMING_END_PRINT("stage 4") if (pca != 0) free(pca); if (primea != 0) -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits
