This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.13 in repository libmath-prime-util-perl.
commit a7bc5c62a23097c8eaf8f663d958e89aa29a65ad Author: Dana Jacobsen <[email protected]> Date: Fri Nov 16 23:04:13 2012 -0800 More Lehmer improvements (faster, lower memory) --- lehmer.c | 292 +++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 154 insertions(+), 138 deletions(-) diff --git a/lehmer.c b/lehmer.c index 4977c0f..8d091d0 100644 --- a/lehmer.c +++ b/lehmer.c @@ -33,22 +33,21 @@ * with large values of x. * * Calculating pi(10^11) is done in under 1 second on my computer. pi(10^14) - * takes about 1 minute, pi(10^16) in under an hour. Compared to Thomas - * R. Nicely's pix4 program, this one is about 3-4x faster and uses 3x less - * memory. However those comparisons were done without the big pre-computed - * prime count tables pix4 can use to speed up the final stage. + * takes under 1 minute, pi(10^16) in a half hour. Compared with Thomas + * R. Nicely's pix4 program, this one is 3-4x faster and uses 2-3x less memory. * * n phi(x,a) mem/time | stage 4 mem/time | total time - * 10^17 5988MB 1244.11 | 2688MB - * 10^16 1877MB 254.83 | 945MB 1478.9 | 28m 51.4s - * 10^15 754MB 50.50 | 392MB 224.2 | 4m 33.8s - * 10^14 260MB 9.879 | 217MB 38.06 | 47.83s - * 10^13 1.852 | 162MB 6.987 | 8.864s - * 10^12 0.363 | 1.358 | 1.768s - * 10^11 0.064 | 0.276 | 0.381s - * 10^10 xxxxxMB 0.011 | xxxxxMB 0.056 | 0.105s + * 10^17 5094MB 870.07 | 2688MB 11328.6 | 203m 20.0s + * 10^16 1483MB 168.19 | 945MB 1489.2 | 27m 35.5s + * 10^15 448MB 31.26 | 392MB 226.1 | 4m 17.1s + * 10^14 5.519 | 217MB 38.36 | 43.76s + * 10^13 0.950 | 162MB 7.025 | 7.993s + * 10^12 0.174 | 1.363 | 1.574s + * 10^11 0.034 | 0.278 | 0.346s + * 10^10 0.007 | 0.058 | 0.103s * - * Reference: "Prime numbers and computer methods for factorization", Riesel. + * Reference: Hans Riesel, "Prime Numbers and Computer Methods for + * Factorization", 2nd edition, 1994. */ static int const verbose = 0; @@ -70,8 +69,7 @@ static int const verbose = 0; /* Use Mapes' method to calculate phi(x,a) for small a. This is really * convenient and a little Perl script will spit this code out for whatever - * limit we select. It gets unwieldy however, and using the primorial/totient - * method looks faster for the final output. + * limit we select. It gets unwieldy with large a values. */ static IV mapes(IV x, UV a) { @@ -87,197 +85,214 @@ static IV mapes(IV x, UV a) return val; } +static IV mapes7(IV x) { /* A tiny bit faster setup for a=7 */ + IV val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21+x/22+x/26 + -x/30+x/33+x/34+x/35+x/39-x/42+x/51+x/55+x/65-x/66-x/70+x/77-x/78 + +x/85+x/91-x/102-x/105-x/110+x/119-x/130+x/143-x/154-x/165-x/170 + -x/182+x/187-x/195+x/210+x/221-x/231-x/238-x/255-x/273-x/286+x/330 + -x/357-x/374-x/385+x/390-x/429-x/442-x/455+x/462+x/510+x/546-x/561 + -x/595-x/663+x/714; + if (x >= 715) { + val += -x/715+x/770+x/858+x/910-x/935-x/1001-x/1105+x/1122+x/1155+x/1190 + -x/1309+x/1326+x/1365+x/1430-x/1547+x/1785+x/1870+x/2002+x/2145 + +x/2210-x/2310-x/2431+x/2618-x/2730+x/2805+x/3003+x/3094+x/3315 + -x/3570+x/3927-x/4290+x/4641+x/4862+x/5005-x/5610-x/6006+x/6545 + -x/6630+x/7293+x/7735-x/7854; + if (x >= 9282) + val += -x/9282-x/10010+x/12155-x/13090-x/14586-x/15015-x/15470+x/17017 + -x/19635-x/23205-x/24310+x/30030-x/34034-x/36465+x/39270+x/46410 + -x/51051+x/72930-x/85085+x/102102+x/170170+x/255255-x/510510; + } + return val; +} + /******************************************************************************/ /* Modified heap for manipulating our UV value / IV count pairs */ /******************************************************************************/ -/* TODO: This should be cleaned up */ +/* + * This is a heap augmented with a small array. We store values and signed + * counts, where all counts for the same value are summed. An easy way to + * do this in Perl/Python is a hash. In plain C, I don't believe this is the + * best solution. A heap can be implemented both easily and very efficiently + * (using a linear array), and as we pull items off the heap we can combine + * all similar values. + * + * Below some threshold value ('small_limit') the items become dense. That is, + * not only are the values small but we have many items in that range. Hence + * the small array augmentation. All values below the threshold are just put + * directly into an array. This not only handles them a little faster but + * helps reduce the heap size a bit, as we don't put any repeated values in + * the heap for the small items. Since they're dense in this range, we can + * do a linear scan to find the next non-zero count. + * + * An ideal data structure for our purpose would coalesce values on insertion, + * and would allow operating in place (so we could retrieve all our items and + * add new items as we go, without them appearing on this scan). The former + * is possible using an ordered list or a balanced tree. I don't know how we + * would achieve the latter. The point being that we're pulling items off of + * h1 and adding it (plus possibly a new item) to h2, so ideally we'd manage + * to use that space freed up by h1. + */ -/* heap of values and counts, stored by value */ typedef struct { UV v; IV c; } vc_t; typedef struct { - vc_t* array; - UV array_size; - UV N; - UV small_limit; - IV* small_array; - UV Nsmall; - int ptr_small; + UV small_limit; /* small count array: size */ + UV small_N; /* small count array: number of non-zero elements */ + int small_ptr; /* small count array: index of largest non-zero value */ + UV array_size; /* heap: allocated size in elements */ + UV N; /* heap: number of elements */ + IV* small_array; /* small count array data */ + vc_t* array; /* heap data */ } heap_t; -static void heap_insert(heap_t* h, vc_t v) +static void heap_insert(heap_t* h, UV val, IV count) { - UV k; - UV val = v.v; - if (v.c == 0) - return; if (val < h->small_limit) { - if (h->small_array[val] == 0) ++(h->Nsmall); - h->small_array[val] += v.c; - if (h->small_array[val] == 0) --(h->Nsmall); - else if (h->ptr_small < (int)val) h->ptr_small = (int) val; -// printf("small array insert %lu count %ld, coalesce to %ld, Nsmall = %lu, ptr %d\n", v.v, v.c, h->small_array[val], h->Nsmall, h->ptr_small); + IV* saptr = h->small_array + val; + if (*saptr == 0) + ++(h->small_N); + *saptr += count; + if (*saptr == 0) + --(h->small_N); + else if (h->small_ptr < (int)val) + h->small_ptr = (int) val; return; } UV n = ++(h->N); vc_t* a = h->array; if (n >= h->array_size) { - UV new_size = (h->array_size == 0) ? 20000 : 1.5 * h->array_size; - if (verbose>2) printf("REALLOCing %p, new size %lu\n", h->array, new_size); - h->array = realloc( h->array, new_size * sizeof(vc_t) ); + UV new_size = (h->array_size == 0) + ? (h->small_limit <= (20000/2)) ? 20000 : 2*h->small_limit + : 1.5 * h->array_size; + if (verbose>2) printf("REALLOCing heap %p, new size %lu\n", h->array, new_size); + h->array = Renew( h->array, new_size, vc_t ); if (h->array == 0) croak("could not allocate heap\n"); a = h->array; h->array_size = new_size-1; a[0].v = UV_MAX; a[0].c = 0; } - a[n] = v; - k = n; - while (a[k/2].v <= v.v) { /* upheap */ - a[k] = a[k/2]; - k /= 2; - } - a[k] = v; -} - -static vc_t heap_remove(heap_t* h) -{ - UV k; - UV n = h->N; - vc_t* a = h->array; - vc_t v, top; - if (n == 0) - croak("removing from empty heap\n"); - top = a[1]; - v = a[1] = a[n]; - n = --(h->N); /* downheap */ - k = 1; - while (k <= n/2) { - UV j = k+k; - if (j < n && a[j].v < a[j+1].v) j++; - if (v.v >= a[j].v) break; - a[k] = a[j]; - k = j; + while (a[n/2].v <= val) { /* upheap */ + a[n] = a[n/2]; + n /= 2; } - a[k] = v; - return top; + a[n].v = val; + a[n].c = count; } -static vc_t heap_remove_coalesce(heap_t* h) +static void heap_remove(heap_t* h, UV* val, IV* count) { - vc_t v; if (h->N == 0) { - int i; - if (h->Nsmall == 0) croak("remove from empty heap\n"); - for (i = h->ptr_small; i >= 0; i--) { - if (h->small_array[i] != 0) { - v.v = (UV) i; - v.c = h->small_array[i]; - h->small_array[i] = 0; - h->ptr_small = i-1; - --(h->Nsmall); - //printf("small array returning %lu count %ld\n", v.v, v.c); - return v; + if (h->small_N == 0) croak("remove from empty heap\n"); + /* Search small_array for a non-zero count from small_ptr down */ + IV* saptr = h->small_array + h->small_ptr; + while (!*saptr) + saptr--; + if (saptr < h->small_array) croak("walked off small array\n"); + *val = (saptr - h->small_array); + *count = *saptr; + *saptr = 0; + h->small_ptr = *val-1; + --(h->small_N); + return; + } else { + vc_t* a = h->array; + UV ival, k, n = h->N; + *val = a[1].v; + *count = 0; + do { + *count += a[1].c; + /* remove top element */ + ival = a[n--].v; + k = 1; + while (k <= n/2) { + UV j = k+k; + if (j < n && a[j].v < a[j+1].v) j++; + if (ival >= a[j].v) break; + a[k] = a[j]; + k = j; } - } - croak("walked off small array\n"); - } - v = heap_remove(h); - /* get rest of entries with same value */ - while (h->N > 0 && h->array[1].v == v.v) { - v.c += h->array[1].c; - (void) heap_remove(h); + a[k] = a[n+1]; + } while (n > 0 && a[1].v == *val); + h->N = n; } - return v; } static heap_t heap_create(UV small_size) { heap_t h; h.array = 0; h.array_size = 0; - h.N = 0; h.small_limit = small_size; h.small_array = 0; if (small_size > 0) { if (verbose>1)printf("creating small array of size %lu\n", small_size); - h.small_array = (IV*) calloc( small_size, sizeof(IV) ); + Newz(0, h.small_array, small_size, IV); } - h.Nsmall = 0; - h.ptr_small = -1; + h.N = 0; + h.small_N = 0; + h.small_ptr = -1; return h; } static void heap_destroy(heap_t* h) { if (h->array != 0) { - if (verbose > 2) printf("FREE %p\n", h->array); - free(h->array); + if (verbose > 2) printf("FREE heap %p\n", h->array); + Safefree(h->array); } h->array = 0; h->array_size = 0; h->N = 0; - if (h->small_array != 0) { - free(h->small_array); - } + if (h->small_array != 0) + Safefree(h->small_array); h->small_array = 0; - h->Nsmall = 0; - h->ptr_small = -1; -} -static void heap_empty(heap_t* h) -{ - h->N = 0; - h->Nsmall = 0; - h->ptr_small = -1; - if (h->small_limit > 0) - memset(h->small_array, 0, h->small_limit * sizeof(IV)); + h->small_N = 0; + h->small_ptr = -1; } - -#define heap_not_empty(h) ((h).N > 0 || (h).Nsmall > 0) +#define heap_not_empty(h) ((h).N > 0 || (h).small_N > 0) /******************************************************************************/ /* - * The main phi(x,a) algorithm. In this implementation, it takes about 25% - * of the total time for the Lehmer algorithm, but it is by far the most memory - * consuming section. + * The main phi(x,a) algorithm. In this implementation, it takes about 15% + * of the total time for the Lehmer algorithm, but it is by far the most + * memory consuming part. */ static UV phi(UV x, UV a, UV* primes) { - UV i; - IV sum = 0; - UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1); + UV i, val; + IV count, sum = 0; + UV primea = primes ? primes[a+1] : _XS_nth_prime(a+1); if (x < primea) return (x > 0) ? 1 : 0; if (a == 1) return ((x+1)/2); if (a <= 7) return mapes(x, a); primea = primes ? primes[a] : _XS_prev_prime(primea); - heap_t h1 = heap_create(20 * x/primea/primea/primea); - heap_t h2 = heap_create(20 * x/primea/primea/primea); - vc_t v; + heap_t h1 = heap_create(a * 1000); + heap_t h2 = heap_create(a * 1000); + + heap_insert(&h1, x, 1); - v.v = x; v.c = 1; - heap_insert(&h1, v); - while (a > 6) { - heap_empty(&h2); + while (a > 7) { + if (heap_not_empty(h2)) croak("h2 heap isn't empty."); while ( heap_not_empty(h1) ) { UV sval; - v = heap_remove_coalesce(&h1); - if (v.c == 0) + heap_remove(&h1, &val, &count); + if (count == 0) continue; - heap_insert(&h2, v); - sval = v.v / primea; + heap_insert(&h2, val, count); + sval = val / primea; if (sval >= primea) { - v.v = sval; - v.c = -v.c; - heap_insert(&h2, v); + heap_insert(&h2, sval, -count); } else { - sum -= v.c; + sum -= count; } } { heap_t t = h1; h1 = h2; h2 = t; } @@ -285,13 +300,14 @@ static UV phi(UV x, UV a, UV* primes) primea = primes ? primes[a] : _XS_prev_prime(primea); } heap_destroy(&h2); + if (a != 7) croak("final loop is set for a=7, a = %lu\n", a); while ( heap_not_empty(h1) ) { - v = heap_remove_coalesce(&h1); - if (v.c != 0) - sum += v.c * mapes(v.v, a); /* This could be faster */ + heap_remove(&h1, &val, &count); + if (count != 0) + sum += count * mapes7(val); } heap_destroy(&h1); - return sum; + return (UV) sum; } @@ -335,7 +351,7 @@ UV _XS_lehmer_pi(UV n) release_prime_cache(sieve); croak("Could not generate sieve for %"UVuf, z); } - primea = (UV*) malloc( (b+4) * sizeof(UV) ); + New(0, primea, b+4, UV); if (primea == 0) croak("Can not allocate small primes\n"); primea[0] = 0; @@ -364,7 +380,7 @@ UV _XS_lehmer_pi(UV n) TIMING_START; { /* Generate primecounts up to z used by stage 4. */ UV count = 0; - pca = (UV*) malloc( (z+4) * sizeof(UV) ); + New(0, pca, z+4, UV); if (pca == 0) croak("Can not allocate small prime counts\n"); for (i = 0; i <= z && count+1 <= b; i++) { @@ -409,8 +425,8 @@ printf("precalculated counts to %lu\n", z); } TIMING_END_PRINT("stage 4") if (pca != 0) - free(pca); + Safefree(pca); if (primea != 0) - free(primea); + Safefree(primea); return sum; } -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits
