This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.23 in repository libmath-prime-util-perl.
commit 52b8d85051ee6cfa53fa208906d54bf7b1e3391a Author: Dana Jacobsen <d...@acm.org> Date: Mon Mar 4 05:38:31 2013 -0800 New data structure for phi(x,a): faster prime_count --- Changes | 1 + TODO | 4 +- lehmer.c | 313 +++++++++++++++++++++++++++------------------------------------ 3 files changed, 134 insertions(+), 184 deletions(-) diff --git a/Changes b/Changes index 31bd220..a0b69db 100644 --- a/Changes +++ b/Changes @@ -29,6 +29,7 @@ Revision history for Perl extension Math::Prime::Util. - Zeta much faster as mentioned above. - faster nth_prime as mentioned above. - AKS about 10% faster. + - New data structure for phi(x,a) means even faster prime_count. 0.22 26 February 2013 diff --git a/TODO b/TODO index e523058..65c9f1a 100644 --- a/TODO +++ b/TODO @@ -48,8 +48,6 @@ that (1) handles any low / high, and (2) segments. Take segment_primes from XS.xs to do this. -- Try in lehmer.c's phi function: Use two linear arrays to replace the - heap+array. If we add val in one and sval in the other, they can remain - sorted, then reading the next combined value is easy. +- Do in-place merge in new phi(x,a). - Implement S2 calculation for LMO prime count. diff --git a/lehmer.c b/lehmer.c index 48d57d1..65a7bd2 100644 --- a/lehmer.c +++ b/lehmer.c @@ -9,7 +9,7 @@ * * Lehmer prime counting utility. Calculates pi(x), count of primes <= x. * - * Copyright (c) 2012 Dana Jacobsen (d...@acm.org). + * Copyright (c) 2012-2013 Dana Jacobsen (d...@acm.org). * This is free software; you can redistribute it and/or modify it under * the same terms as the Perl 5 programming language system itself. * @@ -22,33 +22,34 @@ * * g++ -O3 -DPRIMESIEVE_STANDALONE -DPRIMESIEVE_PARALLEL lehmer.c -o prime_count -lprimesieve -lgomp * - * The phi(x,a) calculation is unique, to the best of my knowledge. It keeps - * a heap of all x values + signed counts for the given 'a' value, and walks + * The phi(x,a) calculation is unique, to the best of my knowledge. It uses + * two lists of all x values + signed counts for the given 'a' value, and walks * 'a' down until it is small enough to calculate directly (either with Mapes * or using a calculated table using the primorial/totient method). This * is relatively fast and low memory compared to many other solutions. As with * all Lehmer-Meissel-Legendre algorithms, memory use will be a constraint * with large values of x. * - * Calculating pi(10^11) is done in under 1 second on my computer. pi(10^14) - * takes under 1 minute, pi(10^16) in a half hour. Compared with Thomas - * R. Nicely's pix4 program, this one is 3-5x faster and uses 2-3x less memory. - * When compiled with parallel primesieve it is another 2x or more faster: - * pix4(10^16) takes 124 minutes, this takes 6 minutes. + * Using my sieve code with everything running in serial, calculating pi(10^12) + * is done undef 1 second on my computer. pi(10^14) takes under 30 seconds, + * pi(10^16) in under 20 minutes. Compared with Thomas R. Nicely's pix4 + * program, his one is 4-6x faster and uses 2-3x less memory. When compiled + * with parallel primesieve it is another 2x or more faster: + * pix4(10^16) takes 124 minutes, this code + primesieve takes 4 minutes. * - * n phi(x,a) mem/time | stage 4 mem/time | total time - * 10^17 4953MB 871.14 | 2988MB 9911.9 | 179m 37.5s - * 10^16 1436MB 168.02 | 901MB 1195.7 | 22m 45.6s - * 10^15 432MB 31.34 | 394MB 165.6 | 3m 17.5s - * 10^14 203MB 5.509 | 223MB 25.96 | 31.69s - * 10^13 0.949 | 165MB 4.284 | 5.336s - * 10^12 0.174 | 0.755 | 0.990s - * 10^11 0.034 | 0.138 | 0.213s - * 10^10 0.007 | 0.025 | 0.064s - * - * These timings are using Perl + MPU. The standalone version using primesieve + * Timings with Perl + MPU. Using the standalone program with primesieve * speeds up stage 4 a lot for large values. * + * n phi(x,a) mem/time | stage 4 mem/time | total time + * 10^17 3000MB 275.29 | 2300MB 9911.9 | 179m 37.5s + * 10^16 986MB 38.73 | 815MB 1139.8 | 19m 43.0s + * 10^15 170MB 7.22 | 296MB 147.9 | 2m 36.2s + * 10^14 39MB 1.69 | 101MB 23.03 | 24.740s + * 10^13 0.398 | 32MB 3.840 | 5.336s + * 10^12 0.093 | 0.666 | 0.802s + * 10^11 0.017 | 0.120 | 0.143s + * 10^10 0.004 | 0.023 | 0.028s + * * Reference: Hans Riesel, "Prime Numbers and Computer Methods for * Factorization", 2nd edition, 1994. */ @@ -267,173 +268,116 @@ static UV mapes7(UV x) { /* A tiny bit faster setup for a=7 */ } /******************************************************************************/ -/* Modified heap for manipulating our UV value / IV count pairs */ +/* In-order lists for manipulating our UV value / IV count pairs */ /******************************************************************************/ -/* - * This is a heap augmented with a small array. We store values and signed - * counts, where all counts for the same value are summed. An easy way to - * do this in Perl/Python is a hash. In plain C, I don't believe this is the - * best solution. A heap can be implemented both easily and very efficiently - * (using a linear array), and as we pull items off the heap we can combine - * all similar values. - * - * Below some threshold value ('small_limit') the items become dense. That is, - * not only are the values small but we have many items in that range. Hence - * the small array augmentation. All values below the threshold are just put - * directly into an array. This not only handles them a little faster but - * helps reduce the heap size a bit, as we don't put any repeated values in - * the heap for the small items. Since they're dense in this range, we can - * do a linear scan to find the next non-zero count. - * - * An ideal data structure for our purpose would coalesce values on insertion, - * and would allow operating in place (so we could retrieve all our items and - * add new items as we go, without them appearing on this scan). The former - * is possible using an ordered list or a balanced tree. I don't know how we - * would achieve the latter. The point being that we're pulling items off of - * h1 and adding it (plus possibly a new item) to h2, so ideally we'd manage - * to use that space freed up by h1. - */ - typedef struct { UV v; IV c; } vc_t; typedef struct { - UV small_limit; /* small count array: size */ - UV small_N; /* small count array: number of non-zero elements */ - int small_ptr; /* small count array: index of largest non-zero value */ - UV array_size; /* heap: allocated size in elements */ - UV N; /* heap: number of elements */ - IV* small_array; /* small count array data */ - vc_t* array; /* heap data */ -} heap_t; - -static void heap_insert(heap_t* h, UV val, IV count) -{ - UV n; vc_t* a; - if (val < h->small_limit) { - IV* saptr = h->small_array + val; - if (*saptr == 0) - ++(h->small_N); - *saptr += count; - if (*saptr == 0) - --(h->small_N); - else if (h->small_ptr < (int)val) - h->small_ptr = (int) val; - return; - } - n = ++(h->N); - a = h->array; - if (n >= h->array_size) { - UV new_size; - if (h->array_size == 0) { - new_size = (h->small_limit <= (20000/2)) ? 20000 : 2*h->small_limit; - if (verbose>2) printf("ALLOCing heap, size %lu\n", new_size); - New(0, h->array, new_size, vc_t); - } else { - new_size = (UV) (1.5 * h->array_size); - if (verbose>2) printf("REALLOCing heap %p, new size %lu\n", h->array, new_size); - Renew( h->array, new_size, vc_t ); - } - if (h->array == 0) croak("could not allocate heap\n"); - a = h->array; - h->array_size = new_size-1; - a[0].v = UV_MAX; a[0].c = 0; - } - while (a[n/2].v <= val) { /* upheap */ - a[n] = a[n/2]; - n /= 2; - } - a[n].v = val; - a[n].c = count; + UV size; + UV n; +} vcarray_t; + +static vcarray_t vcarray_create(void) +{ + vcarray_t l; + l.a = 0; + l.size = 0; + l.n = 0; + return l; } -static void heap_remove(heap_t* h, UV* val, IV* count) +static void vcarray_destroy(vcarray_t* l) { - if (h->N == 0) { - /* Search small_array for a non-zero count from small_ptr down */ - IV* saptr = h->small_array + h->small_ptr; - if (h->small_N == 0) croak("remove from empty heap\n"); - while (!*saptr) - saptr--; - if (saptr < h->small_array) croak("walked off small array\n"); - *val = (saptr - h->small_array); - *count = *saptr; - *saptr = 0; - h->small_ptr = *val-1; - --(h->small_N); - return; - } else { - vc_t* a = h->array; - UV ival, k, n = h->N; - *val = a[1].v; - *count = 0; - do { - *count += a[1].c; - /* remove top element */ - ival = a[n--].v; - k = 1; - while (k <= n/2) { - UV j = k+k; - if (j < n && a[j].v < a[j+1].v) j++; - if (ival >= a[j].v) break; - a[k] = a[j]; - k = j; - } - a[k] = a[n+1]; - } while (n > 0 && a[1].v == *val); - h->N = n; + if (l->a != 0) { + if (verbose > 2) printf("FREE list %p\n", l->a); + Safefree(l->a); } + l->size = 0; + l->n = 0; } -static heap_t heap_create(UV small_size) +static void vcarray_insert(vcarray_t* l, UV val, IV count) { - heap_t h; - h.array = 0; - h.array_size = 0; - h.small_limit = small_size; - h.small_array = 0; - if (small_size > 0) { - if (verbose>1)printf("creating small array of size %lu\n", small_size); - Newz(0, h.small_array, small_size, IV); + UV n = l->n; + if (n > 0 && l->a[n-1].v <= val) { + if (l->a[n-1].v < val) + croak("Previous value was %lu, inserting %lu out of order\n", l->a[n-1].v, val); + l->a[n-1].c += count; + return; } - h.N = 0; - h.small_N = 0; - h.small_ptr = -1; - return h; + if (n >= l->size) { + UV new_size; + if (l->size == 0) { + new_size = 20000; + if (verbose>2) printf("ALLOCing list, size %lu\n", new_size); + New(0, l->a, new_size, vc_t); + } else { + new_size = (UV) (1.5 * l->size); + if (verbose>2) printf("REALLOCing list %p, new size %lu\n",l->a,new_size); + Renew( l->a, new_size, vc_t ); + } + if (l->a == 0) croak("could not allocate list\n"); + l->size = new_size; + } + /* printf(" inserting %lu %ld\n", val, count); */ + l->a[n].v = val; + l->a[n].c = count; + l->n++; } -static void heap_destroy(heap_t* h) + +static vcarray_t vcarray_merge(vcarray_t* a, vcarray_t* b) { - if (h->array != 0) { - if (verbose > 2) printf("FREE heap %p\n", h->array); - Safefree(h->array); + UV ai, bi, an, bn, k, mn; + vcarray_t m = vcarray_create(); + + /* printf("going to merge %lu and %lu elements\n", a->n, b->n); */ + an = a->n; bn = b->n; mn = an+bn; + New(0, m.a, mn, vc_t); + m.size = mn; + /* merge */ + ai = 0; bi = 0; + vc_t* aa = a->a; + vc_t* ba = b->a; + for (k = 0; k < mn; k++) { + if (ai >= an) { + if (bi >= bn) croak("ran out of data during merge"); + while (k < mn) + m.a[k++] = ba[bi++]; + break; + } else if (bi >= bn) { + while (k < mn) + m.a[k++] = aa[ai++]; + break; + } else if (aa[ai].v > ba[bi].v) { + m.a[k] = aa[ai++]; + } else if (ba[bi].v > aa[ai].v) { + m.a[k] = ba[bi++]; + } else { + m.a[k] = aa[ai]; + m.a[k].c += ba[bi].c; + ai++; bi++; mn--; + } } - h->array = 0; - h->array_size = 0; - h->N = 0; - if (h->small_array != 0) - Safefree(h->small_array); - h->small_array = 0; - h->small_N = 0; - h->small_ptr = -1; + m.n = k; + /* printf("done with merge, have %lu elements\n", m.n); */ + return m; } -#define heap_not_empty(h) ((h).N > 0 || (h).small_N > 0) - -/******************************************************************************/ - /* - * The main phi(x,a) algorithm. In this implementation, it takes about 15% - * of the total time for the Lehmer algorithm, but it is by far the most - * memory consuming part. + * The main phi(x,a) algorithm. In this implementation, it takes under 10% + * of the total time for the Lehmer algorithm, but it is the memory + * constraining part. + * + * TODO: Expand a1, then merge a2 directly into it. Should save memory. */ static UV phi(UV x, UV a) { - heap_t h1, h2; - UV val, smallv; + UV i, val, sval; UV sum = 0; IV count; const UV* primes; @@ -446,48 +390,54 @@ static UV phi(UV x, UV a) croak("Could not generate primes for phi(%lu,%lu)\n", x, a); if (x < primes[a+1]) { Safefree(primes); return (x > 0) ? 1 : 0; } - /* This is a hack, trying to guess at a value where the array gets dense */ - smallv = a * 1000; - if (smallv > x / primes[a] / 5) - smallv = x / primes[a] / 5; - - h1 = heap_create(smallv); - h2 = heap_create(smallv); - heap_insert(&h1, x, 1); + vcarray_t a1 = vcarray_create(); + vcarray_t a2 = vcarray_create(); + vcarray_insert(&a1, x, 1); while (a > 7) { UV primea = primes[a]; - if (heap_not_empty(h2)) croak("h2 heap isn't empty."); - while ( heap_not_empty(h1) ) { - UV sval; - heap_remove(&h1, &val, &count); + for (i = 0; i < a1.n; i++) { + val = a1.a[i].v; + count = a1.a[i].c; if (count == 0) continue; - heap_insert(&h2, val, count); sval = val / primea; if (sval >= primea) { - heap_insert(&h2, sval, -count); + vcarray_insert(&a2, sval, -count); } else { sum -= count; } } - { heap_t t = h1; h1 = h2; h2 = t; } - /* printf("a = %lu heap %lu/%lu + %lu\n", a, h1.small_N, h1.small_limit, h1.N); */ + { /* Merge a1 and a2 into a3 */ + vcarray_t m = vcarray_merge(&a1, &a2); + /* printf("a1 size %lu a2 size %lu m size %lu\n",a1.n,a2.n,m.n); */ + /* Empty a2 */ + a2.n = 0; + /* erase a1 and replace with m */ + vcarray_destroy(&a1); + a1.a = m.a; + a1.n = m.n; + a1.size = m.size; + } a--; } - heap_destroy(&h2); + vcarray_destroy(&a2); if (a != 7) croak("final loop is set for a=7, a = %lu\n", a); - while ( heap_not_empty(h1) ) { - heap_remove(&h1, &val, &count); + for (i = 0; i < a1.n; i++) { + val = a1.a[i].v; + count = a1.a[i].c; if (count != 0) sum += count * mapes7(val); } - heap_destroy(&h1); + vcarray_destroy(&a1); Safefree(primes); return (UV) sum; } + + + /* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's * method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */ UV _XS_legendre_pi(UV n) @@ -626,6 +576,7 @@ UV _XS_lehmer_pi(UV n) return sum; } + UV _XS_LMO_pi(UV n) { UV a, b, sum, i, lastprime, lastpc, lastw, lastwpc; @@ -679,7 +630,7 @@ UV _XS_LMO_pi(UV n) for (j = isquared; j <= n13; j += isquared) mu[j] = 0; } - //for (i = 0; i <= n13; i++) { printf("mu %lu %ld\n", i, (IV)mu[i]); } + /* for (i = 0; i <= n13; i++) { printf("mu %lu %ld\n", i, (IV)mu[i]); } */ TIMING_END_PRINT("mu") if (verbose > 0) printf("LMO %lu stage 4: calculate S1 (%lu)\n", n, n13); -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits