This is an automated email from the git hooks/post-receive script. ppm-guest pushed a commit to annotated tag v0.23 in repository libmath-prime-util-perl.
commit 0901cc7d0f2c9aca31ebd7f6c34c30ef017b524d Author: Dana Jacobsen <d...@acm.org> Date: Mon Mar 4 18:18:37 2013 -0800 In-place phi(x,a) merge --- Changes | 7 +++- TODO | 4 +- lehmer.c | 125 ++++++++++++++++++++++++++++++++++++++------------------------- sieve.c | 53 ++++++++++++++------------- 4 files changed, 112 insertions(+), 77 deletions(-) diff --git a/Changes b/Changes index a0b69db..c067d11 100644 --- a/Changes +++ b/Changes @@ -23,13 +23,18 @@ Revision history for Perl extension Math::Prime::Util. - Start on Lagarias-Miller-Odlyzko prime count. + - A new data structure for the phi(x,a) function used by all the fast + prime count routines. Quite a bit faster and most importantly, uses + half the memory of the old structure. + - Performance: - Divisor sum with no sub is ~10x faster. - Speed up PP version of exp_mangoldt, create XS version. - Zeta much faster as mentioned above. - faster nth_prime as mentioned above. - AKS about 10% faster. - - New data structure for phi(x,a) means even faster prime_count. + - Unroll a little more in sieve inner loop. A couple percent faster. + - Faster prime_count and nth_prime due to new phi(x,a) (about 1.25x). 0.22 26 February 2013 diff --git a/TODO b/TODO index 65c9f1a..9a6844d 100644 --- a/TODO +++ b/TODO @@ -48,6 +48,6 @@ that (1) handles any low / high, and (2) segments. Take segment_primes from XS.xs to do this. -- Do in-place merge in new phi(x,a). - - Implement S2 calculation for LMO prime count. + +- Balance memory used in phi with memory for binary search pc in lehmer. diff --git a/lehmer.c b/lehmer.c index 65a7bd2..cad15ea 100644 --- a/lehmer.c +++ b/lehmer.c @@ -31,14 +31,14 @@ * with large values of x. * * Using my sieve code with everything running in serial, calculating pi(10^12) - * is done undef 1 second on my computer. pi(10^14) takes under 30 seconds, + * is done under 1 second on my computer. pi(10^14) takes under 30 seconds, * pi(10^16) in under 20 minutes. Compared with Thomas R. Nicely's pix4 - * program, his one is 4-6x faster and uses 2-3x less memory. When compiled + * program, his one is 4-6x faster and uses 2-4x less memory. When compiled * with parallel primesieve it is another 2x or more faster: * pix4(10^16) takes 124 minutes, this code + primesieve takes 4 minutes. * - * Timings with Perl + MPU. Using the standalone program with primesieve - * speeds up stage 4 a lot for large values. + * Timings with Perl + MPU with all-serial computation. Using the standalone + * program with parallel primesieve speeds up stage 4 a lot for large values. * * n phi(x,a) mem/time | stage 4 mem/time | total time * 10^17 3000MB 275.29 | 2300MB 9911.9 | 179m 37.5s @@ -299,7 +299,14 @@ static void vcarray_destroy(vcarray_t* l) l->size = 0; l->n = 0; } -static void vcarray_insert(vcarray_t* l, UV val, IV count) +/* Insert a value/count pair. We do this indirection because about 80% of + * the calls result in a merge with the previous entry. */ +#define vcarray_insert(arr, val, count) \ + if (arr.n > 0 && arr.a[arr.n-1].v == val) \ + arr.a[arr.n-1].c += count; \ + else \ + vcarray_insert_func(&arr, val, count); +static void vcarray_insert_func(vcarray_t* l, UV val, IV count) { UV n = l->n; if (n > 0 && l->a[n-1].v <= val) { @@ -328,51 +335,80 @@ static void vcarray_insert(vcarray_t* l, UV val, IV count) l->n++; } -static vcarray_t vcarray_merge(vcarray_t* a, vcarray_t* b) +/* Merge the two sorted lists A and B into A. Each list has no duplicates, + * but they may have duplications between the two. We're quite interested + * in saving memory, so first remove all the duplicates, then do an in-place + * merge. */ +static void vcarray_merge(vcarray_t* a, vcarray_t* b) { - UV ai, bi, an, bn, k, mn; - vcarray_t m = vcarray_create(); - - /* printf("going to merge %lu and %lu elements\n", a->n, b->n); */ - an = a->n; bn = b->n; mn = an+bn; - New(0, m.a, mn, vc_t); - m.size = mn; - /* merge */ - ai = 0; bi = 0; + long ai, bi, bj, k, kn; + UV an = a->n; + UV bn = b->n; vc_t* aa = a->a; vc_t* ba = b->a; - for (k = 0; k < mn; k++) { + + /* Merge anything in B that appears in A. */ + for (ai = 0, bi = 0, bj = 0; bi < bn; bi++) { + /* Skip forward in A until empty or aa[ai].v <= ba[bi].v */ + UV bval = ba[bi].v; + while (ai < an && aa[ai].v > bval) + ai++; + /* if A empty then copy the remaining elements */ if (ai >= an) { - if (bi >= bn) croak("ran out of data during merge"); - while (k < mn) - m.a[k++] = ba[bi++]; + if (bi == bj) + bj = bn; + else + while (bi < bn) + ba[bj++] = ba[bi++]; break; - } else if (bi >= bn) { - while (k < mn) - m.a[k++] = aa[ai++]; + } + if (aa[ai].v == bval) + aa[ai].c += ba[bi].c; + else + ba[bj++] = ba[bi]; + } + if (verbose>2) printf(" removed %lu duplicates from b\n", bn - bj); + bn = bj; + + if (bn == 0) { /* In case they were all duplicates */ + b->n = 0; + return; + } + + /* kn = the final merged size. All duplicates are gone, so this is exact. */ + kn = an+bn; + if (a->size < kn) { /* Make A big enough to hold kn elements */ + UV new_size = (UV) (1.2 * kn); + if (verbose>2) printf("REALLOCing list %p, new size %lu\n", a->a, new_size); + Renew( a->a, new_size, vc_t ); + aa = a->a; /* this could have been changed by the realloc */ + a->size = new_size; + } + + /* merge A and B. Very simple using reverse merge. */ + ai = an-1; + bi = bn-1; + for (k = kn-1; k >= 0; k--) { + if (ai < 0) { /* A is exhausted, just filling in B */ + if (bi < 0) croak("ran out of data during merge"); + aa[k] = ba[bi--]; + } else if (bi < 0) { /* We've caught up with A */ break; - } else if (aa[ai].v > ba[bi].v) { - m.a[k] = aa[ai++]; - } else if (ba[bi].v > aa[ai].v) { - m.a[k] = ba[bi++]; + } else if (aa[ai].v < ba[bi].v) { + aa[k] = aa[ai--]; } else { - m.a[k] = aa[ai]; - m.a[k].c += ba[bi].c; - ai++; bi++; mn--; + if (aa[ai].v == ba[bi].v) croak("deduplication error"); + aa[k] = ba[bi--]; } } - m.n = k; - /* printf("done with merge, have %lu elements\n", m.n); */ - return m; + a->n = kn; /* A now has this many items */ + b->n = 0; /* B is marked empty */ } /* * The main phi(x,a) algorithm. In this implementation, it takes under 10% - * of the total time for the Lehmer algorithm, but it is the memory - * constraining part. - * - * TODO: Expand a1, then merge a2 directly into it. Should save memory. + * of the total time for the Lehmer algorithm, but is a big memory consumer. */ static UV phi(UV x, UV a) @@ -392,7 +428,7 @@ static UV phi(UV x, UV a) vcarray_t a1 = vcarray_create(); vcarray_t a2 = vcarray_create(); - vcarray_insert(&a1, x, 1); + vcarray_insert(a1, x, 1); while (a > 7) { UV primea = primes[a]; @@ -403,22 +439,13 @@ static UV phi(UV x, UV a) continue; sval = val / primea; if (sval >= primea) { - vcarray_insert(&a2, sval, -count); + vcarray_insert(a2, sval, -count); } else { sum -= count; } } - { /* Merge a1 and a2 into a3 */ - vcarray_t m = vcarray_merge(&a1, &a2); - /* printf("a1 size %lu a2 size %lu m size %lu\n",a1.n,a2.n,m.n); */ - /* Empty a2 */ - a2.n = 0; - /* erase a1 and replace with m */ - vcarray_destroy(&a1); - a1.a = m.a; - a1.n = m.n; - a1.size = m.size; - } + /* Merge a1 and a2 into a1. a2 will be emptied. */ + vcarray_merge(&a1, &a2); a--; } vcarray_destroy(&a2); diff --git a/sieve.c b/sieve.c index 16d22fc..618fd21 100644 --- a/sieve.c +++ b/sieve.c @@ -94,6 +94,30 @@ static const unsigned char presieve13[PRESIEVE_SIZE] = 0x18,0x89,0x08,0x25,0x44,0x22,0x30,0x14,0xc3,0x88,0x86,0x40,0x1a, 0x28,0x30,0x85,0x09,0x54,0x60,0x43,0x24,0x92,0x81,0x08,0x04,0x70}; +#define FIND_COMPOSITE_POS(i,j) \ + { \ + UV dlast = d; \ + do { \ + d += dinc; \ + m += minc; \ + if (m >= 30) { d++; m -= 30; } \ + } while ( masktab30[m] == 0 ); \ + wdinc[i] = d - dlast; \ + wmask[j] = masktab30[m]; \ + } +#define FIND_COMPOSITE_POSITIONS(p) \ + do { \ + FIND_COMPOSITE_POS(0,1) \ + FIND_COMPOSITE_POS(1,2) \ + FIND_COMPOSITE_POS(2,3) \ + FIND_COMPOSITE_POS(3,4) \ + FIND_COMPOSITE_POS(4,5) \ + FIND_COMPOSITE_POS(5,6) \ + FIND_COMPOSITE_POS(6,7) \ + FIND_COMPOSITE_POS(7,0) \ + d -= p; \ + } while (0) + static void sieve_prefill(unsigned char* mem, UV startd, UV endd) { UV nbytes = endd - startd + 1; @@ -145,20 +169,9 @@ unsigned char* sieve_erat30(UV end) UV minc = (2*prime) - dinc*30; UV wdinc[8]; unsigned char wmask[8]; - int i; /* Find the positions of the next composites we will mark */ - for (i = 1; i <= 8; i++) { - UV dlast = d; - do { - d += dinc; - m += minc; - if (m >= 30) { d++; m -= 30; } - } while ( masktab30[m] == 0 ); - wdinc[i-1] = d - dlast; - wmask[i%8] = masktab30[m]; - } - d -= prime; + FIND_COMPOSITE_POSITIONS(prime); #if 0 assert(d == ((prime*prime)/30)); assert(d < max_buf); @@ -199,7 +212,8 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd) /* Fill buffer with marked 7, 11, and 13 */ sieve_prefill(mem, startd, endd); - limit = isqrt(endp) + 1; + limit = isqrt(endp); + if (limit*limit < endp) limit++; /* ceil(sqrt(endp)) */ /* printf("segment sieve from %"UVuf" to %"UVuf" (aux sieve to %"UVuf")\n", startp, endp, limit); */ pcsize = get_prime_cache(limit, &sieve); if (pcsize < limit) { @@ -236,20 +250,9 @@ int sieve_segment(unsigned char* mem, UV startd, UV endd) UV wdinc[8]; unsigned char wmask[8]; UV offset_endd = endd - startd; - int i; /* Find the positions of the next composites we will mark */ - for (i = 1; i <= 8; i++) { - UV dlast = d; - do { - d += dinc; - m += minc; - if (m >= 30) { d++; m -= 30; } - } while ( masktab30[m] == 0 ); - wdinc[i-1] = d - dlast; - wmask[i%8] = masktab30[m]; - } - d -= p; + FIND_COMPOSITE_POSITIONS(p); d -= startd; #if 0 i = 0; /* Mark the composites */ -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-perl/packages/libmath-prime-util-perl.git _______________________________________________ Pkg-perl-cvs-commits mailing list Pkg-perl-cvs-commits@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-perl-cvs-commits