On Mon, 19 Apr 2004 12:00:10 -0400, Tom Lane <[EMAIL PROTECTED]> wrote: >A possible compromise is to limit the number of pages sampled to >something a bit larger than n, perhaps 2n or 3n. I don't have a feeling >for the shape of the different-pages probability function; would this >make a significant difference, or would it just waste cycles?

## Advertising

I would have replied earlier, if I had a good answer. What I have so far contains at least one, probably two flaws. Knowing not much more than the four basic arithmetic operations I was not able to improve my model. So I post what I have: As usual we assume a constant number c of tuples per page. If we have a table of size B pages and want to collect a sample of n tuples, the number of possible samples is (again in OOo syntax) left( binom{cB}{n} right) If we select an arbitrary page, the number of possible samples that do NOT contain any tuple from this page is left( binom {c (B-1)} {n} right) Let's forget about our actual implementations of sampling methods and pretend we have a perfect random sampling method. So the probability Pnot(c, B, n) that a certain page is not represented in a random sample is left( binom {c (B-1)} {n} right) over left( binom{cB}{n} right) which can be transformed into the more computing-friendly form prod from{i=0} to{n-1} {{cB-c - i} over {cB - i}} Clearly the probability that a certain page *is* represented in a sample is Pyes(c, B, n) = 1 - Pnot(c, B, n) The next step assumes that these probabilities are independent for different pages, which in reality they are not. We simply estimate the number of pages represented in a random sample as numPag(c, B, n) = B * Pyes(c, B, n) Here are some results for n = 3000: B \ c-> 10 | 100 | 200 -------+-------+-------+------- 100 | --- | 100 | 100 1000 | 972 | 953 | 951 2000 | 1606 | 1559 | 1556 3000 | 1954 | 1902 | 1899 6000 | 2408 | 2366 | 2363 9000 | 2588 | 2555 | 2553 20000 | 2805 | 2788 | 2787 30000 | 2869 | 2856 | 2856 100000 | 2960 | 2956 | 2956 This doesn't look to depend heavily on the number of tuples per page, which sort of justifies the assumption that c is constant. In the next step I tried to estimate the number of pages that contain exactly 1, 2, ... tuples of the sample. My naive procedure works as follows (I'm not sure whether it is even valid as a rough approximation, constructive criticism is very welcome): For c=100, B=3000, n=3000 we expect 1902 pages to contain at least 1 tuple of the sample. There are 1098 more tuples than pages, these tuples lie somewhere in those 1902 pages from the first step. numPag(99, 1902, 1098) = 836 pages contain at least a second tuple. So the number of pages containing exactly 1 tuple is 1902 - 836 = 1066. Repeating these steps we get 611 pages with 2 tuples, 192 with 3, 30 with 4, and 3 pages with 5 tuples. Here are some more numbers for c = 100 and n = 3000: B | pages with 1, 2, ... tuples -------+-------------------------------------------------------- 100 | 1 to 24 tuples: 0, then 1, 2, 4, 10, 18, 26, 24, 11, 4 1000 | 108, 201, 268, 229, 113, 29, 5 2000 | 616, 555, 292, 83, 12, 1 3000 | 1066, 611, 192, 30, 3 6000 | 1809, 484, 68, 5 9000 | 2146, 374, 32, 2 20000 | 2584, 196, 8 30000 | 2716, 138, 3 100000 | 2912, 44 A small C program to experimentally confirm or refute these calculations is attached. Its results are fairly compatible with above numbers, IMHO. Servus Manfred

/* ** samsim.c - sampling simulator */ #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <unistd.h> typedef int bool; #define MAX_RANDOM_VALUE (0x7FFFFFFF) static void initrandom() { struct timeval tv; gettimeofday(&tv, NULL); srandom(tv.tv_sec ^ tv.tv_usec); }/*initrandom*/ /* Select a random value R uniformly distributed in 0 < R < 1 */ static double random_fract(void) { long z; /* random() can produce endpoint values, try again if so */ do { z = random(); } while (z <= 0 || z >= MAX_RANDOM_VALUE); return (double) z / (double) MAX_RANDOM_VALUE; } /* ** data structure for (modified) Algorithm S from Knuth 3.4.2 */ typedef struct { long N; /* number of tuples, known in advance */ int n; /* sample size */ long t; /* current tuple number */ int m; /* tuples selected so far */ } SamplerData; typedef SamplerData *Sampler; static void Sampler_Init(Sampler bs, long N, int samplesize); static bool Sampler_HasMore(Sampler bs); static long Sampler_Next(Sampler bs); /* ** Sampler_Init -- prepare for random sampling */ static void Sampler_Init(Sampler bs, long N, int samplesize) { bs->N = N; /* table size */ bs->n = samplesize; bs->t = 0; /* tuples scanned so far */ bs->m = 0; /* tuples selected so far */ } static bool Sampler_HasMore(Sampler bs) { return (bs->t < bs->N) && (bs->m < bs->n); } static long Sampler_Next(Sampler bs) { long K = bs->N - bs->t; /* remaining tuples */ int k = bs->n - bs->m; /* tuples still to sample */ double p; /* probability to skip the next tuple */ double V; /* random */ /* Assert(Sampler_HasMore(bs)); */ if (k >= K) { /* need all the rest */ bs->t += 1; bs->m += 1; return bs->t - 1; } p = 1.0 - (double) k / (double) K; V = random_fract(); while (V < p) { bs->t += 1; K -= 1; /* ** Assert(K > 0) ** because we startet with K > k > 0, ** and when K == k, the loop terminates */ p *= 1.0 - (double) k / (double) K; } /* select */ bs->t += 1; bs->m += 1; return bs->t - 1; } static void usage() { fprintf(stderr, "usage: samsim c B n\n" "where c = tuples/page\n" " B = # of pages\n" " n = sample size\n"); }/*usage*/ static void dumpstats(long *stat, long cnt) { long i; for (i = 1; i <= cnt; ++i) { printf("%ld%c", stat[i], (i < cnt) ? '\t' : '\n'); }/*for*/ }/*dumpstats*/ static int samsim(long c, long B, long n) { SamplerData s; long oldblock = -1; long blockhits = 0; long maxhits = -1; long *stat = calloc(c + 1, sizeof(long)); if (stat == NULL) { fprintf(stderr, "cannot allocate %ld numbers\n", c); return 2; }/*if*/ Sampler_Init(&s, c * B, n); if (!Sampler_HasMore(&s)) { fprintf(stderr, "empty sample\n"); return 3; }/*if*/ while (Sampler_HasMore(&s)) { long t = Sampler_Next(&s); long blocknr = t / c; if (blocknr == oldblock) { ++blockhits; } else { if (oldblock >= 0) { stat[blockhits] += 1; if (blockhits > maxhits) maxhits = blockhits; }/*if*/ oldblock = blocknr; blockhits = 1; }/*else*/ }/*while*/ stat[blockhits] += 1; dumpstats(stat, maxhits); free(stat); return 0; /* success */ }/*samsim*/ int main(int argc, char* argv[]) { long c, B, n; if (argc != 4) { usage(); exit(1); }/*if*/ c = atol(argv[1]); B = atol(argv[2]); n = atol(argv[3]); if ((c <= 0) || (B <= 0) || (n <= 0)) { usage(); exit(1); }/*if*/ initrandom(); return samsim(c, B, n); }/*main*/

---------------------------(end of broadcast)--------------------------- TIP 5: Have you checked our extensive FAQ? http://www.postgresql.org/docs/faqs/FAQ.html