This patch implements the new tuple sampling method as discussed on -hackers and -performance a few weeks ago.

## Advertising

"Large DB" -hackers 2004-04-02 "query slows down with more accurate stats" -perform 2004-04-13 "Tuple sampling" -hackers 2004-04-19 Servus Manfred

diff -rcN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c *** ../base/src/backend/commands/analyze.c Sun May 9 12:05:51 2004 --- src/backend/commands/analyze.c Sun May 9 19:33:27 2004 *************** *** 38,43 **** --- 38,66 ---- #include "utils/syscache.h" #include "utils/tuplesort.h" + /* + * With two-stage sampling we process at most 300000 blocks, each of + * which contains less than 1200 tuples (at 32K block size, smallest + * possible tuple size). So the number of tuples processed can be + * stored in a 32 bit int. + * Change this datatype to double if the number of rows in the table + * might exceed INT_MAX. Then the algorithm should work as long as the + * row count does not become so large that it is not represented accurately + * in a double (on IEEE-math machines this would be around 2^52 rows). + */ + typedef int TupleCount; + + /* + ** data structure for Algorithm S from Knuth 3.4.2 + */ + typedef struct + { + BlockNumber N; /* number of blocks, known in advance */ + int n; /* sample size */ + BlockNumber t; /* current block number */ + int m; /* blocks selected so far */ + } BlockSamplerData; + typedef BlockSamplerData *BlockSampler; /* Per-index data for ANALYZE */ typedef struct AnlIndexData *************** *** 57,62 **** --- 80,89 ---- static MemoryContext anl_context = NULL; + static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize); + static bool BlockSampler_HasMore(BlockSampler bs); + static BlockNumber BlockSampler_Next(BlockSampler bs); + static void compute_index_stats(Relation onerel, double totalrows, AnlIndexData *indexdata, int nindexes, HeapTuple *rows, int numrows, *************** *** 66,72 **** int targrows, double *totalrows); static double random_fract(void); static double init_selection_state(int n); ! static double select_next_random_record(double t, int n, double *stateptr); static int compare_rows(const void *a, const void *b); static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats); static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); --- 93,99 ---- int targrows, double *totalrows); static double random_fract(void); static double init_selection_state(int n); ! static TupleCount get_next_S(TupleCount t, int n, double *stateptr); static int compare_rows(const void *a, const void *b); static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats); static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); *************** *** 638,649 **** } /* * acquire_sample_rows -- acquire a random sample of rows from the table * ! * Up to targrows rows are collected (if there are fewer than that many ! * rows in the table, all rows are collected). When the table is larger ! * than targrows, a truly random sample is collected: every row has an ! * equal chance of ending up in the final sample. * * We also estimate the total number of rows in the table, and return that * into *totalrows. --- 665,754 ---- } /* + ** BlockSampler_Init -- prepare for random sampling of blocknumbers + ** + ** BlockSampler is used for stage one of our new two-stage tuple + ** sampling mechanism as discussed on -hackers 2004-04-02 (subject + ** "Large DB"). It selects a random sample of samplesize blocks out of + ** the nblocks blocks in the table. If the table has less than + ** samplesize blocks, all blocks are selected. + ** + */ + static void + BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize) + { + bs->N = nblocks; /* table size */ + /* + ** If we decide to reduce samplesize for tables that have less or + ** not much more than samplesize blocks, here is the place to do + ** it. + */ + bs->n = samplesize; + bs->t = 0; /* blocks scanned so far */ + bs->m = 0; /* blocks selected so far */ + } + + static bool + BlockSampler_HasMore(BlockSampler bs) + { + return (bs->t < bs->N) && (bs->m < bs->n); + } + + static BlockNumber + BlockSampler_Next(BlockSampler bs) + { + BlockNumber K = bs->N - bs->t; /* remaining blocks */ + int k = bs->n - bs->m; /* blocks still to sample */ + double p; /* probability to skip the next block */ + double V; /* random */ + + Assert(BlockSampler_HasMore(bs)); + + if (k >= K) + { + /* need all the rest */ + bs->t += 1; + bs->m += 1; + return bs->t - 1; + } + + p = 1.0 - (double) k / (double) K; + V = random_fract(); + while (V < p) + { + bs->t += 1; + K -= 1; + /* + ** Assert(K > 0) + ** because we startet with K > k > 0, + ** and when K == k, the loop terminates + */ + p *= 1.0 - (double) k / (double) K; + } + + /* select */ + bs->t += 1; + bs->m += 1; + return bs->t - 1; + } + + /* * acquire_sample_rows -- acquire a random sample of rows from the table * ! * As of May 2004 we use a new two-stage method: Stage one selects up ! * to targrows random blocks (or all blocks, if there aren't so many). ! * Stage two scans these blocks and uses the Vitter algorithm to create ! * a random sample of targrows rows (or less, if there are less in the ! * sample of blocks). The two stages are executed simultaneously: Each ! * block is processed as soon as stage one returns its number and while ! * the rows are read stage two controls which ones are to be inserted ! * into the sample. ! * ! * Although every row has an equal chance of ending up in the final ! * sample, this sampling method is not perfect: not every possible ! * sample has an equal chance of being selected. For large relations ! * the number of different blocks represented by the sample tends to be ! * too small. We can live with that for now. Improvements are welcome. * * We also estimate the total number of rows in the table, and return that * into *totalrows. *************** *** 656,754 **** double *totalrows) { int numrows = 0; ! HeapScanDesc scan; BlockNumber totalblocks; ! HeapTuple tuple; ! ItemPointer lasttuple; ! BlockNumber lastblock, ! estblock; ! OffsetNumber lastoffset; ! int numest; ! double tuplesperpage; ! double t; double rstate; Assert(targrows > 1); ! /* ! * Do a simple linear scan until we reach the target number of rows. ! */ ! scan = heap_beginscan(onerel, SnapshotNow, 0, NULL); ! totalblocks = scan->rs_nblocks; /* grab current relation size */ ! while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) ! { ! rows[numrows++] = heap_copytuple(tuple); ! if (numrows >= targrows) ! break; ! vacuum_delay_point(); ! } ! heap_endscan(scan); /* ! * If we ran out of tuples then we're done, no matter how few we ! * collected. No sort is needed, since they're already in order. ! */ ! if (!HeapTupleIsValid(tuple)) ! { ! *totalrows = (double) numrows; ! ! ereport(elevel, ! (errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows", ! RelationGetRelationName(onerel), ! totalblocks, numrows, *totalrows))); ! ! return numrows; ! } ! /* ! * Otherwise, start replacing tuples in the sample until we reach the ! * end of the relation. This algorithm is from Jeff Vitter's paper ! * (see full citation below). It works by repeatedly computing the ! * number of the next tuple we want to fetch, which will replace a ! * randomly chosen element of the reservoir (current set of tuples). ! * At all times the reservoir is a true random sample of the tuples ! * we've passed over so far, so when we fall off the end of the ! * relation we're done. ! * ! * A slight difficulty is that since we don't want to fetch tuples or ! * even pages that we skip over, it's not possible to fetch *exactly* ! * the N'th tuple at each step --- we don't know how many valid tuples ! * are on the skipped pages. We handle this by assuming that the ! * average number of valid tuples/page on the pages already scanned ! * over holds good for the rest of the relation as well; this lets us ! * estimate which page the next tuple should be on and its position in ! * the page. Then we fetch the first valid tuple at or after that ! * position, being careful not to use the same tuple twice. This ! * approach should still give a good random sample, although it's not ! * perfect. ! */ ! lasttuple = &(rows[numrows - 1]->t_self); ! lastblock = ItemPointerGetBlockNumber(lasttuple); ! lastoffset = ItemPointerGetOffsetNumber(lasttuple); ! ! /* ! * If possible, estimate tuples/page using only completely-scanned ! * pages. ! */ ! for (numest = numrows; numest > 0; numest--) ! { ! if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock) ! break; ! } ! if (numest == 0) ! { ! numest = numrows; /* don't have a full page? */ ! estblock = lastblock + 1; ! } ! else ! estblock = lastblock; ! tuplesperpage = (double) numest / (double) estblock; ! ! t = (double) numrows; /* t is the # of records processed so far */ rstate = init_selection_state(targrows); ! for (;;) { - double targpos; BlockNumber targblock; Buffer targbuffer; Page targpage; --- 761,788 ---- double *totalrows) { int numrows = 0; ! TupleCount liverows = 0; ! TupleCount deadrows = 0; ! TupleCount rowstoskip = -1; BlockNumber totalblocks; ! BlockSamplerData bs; double rstate; Assert(targrows > 1); ! totalblocks = RelationGetNumberOfBlocks(onerel); /* ! ** Prepare for sampling block numbers ! */ ! BlockSampler_Init(&bs, totalblocks, targrows); /* ! ** Prepare for sampling rows ! */ rstate = init_selection_state(targrows); ! ! while (BlockSampler_HasMore(&bs)) { BlockNumber targblock; Buffer targbuffer; Page targpage; *************** *** 757,783 **** vacuum_delay_point(); ! t = select_next_random_record(t, targrows, &rstate); ! /* Try to read the t'th record in the table */ ! targpos = t / tuplesperpage; ! targblock = (BlockNumber) targpos; ! targoffset = ((int) ((targpos - targblock) * tuplesperpage)) + ! FirstOffsetNumber; ! /* Make sure we are past the last selected record */ ! if (targblock <= lastblock) ! { ! targblock = lastblock; ! if (targoffset <= lastoffset) ! targoffset = lastoffset + 1; ! } ! /* Loop to find first valid record at or after given position */ ! pageloop:; ! ! /* ! * Have we fallen off the end of the relation? ! */ ! if (targblock >= totalblocks) ! break; /* * We must maintain a pin on the target page's buffer to ensure --- 791,797 ---- vacuum_delay_point(); ! targblock = BlockSampler_Next(&bs); /* * We must maintain a pin on the target page's buffer to ensure *************** *** 795,848 **** maxoffset = PageGetMaxOffsetNumber(targpage); LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK); ! for (;;) { HeapTupleData targtuple; Buffer tupbuffer; - if (targoffset > maxoffset) - { - /* Fell off end of this page, try next */ - ReleaseBuffer(targbuffer); - targblock++; - targoffset = FirstOffsetNumber; - goto pageloop; - } ItemPointerSet(&targtuple.t_self, targblock, targoffset); if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer, false, NULL)) { /* ! * Found a suitable tuple, so save it, replacing one old ! * tuple at random */ ! int k = (int) (targrows * random_fract()); - Assert(k >= 0 && k < targrows); - heap_freetuple(rows[k]); - rows[k] = heap_copytuple(&targtuple); /* this releases the second pin acquired by heap_fetch: */ ReleaseBuffer(tupbuffer); - /* this releases the initial pin: */ - ReleaseBuffer(targbuffer); - lastblock = targblock; - lastoffset = targoffset; - break; } ! /* this tuple is dead, so advance to next one on same page */ ! targoffset++; } } /* ! * Now we need to sort the collected tuples by position (itempointer). */ ! qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); /* * Estimate total number of valid rows in relation. */ ! *totalrows = floor((double) totalblocks * tuplesperpage + 0.5); /* * Emit some interesting relation info --- 809,895 ---- maxoffset = PageGetMaxOffsetNumber(targpage); LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK); ! for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; ++targoffset) { HeapTupleData targtuple; Buffer tupbuffer; ItemPointerSet(&targtuple.t_self, targblock, targoffset); if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer, false, NULL)) { + liverows += 1; /* ! * The first targrows rows are simply copied into the ! * reservoir. ! * Then we start replacing tuples in the sample until ! * we reach the end of the relation. This algorithm is ! * from Jeff Vitter's paper (see full citation below). ! * It works by repeatedly computing the number of the ! * next tuple we want to fetch, which will replace a ! * randomly chosen element of the reservoir (current ! * set of tuples). At all times the reservoir is a true ! * random sample of the tuples we've passed over so far, ! * so when we fall off the end of the relation we're done. */ ! if (numrows < targrows) ! rows[numrows++] = heap_copytuple(&targtuple); ! else ! { ! if (rowstoskip < 0) ! rowstoskip = get_next_S(liverows, targrows, &rstate); ! if (rowstoskip == 0) ! { ! /* ! * Found a suitable tuple, so save it, ! * replacing one old tuple at random ! */ ! int k = (int) (targrows * random_fract()); ! ! Assert(k >= 0 && k < targrows); ! heap_freetuple(rows[k]); ! rows[k] = heap_copytuple(&targtuple); ! } ! ! rowstoskip -= 1; ! } /* this releases the second pin acquired by heap_fetch: */ ReleaseBuffer(tupbuffer); } ! else ! { ! /* This information is currently not used. */ ! deadrows += 1; ! } } + + /* this releases the initial pin: */ + ReleaseBuffer(targbuffer); } + ereport(DEBUG2, + (errmsg("%d pages sampled, %.0f live rows and %.0f dead rows scanned", + bs.m, (double) liverows, (double) deadrows))); + /* ! * If we ran out of tuples then we're done. No sort is needed, ! * since they're already in order. ! * ! * Otherwise we need to sort the collected tuples by position ! * (itempointer). I don't care for corner cases where the tuples ! * are already sorted. */ ! if (numrows == targrows) ! qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); /* * Estimate total number of valid rows in relation. */ ! if (bs.m > 0) ! *totalrows = floor((double) liverows * totalblocks / bs.m + 0.5); ! else ! *totalrows = 0.0; /* * Emit some interesting relation info *************** *** 872,894 **** /* * These two routines embody Algorithm Z from "Random sampling with a * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 ! * (Mar. 1985), Pages 37-57. While Vitter describes his algorithm in terms ! * of the count S of records to skip before processing another record, ! * it is convenient to work primarily with t, the index (counting from 1) ! * of the last record processed and next record to process. The only extra ! * state needed between calls is W, a random state variable. ! * ! * Note: the original algorithm defines t, S, numer, and denom as integers. ! * Here we express them as doubles to avoid overflow if the number of rows ! * in the table exceeds INT_MAX. The algorithm should work as long as the ! * row count does not become so large that it is not represented accurately ! * in a double (on IEEE-math machines this would be around 2^52 rows). * * init_selection_state computes the initial W value. * ! * Given that we've already processed t records (t >= n), ! * select_next_random_record determines the number of the next record to ! * process. */ static double init_selection_state(int n) --- 919,935 ---- /* * These two routines embody Algorithm Z from "Random sampling with a * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1 ! * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms ! * of the count S of records to skip before processing another record. ! * It is computed primarily based on t, the index (counting from 1) ! * of the last record processed. The only extra state needed between ! * calls is W, a random state variable. * * init_selection_state computes the initial W value. * ! * Given that we've already processed t records (t >= n), get_next_S ! * determines the number of records to skip before the next record is ! * processed. */ static double init_selection_state(int n) *************** *** 897,932 **** return exp(-log(random_fract()) / n); } ! static double ! select_next_random_record(double t, int n, double *stateptr) { /* The magic constant here is T from Vitter's paper */ ! if (t <= (22.0 * n)) { /* Process records using Algorithm X until t is large enough */ double V, quot; V = random_fract(); /* Generate V */ t += 1; ! quot = (t - (double) n) / t; /* Find min S satisfying (4.1) */ while (quot > V) { t += 1; ! quot *= (t - (double) n) / t; } } else { /* Now apply Algorithm Z */ double W = *stateptr; ! double term = t - (double) n + 1; ! double S; for (;;) { ! double numer, numer_lim, denom; double U, --- 938,976 ---- return exp(-log(random_fract()) / n); } ! static TupleCount ! get_next_S(TupleCount t, int n, double *stateptr) { /* The magic constant here is T from Vitter's paper */ ! if (t <= (22 * n)) { /* Process records using Algorithm X until t is large enough */ + TupleCount S = 0; double V, quot; V = random_fract(); /* Generate V */ t += 1; ! quot = 1.0 - (double) n / t; /* Find min S satisfying (4.1) */ while (quot > V) { + S += 1; t += 1; ! quot *= 1.0 - (double) n / t; } + return S; } else { /* Now apply Algorithm Z */ double W = *stateptr; ! TupleCount term = t - n + 1; ! TupleCount S; for (;;) { ! TupleCount numer, numer_lim, denom; double U, *************** *** 941,947 **** X = t * (W - 1.0); S = floor(X); /* S is tentatively set to floor(X) */ /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ ! tmp = (t + 1) / term; lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); rhs = (((t + X) / (term + S)) * term) / t; if (lhs <= rhs) --- 985,991 ---- X = t * (W - 1.0); S = floor(X); /* S is tentatively set to floor(X) */ /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ ! tmp = (double) (t + 1) / term; lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n); rhs = (((t + X) / (term + S)) * term) / t; if (lhs <= rhs) *************** *** 951,979 **** } /* Test if U <= f(S)/cg(X) */ y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); ! if ((double) n < S) { denom = t; numer_lim = term + S; } else { ! denom = t - (double) n + S; numer_lim = t + 1; } for (numer = t + S; numer >= numer_lim; numer -= 1) { ! y *= numer / denom; denom -= 1; } W = exp(-log(random_fract()) / n); /* Generate W in advance */ if (exp(log(y) / n) <= (t + X) / t) break; } - t += S + 1; *stateptr = W; } - return t; } /* --- 995,1022 ---- } /* Test if U <= f(S)/cg(X) */ y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X); ! if (n < S) { denom = t; numer_lim = term + S; } else { ! denom = t - n + S; numer_lim = t + 1; } for (numer = t + S; numer >= numer_lim; numer -= 1) { ! y *= (double) numer / denom; denom -= 1; } W = exp(-log(random_fract()) / n); /* Generate W in advance */ if (exp(log(y) / n) <= (t + X) / t) break; } *stateptr = W; + return S; } } /*

---------------------------(end of broadcast)--------------------------- TIP 6: Have you searched our list archives? http://archives.postgresql.org