This patch implements the new tuple sampling method as discussed on
-hackers and -performance a few weeks ago.

"Large DB" -hackers 2004-04-02
"query slows down with more accurate stats" -perform 2004-04-13
"Tuple sampling" -hackers 2004-04-19

Servus
 Manfred
diff -rcN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c
*** ../base/src/backend/commands/analyze.c      Sun May  9 12:05:51 2004
--- src/backend/commands/analyze.c      Sun May  9 19:33:27 2004
***************
*** 38,43 ****
--- 38,66 ----
  #include "utils/syscache.h"
  #include "utils/tuplesort.h"
  
+ /*
+  * With two-stage sampling we process at most 300000 blocks, each of
+  * which contains less than 1200 tuples (at 32K block size, smallest
+  * possible tuple size).  So the number of tuples processed can be
+  * stored in a 32 bit int.
+  * Change this datatype to double if the number of rows in the table
+  * might exceed INT_MAX.  Then the algorithm should work as long as the
+  * row count does not become so large that it is not represented accurately
+  * in a double (on IEEE-math machines this would be around 2^52 rows).
+  */
+ typedef int TupleCount;
+ 
+ /*
+ ** data structure for Algorithm S from Knuth 3.4.2
+ */
+ typedef struct
+ {
+       BlockNumber     N;                              /* number of blocks, known in 
advance */
+       int                     n;                              /* sample size */
+       BlockNumber     t;                              /* current block number */
+       int                     m;                              /* blocks selected so 
far */
+ } BlockSamplerData;
+ typedef BlockSamplerData *BlockSampler;
  
  /* Per-index data for ANALYZE */
  typedef struct AnlIndexData
***************
*** 57,62 ****
--- 80,89 ----
  static MemoryContext anl_context = NULL;
  
  
+ static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize);
+ static bool BlockSampler_HasMore(BlockSampler bs);
+ static BlockNumber BlockSampler_Next(BlockSampler bs);
+ 
  static void compute_index_stats(Relation onerel, double totalrows,
                                                                AnlIndexData 
*indexdata, int nindexes,
                                                                HeapTuple *rows, int 
numrows,
***************
*** 66,72 ****
                                        int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static double select_next_random_record(double t, int n, double *stateptr);
  static int    compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
--- 93,99 ----
                                        int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static TupleCount get_next_S(TupleCount t, int n, double *stateptr);
  static int    compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
***************
*** 638,649 ****
  }
  
  /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * Up to targrows rows are collected (if there are fewer than that many
!  * rows in the table, all rows are collected).        When the table is larger
!  * than targrows, a truly random sample is collected: every row has an
!  * equal chance of ending up in the final sample.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
--- 665,754 ----
  }
  
  /*
+ ** BlockSampler_Init -- prepare for random sampling of blocknumbers
+ **
+ ** BlockSampler is used for stage one of our new two-stage tuple
+ ** sampling mechanism as discussed on -hackers 2004-04-02 (subject
+ ** "Large DB").  It selects a random sample of samplesize blocks out of
+ ** the nblocks blocks in the table.  If the table has less than
+ ** samplesize blocks, all blocks are selected.
+ **
+ */
+ static void
+ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+ {
+       bs->N = nblocks;                        /* table size */
+       /*
+       ** If we decide to reduce samplesize for tables that have less or
+       ** not much more than samplesize blocks, here is the place to do
+       ** it.
+       */
+       bs->n = samplesize;
+       bs->t = 0;                                      /* blocks scanned so far */
+       bs->m = 0;                                      /* blocks selected so far */
+ }
+ 
+ static bool
+ BlockSampler_HasMore(BlockSampler bs)
+ {
+       return (bs->t < bs->N) && (bs->m < bs->n);
+ }
+ 
+ static BlockNumber
+ BlockSampler_Next(BlockSampler bs)
+ {
+       BlockNumber     K = bs->N - bs->t;              /* remaining blocks */
+       int                     k = bs->n - bs->m;              /* blocks still to 
sample */
+       double          p;                              /* probability to skip the 
next block */
+       double          V;                              /* random */
+ 
+       Assert(BlockSampler_HasMore(bs));
+ 
+       if (k >= K)
+       {
+               /* need all the rest */
+               bs->t += 1;
+               bs->m += 1;
+               return bs->t - 1;
+       }
+       
+       p = 1.0 - (double) k / (double) K;
+       V = random_fract();
+       while (V < p)
+       {
+               bs->t += 1;
+               K -= 1;
+               /*
+               ** Assert(K > 0)
+               ** because we startet with K > k > 0,
+               ** and when K == k, the loop terminates
+               */
+               p *= 1.0 - (double) k / (double) K;
+       }
+ 
+       /* select */
+       bs->t += 1;
+       bs->m += 1;
+       return bs->t - 1;
+ }
+ 
+ /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * As of May 2004 we use a new two-stage method:  Stage one selects up
!  * to targrows random blocks (or all blocks, if there aren't so many).
!  * Stage two scans these blocks and uses the Vitter algorithm to create
!  * a random sample of targrows rows (or less, if there are less in the
!  * sample of blocks).  The two stages are executed simultaneously:  Each
!  * block is processed as soon as stage one returns its number and while
!  * the rows are read stage two controls which ones are to be inserted
!  * into the sample.
!  *
!  * Although every row has an equal chance of ending up in the final
!  * sample, this sampling method is not perfect: not every possible
!  * sample has an equal chance of being selected.  For large relations
!  * the number of different blocks represented by the sample tends to be
!  * too small.  We can live with that for now.  Improvements are welcome.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
***************
*** 656,754 ****
                                        double *totalrows)
  {
        int                     numrows = 0;
!       HeapScanDesc scan;
        BlockNumber     totalblocks;
!       HeapTuple       tuple;
!       ItemPointer lasttuple;
!       BlockNumber lastblock,
!                               estblock;
!       OffsetNumber lastoffset;
!       int                     numest;
!       double          tuplesperpage;
!       double          t;
        double          rstate;
  
        Assert(targrows > 1);
  
!       /*
!        * Do a simple linear scan until we reach the target number of rows.
!        */
!       scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
!       totalblocks = scan->rs_nblocks; /* grab current relation size */
!       while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
!       {
!               rows[numrows++] = heap_copytuple(tuple);
!               if (numrows >= targrows)
!                       break;
!               vacuum_delay_point();
!       }
!       heap_endscan(scan);
  
        /*
!        * If we ran out of tuples then we're done, no matter how few we
!        * collected.  No sort is needed, since they're already in order.
!        */
!       if (!HeapTupleIsValid(tuple))
!       {
!               *totalrows = (double) numrows;
! 
!               ereport(elevel,
!                               (errmsg("\"%s\": %u pages, %d rows sampled, %.0f 
estimated total rows",
!                                               RelationGetRelationName(onerel),
!                                               totalblocks, numrows, *totalrows)));
! 
!               return numrows;
!       }
! 
        /*
!        * Otherwise, start replacing tuples in the sample until we reach the
!        * end of the relation.  This algorithm is from Jeff Vitter's paper
!        * (see full citation below).  It works by repeatedly computing the
!        * number of the next tuple we want to fetch, which will replace a
!        * randomly chosen element of the reservoir (current set of tuples).
!        * At all times the reservoir is a true random sample of the tuples
!        * we've passed over so far, so when we fall off the end of the
!        * relation we're done.
!        *
!        * A slight difficulty is that since we don't want to fetch tuples or
!        * even pages that we skip over, it's not possible to fetch *exactly*
!        * the N'th tuple at each step --- we don't know how many valid tuples
!        * are on the skipped pages.  We handle this by assuming that the
!        * average number of valid tuples/page on the pages already scanned
!        * over holds good for the rest of the relation as well; this lets us
!        * estimate which page the next tuple should be on and its position in
!        * the page.  Then we fetch the first valid tuple at or after that
!        * position, being careful not to use the same tuple twice.  This
!        * approach should still give a good random sample, although it's not
!        * perfect.
!        */
!       lasttuple = &(rows[numrows - 1]->t_self);
!       lastblock = ItemPointerGetBlockNumber(lasttuple);
!       lastoffset = ItemPointerGetOffsetNumber(lasttuple);
! 
!       /*
!        * If possible, estimate tuples/page using only completely-scanned
!        * pages.
!        */
!       for (numest = numrows; numest > 0; numest--)
!       {
!               if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != 
lastblock)
!                       break;
!       }
!       if (numest == 0)
!       {
!               numest = numrows;               /* don't have a full page? */
!               estblock = lastblock + 1;
!       }
!       else
!               estblock = lastblock;
!       tuplesperpage = (double) numest / (double) estblock;
! 
!       t = (double) numrows;           /* t is the # of records processed so far */
        rstate = init_selection_state(targrows);
!       for (;;)
        {
-               double          targpos;
                BlockNumber targblock;
                Buffer          targbuffer;
                Page            targpage;
--- 761,788 ----
                                        double *totalrows)
  {
        int                     numrows = 0;
!       TupleCount      liverows = 0;
!       TupleCount      deadrows = 0;
!       TupleCount      rowstoskip = -1;
        BlockNumber     totalblocks;
!       BlockSamplerData bs;
        double          rstate;
  
        Assert(targrows > 1);
  
!       totalblocks = RelationGetNumberOfBlocks(onerel);
  
        /*
!       ** Prepare for sampling block numbers
!       */
!       BlockSampler_Init(&bs, totalblocks, targrows);
        /*
!       ** Prepare for sampling rows
!       */
        rstate = init_selection_state(targrows);
! 
!       while (BlockSampler_HasMore(&bs))
        {
                BlockNumber targblock;
                Buffer          targbuffer;
                Page            targpage;
***************
*** 757,783 ****
  
                vacuum_delay_point();
  
!               t = select_next_random_record(t, targrows, &rstate);
!               /* Try to read the t'th record in the table */
!               targpos = t / tuplesperpage;
!               targblock = (BlockNumber) targpos;
!               targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
!                       FirstOffsetNumber;
!               /* Make sure we are past the last selected record */
!               if (targblock <= lastblock)
!               {
!                       targblock = lastblock;
!                       if (targoffset <= lastoffset)
!                               targoffset = lastoffset + 1;
!               }
!               /* Loop to find first valid record at or after given position */
! pageloop:;
! 
!               /*
!                * Have we fallen off the end of the relation?
!                */
!               if (targblock >= totalblocks)
!                       break;
  
                /*
                 * We must maintain a pin on the target page's buffer to ensure
--- 791,797 ----
  
                vacuum_delay_point();
  
!               targblock = BlockSampler_Next(&bs);
  
                /*
                 * We must maintain a pin on the target page's buffer to ensure
***************
*** 795,848 ****
                maxoffset = PageGetMaxOffsetNumber(targpage);
                LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
  
!               for (;;)
                {
                        HeapTupleData targtuple;
                        Buffer          tupbuffer;
  
-                       if (targoffset > maxoffset)
-                       {
-                               /* Fell off end of this page, try next */
-                               ReleaseBuffer(targbuffer);
-                               targblock++;
-                               targoffset = FirstOffsetNumber;
-                               goto pageloop;
-                       }
                        ItemPointerSet(&targtuple.t_self, targblock, targoffset);
                        if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
                                                   false, NULL))
                        {
                                /*
!                                * Found a suitable tuple, so save it, replacing one 
old
!                                * tuple at random
                                 */
!                               int                     k = (int) (targrows * 
random_fract());
  
-                               Assert(k >= 0 && k < targrows);
-                               heap_freetuple(rows[k]);
-                               rows[k] = heap_copytuple(&targtuple);
                                /* this releases the second pin acquired by 
heap_fetch: */
                                ReleaseBuffer(tupbuffer);
-                               /* this releases the initial pin: */
-                               ReleaseBuffer(targbuffer);
-                               lastblock = targblock;
-                               lastoffset = targoffset;
-                               break;
                        }
!                       /* this tuple is dead, so advance to next one on same page */
!                       targoffset++;
                }
        }
  
        /*
!        * Now we need to sort the collected tuples by position (itempointer).
         */
!       qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
  
        /*
         * Estimate total number of valid rows in relation.
         */
!       *totalrows = floor((double) totalblocks * tuplesperpage + 0.5);
  
        /*
         * Emit some interesting relation info 
--- 809,895 ----
                maxoffset = PageGetMaxOffsetNumber(targpage);
                LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
  
!               for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; 
++targoffset)
                {
                        HeapTupleData targtuple;
                        Buffer          tupbuffer;
  
                        ItemPointerSet(&targtuple.t_self, targblock, targoffset);
                        if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
                                                   false, NULL))
                        {
+                               liverows += 1;
                                /*
!                                * The first targrows rows are simply copied into the
!                                * reservoir.
!                                * Then we start replacing tuples in the sample until
!                                * we reach the end of the relation.  This algorithm is
!                                * from Jeff Vitter's paper (see full citation below).
!                                * It works by repeatedly computing the number of the
!                                * next tuple we want to fetch, which will replace a
!                                * randomly chosen element of the reservoir (current
!                                * set of tuples).  At all times the reservoir is a 
true
!                                * random sample of the tuples we've passed over so 
far,
!                                * so when we fall off the end of the relation we're 
done.
                                 */
!                               if (numrows < targrows)
!                                       rows[numrows++] = heap_copytuple(&targtuple);
!                               else
!                               {
!                                       if (rowstoskip < 0)
!                                               rowstoskip = get_next_S(liverows, 
targrows, &rstate);
!                                       if (rowstoskip == 0)
!                                       {
!                                               /*
!                                                * Found a suitable tuple, so save it,
!                                                * replacing one old tuple at random
!                                                */
!                                               int     k = (int) (targrows * 
random_fract());
! 
!                                               Assert(k >= 0 && k < targrows);
!                                               heap_freetuple(rows[k]);
!                                               rows[k] = heap_copytuple(&targtuple);
!                                       }
! 
!                                       rowstoskip -= 1;
!                               }
  
                                /* this releases the second pin acquired by 
heap_fetch: */
                                ReleaseBuffer(tupbuffer);
                        }
!                       else
!                       {
!                               /* This information is currently not used. */
!                               deadrows += 1;
!                       }
                }
+ 
+               /* this releases the initial pin: */
+               ReleaseBuffer(targbuffer);
        }
  
+       ereport(DEBUG2,
+                       (errmsg("%d pages sampled, %.0f live rows and %.0f dead rows 
scanned",
+                                       bs.m, (double) liverows, (double) deadrows)));
+ 
        /*
!        * If we ran out of tuples then we're done.  No sort is needed,
!        * since they're already in order.
!        *
!        * Otherwise we need to sort the collected tuples by position
!        * (itempointer).  I don't care for corner cases where the tuples
!        * are already sorted.
         */
!       if (numrows == targrows)
!               qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
  
        /*
         * Estimate total number of valid rows in relation.
         */
!       if (bs.m > 0)
!               *totalrows = floor((double) liverows * totalblocks / bs.m + 0.5);
!       else
!               *totalrows = 0.0;
  
        /*
         * Emit some interesting relation info 
***************
*** 872,894 ****
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  While Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record,
!  * it is convenient to work primarily with t, the index (counting from 1)
!  * of the last record processed and next record to process.  The only extra
!  * state needed between calls is W, a random state variable.
!  *
!  * Note: the original algorithm defines t, S, numer, and denom as integers.
!  * Here we express them as doubles to avoid overflow if the number of rows
!  * in the table exceeds INT_MAX.  The algorithm should work as long as the
!  * row count does not become so large that it is not represented accurately
!  * in a double (on IEEE-math machines this would be around 2^52 rows).
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n),
!  * select_next_random_record determines the number of the next record to
!  * process.
   */
  static double
  init_selection_state(int n)
--- 919,935 ----
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record.
!  * It is computed primarily based on t, the index (counting from 1)
!  * of the last record processed.  The only extra state needed between
!  * calls is W, a random state variable.
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n), get_next_S
!  * determines the number of records to skip before the next record is
!  * processed.
   */
  static double
  init_selection_state(int n)
***************
*** 897,932 ****
        return exp(-log(random_fract()) / n);
  }
  
! static double
! select_next_random_record(double t, int n, double *stateptr)
  {
        /* The magic constant here is T from Vitter's paper */
!       if (t <= (22.0 * n))
        {
                /* Process records using Algorithm X until t is large enough */
                double          V,
                                        quot;
  
                V = random_fract();             /* Generate V */
                t += 1;
!               quot = (t - (double) n) / t;
                /* Find min S satisfying (4.1) */
                while (quot > V)
                {
                        t += 1;
!                       quot *= (t - (double) n) / t;
                }
        }
        else
        {
                /* Now apply Algorithm Z */
                double          W = *stateptr;
!               double          term = t - (double) n + 1;
!               double          S;
  
                for (;;)
                {
!                       double          numer,
                                                numer_lim,
                                                denom;
                        double          U,
--- 938,976 ----
        return exp(-log(random_fract()) / n);
  }
  
! static TupleCount
! get_next_S(TupleCount t, int n, double *stateptr)
  {
        /* The magic constant here is T from Vitter's paper */
!       if (t <= (22 * n))
        {
                /* Process records using Algorithm X until t is large enough */
+               TupleCount S = 0;
                double          V,
                                        quot;
  
                V = random_fract();             /* Generate V */
                t += 1;
!               quot = 1.0 - (double) n / t;
                /* Find min S satisfying (4.1) */
                while (quot > V)
                {
+                       S += 1;
                        t += 1;
!                       quot *= 1.0 - (double) n / t;
                }
+               return S;
        }
        else
        {
                /* Now apply Algorithm Z */
                double          W = *stateptr;
!               TupleCount      term = t - n + 1;
!               TupleCount      S;
  
                for (;;)
                {
!                       TupleCount      numer,
                                                numer_lim,
                                                denom;
                        double          U,
***************
*** 941,947 ****
                        X = t * (W - 1.0);
                        S = floor(X);           /* S is tentatively set to floor(X) */
                        /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
!                       tmp = (t + 1) / term;
                        lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
                        rhs = (((t + X) / (term + S)) * term) / t;
                        if (lhs <= rhs)
--- 985,991 ----
                        X = t * (W - 1.0);
                        S = floor(X);           /* S is tentatively set to floor(X) */
                        /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
!                       tmp = (double) (t + 1) / term;
                        lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
                        rhs = (((t + X) / (term + S)) * term) / t;
                        if (lhs <= rhs)
***************
*** 951,979 ****
                        }
                        /* Test if U <= f(S)/cg(X) */
                        y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
!                       if ((double) n < S)
                        {
                                denom = t;
                                numer_lim = term + S;
                        }
                        else
                        {
!                               denom = t - (double) n + S;
                                numer_lim = t + 1;
                        }
                        for (numer = t + S; numer >= numer_lim; numer -= 1)
                        {
!                               y *= numer / denom;
                                denom -= 1;
                        }
                        W = exp(-log(random_fract()) / n);      /* Generate W in 
advance */
                        if (exp(log(y) / n) <= (t + X) / t)
                                break;
                }
-               t += S + 1;
                *stateptr = W;
        }
-       return t;
  }
  
  /*
--- 995,1022 ----
                        }
                        /* Test if U <= f(S)/cg(X) */
                        y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
!                       if (n < S)
                        {
                                denom = t;
                                numer_lim = term + S;
                        }
                        else
                        {
!                               denom = t - n + S;
                                numer_lim = t + 1;
                        }
                        for (numer = t + S; numer >= numer_lim; numer -= 1)
                        {
!                               y *= (double) numer / denom;
                                denom -= 1;
                        }
                        W = exp(-log(random_fract()) / n);      /* Generate W in 
advance */
                        if (exp(log(y) / n) <= (t + X) / t)
                                break;
                }
                *stateptr = W;
+               return S;
        }
  }
  
  /*
---------------------------(end of broadcast)---------------------------
TIP 6: Have you searched our list archives?

               http://archives.postgresql.org

Reply via email to