Changeset: ac64b6abe064 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ac64b6abe064
Modified Files:
        gdk/gdk_select.c
Branch: default
Log Message:

revised result estimation for hash- & scan-select:

hash- & scan-select do not require the (possibly expansive)
perfectly uniform BATsample() to estimate result sizes.

Rather, we can estimate exact result sizes in some special cases,
perform a simple quick "layman's" "pseudo sample" for some
hash-select cases, and simply allocate 1M tuples by default
in all other cases, balacing between (over-)allocation and
(delaying/avoiding) BATextend(), sampling as we go.


diffs (147 lines):

diff --git a/gdk/gdk_select.c b/gdk/gdk_select.c
--- a/gdk/gdk_select.c
+++ b/gdk/gdk_select.c
@@ -340,10 +340,10 @@ BAT *
 BATsubselect(BAT *b, BAT *s, const void *tl, const void *th,
              int li, int hi, int anti)
 {
-       int hval, lval, equi, t, lnil;
+       int hval, lval, equi, t, lnil, hash;
        const void *nil;
        BAT *bn;
-       BUN estimate;
+       BUN estimate = BUN_NONE, maximum = BUN_NONE;
 
        BATcheck(b, "BATsubselect");
        BATcheck(tl, "BATsubselect: tl value required");
@@ -601,41 +601,102 @@ BATsubselect(BAT *b, BAT *s, const void 
                return bn;
        }
 
-       if (b->tkey) {
-               estimate = 1;
-       } else if (s) {
-               estimate = BATcount(s);
-       } else if (BATcount(b) <= 100000) {
-               estimate = BATguess(b);
-       } else {
-               BAT *tmp1 = BATmirror(BATmark(BATmirror(b), oid_nil));
-               estimate = 0;
-               if (tmp1) {
-                       BAT *tmp2 = BATsample(tmp1, 128);
-                       if (tmp2) {
-                               BAT *tmp3;
-                               BATseqbase(tmp2, 0);
-                               tmp3 = BATsubselect(tmp2, NULL, tl, th, li, hi, 
anti);
-                               if (tmp3) {
-                                       estimate = (BUN) ((lng) BATcount(tmp3) 
* BATcount(b) / 100);
-                                       BBPreclaim(tmp3);
-                               }
-                               BBPreclaim(tmp2);
-                       }
-                       BBPreclaim(tmp1);
+       /* upper limit for result size */
+       maximum = BATcount(b);
+       if (s) {
+               /* refine upper limit of result size by candidate list */
+               oid ol = b->hseqbase;
+               oid oh = ol + BATcount(b);
+               assert(s->tsorted);
+               assert(s->tkey);
+               if (BATtdense(s)) {
+                       maximum = MIN( maximum , 
+                                      MIN( oh , s->tseqbase + BATcount(s)) 
+                                      - MAX( ol , s->tseqbase ) );
+               } else {
+                       maximum = MIN( maximum ,
+                                      SORTfndfirst(s, &oh) 
+                                      - SORTfndfirst(s, &ol) ) ;
                }
        }
+       if (b->tkey) {
+               /* exact result size in special cases */
+               if (equi) {
+                       estimate = 1;
+               } else
+               if (!anti && lval && hval) {
+                       if (ATOMstorage(b->ttype) == TYPE_bte) {
+                               estimate = (BUN) (*(bte*) th - *(bte*) tl);
+                       } else
+                       if (ATOMstorage(b->ttype) == TYPE_sht) {
+                               estimate = (BUN) (*(sht*) th - *(sht*) tl);
+                       } else
+                       if (ATOMstorage(b->ttype) == TYPE_int) {
+                               estimate = (BUN) (*(int*) th - *(int*) tl);
+                       } else
+                       if (ATOMstorage(b->ttype) == TYPE_lng) {
+                               estimate = (BUN) (*(lng*) th - *(lng*) tl);
+                       }
+                       if (estimate != BUN_NONE) {
+                               estimate += li + hi - 1;
+                       }
+               }
+       }
+       /* refine upper limit by exact size (if known) */
+       maximum = MIN(maximum, estimate);
+       hash = b->batPersistence == PERSISTENT &&
+              (size_t) ATOMsize(b->ttype) > sizeof(BUN) / 4 &&
+              BATcount(b) * (ATOMsize(b->ttype) + 2 * sizeof(BUN)) < 
GDK_mem_maxsize / 2;
+       if (estimate == BUN_NONE && equi && !b->T->hash && hash) {
+               /* no exact result size, but we need estimate to choose
+                * between hash- & scan-select */
+               if (BATcount(b) <= 10000) {
+                       /* "small" input: don't bother about more accurate
+                        * estimate */
+                       estimate = maximum;
+               } else {
+                       /* layman's quick "pseudo-sample" of 1000 tuples,
+                        * i.e., 333 from begin, middle & end of BAT */
+                       BUN smpl_cnt = 0, slct_cnt = 0, pos, skip, delta;
+                       BAT *smpl, *slct;
+
+                       delta = 1000 / 3 / 2;
+                       skip = (BATcount(b) - (2 * delta)) / 2;
+                       for (pos = delta; pos < BATcount(b); pos += skip) {
+                               smpl = BATslice(b, pos - delta, pos + delta);
+                               if (smpl) {
+                                       slct = BATsubselect(smpl, NULL, tl,
+                                                           th, li, hi, anti);
+                                       if (slct) {
+                                               smpl_cnt += BATcount(smpl);
+                                               slct_cnt += BATcount(slct);
+                                               BBPreclaim(slct);
+                                       }
+                                       BBPreclaim(smpl);
+                               }
+                       }
+                       if (smpl_cnt > 0 && slct_cnt > 0) {
+                               /* linear extrapolation plus 10% margin */
+                               estimate = (BUN) ((dbl) slct_cnt / (dbl) 
smpl_cnt
+                                                 * (dbl) BATcount(b) * 1.1);
+                       }
+               }
+               hash = hash && estimate < BATcount(b) / 100;
+       }
+       if (estimate == BUN_NONE) {
+               /* no better estimate possible/required:
+                * (pre-)allocate 1M tuples, i.e., avoid/delay extend
+                * without too much overallocation */
+               estimate = 1000000;
+       }
+       /* limit estimation by upper limit */
+       estimate = MIN(estimate, maximum);
 
        bn = BATnew(TYPE_void, TYPE_oid, estimate);
        if (bn == NULL)
                return NULL;
 
-       if (equi &&
-           (b->T->hash ||
-            (b->batPersistence == PERSISTENT &&
-             (size_t) ATOMsize(b->ttype) > sizeof(BUN) / 4 &&
-             estimate < BATcount(b) / 100 &&
-             BATcount(b) * (ATOMsize(b->ttype) + 2 * sizeof(BUN)) < 
GDK_mem_maxsize / 2))) {
+       if (equi && (b->T->hash || hash)) {
                ALGODEBUG fprintf(stderr, "#BATsubselect(b=%s#" BUNFMT
                                  ",s=%s,anti=%d): hash select\n",
                                  BATgetId(b), BATcount(b),
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to