Changeset: 3124405e5d11 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3124405e5d11
Modified Files:
        clients/Tests/exports.stable.out
        gdk/gdk_cand.c
        gdk/gdk_cand.h
        gdk/gdk_join.c
Branch: default
Log Message:

In mergejoin, avoid searching the candidate list for the nth set bit in a mask.


diffs (truncated from 302 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -449,7 +449,7 @@ ssize_t bteToStr(str *dst, size_t *len, 
 const bte bte_nil;
 oid canditer_idx(struct canditer *ci, BUN p);
 BUN canditer_init(struct canditer *ci, BAT *b, BAT *s);
-oid canditer_last(struct canditer *ci);
+oid canditer_last(const struct canditer *ci);
 oid canditer_peek(struct canditer *ci);
 oid canditer_peekprev(struct canditer *ci);
 oid canditer_prev(struct canditer *ci);
diff --git a/gdk/gdk_cand.c b/gdk/gdk_cand.c
--- a/gdk/gdk_cand.c
+++ b/gdk/gdk_cand.c
@@ -806,6 +806,45 @@ canditer_peekprev(struct canditer *ci)
        return o;
 }
 
+/* if o is a candidate, return it, else return the next candidate from
+ * the cand_mask iterator ci after (if next is set) or before (if next
+ * is unset) o; if there are no more candidates, return oid_nil */
+oid
+canditer_mask_next(struct canditer *ci, oid o, bool next)
+{
+       if (o < ci->mskoff)
+               return next ? ci->mskoff + ci->firstbit : oid_nil;
+       o -= ci->mskoff;
+       BUN p = o / 32;
+       o %= 32;
+       if (p >= ci->nvals || (p == ci->nvals - 1 && o >= ci->lastbit))
+               return next ? oid_nil : canditer_last(ci);
+       if (next) {
+               while ((ci->mask[p] & (1U << o)) == 0) {
+                       if (++o == 32) {
+                               o = 0;
+                               if (++p == ci->nvals)
+                                       return oid_nil;
+                       }
+               }
+               if (p == ci->nvals - 1 && o >= ci->lastbit)
+                       return oid_nil;
+       } else {
+               while ((ci->mask[p] & (1U << o)) == 0) {
+                       if (o == 0) {
+                               o = 31;
+                               if (p == 0)
+                                       return oid_nil;
+                       } else {
+                               o--;
+                       }
+               }
+               if (p == 0 && o < ci->firstbit)
+                       return oid_nil;
+       }
+       return ci->mskoff + 32 * p + o;
+}
+
 /* return the last candidate */
 oid
 canditer_last(struct canditer *ci)
diff --git a/gdk/gdk_cand.h b/gdk/gdk_cand.h
--- a/gdk/gdk_cand.h
+++ b/gdk/gdk_cand.h
@@ -197,6 +197,7 @@ canditer_contains(struct canditer *ci, o
        }
        return canditer_search(ci, o, false) != BUN_NONE;
 }
+gdk_export oid canditer_mask_next(struct canditer *ci, oid o, bool next);
 gdk_export BAT *canditer_slice(struct canditer *ci, BUN lo, BUN hi);
 gdk_export BAT *canditer_sliceval(struct canditer *ci, oid lo, oid hi);
 gdk_export BAT *canditer_slice2(struct canditer *ci, BUN lo1, BUN hi1, BUN 
lo2, BUN hi2);
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -1684,9 +1684,10 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
        int lordering, rordering;
        oid lv;
        BUN i, j;               /* counters */
-       bool lskipped = false;  /* whether we skipped values in l */
        oid lval = oid_nil, rval = oid_nil; /* temporary space to point v to */
        struct canditer llci, rrci;
+       struct canditer *mlci, xlci;
+       struct canditer *mrci, xrci;
 
        if (lci->tpe == cand_dense && lci->ncand == BATcount(l) &&
            rci->tpe == cand_dense && rci->ncand == BATcount(r) &&
@@ -1718,7 +1719,7 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                canditer_init(&llci, NULL, l);
                lvals = NULL;
        } else {
-               lvals = Tloc(l, 0);
+               lvals = Tloc(l, 0);                           /* non NULL */
                llci = (struct canditer) {.tpe = cand_dense}; /* not used */
        }
        rrci = (struct canditer) {.tpe = cand_dense};
@@ -1772,6 +1773,23 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
        BAT *r1 = *r1p;
        BAT *r2 = r2p ? *r2p : NULL;
 
+       if (lci->tpe == cand_mask) {
+               mlci = lci;
+               canditer_init(&xlci, l, NULL);
+               lci = &xlci;
+       } else {
+               mlci = NULL;
+               xlci = (struct canditer) {.tpe = cand_dense}; /* not used */
+       }
+       if (rci->tpe == cand_mask) {
+               mrci = rci;
+               canditer_init(&xrci, r, NULL);
+               rci = &xrci;
+       } else {
+               mrci = NULL;
+               xrci = (struct canditer) {.tpe = cand_dense}; /* not used */
+       }
+
        if (l->tsorted || l->trevsorted) {
                equal_order = (l->tsorted && r->tsorted) ||
                        (l->trevsorted && r->trevsorted &&
@@ -1868,12 +1886,18 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                        if (equal_order) {
                                if (rci->next == rci->ncand)
                                        v = NULL; /* no more values */
-                               else
+                               else if (mrci) {
+                                       oid rv = canditer_mask_next(mrci, 
canditer_peek(rci), true);
+                                       v = rv == oid_nil ? NULL : VALUE(r, rv 
- r->hseqbase);
+                               } else
                                        v = VALUE(r, canditer_peek(rci) - 
r->hseqbase);
                        } else {
                                if (rci->next == 0)
                                        v = NULL; /* no more values */
-                               else
+                               else if (mrci) {
+                                       oid rv = canditer_mask_next(mrci, 
canditer_peekprev(rci), false);
+                                       v = rv == oid_nil ? NULL : VALUE(r, rv 
- r->hseqbase);
+                               } else
                                        v = VALUE(r, canditer_peekprev(rci) - 
r->hseqbase);
                        }
                        /* here, v points to next value in r, or if
@@ -1898,6 +1922,13 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                                        nlx -= lci->next;
                                                }
                                        }
+                                       if (mlci) {
+                                               lv = canditer_mask_next(mlci, 
lci->seq + lci->next + nlx, true);
+                                               if (lv == oid_nil)
+                                                       nlx = lci->ncand - 
lci->next;
+                                               else
+                                                       nlx = lv - lci->seq - 
lci->next;
+                                       }
                                        if (lci->next + nlx == lci->ncand)
                                                v = NULL;
                                }
@@ -1906,13 +1937,12 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                if (only_misses) {
                                        if (maybeextend(r1, r2, nlx, lci->next, 
lci->ncand, maxsize) != GDK_SUCCEED)
                                                goto bailout;
-                                       lskipped |= nlx > 1 && lci->tpe != 
cand_dense;
                                        while (nlx > 0) {
-                                               APPEND(r1, canditer_next(lci));
+                                               lv = canditer_next(lci);
+                                               if (mlci == NULL || 
canditer_contains(mlci, lv))
+                                                       APPEND(r1, lv);
                                                nlx--;
                                        }
-                                       if (lskipped)
-                                               r1->tseqbase = oid_nil;
                                        if (r1->trevsorted && BATcount(r1) > 1)
                                                r1->trevsorted = false;
                                } else if (nil_on_miss) {
@@ -1926,18 +1956,17 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                        }
                                        if (maybeextend(r1, r2, nlx, lci->next, 
lci->ncand, maxsize) != GDK_SUCCEED)
                                                goto bailout;
-                                       lskipped |= nlx > 1 && lci->tpe != 
cand_dense;
                                        while (nlx > 0) {
-                                               APPEND(r1, canditer_next(lci));
-                                               APPEND(r2, oid_nil);
+                                               lv = canditer_next(lci);
+                                               if (mlci == NULL || 
canditer_contains(mlci, lv)) {
+                                                       APPEND(r1, lv);
+                                                       APPEND(r2, oid_nil);
+                                               }
                                                nlx--;
                                        }
-                                       if (lskipped)
-                                               r1->tseqbase = oid_nil;
                                        if (r1->trevsorted && BATcount(r1) > 1)
                                                r1->trevsorted = false;
                                } else {
-                                       lskipped = BATcount(r1) > 0;
                                        canditer_setidx(lci, lci->next + nlx);
                                }
                        }
@@ -1960,7 +1989,14 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                 * l is actually sorted (lscan > 0). */
                nl = 1;         /* we'll match (at least) one in l */
                nr = 0;         /* maybe we won't match anything in r */
-               v = VALUE(l, canditer_peek(lci) - l->hseqbase);
+               lv = canditer_peek(lci);
+               if (mlci) {
+                       lv = canditer_mask_next(mlci, lv, true);
+                       if (lv == oid_nil)
+                               break;
+                       canditer_setidx(lci, canditer_search(lci, lv, true));
+               }
+               v = VALUE(l, lv - l->hseqbase);
                if (l->tkey) {
                        /* if l is key, there is a single value */
                } else if (lscan > 0 &&
@@ -2166,7 +2202,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                         * in r */
                                        break;
                                }
-                               lskipped = BATcount(r1) > 0;
                                canditer_setidx(lci, lci->next + nl);
                                continue;
                        }
@@ -2186,7 +2221,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                        goto bailout;
                } else if (only_misses) {
                        /* we had a match, so we're not interested */
-                       lskipped = BATcount(r1) > 0;
                        canditer_setidx(lci, lci->next + nl);
                        continue;
                } else {
@@ -2197,11 +2231,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                nr = 1;
                        }
                }
-               if (canditer_idx(lci, lci->next + nl - 1) - canditer_idx(lci, 
lci->next) != nl - 1) {
-                       /* not all values in the range are
-                        * candidates */
-                       lskipped = true;
-               }
                /* make space: nl values in l match nr values in r, so
                 * we need to add nl * nr values in the results */
                if (maybeextend(r1, r2, nl * nr, lci->next, lci->ncand, 
maxsize) != GDK_SUCCEED)
@@ -2227,7 +2256,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                         * in l will be repeated multiple times: hence
                         * r1 is not key and not dense */
                        r1->tkey = false;
-                       r1->tseqbase = oid_nil;
                        if (r2) {
                                /* multiple different values will be
                                 * inserted in r2 (in order), so not
@@ -2293,20 +2321,19 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
                                        r2->tseqbase = oid_nil;
                                }
                        }
-                       /* if there is a left candidate list, it may
-                        * be that the next value added isn't
-                        * consecutive with the last one */
-                       if (lskipped ||
-                           ((oid *) r1->theap->base)[r1->batCount - 1] + 1 != 
canditer_peek(lci))
-                               r1->tseqbase = oid_nil;
                }
 
                /* insert values: first the left output */
+               BUN nladded = 0;
                for (i = 0; i < nl; i++) {
                        lv = canditer_next(lci);
-                       for (j = 0; j < nr; j++)
-                               APPEND(r1, lv);
+                       if (mlci == NULL || canditer_contains(mlci, lv)) {
+                               nladded++;
+                               for (j = 0; j < nr; j++)
+                                       APPEND(r1, lv);
+                       }
                }
+               nl = nladded;
                /* then the right output, various different ways of
                 * doing it */
                if (r2 == NULL) {
@@ -2345,20 +2372,15 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l, 
        /* also set other bits of heap to correct value to indicate size */
        BATsetcount(r1, BATcount(r1));
        r1->tseqbase = oid_nil;
+       if (r1->tkey)
+               r1 = virtualize(r1);
        if (r2) {
                BATsetcount(r2, BATcount(r2));
                assert(BATcount(r1) == BATcount(r2));
                r2->tseqbase = oid_nil;
-       }
-       if (BATcount(r1) > 0) {
-               if (BATtdense(r1))
-                       r1->tseqbase = ((oid *) r1->theap->base)[0];
-               if (r2 && BATtdense(r2))
-                       r2->tseqbase = ((oid *) r2->theap->base)[0];
-       } else {
-               r1->tseqbase = 0;
-               if (r2) {
-                       r2->tseqbase = 0;
+               if (BATcount(r2) <= 1) {
+                       r2->tkey = true;
+                       r2 = virtualize(r2);
                }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to