Changeset: 7177c7edef55 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7177c7edef55
Modified Files:
        gdk/gdk_join.c
Branch: candidate-exceptions
Log Message:

Join on candidate lists.


diffs (truncated from 647 to 300 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -207,7 +207,8 @@ joininitresults(BAT **r1p, BAT **r2p, BU
 
 #define VALUE(s, x)    (s##vars ?                                      \
                         s##vars + VarHeapVal(s##vals, (x), s##width) : \
-                        (const char *) s##vals + ((x) * s##width))
+                        s##vals ? (const char *) s##vals + ((x) * s##width) : \
+                        (s##val = BUNtoid(s, (x)), (const char *) &s##val))
 #define FVALUE(s, x)   ((const char *) s##vals + ((x) * s##width))
 
 #define APPEND(b, o)           (((oid *) b->theap.base)[b->batCount++] = (o))
@@ -471,7 +472,13 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
                *r1p = r1;
                if (r2p == NULL)
                        goto doreturn2;
-               if (BATtdense(r1) && BATtdense(l)) {
+               if (BATcount(r1) == 0) {
+                       r2 = BATdense(0, 0, 0);
+                       if (r2 == NULL) {
+                               BBPreclaim(r1);
+                               return GDK_FAIL;
+                       }
+               } else if (BATtdense(r1) && BATtdense(l)) {
                        r2 = BATdense(0, l->tseqbase + r1->tseqbase - 
l->hseqbase + r->hseqbase - r->tseqbase, BATcount(r1));
                        if (r2 == NULL) {
                                BBPreclaim(r1);
@@ -487,7 +494,11 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
                        const oid *o1p = (const oid *) Tloc(r1, 0);
                        oid *o2p = (oid *) Tloc(r2, 0);
                        hi = BATcount(r1);
-                       if (BATtdense(r1)) {
+                       if (l->ttype == TYPE_void && l->tvheap != NULL) {
+                               /* this is actually generic code */
+                               for (o = 0; o < hi; o++)
+                                       o2p[o] = BUNtoid(l, BUNtoid(r1, o) - 
l->hseqbase) - r->tseqbase + r->hseqbase;
+                       } else if (BATtdense(r1)) {
                                lo = r1->tseqbase - l->hseqbase;
                                if (r->tseqbase == r->hseqbase) {
                                        memcpy(o2p, lp + lo, hi * SIZEOF_OID);
@@ -671,17 +682,33 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
        o2p = (oid *) Tloc(r2, 0);
        r2->tnil = false;
        r2->tnonil = true;
-       for (i = 0; i < lci->ncand; i++) {
-               oid c = canditer_next(lci);
+       if (l->ttype == TYPE_void && l->tvheap != NULL) {
+               for (i = 0; i < lci->ncand; i++) {
+                       oid c = canditer_next(lci);
 
-               o = lvals[c - l->hseqbase];
-               *o1p++ = c;
-               if (o >= lo && o < hi) {
-                       *o2p++ = o - r->tseqbase + r->hseqbase;
-               } else {
-                       *o2p++ = oid_nil;
-                       r2->tnil = true;
-                       r2->tnonil = false;
+                       o = BUNtoid(l, c - l->hseqbase);
+                       *o1p++ = c;
+                       if (o >= lo && o < hi) {
+                               *o2p++ = o - r->tseqbase + r->hseqbase;
+                       } else {
+                               *o2p++ = oid_nil;
+                               r2->tnil = true;
+                               r2->tnonil = false;
+                       }
+               }
+       } else {
+               for (i = 0; i < lci->ncand; i++) {
+                       oid c = canditer_next(lci);
+
+                       o = lvals[c - l->hseqbase];
+                       *o1p++ = c;
+                       if (o >= lo && o < hi) {
+                               *o2p++ = o - r->tseqbase + r->hseqbase;
+                       } else {
+                               *o2p++ = oid_nil;
+                               r2->tnil = true;
+                               r2->tnonil = false;
+                       }
                }
        }
        r1->tsorted = true;
@@ -1266,6 +1293,263 @@ mergejoin_lng(BAT **r1p, BAT **r2p, BAT 
        return GDK_FAIL;
 }
 
+/* Implementation of mergejoin (see below) for the special case that
+ * the values are of type oid, and the right-hand side is a candidate
+ * list with exception, and some more conditions are met. */
+static gdk_return
+mergejoin_cand(BAT **r1p, BAT **r2p, BAT *l, BAT *r,
+              bool nil_matches, BUN estimate, lng t0, bool swapped)
+{
+       BAT *r1, *r2;
+       BUN lstart, lend, lcnt;
+       struct canditer lci, rci;
+       BUN lscan;              /* opportunistic scan window */
+       BUN maxsize;
+       const oid *lvals;
+       oid v;
+       BUN nl, nr;
+       oid lv;
+       BUN i;
+
+       ALGODEBUG fprintf(stderr, "#%s(l=" ALGOBATFMT ","
+                         "r=" ALGOBATFMT ",nil_matches=%d)%s\n",
+                         __func__, ALGOBATPAR(l), ALGOBATPAR(r), nil_matches,
+                         swapped ? " swapped" : "");
+
+       assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
+
+       lstart = 0;
+       lend = BATcount(l);
+       lcnt = lend - lstart;
+       if (l->ttype == TYPE_void) {
+               assert(!is_oid_nil(l->tseqbase));
+               lcnt = canditer_init(&lci, NULL, l);
+               lvals = NULL;
+       } else {
+               lci = (struct canditer) {.tpe = cand_dense}; /* not used */
+               lvals = (const oid *) Tloc(l, 0);
+               assert(lvals != NULL);
+       }
+
+       assert(r->ttype == TYPE_void && r->tvheap != NULL);
+       canditer_init(&rci, NULL, r);
+
+       /* basic properties will be adjusted if necessary later on,
+        * they were initially set by joininitresults() */
+
+       if (lend == 0 || rci.ncand == 0) {
+               /* there are no matches */
+               return nomatch(r1p, r2p, l, r,
+                              &(struct canditer) {.tpe = cand_dense, .ncand = 
lcnt,},
+                              false, false, __func__, t0);
+       }
+
+       if ((maxsize = joininitresults(r1p, r2p, BATcount(l), BATcount(r),
+                                      l->tkey, r->tkey, false, false,
+                                      false, estimate)) == BUN_NONE)
+               return GDK_FAIL;
+       r1 = *r1p;
+       r2 = r2p ? *r2p : NULL;
+
+       /* determine opportunistic scan window for l and r */
+       for (nl = lend - lstart, lscan = 4; nl > 0; lscan++)
+               nl >>= 1;
+
+       if (!nil_matches) {
+               /* skip over nils at the start of the columns */
+               if (lscan < lend - lstart && lvals && is_oid_nil(lvals[lstart + 
lscan])) {
+                       lstart = binsearch_oid(NULL, 0, lvals, lstart + lscan,
+                                              lend - 1, oid_nil, 1, 1);
+               } else if (lvals) {
+                       while (is_oid_nil(lvals[lstart]))
+                               lstart++;
+               } /* else l is candidate list: no nils */
+       }
+       /* from here on we don't have to worry about nil values */
+
+       while (lstart < lend && rci.next < rci.ncand) {
+               v = canditer_peek(&rci);
+
+               if (lvals) {
+                       if (lscan < lend - lstart &&
+                           lvals[lstart + lscan] < v) {
+                               lstart = binsearch_oid(NULL, 0, lvals,
+                                                      lstart + lscan,
+                                                      lend - 1, v, 1, 0);
+                       } else {
+                               /* scan l for v */
+                               while (lstart < lend && lvals[lstart] < v)
+                                       lstart++;
+                       }
+               } else {
+                       lstart = canditer_search(&lci, v, true);
+                       canditer_setidx(&lci, lstart);
+               }
+               if (lstart >= lend) {
+                       /* nothing found */
+                       break;
+               }
+
+               /* Here we determine the next value in l that we are
+                * going to try to match in r.  We will also count the
+                * number of occurrences in l of that value.
+                * Afterwards, v points to the value and nl is the
+                * number of times it occurs.  Also, lstart will
+                * point to the next value to be considered (ready for
+                * the next iteration).
+                * If there are many equal values in l (more than
+                * lscan), we will use binary search to find the end
+                * of the sequence.  Obviously, we can do this only if
+                * l is actually sorted (lscan > 0). */
+               nl = 1;         /* we'll match (at least) one in l */
+               nr = 0;         /* maybe we won't match anything in r */
+               v = lvals ? lvals[lstart] : canditer_next(&lci);
+               if (l->tkey || lvals == NULL) {
+                       /* if l is key, there is a single value */
+                       lstart++;
+               } else if (lscan < lend - lstart &&
+                          v == lvals[lstart + lscan]) {
+                       /* lots of equal values: use binary search to
+                        * find end */
+                       nl = binsearch_oid(NULL, 0, lvals, lstart + lscan,
+                                          lend - 1, v, 1, 1);
+                       nl -= lstart;
+                       lstart += nl;
+               } else {
+                       /* just scan */
+                       while (++lstart < lend && v == lvals[lstart])
+                               nl++;
+               }
+               /* lstart points one beyond the value we're
+                * going to match: ready for the next iteration. */
+
+               /* First we find the first value in r that is at
+                * least as large as v, then we find the first
+                * value in r that is larger than v.  The difference
+                * is the number of values equal to v and is stored in
+                * nr.
+                * We will use binary search on r to find both ends of
+                * the sequence of values that are equal to v in case
+                * the position is "too far" (more than rscan
+                * away). */
+
+               /* first find the location of the first value in r
+                * that is >= v, then find the location of the first
+                * value in r that is > v; the difference is the
+                * number of values equal to v */
+               nr = canditer_search(&rci, v, true);
+               canditer_setidx(&rci, nr);
+               if (nr == rci.ncand) {
+                       /* nothing found */
+                       break;
+               }
+
+               /* now find the end of the sequence of equal values v */
+
+               /* if r is key, there is zero or one match, otherwise
+                * look ahead a little (rscan) in r to see whether
+                * we're better off doing a binary search */
+               if (canditer_peek(&rci) == v) {
+                       nr = 1;
+                       canditer_next(&rci);
+               } else {
+                       /* rci points to first value > v or end of
+                        * r, and nr is the number of values in r that
+                        * are equal to v */
+                       /* no entries in r found */
+                       continue;
+               }
+               /* make space: nl values in l match nr values in r, so
+                * we need to add nl * nr values in the results */
+               MAYBEEXTEND_NO_CAND(nl * nr);
+
+               /* maintain properties */
+               if (nl > 1) {
+                       /* value occurs multiple times in l, so entry
+                        * in r will be repeated multiple times: hence
+                        * r2 is not key and not dense */
+                       r2->tkey = false;
+                       r2->tseqbase = oid_nil;
+                       /* multiple different values will be inserted
+                        * in r1 (always in order), so not reverse
+                        * ordered anymore */
+                       r1->trevsorted = false;
+               }
+               if (nr > 1) {
+                       /* value occurs multiple times in r, so entry
+                        * in l will be repeated multiple times: hence
+                        * r1 is not key and not dense */
+                       r1->tkey = false;
+                       r1->tseqbase = oid_nil;
+                       /* multiple different values will be inserted
+                        * in r2 (in order), so not reverse ordered
+                        * anymore */
+                       r2->trevsorted = false;
+                       if (nl > 1) {
+                               /* multiple values in l match multiple
+                                * values in r, so an ordered sequence
+                                * will be inserted multiple times in
+                                * r2, so r2 is not ordered anymore */
+                               r2->tsorted = false;
+                       }
+               }
+               if (BATcount(r1) > 0) {
+                       /* a new, higher value will be inserted into
+                        * r1, so r1 is not reverse ordered anymore */
+                       r1->trevsorted = false;
+                       /* a new higher value will be added to r2 */
+                       r2->trevsorted = false;
+                       if (BATtdense(r1) &&
+                           ((oid *) r1->theap.base)[r1->batCount - 1] + 1 != 
l->hseqbase + lstart - nl) {
+                               r1->tseqbase = oid_nil;
+                       }
+               }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to