Changeset: 7177c7edef55 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7177c7edef55
Modified Files:
gdk/gdk_join.c
Branch: candidate-exceptions
Log Message:
Join on candidate lists.
diffs (truncated from 647 to 300 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -207,7 +207,8 @@ joininitresults(BAT **r1p, BAT **r2p, BU
#define VALUE(s, x) (s##vars ? \
s##vars + VarHeapVal(s##vals, (x), s##width) : \
- (const char *) s##vals + ((x) * s##width))
+ s##vals ? (const char *) s##vals + ((x) * s##width) : \
+ (s##val = BUNtoid(s, (x)), (const char *) &s##val))
#define FVALUE(s, x) ((const char *) s##vals + ((x) * s##width))
#define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o))
@@ -471,7 +472,13 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
*r1p = r1;
if (r2p == NULL)
goto doreturn2;
- if (BATtdense(r1) && BATtdense(l)) {
+ if (BATcount(r1) == 0) {
+ r2 = BATdense(0, 0, 0);
+ if (r2 == NULL) {
+ BBPreclaim(r1);
+ return GDK_FAIL;
+ }
+ } else if (BATtdense(r1) && BATtdense(l)) {
r2 = BATdense(0, l->tseqbase + r1->tseqbase -
l->hseqbase + r->hseqbase - r->tseqbase, BATcount(r1));
if (r2 == NULL) {
BBPreclaim(r1);
@@ -487,7 +494,11 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
const oid *o1p = (const oid *) Tloc(r1, 0);
oid *o2p = (oid *) Tloc(r2, 0);
hi = BATcount(r1);
- if (BATtdense(r1)) {
+ if (l->ttype == TYPE_void && l->tvheap != NULL) {
+ /* this is actually generic code */
+ for (o = 0; o < hi; o++)
+ o2p[o] = BUNtoid(l, BUNtoid(r1, o) -
l->hseqbase) - r->tseqbase + r->hseqbase;
+ } else if (BATtdense(r1)) {
lo = r1->tseqbase - l->hseqbase;
if (r->tseqbase == r->hseqbase) {
memcpy(o2p, lp + lo, hi * SIZEOF_OID);
@@ -671,17 +682,33 @@ mergejoin_void(BAT **r1p, BAT **r2p, BAT
o2p = (oid *) Tloc(r2, 0);
r2->tnil = false;
r2->tnonil = true;
- for (i = 0; i < lci->ncand; i++) {
- oid c = canditer_next(lci);
+ if (l->ttype == TYPE_void && l->tvheap != NULL) {
+ for (i = 0; i < lci->ncand; i++) {
+ oid c = canditer_next(lci);
- o = lvals[c - l->hseqbase];
- *o1p++ = c;
- if (o >= lo && o < hi) {
- *o2p++ = o - r->tseqbase + r->hseqbase;
- } else {
- *o2p++ = oid_nil;
- r2->tnil = true;
- r2->tnonil = false;
+ o = BUNtoid(l, c - l->hseqbase);
+ *o1p++ = c;
+ if (o >= lo && o < hi) {
+ *o2p++ = o - r->tseqbase + r->hseqbase;
+ } else {
+ *o2p++ = oid_nil;
+ r2->tnil = true;
+ r2->tnonil = false;
+ }
+ }
+ } else {
+ for (i = 0; i < lci->ncand; i++) {
+ oid c = canditer_next(lci);
+
+ o = lvals[c - l->hseqbase];
+ *o1p++ = c;
+ if (o >= lo && o < hi) {
+ *o2p++ = o - r->tseqbase + r->hseqbase;
+ } else {
+ *o2p++ = oid_nil;
+ r2->tnil = true;
+ r2->tnonil = false;
+ }
}
}
r1->tsorted = true;
@@ -1266,6 +1293,263 @@ mergejoin_lng(BAT **r1p, BAT **r2p, BAT
return GDK_FAIL;
}
+/* Implementation of mergejoin (see below) for the special case that
+ * the values are of type oid, and the right-hand side is a candidate
+ * list with exception, and some more conditions are met. */
+static gdk_return
+mergejoin_cand(BAT **r1p, BAT **r2p, BAT *l, BAT *r,
+ bool nil_matches, BUN estimate, lng t0, bool swapped)
+{
+ BAT *r1, *r2;
+ BUN lstart, lend, lcnt;
+ struct canditer lci, rci;
+ BUN lscan; /* opportunistic scan window */
+ BUN maxsize;
+ const oid *lvals;
+ oid v;
+ BUN nl, nr;
+ oid lv;
+ BUN i;
+
+ ALGODEBUG fprintf(stderr, "#%s(l=" ALGOBATFMT ","
+ "r=" ALGOBATFMT ",nil_matches=%d)%s\n",
+ __func__, ALGOBATPAR(l), ALGOBATPAR(r), nil_matches,
+ swapped ? " swapped" : "");
+
+ assert(ATOMtype(l->ttype) == ATOMtype(r->ttype));
+
+ lstart = 0;
+ lend = BATcount(l);
+ lcnt = lend - lstart;
+ if (l->ttype == TYPE_void) {
+ assert(!is_oid_nil(l->tseqbase));
+ lcnt = canditer_init(&lci, NULL, l);
+ lvals = NULL;
+ } else {
+ lci = (struct canditer) {.tpe = cand_dense}; /* not used */
+ lvals = (const oid *) Tloc(l, 0);
+ assert(lvals != NULL);
+ }
+
+ assert(r->ttype == TYPE_void && r->tvheap != NULL);
+ canditer_init(&rci, NULL, r);
+
+ /* basic properties will be adjusted if necessary later on,
+ * they were initially set by joininitresults() */
+
+ if (lend == 0 || rci.ncand == 0) {
+ /* there are no matches */
+ return nomatch(r1p, r2p, l, r,
+ &(struct canditer) {.tpe = cand_dense, .ncand =
lcnt,},
+ false, false, __func__, t0);
+ }
+
+ if ((maxsize = joininitresults(r1p, r2p, BATcount(l), BATcount(r),
+ l->tkey, r->tkey, false, false,
+ false, estimate)) == BUN_NONE)
+ return GDK_FAIL;
+ r1 = *r1p;
+ r2 = r2p ? *r2p : NULL;
+
+ /* determine opportunistic scan window for l and r */
+ for (nl = lend - lstart, lscan = 4; nl > 0; lscan++)
+ nl >>= 1;
+
+ if (!nil_matches) {
+ /* skip over nils at the start of the columns */
+ if (lscan < lend - lstart && lvals && is_oid_nil(lvals[lstart +
lscan])) {
+ lstart = binsearch_oid(NULL, 0, lvals, lstart + lscan,
+ lend - 1, oid_nil, 1, 1);
+ } else if (lvals) {
+ while (is_oid_nil(lvals[lstart]))
+ lstart++;
+ } /* else l is candidate list: no nils */
+ }
+ /* from here on we don't have to worry about nil values */
+
+ while (lstart < lend && rci.next < rci.ncand) {
+ v = canditer_peek(&rci);
+
+ if (lvals) {
+ if (lscan < lend - lstart &&
+ lvals[lstart + lscan] < v) {
+ lstart = binsearch_oid(NULL, 0, lvals,
+ lstart + lscan,
+ lend - 1, v, 1, 0);
+ } else {
+ /* scan l for v */
+ while (lstart < lend && lvals[lstart] < v)
+ lstart++;
+ }
+ } else {
+ lstart = canditer_search(&lci, v, true);
+ canditer_setidx(&lci, lstart);
+ }
+ if (lstart >= lend) {
+ /* nothing found */
+ break;
+ }
+
+ /* Here we determine the next value in l that we are
+ * going to try to match in r. We will also count the
+ * number of occurrences in l of that value.
+ * Afterwards, v points to the value and nl is the
+ * number of times it occurs. Also, lstart will
+ * point to the next value to be considered (ready for
+ * the next iteration).
+ * If there are many equal values in l (more than
+ * lscan), we will use binary search to find the end
+ * of the sequence. Obviously, we can do this only if
+ * l is actually sorted (lscan > 0). */
+ nl = 1; /* we'll match (at least) one in l */
+ nr = 0; /* maybe we won't match anything in r */
+ v = lvals ? lvals[lstart] : canditer_next(&lci);
+ if (l->tkey || lvals == NULL) {
+ /* if l is key, there is a single value */
+ lstart++;
+ } else if (lscan < lend - lstart &&
+ v == lvals[lstart + lscan]) {
+ /* lots of equal values: use binary search to
+ * find end */
+ nl = binsearch_oid(NULL, 0, lvals, lstart + lscan,
+ lend - 1, v, 1, 1);
+ nl -= lstart;
+ lstart += nl;
+ } else {
+ /* just scan */
+ while (++lstart < lend && v == lvals[lstart])
+ nl++;
+ }
+ /* lstart points one beyond the value we're
+ * going to match: ready for the next iteration. */
+
+ /* First we find the first value in r that is at
+ * least as large as v, then we find the first
+ * value in r that is larger than v. The difference
+ * is the number of values equal to v and is stored in
+ * nr.
+ * We will use binary search on r to find both ends of
+ * the sequence of values that are equal to v in case
+ * the position is "too far" (more than rscan
+ * away). */
+
+ /* first find the location of the first value in r
+ * that is >= v, then find the location of the first
+ * value in r that is > v; the difference is the
+ * number of values equal to v */
+ nr = canditer_search(&rci, v, true);
+ canditer_setidx(&rci, nr);
+ if (nr == rci.ncand) {
+ /* nothing found */
+ break;
+ }
+
+ /* now find the end of the sequence of equal values v */
+
+ /* if r is key, there is zero or one match, otherwise
+ * look ahead a little (rscan) in r to see whether
+ * we're better off doing a binary search */
+ if (canditer_peek(&rci) == v) {
+ nr = 1;
+ canditer_next(&rci);
+ } else {
+ /* rci points to first value > v or end of
+ * r, and nr is the number of values in r that
+ * are equal to v */
+ /* no entries in r found */
+ continue;
+ }
+ /* make space: nl values in l match nr values in r, so
+ * we need to add nl * nr values in the results */
+ MAYBEEXTEND_NO_CAND(nl * nr);
+
+ /* maintain properties */
+ if (nl > 1) {
+ /* value occurs multiple times in l, so entry
+ * in r will be repeated multiple times: hence
+ * r2 is not key and not dense */
+ r2->tkey = false;
+ r2->tseqbase = oid_nil;
+ /* multiple different values will be inserted
+ * in r1 (always in order), so not reverse
+ * ordered anymore */
+ r1->trevsorted = false;
+ }
+ if (nr > 1) {
+ /* value occurs multiple times in r, so entry
+ * in l will be repeated multiple times: hence
+ * r1 is not key and not dense */
+ r1->tkey = false;
+ r1->tseqbase = oid_nil;
+ /* multiple different values will be inserted
+ * in r2 (in order), so not reverse ordered
+ * anymore */
+ r2->trevsorted = false;
+ if (nl > 1) {
+ /* multiple values in l match multiple
+ * values in r, so an ordered sequence
+ * will be inserted multiple times in
+ * r2, so r2 is not ordered anymore */
+ r2->tsorted = false;
+ }
+ }
+ if (BATcount(r1) > 0) {
+ /* a new, higher value will be inserted into
+ * r1, so r1 is not reverse ordered anymore */
+ r1->trevsorted = false;
+ /* a new higher value will be added to r2 */
+ r2->trevsorted = false;
+ if (BATtdense(r1) &&
+ ((oid *) r1->theap.base)[r1->batCount - 1] + 1 !=
l->hseqbase + lstart - nl) {
+ r1->tseqbase = oid_nil;
+ }
+ }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list