Changeset: 3124405e5d11 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3124405e5d11
Modified Files:
clients/Tests/exports.stable.out
gdk/gdk_cand.c
gdk/gdk_cand.h
gdk/gdk_join.c
Branch: default
Log Message:
In mergejoin, avoid searching the candidate list for the nth set bit in a mask.
diffs (truncated from 302 to 300 lines):
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -449,7 +449,7 @@ ssize_t bteToStr(str *dst, size_t *len,
const bte bte_nil;
oid canditer_idx(struct canditer *ci, BUN p);
BUN canditer_init(struct canditer *ci, BAT *b, BAT *s);
-oid canditer_last(struct canditer *ci);
+oid canditer_last(const struct canditer *ci);
oid canditer_peek(struct canditer *ci);
oid canditer_peekprev(struct canditer *ci);
oid canditer_prev(struct canditer *ci);
diff --git a/gdk/gdk_cand.c b/gdk/gdk_cand.c
--- a/gdk/gdk_cand.c
+++ b/gdk/gdk_cand.c
@@ -806,6 +806,45 @@ canditer_peekprev(struct canditer *ci)
return o;
}
+/* if o is a candidate, return it, else return the next candidate from
+ * the cand_mask iterator ci after (if next is set) or before (if next
+ * is unset) o; if there are no more candidates, return oid_nil */
+oid
+canditer_mask_next(struct canditer *ci, oid o, bool next)
+{
+ if (o < ci->mskoff)
+ return next ? ci->mskoff + ci->firstbit : oid_nil;
+ o -= ci->mskoff;
+ BUN p = o / 32;
+ o %= 32;
+ if (p >= ci->nvals || (p == ci->nvals - 1 && o >= ci->lastbit))
+ return next ? oid_nil : canditer_last(ci);
+ if (next) {
+ while ((ci->mask[p] & (1U << o)) == 0) {
+ if (++o == 32) {
+ o = 0;
+ if (++p == ci->nvals)
+ return oid_nil;
+ }
+ }
+ if (p == ci->nvals - 1 && o >= ci->lastbit)
+ return oid_nil;
+ } else {
+ while ((ci->mask[p] & (1U << o)) == 0) {
+ if (o == 0) {
+ o = 31;
+ if (p == 0)
+ return oid_nil;
+ } else {
+ o--;
+ }
+ }
+ if (p == 0 && o < ci->firstbit)
+ return oid_nil;
+ }
+ return ci->mskoff + 32 * p + o;
+}
+
/* return the last candidate */
oid
canditer_last(struct canditer *ci)
diff --git a/gdk/gdk_cand.h b/gdk/gdk_cand.h
--- a/gdk/gdk_cand.h
+++ b/gdk/gdk_cand.h
@@ -197,6 +197,7 @@ canditer_contains(struct canditer *ci, o
}
return canditer_search(ci, o, false) != BUN_NONE;
}
+gdk_export oid canditer_mask_next(struct canditer *ci, oid o, bool next);
gdk_export BAT *canditer_slice(struct canditer *ci, BUN lo, BUN hi);
gdk_export BAT *canditer_sliceval(struct canditer *ci, oid lo, oid hi);
gdk_export BAT *canditer_slice2(struct canditer *ci, BUN lo1, BUN hi1, BUN
lo2, BUN hi2);
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -1684,9 +1684,10 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
int lordering, rordering;
oid lv;
BUN i, j; /* counters */
- bool lskipped = false; /* whether we skipped values in l */
oid lval = oid_nil, rval = oid_nil; /* temporary space to point v to */
struct canditer llci, rrci;
+ struct canditer *mlci, xlci;
+ struct canditer *mrci, xrci;
if (lci->tpe == cand_dense && lci->ncand == BATcount(l) &&
rci->tpe == cand_dense && rci->ncand == BATcount(r) &&
@@ -1718,7 +1719,7 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
canditer_init(&llci, NULL, l);
lvals = NULL;
} else {
- lvals = Tloc(l, 0);
+ lvals = Tloc(l, 0); /* non NULL */
llci = (struct canditer) {.tpe = cand_dense}; /* not used */
}
rrci = (struct canditer) {.tpe = cand_dense};
@@ -1772,6 +1773,23 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
BAT *r1 = *r1p;
BAT *r2 = r2p ? *r2p : NULL;
+ if (lci->tpe == cand_mask) {
+ mlci = lci;
+ canditer_init(&xlci, l, NULL);
+ lci = &xlci;
+ } else {
+ mlci = NULL;
+ xlci = (struct canditer) {.tpe = cand_dense}; /* not used */
+ }
+ if (rci->tpe == cand_mask) {
+ mrci = rci;
+ canditer_init(&xrci, r, NULL);
+ rci = &xrci;
+ } else {
+ mrci = NULL;
+ xrci = (struct canditer) {.tpe = cand_dense}; /* not used */
+ }
+
if (l->tsorted || l->trevsorted) {
equal_order = (l->tsorted && r->tsorted) ||
(l->trevsorted && r->trevsorted &&
@@ -1868,12 +1886,18 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
if (equal_order) {
if (rci->next == rci->ncand)
v = NULL; /* no more values */
- else
+ else if (mrci) {
+ oid rv = canditer_mask_next(mrci,
canditer_peek(rci), true);
+ v = rv == oid_nil ? NULL : VALUE(r, rv
- r->hseqbase);
+ } else
v = VALUE(r, canditer_peek(rci) -
r->hseqbase);
} else {
if (rci->next == 0)
v = NULL; /* no more values */
- else
+ else if (mrci) {
+ oid rv = canditer_mask_next(mrci,
canditer_peekprev(rci), false);
+ v = rv == oid_nil ? NULL : VALUE(r, rv
- r->hseqbase);
+ } else
v = VALUE(r, canditer_peekprev(rci) -
r->hseqbase);
}
/* here, v points to next value in r, or if
@@ -1898,6 +1922,13 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
nlx -= lci->next;
}
}
+ if (mlci) {
+ lv = canditer_mask_next(mlci,
lci->seq + lci->next + nlx, true);
+ if (lv == oid_nil)
+ nlx = lci->ncand -
lci->next;
+ else
+ nlx = lv - lci->seq -
lci->next;
+ }
if (lci->next + nlx == lci->ncand)
v = NULL;
}
@@ -1906,13 +1937,12 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
if (only_misses) {
if (maybeextend(r1, r2, nlx, lci->next,
lci->ncand, maxsize) != GDK_SUCCEED)
goto bailout;
- lskipped |= nlx > 1 && lci->tpe !=
cand_dense;
while (nlx > 0) {
- APPEND(r1, canditer_next(lci));
+ lv = canditer_next(lci);
+ if (mlci == NULL ||
canditer_contains(mlci, lv))
+ APPEND(r1, lv);
nlx--;
}
- if (lskipped)
- r1->tseqbase = oid_nil;
if (r1->trevsorted && BATcount(r1) > 1)
r1->trevsorted = false;
} else if (nil_on_miss) {
@@ -1926,18 +1956,17 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
}
if (maybeextend(r1, r2, nlx, lci->next,
lci->ncand, maxsize) != GDK_SUCCEED)
goto bailout;
- lskipped |= nlx > 1 && lci->tpe !=
cand_dense;
while (nlx > 0) {
- APPEND(r1, canditer_next(lci));
- APPEND(r2, oid_nil);
+ lv = canditer_next(lci);
+ if (mlci == NULL ||
canditer_contains(mlci, lv)) {
+ APPEND(r1, lv);
+ APPEND(r2, oid_nil);
+ }
nlx--;
}
- if (lskipped)
- r1->tseqbase = oid_nil;
if (r1->trevsorted && BATcount(r1) > 1)
r1->trevsorted = false;
} else {
- lskipped = BATcount(r1) > 0;
canditer_setidx(lci, lci->next + nlx);
}
}
@@ -1960,7 +1989,14 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
* l is actually sorted (lscan > 0). */
nl = 1; /* we'll match (at least) one in l */
nr = 0; /* maybe we won't match anything in r */
- v = VALUE(l, canditer_peek(lci) - l->hseqbase);
+ lv = canditer_peek(lci);
+ if (mlci) {
+ lv = canditer_mask_next(mlci, lv, true);
+ if (lv == oid_nil)
+ break;
+ canditer_setidx(lci, canditer_search(lci, lv, true));
+ }
+ v = VALUE(l, lv - l->hseqbase);
if (l->tkey) {
/* if l is key, there is a single value */
} else if (lscan > 0 &&
@@ -2166,7 +2202,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
* in r */
break;
}
- lskipped = BATcount(r1) > 0;
canditer_setidx(lci, lci->next + nl);
continue;
}
@@ -2186,7 +2221,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
goto bailout;
} else if (only_misses) {
/* we had a match, so we're not interested */
- lskipped = BATcount(r1) > 0;
canditer_setidx(lci, lci->next + nl);
continue;
} else {
@@ -2197,11 +2231,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
nr = 1;
}
}
- if (canditer_idx(lci, lci->next + nl - 1) - canditer_idx(lci,
lci->next) != nl - 1) {
- /* not all values in the range are
- * candidates */
- lskipped = true;
- }
/* make space: nl values in l match nr values in r, so
* we need to add nl * nr values in the results */
if (maybeextend(r1, r2, nl * nr, lci->next, lci->ncand,
maxsize) != GDK_SUCCEED)
@@ -2227,7 +2256,6 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
* in l will be repeated multiple times: hence
* r1 is not key and not dense */
r1->tkey = false;
- r1->tseqbase = oid_nil;
if (r2) {
/* multiple different values will be
* inserted in r2 (in order), so not
@@ -2293,20 +2321,19 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
r2->tseqbase = oid_nil;
}
}
- /* if there is a left candidate list, it may
- * be that the next value added isn't
- * consecutive with the last one */
- if (lskipped ||
- ((oid *) r1->theap->base)[r1->batCount - 1] + 1 !=
canditer_peek(lci))
- r1->tseqbase = oid_nil;
}
/* insert values: first the left output */
+ BUN nladded = 0;
for (i = 0; i < nl; i++) {
lv = canditer_next(lci);
- for (j = 0; j < nr; j++)
- APPEND(r1, lv);
+ if (mlci == NULL || canditer_contains(mlci, lv)) {
+ nladded++;
+ for (j = 0; j < nr; j++)
+ APPEND(r1, lv);
+ }
}
+ nl = nladded;
/* then the right output, various different ways of
* doing it */
if (r2 == NULL) {
@@ -2345,20 +2372,15 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
/* also set other bits of heap to correct value to indicate size */
BATsetcount(r1, BATcount(r1));
r1->tseqbase = oid_nil;
+ if (r1->tkey)
+ r1 = virtualize(r1);
if (r2) {
BATsetcount(r2, BATcount(r2));
assert(BATcount(r1) == BATcount(r2));
r2->tseqbase = oid_nil;
- }
- if (BATcount(r1) > 0) {
- if (BATtdense(r1))
- r1->tseqbase = ((oid *) r1->theap->base)[0];
- if (r2 && BATtdense(r2))
- r2->tseqbase = ((oid *) r2->theap->base)[0];
- } else {
- r1->tseqbase = 0;
- if (r2) {
- r2->tseqbase = 0;
+ if (BATcount(r2) <= 1) {
+ r2->tkey = true;
+ r2 = virtualize(r2);
}
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list