Changeset: 901541512f01 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=901541512f01
Modified Files:
gdk/gdk_join.c
Branch: default
Log Message:
All sorts of fixes to the new (and still unused) join code.
diffs (truncated from 734 to 300 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -84,11 +84,13 @@ joininitresults(BAT **r1p, BAT **r2p, BU
r1->tkey = 1;
r1->tsorted = 1;
r1->trevsorted = 1;
+ r1->tdense = 1;
r2->T->nil = 0;
r2->T->nonil = 1;
r2->tkey = 1;
r2->tsorted = 1;
r2->trevsorted = 1;
+ r2->tdense = 1;
*r1p = r1;
*r2p = r2;
return GDK_SUCCEED;
@@ -102,38 +104,64 @@ joininitresults(BAT **r1p, BAT **r2p, BU
* (lo inclusive, hi not inclusive) in rvals/rvars.
* If last is set, return the index of the first value > v; if last is
* not set, return the index of the first value >= v.
- * If reverse is -1, the values are sorted in reverse order and hence
+ * If ordering is -1, the values are sorted in reverse order and hence
* all comparisons are reversed.
*/
static BUN
binsearch(const oid *rcand, oid offset,
const char *rvals, const char *rvars,
int rwidth, BUN lo, BUN hi, const char *v,
- int (*cmp)(const void *, const void *), int reverse, int last)
+ int (*cmp)(const void *, const void *), int ordering, int last)
{
BUN mid;
int c;
- assert(reverse == 1 || reverse == -1);
+ assert(ordering == 1 || ordering == -1);
assert(lo < hi);
--hi; /* now hi is inclusive */
- if ((c = reverse * cmp(VALUE(r, rcand ? rcand[lo] - offset : lo), v)) >
0 ||
+ if ((c = ordering * cmp(VALUE(r, rcand ? rcand[lo] - offset : lo), v))
> 0 ||
(!last && c == 0))
return lo;
- if ((c = reverse * cmp(VALUE(r, rcand ? rcand[hi] - offset : hi), v)) <
0 ||
+ if ((c = ordering * cmp(VALUE(r, rcand ? rcand[hi] - offset : hi), v))
< 0 ||
(last && c == 0))
return hi + 1;
+/* the two versions here are equivalent, the first is disabled because
+ * it does more work in the inner loop */
+#if 0
/* loop invariant:
* last ? value@lo <= v < value@hi : value@lo < v <= value@hi */
while (hi - lo > 1) {
mid = (hi + lo) / 2;
- if ((c = reverse * cmp(VALUE(r, rcand ? rcand[mid] - offset :
mid), v)) > 0 ||
+ if ((c = ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset :
mid), v)) > 0 ||
(!last && c == 0))
hi = mid;
else
lo = mid;
}
+#else
+ if (last) {
+ /* loop invariant:
+ * value@lo <= v < value@hi */
+ while (hi - lo > 1) {
+ mid = (hi + lo) / 2;
+ if (ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset
: mid), v) > 0)
+ hi = mid;
+ else
+ lo = mid;
+ }
+ } else {
+ /* loop invariant:
+ * value@lo < v <= value@hi */
+ while (hi - lo > 1) {
+ mid = (hi + lo) / 2;
+ if (ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset
: mid), v) >= 0)
+ hi = mid;
+ else
+ lo = mid;
+ }
+ }
+#endif
return hi;
}
@@ -160,10 +188,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
const char *v, *prev = NULL;
BUN nl, nr;
int insert_nil;
+ /* equal_order is set if we can scan both BATs in the same
+ * order, so when both are sorted or both are reverse sorted
+ * -- important to know in order to skip over values; if l is
+ * not sorted, this must be set to 1 and we will always do a
+ * binary search on all of r */
int equal_order;
- int lreverse, rreverse;
+ /* [lr]ordering is either 1 or -1 depending on the order of
+ * l/r: it determines the comparison function used */
+ int lordering, rordering;
oid lv;
BUN i;
+ int lskipped = 0; /* whether we skipped values in l */
ALGODEBUG fprintf(stderr, "#mergejoin(l=%s#" BUNFMT "[%s]%s%s,"
"r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
@@ -219,30 +255,26 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
if (l->tsorted || l->trevsorted) {
/* determine opportunistic scan window for l */
- for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart,
lscan = 4;
+ for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart,
+ lscan = 4;
nl > 0;
lscan++)
nl >>= 1;
-
- /* equal_order is set if we can scan both BATs in the
- * same order, so when both are sorted or both are
- * reverse sorted */
- equal_order = l->tsorted == r->tsorted || l->trevsorted ==
r->trevsorted;
- /* [lr]reverse is either 1 or -1 depending on the
- * order of l/r: it determines the comparison function
- * used */
- lreverse = l->tsorted ? 1 : -1;
+ equal_order = l->tsorted == r->tsorted ||
+ l->trevsorted == r->trevsorted;
+ lordering = l->tsorted && (r->tsorted|| !equal_order) ? 1 : -1;
+ rordering = equal_order ? lordering : -lordering;
} else {
/* if l not sorted, we will always use binary search
* on r */
lscan = 0;
equal_order = 1;
- lreverse = 1;
+ lordering = 1;
+ rordering = r->tsorted ? 1 : -1;
/* if l not sorted, we only know for sure that r2 is
* key if l is, and that r1 is key if r is */
r2->tkey = l->tkey != 0;
r1->tkey = r->tkey != 0;
-
}
/* determine opportunistic scan window for r; if l is not
* sorted this is only used to find range of equal values */
@@ -250,7 +282,6 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
nl > 0;
rscan++)
nl >>= 1;
- rreverse = r->tsorted ? 1 : -1;
while (lcand ? lcand < lcandend : lstart < lend) {
if (!nil_on_miss && lscan > 0) {
@@ -258,16 +289,36 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* lscan from current position in l), use
* binary search on l to skip over
* non-matching values */
+ if (equal_order) {
+ /* next value to match is first in r */
+ v = VALUE(r, rcand ? rcand[0] - r->hseqbase :
rstart);
+ } else {
+ /* next value to match is last in r */
+ v = VALUE(r, rcand ? rcandend[-1] - r->hseqbase
: rend - 1);
+ }
if (lcand) {
if (lscan < (BUN) (lcandend - lcand) &&
- lreverse * cmp(VALUE(l, lcand[lscan] -
l->hseqbase),
- (v = VALUE(r, rcand ?
rcand[0] - r->hseqbase : rstart))) < 0)
- lcand += binsearch(lcand, l->hseqbase,
lvals, lvars, lwidth, lscan, lcandend - lcand, v, cmp, lreverse, 0);
+ lordering * cmp(VALUE(l, lcand[lscan] -
l->hseqbase),
+ v) < 0) {
+ lcand += binsearch(lcand, l->hseqbase,
+ lvals, lvars,
+ lwidth, lscan,
+ lcandend - lcand, v,
+ cmp, lordering, 0);
+ lskipped = BATcount(r1) > 0;
+ }
} else {
if (lscan < lend - lstart &&
- lreverse * cmp(VALUE(l, lstart + lscan),
- (v = VALUE(r, rcand ?
rcand[0] - r->hseqbase : rstart))) < 0)
- lstart = binsearch(NULL, 0, lvals,
lvars, lwidth, lstart + lscan, lend, v, cmp, lreverse, 0);
+ lordering * cmp(VALUE(l, lstart + lscan),
+ v) < 0) {
+ lstart = binsearch(NULL, 0,
+ lvals, lvars,
+ lwidth,
+ lstart + lscan,
+ lend, v,
+ cmp, lordering, 0);
+ lskipped = BATcount(r1) > 0;
+ }
}
} else if (lscan == 0) {
/* always search r completely */
@@ -303,15 +354,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* a binary search */
if (lscan == 0 ||
(rscan < (BUN) (rcandend - rcand) &&
- rreverse * cmp(v, VALUE(r, rcand[rscan] -
r->hseqbase)) > 0)) {
+ rordering * cmp(v, VALUE(r, rcand[rscan] -
r->hseqbase)) > 0)) {
/* value too far away in r or
* l not sorted: use binary
* search */
- rcand += binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, lscan == 0 ? 0 : rscan, rcandend - rcand, v, cmp,
rreverse, 0);
+ rcand += binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, lscan == 0 ? 0 : rscan, rcandend - rcand, v, cmp,
rordering, 0);
} else {
/* scan r for v */
while (rcand < rcandend &&
- rreverse * cmp(v, VALUE(r,
rcand[0] - r->hseqbase)) > 0)
+ rordering * cmp(v, VALUE(r,
rcand[0] - r->hseqbase)) > 0)
rcand++;
}
} else {
@@ -322,15 +373,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* a binary search */
if (lscan == 0 ||
(rscan < rend - rstart &&
- rreverse * cmp(v, VALUE(r, rstart +
rscan)) > 0)) {
+ rordering * cmp(v, VALUE(r, rstart +
rscan)) > 0)) {
/* value too far away in r or
* l not sorted: use binary
* search */
- rstart = binsearch(NULL, 0, rvals,
rvars, rwidth, rstart + (lscan == 0 ? 0 : rscan), rend, v, cmp, rreverse, 0);
+ rstart = binsearch(NULL, 0, rvals,
rvars, rwidth, rstart + (lscan == 0 ? 0 : rscan), rend, v, cmp, rordering, 0);
} else {
/* scan r for v */
while (rstart < rend &&
- rreverse * cmp(v, VALUE(r,
rstart)) > 0)
+ rordering * cmp(v, VALUE(r,
rstart)) > 0)
rstart++;
}
}
@@ -344,15 +395,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* sorted (lscan == 0) we'll always do
* a binary search */
if (rscan < (BUN) (rcandend - rcand) &&
- rreverse * cmp(v, VALUE(r, rcandend[-rscan
- 1] - r->hseqbase)) < 0) {
+ rordering * cmp(v, VALUE(r, rcandend[-rscan
- 1] - r->hseqbase)) < 0) {
/* value too far away in r or
* l not sorted: use binary
* search */
- rcandend = rcand + binsearch(rcand,
r->hseqbase, rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - (lscan == 0 ?
0 : rscan), v, cmp, rreverse, 1);
+ rcandend = rcand + binsearch(rcand,
r->hseqbase, rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - (lscan == 0 ?
0 : rscan), v, cmp, rordering, 1);
} else {
/* scan r for v */
while (rcand < rcandend &&
- rreverse * cmp(v, VALUE(r,
rcandend[-1] - r->hseqbase)) < 0)
+ rordering * cmp(v, VALUE(r,
rcandend[-1] - r->hseqbase)) < 0)
rcandend--;
}
} else {
@@ -362,15 +413,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* sorted (lscan == 0) we'll always do
* a binary search */
if (rscan < rend - rstart &&
- rreverse * cmp(v, VALUE(r, rend - rscan -
1)) < 0) {
+ rordering * cmp(v, VALUE(r, rend - rscan -
1)) < 0) {
/* value too far away in r or
* l not sorted: use binary
* search */
- rend = binsearch(NULL, 0, rvals, rvars,
rwidth, rstart, rend - (lscan == 0 ? 0 : rscan), v, cmp, rreverse, 1);
+ rend = binsearch(NULL, 0, rvals, rvars,
rwidth, rstart, rend - (lscan == 0 ? 0 : rscan), v, cmp, rordering, 1);
} else {
/* scan r for v */
while (rstart < rend &&
- rreverse * cmp(v, VALUE(r, rend
- 1)) < 0)
+ rordering * cmp(v, VALUE(r, rend
- 1)) < 0)
rend--;
}
}
@@ -388,7 +439,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
cmp(v, VALUE(r, rcand[rscan] -
r->hseqbase)) == 0) {
/* range too large: use binary
* search */
- nr = binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, rscan, rcandend - rcand, v, cmp, rreverse, 1);
+ nr = binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, rscan, rcandend - rcand, v, cmp, rordering, 1);
rcand += nr;
} else {
/* scan r for end of range */
@@ -406,7 +457,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
cmp(v, VALUE(r, rstart + rscan)) == 0) {
/* range too large: use binary
* search */
- nr = binsearch(NULL, 0, rvals, rvars,
rwidth, rstart + rscan, rend, v, cmp, rreverse, 1);
+ nr = binsearch(NULL, 0, rvals, rvars,
rwidth, rstart + rscan, rend, v, cmp, rordering, 1);
nr -= rstart;
rstart += nr;
} else {
@@ -425,7 +476,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* a binary search */
if (rscan < (BUN) (rcandend - rcand) &&
cmp(v, VALUE(r, rcandend[-rscan - 1] -
r->hseqbase)) == 0) {
- nr = binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - rscan, v, cmp, rreverse, 0);
+ nr = binsearch(rcand, r->hseqbase,
rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - rscan, v, cmp, rordering,
0);
nr = (BUN) (rcandend - rcand) - nr;
rcandend -= nr;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list