Changeset: 820e0fb15609 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=820e0fb15609
Modified Files:
clients/Tests/exports.stable.out
gdk/gdk.h
gdk/gdk_batop.c
gdk/gdk_join.c
gdk/gdk_relop.mx
gdk/gdk_select.c
monetdb5/extras/jaql/Tests/join00.stable.out
monetdb5/extras/jaql/Tests/join01.stable.out
monetdb5/extras/jaql/Tests/join02.stable.out
monetdb5/extras/jaql/jaqltests/Tests/join.stable.out
monetdb5/mal/Tests/tst034.stable.out
monetdb5/modules/kernel/aggr.c
monetdb5/modules/kernel/algebra.mx
monetdb5/modules/mal/mat.c
sql/test/leaks/Tests/check0.stable.out
sql/test/leaks/Tests/check1.stable.out
sql/test/leaks/Tests/check2.stable.out
sql/test/leaks/Tests/check3.stable.out
sql/test/leaks/Tests/check4.stable.out
sql/test/leaks/Tests/check5.stable.out
sql/test/leaks/Tests/drop3.stable.out
sql/test/leaks/Tests/select1.stable.out
sql/test/leaks/Tests/select2.stable.out
sql/test/leaks/Tests/temp1.stable.out
sql/test/leaks/Tests/temp2.stable.out
sql/test/leaks/Tests/temp3.stable.out
Branch: default
Log Message:
Merge with default
diffs (truncated from 4377 to 300 lines):
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -115,8 +115,6 @@ void BATderiveHeadProps(BAT *b, int expe
void BATderiveProps(BAT *b, int expensive);
BAT *BATextend(BAT *b, BUN newcap);
BAT *BATfakeCommit(BAT *b);
-BAT *BATfetch(BAT *b, BAT *s);
-BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate);
int BATgetaccess(BAT *b);
PROPrec *BATgetprop(BAT *b, int idx);
gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *g,
BAT *e, BAT *h);
@@ -135,7 +133,6 @@ BAT *BATgroupvariance_population(BAT *b,
BAT *BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int
skip_nils, int abort_on_error);
BUN BATgrows(BAT *b);
BAT *BAThash(BAT *b, BUN masksize);
-BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate);
BAT *BAThistogram(BAT *b);
BAT *BATimprints(BAT *b);
BAT *BATins(BAT *b, BAT *c, bit force);
@@ -155,7 +152,6 @@ BAT *BATmaterialize(BAT *b);
BAT *BATmaterializeh(BAT *b);
size_t BATmemsize(BAT *b, int dirty);
BAT *BATmergecand(BAT *a, BAT *b);
-BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate);
int BATmmap(BAT *b, int hb, int tb, int hh, int th, int force);
BAT *BATmode(BAT *b, int onoff);
int BATmultiprintf(stream *f, int argc, BAT *argv[], int printoid, int order,
int printorderby);
@@ -196,6 +192,7 @@ BAT *BATsort_rev(BAT *b);
BAT *BATssort(BAT *b);
BAT *BATssort_rev(BAT *b);
gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr,
BUN estimate);
+gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl,
BAT *sr, BUN estimate);
gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT
*sr, BUN estimate);
gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT
*sr, BUN estimate);
BAT *BATsubselect(BAT *b, BAT *s, const void *tl, const void *th, int li, int
hi, int anti);
@@ -747,15 +744,11 @@ str ALGcrossproduct2(int *l, int *r, int
str ALGexist(bit *ret, int *bid, ptr val);
str ALGexistBUN(bit *ret, int *bid, ptr val, ptr tval);
str ALGfetch(ptr ret, int *bid, lng *pos);
-str ALGfetchbat(int *ret, int *bid, int *sid);
str ALGfetchint(int *ret, int *bid, int *pos);
-str ALGfetchjoin(int *result, int *lid, int *rid);
-str ALGfetchjoinestimate(int *result, int *lid, int *rid, lng *estimate);
str ALGfetchoid(int *ret, int *bid, oid *pos);
str ALGfind(ptr ret, int *bid, ptr val);
str ALGfragment(int *result, int *bid, ptr hlow, ptr hhigh, ptr tlow, ptr
thigh);
str ALGgroupby(int *res, int *gids, int *cnts);
-str ALGhashjoin(int *result, int *lid, int *rid);
str ALGhistogram(int *result, int *bid);
str ALGhistogram_rev(int *result, int *bid);
str ALGhmarkp(int *result, int *bid, int *nr_parts, int *part_nr);
@@ -791,7 +784,6 @@ str ALGmax_sht(sht *res, int *bid);
str ALGmax_wrd(wrd *res, int *bid);
str ALGmaxany(ptr result, int *bid);
str ALGmerge(int *result, int *bid);
-str ALGmergejoin(int *result, int *lid, int *rid);
str ALGmin_bte(bte *res, int *bid);
str ALGmin_dbl(dbl *res, int *bid);
str ALGmin_flt(flt *res, int *bid);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2151,7 +2151,6 @@ gdk_export oid OIDnew(oid inc);
* operations.
*/
gdk_export BAT *BAThash(BAT *b, BUN masksize);
-gdk_export BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate);
/* low level functions */
@@ -3147,7 +3146,6 @@ gdk_export BAT *BATconstant(int tt, cons
gdk_export BAT *BATconst(BAT *l, int tt, const void *val);
gdk_export BAT *BATthetajoin(BAT *l, BAT *r, int mode, BUN estimate);
gdk_export BAT *BATsemijoin(BAT *l, BAT *r);
-gdk_export BAT *BATmergejoin(BAT *l, BAT *r, BUN estimate);
gdk_export BAT *BATjoin(BAT *l, BAT *r, BUN estimate);
gdk_export BAT *BATantijoin(BAT *l, BAT *r);
gdk_export BAT *BATleftjoin(BAT *l, BAT *r, BUN estimate);
@@ -3159,11 +3157,10 @@ gdk_export gdk_return BATsubouterjoin(BA
gdk_export gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r,
BAT *sl, BAT *sr, const char *op, BUN estimate);
gdk_export gdk_return BATsubsemijoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT
*sl, BAT *sr, BUN estimate);
gdk_export gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT
*sl, BAT *sr, BUN estimate);
+gdk_export gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT
*r, BAT *sl, BAT *sr, BUN estimate);
gdk_export BAT *BATproject(BAT *l, BAT *r);
gdk_export BAT *BATslice(BAT *b, BUN low, BUN high);
-gdk_export BAT *BATfetch(BAT *b, BAT *s);
-gdk_export BAT *BATfetchjoin(BAT *b, BAT *s, BUN estimate);
gdk_export BAT *BATleftfetchjoin(BAT *b, BAT *s, BUN estimate);
gdk_export BAT *BATsunique(BAT *b);
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -1319,7 +1319,7 @@ BATsubsort(BAT **sorted, BAT **order, BA
return GDK_SUCCEED;
}
if (o) {
- bn = BATleftfetchjoin(o, b, BATcount(b));
+ bn = BATproject(o, b);
if (bn == NULL)
goto error;
if (bn->ttype == TYPE_void || isVIEW(bn)) {
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -180,7 +180,7 @@ binsearch(const oid *rcand, oid offset,
*/
static gdk_return
mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
- int nil_matches, int nil_on_miss, int semi)
+ int nil_matches, int nil_on_miss, int semi, int must_match)
{
BUN lstart, lend, lcnt;
const oid *lcand, *lcandend;
@@ -213,7 +213,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
ALGODEBUG fprintf(stderr, "#mergejoin(l=%s#" BUNFMT "[%s]%s%s,"
"r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
"sr=%s#" BUNFMT "%s%s,nil_matches=%d,"
- "nil_on_miss=%d,semi=%d)\n",
+ "nil_on_miss=%d,semi=%d,must_match=%d)\n",
BATgetId(l), BATcount(l), ATOMname(l->ttype),
l->tsorted ? "-sorted" : "",
l->trevsorted ? "-revsorted" : "",
@@ -226,7 +226,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
sr ? BATgetId(sr) : "NULL", sr ? BATcount(sr) : 0,
sr && sr->tsorted ? "-sorted" : "",
sr && sr->trevsorted ? "-revsorted" : "",
- nil_matches, nil_on_miss, semi);
+ nil_matches, nil_on_miss, semi, must_match);
assert(BAThdense(l));
assert(BAThdense(r));
@@ -234,6 +234,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
assert(r->tsorted || r->trevsorted);
assert(sl == NULL || sl->tsorted);
assert(sr == NULL || sr->tsorted);
+ assert(!nil_on_miss || !must_match); /* can't have both */
CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
@@ -255,6 +256,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
if (lstart == lend || (!nil_on_miss && rstart == rend)) {
/* nothing to do: there are no matches */
+ if (must_match && lstart < lend) {
+ GDKerror("mergejoin(%s,%s) does not hit always => can't
use fetchjoin.\n", BATgetId(l), BATgetId(r));
+ goto bailout;
+ }
+ ALGODEBUG fprintf(stderr,
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+ BATgetId(l), BATgetId(r),
+ BATgetId(r1), BATcount(r1),
+ r1->tsorted ? "-sorted" : "",
+ r1->trevsorted ? "-revsorted" : "",
+ BATgetId(r2), BATcount(r2),
+ r2->tsorted ? "-sorted" : "",
+ r2->trevsorted ? "-revsorted" : "");
return GDK_SUCCEED;
}
if (!nil_on_miss) {
@@ -266,6 +279,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
/* nothing to do: all values in either l or r
* (or both) are nil, and we don't have to
* match nil values */
+ if (must_match) {
+ GDKerror("mergejoin(%s,%s) does not hit always
=> can't use fetchjoin.\n", BATgetId(l), BATgetId(r));
+ goto bailout;
+ }
+ ALGODEBUG fprintf(stderr,
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+ BATgetId(l), BATgetId(r),
+ BATgetId(r1), BATcount(r1),
+ r1->tsorted ? "-sorted" : "",
+ r1->trevsorted ? "-revsorted" : "",
+ BATgetId(r2), BATcount(r2),
+ r2->tsorted ? "-sorted" : "",
+ r2->trevsorted ? "-revsorted" : "");
return GDK_SUCCEED;
}
if ((l->ttype == TYPE_void && l->tseqbase == oid_nil &&
@@ -277,6 +302,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
/* nothing to do: all values in l are nil, and
* we can guarantee there are no nil values in
* r, or vice versa */
+ if (must_match) {
+ GDKerror("mergejoin(%s,%s) does not hit always
=> can't use fetchjoin.\n", BATgetId(l), BATgetId(r));
+ goto bailout;
+ }
+ ALGODEBUG fprintf(stderr,
"#mergejoin(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s\n",
+ BATgetId(l), BATgetId(r),
+ BATgetId(r1), BATcount(r1),
+ r1->tsorted ? "-sorted" : "",
+ r1->trevsorted ? "-revsorted" : "",
+ BATgetId(r2), BATcount(r2),
+ r2->tsorted ? "-sorted" : "",
+ r2->trevsorted ? "-revsorted" : "");
return GDK_SUCCEED;
}
}
@@ -288,9 +325,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
nl > 0;
lscan++)
nl >>= 1;
- equal_order = l->tsorted == r->tsorted ||
- l->trevsorted == r->trevsorted;
- lordering = l->tsorted && (r->tsorted|| !equal_order) ? 1 : -1;
+ equal_order = (l->tsorted && r->tsorted) ||
+ (l->trevsorted && r->trevsorted &&
+ l->ttype != TYPE_void && r->ttype != TYPE_void);
+ lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1;
rordering = equal_order ? lordering : -lordering;
} else {
/* if l not sorted, we will always use binary search
@@ -344,16 +382,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
rcandorig = rcand;
rstartorig = rstart;
while (lcand ? lcand < lcandend : lstart < lend) {
- if (!nil_on_miss && lscan > 0) {
- /* if next value in r too far away (more than
- * lscan from current position in l), use
- * binary search on l to skip over
- * non-matching values, but only if l is
- * sorted (lscan > 0) and we don't have to
- * insert nils (outer join)
- *
- * next value to match in r is first if
- * equal_order is set, last otherwise */
+ if (!nil_on_miss && !must_match && lscan > 0) {
+ /* If l is sorted (lscan > 0), we look at the
+ * next value in r to see whether we can jump
+ * over a large section of l using binary
+ * search. We do this by looking ahead in l
+ * (lscan far, to be precise) and seeing if
+ * the value there is still too "small"
+ * (definition depends on sort order of l).
+ * If it is, we use binary search on l,
+ * otherwise we scan l for the next position
+ * with a value greater than or equal to the
+ * value in r. Obviously, we can only do this
+ * if we're not inserting NILs for missing
+ * values in r (nil_on_miss set, i.e., outer
+ * join).
+ * The next value to in r is the first if
+ * equal_order is set, the last otherwise. */
if (rcand) {
v = VALUE(r, (equal_order ? rcand[0] :
rcandend[-1]) - r->hseqbase);
} else if (rvals) {
@@ -374,6 +419,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
rval = rend - 1 + r->tseqbase;
v = (const char *) &rval;
}
+ /* here, v points to next value in r */
if (lcand) {
if (lscan < (BUN) (lcandend - lcand) &&
lordering * cmp(VALUE(l, lcand[lscan] -
l->hseqbase),
@@ -426,15 +472,33 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
rcand = rcandorig;
rstart = rstartorig;
}
- /* v is the value we're going to work with in this
- * iteration; count number of equal values in left */
+ /* Here we determine the next value in l that we are
+ * going to try to match in r. We will also count the
+ * number of occurrences in l of that value.
+ * Afterwards, v points to the value and nl is the
+ * number of times it occurs. Also, lstart/lcand will
+ * point to the next value to be considered (ready for
+ * the next iteration).
+ * If there are many equal values in l (more than
+ * lscan), we will use binary search to find the end
+ * of the sequence. Obviously, we can do this only if
+ * l is actually sorted (lscan > 0). */
nl = 1; /* we'll match (at least) one in l */
nr = 0; /* maybe we won't match anything in r */
if (lcand) {
v = VALUE(l, lcand[0] - l->hseqbase);
- while (++lcand < lcandend &&
- cmp(v, VALUE(l, lcand[0] - l->hseqbase)) == 0)
- nl++;
+ if (lscan > 0 &&
+ lscan < (BUN) (lcandend - lcand) &&
+ cmp(v, VALUE(l, lcand[lscan] - l->hseqbase)) == 0) {
+ /* lots of equal values: use binary
+ * search to find end */
+ nl = binsearch(lcand, l->hseqbase, lvals,
lvars, lwidth, lscan, lcandend - lcand, v, cmp, lordering, 1);
+ lcand += nl;
+ } else {
+ while (++lcand < lcandend &&
+ cmp(v, VALUE(l, lcand[0] - l->hseqbase))
== 0)
+ nl++;
+ }
} else if (lvals) {
v = VALUE(l, lstart);
if (loff == wrd_nil) {
@@ -445,9 +509,23 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
v = (const char *) &lval;
} else {
/* compare values without offset */
- while (++lstart < lend &&
- cmp(v, VALUE(l, lstart)) == 0)
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list