Changeset: 60ed1e770b46 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=60ed1e770b46
Modified Files:
gdk/gdk.h
gdk/gdk_join.c
monetdb5/modules/kernel/algebra.mx
Branch: default
Log Message:
Initial version of BATsubthetajoin.
Again, the MAL interface is likely to change.
diffs (truncated from 305 to 300 lines):
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -3167,6 +3167,7 @@ gdk_export BAT *BATcross(BAT *l, BAT *r)
gdk_export gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT
*sl, BAT *sr, BUN estimate);
gdk_export gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r,
BAT *sl, BAT *sr, BUN estimate);
+gdk_export gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r,
BAT *sl, BAT *sr, const char *op, BUN estimate);
gdk_export BAT *BATslice(BAT *b, BUN low, BUN high);
gdk_export BAT *BATfetch(BAT *b, BAT *s);
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -730,6 +730,169 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
return GDK_FAIL;
}
+#define MASK_EQ 1
+#define MASK_LT 2
+#define MASK_GT 4
+#define MASK_LE (MASK_EQ | MASK_LT)
+#define MASK_GE (MASK_EQ | MASK_GT)
+#define MASK_NE (MASK_LT | MASK_GT)
+
+static gdk_return
+thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, const char *op)
+{
+ BUN lstart, lend, lcnt;
+ const oid *lcand = NULL, *lcandend = NULL;
+ BUN rstart, rend, rcnt;
+ const oid *rcand = NULL, *rcandend = NULL;
+ const char *lvals, *rvals;
+ const char *lvars, *rvars;
+ int lwidth, rwidth;
+ const void *nil = ATOMnilptr(l->ttype);
+ int (*cmp)(const void *, const void *) = BATatoms[l->ttype].atomCmp;
+ const char *vl, *vr;
+ const oid *p;
+ oid lastr = 0; /* last value inserted into r2 */
+ BUN n, nr;
+ BUN newcap;
+ int opcode = 0;
+ oid lo, ro;
+ int c;
+
+ assert(BAThdense(l));
+ assert(BAThdense(r));
+ assert(l->ttype == r->ttype);
+ assert(sl == NULL || sl->tsorted);
+ assert(sr == NULL || sr->tsorted);
+
+ /* encode operator as a bit mask into opcode */
+ if (op[0] == '=' && ((op[1] == '=' && op[2] == 0) || op[2] == 0)) {
+ /* "=" or "==" */
+ opcode |= MASK_EQ;
+ } else if (op[0] == '!' && op[1] == '=' && op[2] == 0) {
+ /* "!=" (equivalent to "<>") */
+ opcode |= MASK_NE;
+ } else if (op[0] == '<') {
+ if (op[1] == 0) {
+ /* "<" */
+ opcode |= MASK_LT;
+ } else if (op[1] == '=' && op[2] == 0) {
+ /* "<=" */
+ opcode |= MASK_LE;
+ } else if (op[1] == '>' && op[2] == 0) {
+ /* "<>" (equivalent to "!=") */
+ opcode |= MASK_NE;
+ }
+ } else if (op[0] == '>') {
+ if (op[1] == 0) {
+ /* ">" */
+ opcode |= MASK_GT;
+ } else if (op[1] == '=' && op[2] == 0) {
+ /* ">=" */
+ opcode |= MASK_GE;
+ }
+ }
+ if (opcode == 0) {
+ GDKerror("BATthetasubjoin: unknown operator.\n");
+ return GDK_FAIL;
+ }
+
+ CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
+ CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
+
+ lvals = (const char *) Tloc(l, BUNfirst(l));
+ rvals = (const char *) Tloc(r, BUNfirst(r));
+ if (l->tvarsized && l->ttype) {
+ assert(r->tvarsized && r->ttype);
+ lvars = l->T->vheap->base;
+ rvars = r->T->vheap->base;
+ } else {
+ assert(!r->tvarsized || !r->ttype);
+ lvars = rvars = NULL;
+ }
+ lwidth = l->T->width;
+ rwidth = r->T->width;
+
+ r1->tkey = 1;
+ r1->tsorted = 1;
+ r1->trevsorted = 1;
+ r2->tkey = 1;
+ r2->tsorted = 1;
+ r2->trevsorted = 1;
+
+ /* nested loop implementation for theta join */
+ for (;;) {
+ if (lcand) {
+ if (lcand == lcandend)
+ break;
+ lo = *lcand++;
+ vl = VALUE(l, lo - l->hseqbase);
+ } else {
+ if (lstart == lend)
+ break;
+ vl = VALUE(l, lstart);
+ lo = lstart++ + l->hseqbase;
+ }
+ if (cmp(vl, nil) == 0)
+ continue;
+ nr = 0;
+ p = rcand;
+ n = rstart;
+ for (;;) {
+ if (rcand) {
+ if (p == rcandend)
+ break;
+ ro = *p++;
+ vr = VALUE(r, ro - r->hseqbase);
+ } else {
+ if (n == rend)
+ break;
+ vr = VALUE(r, n);
+ ro = n++ + r->hseqbase;
+ }
+ if (cmp(vr, nil) == 0)
+ continue;
+ c = cmp(vl, vr);
+ if (!((c < 0 && opcode & MASK_LT) ||
+ (c > 0 && opcode & MASK_GT) ||
+ (c == 0 && opcode & MASK_EQ)))
+ continue;
+ if (BUNlast(r1) == BATcapacity(r1)) {
+ newcap = BATgrows(r1);
+ r1 = BATextend(r1, newcap);
+ r2 = BATextend(r2, newcap);
+ if (r1 == NULL || r2 == NULL)
+ goto bailout;
+ assert(BATcapacity(r1) == BATcapacity(r2));
+ }
+ if (nr == 0 && BATcount(r2) > 0) {
+ r1->trevsorted = 0;
+ if (lastr > ro) {
+ r2->tsorted = 0;
+ r2->tkey = 0;
+ } else if (lastr < ro)
+ r2->trevsorted = 0;
+ }
+ APPEND(r1, lo);
+ APPEND(r2, ro);
+ lastr = ro;
+ nr++;
+ }
+ if (nr > 1) {
+ r1->tkey = 0;
+ r2->trevsorted = 0;
+ }
+ }
+ assert(BATcount(r1) == BATcount(r2));
+ return GDK_SUCCEED;
+
+ bailout:
+ if (r1)
+ BBPreclaim(r1);
+ if (r2)
+ BBPreclaim(r2);
+ return GDK_FAIL;
+}
+
/* Perform an equi-join over l and r. Returns two new, aligned,
* dense-headed bats with in the tail the oids (head column values) of
* matching tuples. The result is in the same order as l (i.e. r1 is
@@ -743,7 +906,7 @@ BATsubleftjoin(BAT **r1p, BAT **r2p, BAT
*r2p = NULL;
if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubhashjoin") == GDK_FAIL)
+ if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubleftjoin") == GDK_FAIL)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
@@ -764,9 +927,9 @@ BATsubouterjoin(BAT **r1p, BAT **r2p, BA
*r1p = NULL;
*r2p = NULL;
- if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
+ if (joinparamcheck(l, r, sl, sr, "BATsubouterjoin") == GDK_FAIL)
return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubhashjoin") == GDK_FAIL)
+ if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubouterjoin") == GDK_FAIL)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
@@ -774,3 +937,19 @@ BATsubouterjoin(BAT **r1p, BAT **r2p, BA
return mergejoin(r1, r2, l, r, sl, sr, 0, 1, 0);
return hashjoin(r1, r2, l, r, sl, sr, 0, 1, 0);
}
+
+gdk_return
+BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, const
char *op, BUN estimate)
+{
+ BAT *r1, *r2;
+
+ *r1p = NULL;
+ *r2p = NULL;
+ if (joinparamcheck(l, r, sl, sr, "BATsubthetajoin") == GDK_FAIL)
+ return GDK_FAIL;
+ if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubthetajoin") == GDK_FAIL)
+ return GDK_FAIL;
+ *r1p = r1;
+ *r2p = r2;
+ return thetajoin(r1, r2, l, r, sl, sr, op);
+}
diff --git a/monetdb5/modules/kernel/algebra.mx
b/monetdb5/modules/kernel/algebra.mx
--- a/monetdb5/modules/kernel/algebra.mx
+++ b/monetdb5/modules/kernel/algebra.mx
@@ -719,6 +719,13 @@ command subouterjoin(l:bat[:oid,:any_1],
address ALGsubouterjoin4
comment "Left outer join with candidate lists";
+command subthetajoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],op:str)
(:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubthetajoin
+comment "Theta join";
+command
subthetajoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],sl:bat[:oid,:oid],sr:bat[:oid,:oid],op:str)
(:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubthetajoin4
+comment "Theta join with candidate lists";
+
# @+ Projection operations
command project(b:bat[:any_1,:any_2]) :bat[:any_1,:void]
address ALGprojectNIL
@@ -1059,6 +1066,8 @@ algebra_export str ALGsubleftjoin(bat *r
algebra_export str ALGsubleftjoin4(bat *r1, bat *r2, bat *l, bat *r, bat *sl,
bat *sr);
algebra_export str ALGsubouterjoin(bat *r1, bat *r2, bat *l, bat *r);
algebra_export str ALGsubouterjoin4(bat *r1, bat *r2, bat *l, bat *r, bat *sl,
bat *sr);
+algebra_export str ALGsubthetajoin(bat *r1, bat *r2, bat *l, bat *r, str *op);
+algebra_export str ALGsubthetajoin4(bat *r1, bat *r2, bat *l, bat *r, bat *sl,
bat *sr, str *op);
@= ALGunaryExport
algebra_export str ALG@1(int *result, int *bid);
@@ -2455,6 +2464,53 @@ ALGsubouterjoin(bat *r1, bat *r2, bat *l
return ALGsubouterjoin4(r1, r2, l, r, NULL, NULL);
}
+str
+ALGsubthetajoin4(bat *r1, bat *r2, bat *lid, bat *rid, bat *slid, bat *srid,
str *op)
+{
+ BAT *left = NULL, *right = NULL, *candleft = NULL, *candright = NULL;
+ BAT *result1, *result2;
+
+ if ((left = BATdescriptor(*lid)) == NULL)
+ goto fail;
+ if ((right = BATdescriptor(*rid)) == NULL)
+ goto fail;
+ if (slid && (candleft = BATdescriptor(*slid)) == NULL)
+ goto fail;
+ if (srid && (candright = BATdescriptor(*srid)) == NULL)
+ goto fail;
+
+ if (BATsubthetajoin(&result1, &result2, left, right, candleft,
candright, *op, BUN_NONE) == GDK_FAIL)
+ goto fail;
+ *r1 = result1->batCacheid;
+ *r2 = result2->batCacheid;
+ BBPkeepref(*r1);
+ BBPkeepref(*r2);
+ BBPreleaseref(left->batCacheid);
+ BBPreleaseref(right->batCacheid);
+ if (candleft)
+ BBPreleaseref(candleft->batCacheid);
+ if (candright)
+ BBPreleaseref(candright->batCacheid);
+ return MAL_SUCCEED;
+
+ fail:
+ if (left)
+ BBPreclaim(left);
+ if (right)
+ BBPreclaim(right);
+ if (candleft)
+ BBPreclaim(candleft);
+ if (candright)
+ BBPreclaim(candright);
+ throw(MAL, "algebra.subthetajoin", RUNTIME_OBJECT_MISSING);
+}
+
+str
+ALGsubthetajoin(bat *r1, bat *r2, bat *l, bat *r, str *op)
+{
+ return ALGsubthetajoin4(r1, r2, l, r, NULL, NULL, op);
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list