Changeset: 0090a6ec68b4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0090a6ec68b4
Modified Files:
        gdk/gdk.h
        gdk/gdk_join.c
        monetdb5/modules/kernel/algebra.mx
Branch: default
Log Message:

Bunch of minor changes to sub-join implementation.
Added comments; added interfaces for subleftjoin and subouterjoin;
removed interfaces for submergejoin and subhashjoin; added estimate
parameter.


diffs (truncated from 379 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -3165,8 +3165,8 @@ gdk_export BAT *BATleftjoin(BAT *l, BAT 
 gdk_export BAT *BATouterjoin(BAT *l, BAT *r, BUN estimate);
 gdk_export BAT *BATcross(BAT *l, BAT *r);
 
-gdk_export gdk_return BATsubmergejoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT 
*sl, BAT *sr);
-gdk_export gdk_return BATsubhashjoin(BAT **r1, BAT **r2, BAT *l, BAT *r, BAT 
*sl, BAT *sr);
+gdk_export gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT 
*sl, BAT *sr, BUN estimate);
+gdk_export gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, 
BAT *sl, BAT *sr, BUN estimate);
 
 gdk_export BAT *BATslice(BAT *b, BUN low, BUN high);
 gdk_export BAT *BATfetch(BAT *b, BAT *s);
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -1,8 +1,28 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
 #include "monetdb_config.h"
 #include "gdk.h"
 #include "gdk_private.h"
 #include "gdk_calc_private.h"
 
+/* Perform a bunch of sanity checks on the inputs to a join. */
 static gdk_return
 joinparamcheck(BAT *l, BAT *r, BAT *sl, BAT *sr, const char *func)
 {
@@ -36,10 +56,12 @@ joinparamcheck(BAT *l, BAT *r, BAT *sl, 
        return GDK_SUCCEED;
 }
 
+/* Create the result bats for a join. */
 static gdk_return
 joininitresults(BAT **r1p, BAT **r2p, BUN size, const char *func)
 {
        BAT *r1, *r2;
+
        r1 = BATnew(TYPE_void, TYPE_oid, size);
        r2 = BATnew(TYPE_void, TYPE_oid, size);
        if (r1 == NULL || r2 == NULL) {
@@ -67,7 +89,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU
 #define VALUE(side, x)         (side##vars ? side##vars + 
VarHeapVal(side##vals, (x), side##width) : side##vals + ((x) * side##width))
 
 /* Do a binary search for the first/last occurrence of v between lo and hi
- * (lo inclusive, hi not inclusive) in values.
+ * (lo inclusive, hi not inclusive) in rvals/rvars.
  * If last is set, return the index of the first value > v; if last is
  * not set, return the index of the first value >= v.
  * If reverse is -1, the values are sorted in reverse order and hence
@@ -92,6 +114,8 @@ binsearch(const oid *rcand, oid offset,
        if ((c = reverse * cmp(VALUE(r, rcand ? rcand[hi] - offset : hi), v)) < 
0 ||
            (last && c == 0))
                return hi + 1;
+       /* loop invariant:
+        * last ? value@lo <= v < value@hi : value@lo < v <= value@hi */
        while (hi - lo > 1) {
                mid = (hi + lo) / 2;
                if ((c = reverse * cmp(VALUE(r, rcand ? rcand[mid] - offset : 
mid), v)) > 0 ||
@@ -105,17 +129,18 @@ binsearch(const oid *rcand, oid offset,
 
 #define APPEND(b, o)           (((oid *) b->T->heap.base)[b->batFirst + 
b->batCount++] = (o))
 
-/* Perform a "merge" join on l and r (must both be sorted) with
- * optional candidate lists.  The return BATs have already been
- * created by the caller.
+/* Perform a "merge" join on l and r (if both are sorted) with
+ * optional candidate lists, or join using binary search on r if l is
+ * not sorted.  The return BATs have already been created by the
+ * caller.
  */
 static gdk_return
 mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, 
int nil_on_miss, int semi)
 {
        BUN lstart, lend, lcnt;
        const oid *lcand = NULL, *lcandend = NULL;
-       BUN rstart, rend, rcnt;
-       const oid *rcand = NULL, *rcandend = NULL;
+       BUN rstart, rend, rcnt, rstartorig;
+       const oid *rcand = NULL, *rcandend = NULL, *rcandorig;
        BUN lscan, rscan;
        const char *lvals, *rvals;
        const char *lvars, *rvars;
@@ -133,13 +158,14 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        assert(BAThdense(l));
        assert(BAThdense(r));
        assert(l->ttype == r->ttype);
-       assert(l->tsorted || l->trevsorted);
        assert(r->tsorted || r->trevsorted);
        assert(sl == NULL || sl->tsorted);
        assert(sr == NULL || sr->tsorted);
 
        CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
        CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
+       rcandorig = rcand;
+       rstartorig = rstart;
        lvals = (const char *) Tloc(l, BUNfirst(l));
        rvals = (const char *) Tloc(r, BUNfirst(r));
        if (l->tvarsized && l->ttype) {
@@ -177,18 +203,25 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                return GDK_SUCCEED;
        }
 
-       /* determine opportunistic scan window for l/r */
-       for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart, lscan = 4;
-            nl > 0;
-            lscan++)
-               nl >>= 1;
-       for (nl = rcand ? (BUN) (rcandend - rcand) : rend - rstart, rscan = 4;
-            nl > 0;
-            rscan++)
-               nl >>= 1;
+       if (l->tsorted || l->trevsorted) {
+               /* determine opportunistic scan window for l/r */
+               for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart, 
lscan = 4;
+                    nl > 0;
+                    lscan++)
+                       nl >>= 1;
+               for (nl = rcand ? (BUN) (rcandend - rcand) : rend - rstart, 
rscan = 4;
+                    nl > 0;
+                    rscan++)
+                       nl >>= 1;
+       } else {
+               /* if l not sorted, we will always use binary search
+                * on r */
+               lscan = rscan = 0;
+               equal_order = 1;
+       }
 
        while (lcand ? lcand < lcandend : lstart < lend) {
-               if (!nil_on_miss) {
+               if (!nil_on_miss && lscan > 0) {
                        /* if next value in r too far away (more than
                         * lscan from current position in l), use
                         * binary search on l to skip over
@@ -204,6 +237,11 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                                   (v = VALUE(r, rcand ? *rcand 
- r->hseqbase : rstart))) < 0)
                                        lstart = binsearch(NULL, 0, lvals, 
lvars, lwidth, lstart + lscan, lend, v, cmp, lreverse, 0);
                        }
+               } else if (lscan == 0) {
+                       /* always search r completely */
+                       assert(rscan == 0);
+                       rcand = rcandorig;
+                       rstart = rstartorig;
                }
                /* v is the value we're going to work with in this
                 * iteration */
@@ -230,8 +268,9 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                /* look ahead a little (rscan) in r to
                                 * see whether we're better off doing
                                 * a binary search */
-                               if (rscan < (BUN) (rcandend - rcand) &&
-                                   rreverse * cmp(v, VALUE(r, *rcand - 
r->hseqbase + rscan)) > 0) {
+                               if (rscan == 0 ||
+                                   (rscan < (BUN) (rcandend - rcand) &&
+                                    rreverse * cmp(v, VALUE(r, *rcand - 
r->hseqbase + rscan)) > 0)) {
                                        /* value too far away in r:
                                         * use binary search */
                                        rcand += binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, rscan, rcandend - rcand, v, cmp, rreverse, 0);
@@ -245,8 +284,9 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                /* look ahead a little (rscan) in r to
                                 * see whether we're better off doing
                                 * a binary search */
-                               if (rscan < rend - rstart &&
-                                   rreverse * cmp(v, VALUE(r, rstart + rscan)) 
> 0) {
+                               if (rscan == 0 ||
+                                   (rscan < rend - rstart &&
+                                    rreverse * cmp(v, VALUE(r, rstart + 
rscan)) > 0)) {
                                        /* value too far away in r:
                                         * use binary search */
                                        rstart = binsearch(NULL, 0, rvals, 
rvars, rwidth, rstart + rscan, rend, v, cmp, rreverse, 0);
@@ -451,22 +491,6 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        return GDK_SUCCEED;
 }
 
-gdk_return
-BATsubmergejoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr)
-{
-       BAT *r1, *r2;
-
-       *r1p = NULL;
-       *r2p = NULL;
-       if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
-               return GDK_FAIL;
-       if (joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), 
"BATsubmergejoin") == GDK_FAIL)
-               return GDK_FAIL;
-       *r1p = r1;
-       *r2p = r2;
-       return mergejoin(r1, r2, l, r, sl, sr, 0, 0, 0);
-}
-
 /* binary search in a candidate list, return 1 if found, 0 if not */
 static int
 binsearchcand(const oid *cand, BUN lo, BUN hi, oid v)
@@ -706,8 +730,12 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
        return GDK_FAIL;
 }
 
+/* Perform an equi-join over l and r.  Returns two new, aligned,
+ * dense-headed bats with in the tail the oids (head column values) of
+ * matching tuples.  The result is in the same order as l (i.e. r1 is
+ * sorted). */
 gdk_return
-BATsubhashjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr)
+BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN 
estimate)
 {
        BAT *r1, *r2;
 
@@ -715,9 +743,34 @@ BATsubhashjoin(BAT **r1p, BAT **r2p, BAT
        *r2p = NULL;
        if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
                return GDK_FAIL;
-       if (joininitresults(&r1, &r2, sl ? BATcount(sl) : BATcount(l), 
"BATsubhashjoin") == GDK_FAIL)
+       if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? 
BATcount(sl) : BATcount(l), "BATsubhashjoin") == GDK_FAIL)
                return GDK_FAIL;
        *r1p = r1;
        *r2p = r2;
+       if (r->tsorted || r->trevsorted)
+               return mergejoin(r1, r2, l, r, sl, sr, 0, 0, 0);
        return hashjoin(r1, r2, l, r, sl, sr, 0, 0, 0);
 }
+
+/* Performs a left outer join over l and r.  Returns two new, aligned,
+ * dense-headed bats with in the tail the oids (head column values) of
+ * matching tuples, or the oid in the first output bat and nil in the
+ * second output bat if the value in l does not occur in r.  The
+ * result is in the same order as l (i.e. r1 is sorted). */
+gdk_return
+BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, BUN 
estimate)
+{
+       BAT *r1, *r2;
+
+       *r1p = NULL;
+       *r2p = NULL;
+       if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
+               return GDK_FAIL;
+       if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ? 
BATcount(sl) : BATcount(l), "BATsubhashjoin") == GDK_FAIL)
+               return GDK_FAIL;
+       *r1p = r1;
+       *r2p = r2;
+       if (r->tsorted || r->trevsorted)
+               return mergejoin(r1, r2, l, r, sl, sr, 0, 1, 0);
+       return hashjoin(r1, r2, l, r, sl, sr, 0, 1, 0);
+}
diff --git a/monetdb5/modules/kernel/algebra.mx 
b/monetdb5/modules/kernel/algebra.mx
--- a/monetdb5/modules/kernel/algebra.mx
+++ b/monetdb5/modules/kernel/algebra.mx
@@ -705,19 +705,19 @@ comment "This is a join() for which the 
 command join(l:bat[:any_1,:any_2], rl:bat[:any_3,:any_2], 
rh:bat[:any_3,:any_2], li:bit, hi:bit) :bat[:any_1,:any_3] 
 address ALGrangejoin;
 
-command submergejoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1]) 
(:bat[:oid,:oid],:bat[:oid,:oid])
-address ALGsubmergejoin
-comment "Merge join";
-command 
submergejoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],sl:bat[:oid,:oid],sr:bat[:oid,:oid])
 (:bat[:oid,:oid],:bat[:oid,:oid])
-address ALGsubmergejoin4
-comment "Merge join with candidate lists";
-
-command subhashjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1]) 
(:bat[:oid,:oid],:bat[:oid,:oid])
-address ALGsubhashjoin
-comment "Hash join";
-command 
subhashjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],sl:bat[:oid,:oid],sr:bat[:oid,:oid])
 (:bat[:oid,:oid],:bat[:oid,:oid])
-address ALGsubhashjoin4
-comment "Hash join with candidate lists";
+command subleftjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1]) 
(:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubleftjoin
+comment "Left join";
+command 
subleftjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],sl:bat[:oid,:oid],sr:bat[:oid,:oid])
 (:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubleftjoin4
+comment "Left join with candidate lists";
+
+command subouterjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1]) 
(:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubouterjoin
+comment "Left outer join";
+command 
subouterjoin(l:bat[:oid,:any_1],r:bat[:oid,:any_1],sl:bat[:oid,:oid],sr:bat[:oid,:oid])
 (:bat[:oid,:oid],:bat[:oid,:oid])
+address ALGsubouterjoin4
+comment "Left outer join with candidate lists";
 
 # @+ Projection operations
 command project(b:bat[:any_1,:any_2]) :bat[:any_1,:void]
@@ -1055,10 +1055,10 @@ algebra_export str ALGthetajoin(int *res
 algebra_export str ALGbandjoin_default(int *result, int *lid, int *rid, ptr 
*minus, ptr *plus);
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to