Changeset: 901541512f01 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=901541512f01
Modified Files:
        gdk/gdk_join.c
Branch: default
Log Message:

All sorts of fixes to the new (and still unused) join code.


diffs (truncated from 734 to 300 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -84,11 +84,13 @@ joininitresults(BAT **r1p, BAT **r2p, BU
        r1->tkey = 1;
        r1->tsorted = 1;
        r1->trevsorted = 1;
+       r1->tdense = 1;
        r2->T->nil = 0;
        r2->T->nonil = 1;
        r2->tkey = 1;
        r2->tsorted = 1;
        r2->trevsorted = 1;
+       r2->tdense = 1;
        *r1p = r1;
        *r2p = r2;
        return GDK_SUCCEED;
@@ -102,38 +104,64 @@ joininitresults(BAT **r1p, BAT **r2p, BU
  * (lo inclusive, hi not inclusive) in rvals/rvars.
  * If last is set, return the index of the first value > v; if last is
  * not set, return the index of the first value >= v.
- * If reverse is -1, the values are sorted in reverse order and hence
+ * If ordering is -1, the values are sorted in reverse order and hence
  * all comparisons are reversed.
  */
 static BUN
 binsearch(const oid *rcand, oid offset,
          const char *rvals, const char *rvars,
          int rwidth, BUN lo, BUN hi, const char *v,
-         int (*cmp)(const void *, const void *), int reverse, int last)
+         int (*cmp)(const void *, const void *), int ordering, int last)
 {
        BUN mid;
        int c;
 
-       assert(reverse == 1 || reverse == -1);
+       assert(ordering == 1 || ordering == -1);
        assert(lo < hi);
 
        --hi;                   /* now hi is inclusive */
-       if ((c = reverse * cmp(VALUE(r, rcand ? rcand[lo] - offset : lo), v)) > 
0 ||
+       if ((c = ordering * cmp(VALUE(r, rcand ? rcand[lo] - offset : lo), v)) 
> 0 ||
            (!last && c == 0))
                return lo;
-       if ((c = reverse * cmp(VALUE(r, rcand ? rcand[hi] - offset : hi), v)) < 
0 ||
+       if ((c = ordering * cmp(VALUE(r, rcand ? rcand[hi] - offset : hi), v)) 
< 0 ||
            (last && c == 0))
                return hi + 1;
+/* the two versions here are equivalent, the first is disabled because
+ * it does more work in the inner loop */
+#if 0
        /* loop invariant:
         * last ? value@lo <= v < value@hi : value@lo < v <= value@hi */
        while (hi - lo > 1) {
                mid = (hi + lo) / 2;
-               if ((c = reverse * cmp(VALUE(r, rcand ? rcand[mid] - offset : 
mid), v)) > 0 ||
+               if ((c = ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset : 
mid), v)) > 0 ||
                    (!last && c == 0))
                        hi = mid;
                else
                        lo = mid;
        }
+#else
+       if (last) {
+               /* loop invariant:
+                * value@lo <= v < value@hi */
+               while (hi - lo > 1) {
+                       mid = (hi + lo) / 2;
+                       if (ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset 
: mid), v) > 0)
+                               hi = mid;
+                       else
+                               lo = mid;
+               }
+       } else {
+               /* loop invariant:
+                * value@lo < v <= value@hi */
+               while (hi - lo > 1) {
+                       mid = (hi + lo) / 2;
+                       if (ordering * cmp(VALUE(r, rcand ? rcand[mid] - offset 
: mid), v) >= 0)
+                               hi = mid;
+                       else
+                               lo = mid;
+               }
+       }
+#endif
        return hi;
 }
 
@@ -160,10 +188,18 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
        const char *v, *prev = NULL;
        BUN nl, nr;
        int insert_nil;
+       /* equal_order is set if we can scan both BATs in the same
+        * order, so when both are sorted or both are reverse sorted
+        * -- important to know in order to skip over values; if l is
+        * not sorted, this must be set to 1 and we will always do a
+        * binary search on all of r */
        int equal_order;
-       int lreverse, rreverse;
+       /* [lr]ordering is either 1 or -1 depending on the order of
+        * l/r: it determines the comparison function used */
+       int lordering, rordering;
        oid lv;
        BUN i;
+       int lskipped = 0;       /* whether we skipped values in l */
 
        ALGODEBUG fprintf(stderr, "#mergejoin(l=%s#" BUNFMT "[%s]%s%s,"
                          "r=%s#" BUNFMT "[%s]%s%s,sl=%s#" BUNFMT "%s%s,"
@@ -219,30 +255,26 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
 
        if (l->tsorted || l->trevsorted) {
                /* determine opportunistic scan window for l */
-               for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart, 
lscan = 4;
+               for (nl = lcand ? (BUN) (lcandend - lcand) : lend - lstart,
+                            lscan = 4;
                     nl > 0;
                     lscan++)
                        nl >>= 1;
-
-               /* equal_order is set if we can scan both BATs in the
-                * same order, so when both are sorted or both are
-                * reverse sorted */
-               equal_order = l->tsorted == r->tsorted || l->trevsorted == 
r->trevsorted;
-               /* [lr]reverse is either 1 or -1 depending on the
-                * order of l/r: it determines the comparison function
-                * used */
-               lreverse = l->tsorted ? 1 : -1;
+               equal_order = l->tsorted == r->tsorted ||
+                       l->trevsorted == r->trevsorted;
+               lordering = l->tsorted && (r->tsorted|| !equal_order) ? 1 : -1;
+               rordering = equal_order ? lordering : -lordering;
        } else {
                /* if l not sorted, we will always use binary search
                 * on r */
                lscan = 0;
                equal_order = 1;
-               lreverse = 1;
+               lordering = 1;
+               rordering = r->tsorted ? 1 : -1;
                /* if l not sorted, we only know for sure that r2 is
                 * key if l is, and that r1 is key if r is */
                r2->tkey = l->tkey != 0;
                r1->tkey = r->tkey != 0;
-
        }
        /* determine opportunistic scan window for r; if l is not
         * sorted this is only used to find range of equal values */
@@ -250,7 +282,6 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
             nl > 0;
             rscan++)
                nl >>= 1;
-       rreverse = r->tsorted ? 1 : -1;
 
        while (lcand ? lcand < lcandend : lstart < lend) {
                if (!nil_on_miss && lscan > 0) {
@@ -258,16 +289,36 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                         * lscan from current position in l), use
                         * binary search on l to skip over
                         * non-matching values */
+                       if (equal_order) {
+                               /* next value to match is first in r */
+                               v = VALUE(r, rcand ? rcand[0] - r->hseqbase : 
rstart);
+                       } else {
+                               /* next value to match is last in r */
+                               v = VALUE(r, rcand ? rcandend[-1] - r->hseqbase 
: rend - 1);
+                       }
                        if (lcand) {
                                if (lscan < (BUN) (lcandend - lcand) &&
-                                   lreverse * cmp(VALUE(l, lcand[lscan] - 
l->hseqbase),
-                                                  (v = VALUE(r, rcand ? 
rcand[0] - r->hseqbase : rstart))) < 0)
-                                       lcand += binsearch(lcand, l->hseqbase, 
lvals, lvars, lwidth, lscan, lcandend - lcand, v, cmp, lreverse, 0);
+                                   lordering * cmp(VALUE(l, lcand[lscan] - 
l->hseqbase),
+                                                   v) < 0) {
+                                       lcand += binsearch(lcand, l->hseqbase,
+                                                          lvals, lvars,
+                                                          lwidth, lscan,
+                                                          lcandend - lcand, v,
+                                                          cmp, lordering, 0);
+                                       lskipped = BATcount(r1) > 0;
+                               }
                        } else {
                                if (lscan < lend - lstart &&
-                                   lreverse * cmp(VALUE(l, lstart + lscan),
-                                                  (v = VALUE(r, rcand ? 
rcand[0] - r->hseqbase : rstart))) < 0)
-                                       lstart = binsearch(NULL, 0, lvals, 
lvars, lwidth, lstart + lscan, lend, v, cmp, lreverse, 0);
+                                   lordering * cmp(VALUE(l, lstart + lscan),
+                                                   v) < 0) {
+                                       lstart = binsearch(NULL, 0,
+                                                          lvals, lvars,
+                                                          lwidth,
+                                                          lstart + lscan,
+                                                          lend, v,
+                                                          cmp, lordering, 0);
+                                       lskipped = BATcount(r1) > 0;
+                               }
                        }
                } else if (lscan == 0) {
                        /* always search r completely */
@@ -303,15 +354,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                 * a binary search */
                                if (lscan == 0 ||
                                    (rscan < (BUN) (rcandend - rcand) &&
-                                    rreverse * cmp(v, VALUE(r, rcand[rscan] - 
r->hseqbase)) > 0)) {
+                                    rordering * cmp(v, VALUE(r, rcand[rscan] - 
r->hseqbase)) > 0)) {
                                        /* value too far away in r or
                                         * l not sorted: use binary
                                         * search */
-                                       rcand += binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, lscan == 0 ? 0 : rscan, rcandend - rcand, v, cmp, 
rreverse, 0);
+                                       rcand += binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, lscan == 0 ? 0 : rscan, rcandend - rcand, v, cmp, 
rordering, 0);
                                } else {
                                        /* scan r for v */
                                        while (rcand < rcandend &&
-                                              rreverse * cmp(v, VALUE(r, 
rcand[0] - r->hseqbase)) > 0)
+                                              rordering * cmp(v, VALUE(r, 
rcand[0] - r->hseqbase)) > 0)
                                                rcand++;
                                }
                        } else {
@@ -322,15 +373,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                 * a binary search */
                                if (lscan == 0 ||
                                    (rscan < rend - rstart &&
-                                    rreverse * cmp(v, VALUE(r, rstart + 
rscan)) > 0)) {
+                                    rordering * cmp(v, VALUE(r, rstart + 
rscan)) > 0)) {
                                        /* value too far away in r or
                                         * l not sorted: use binary
                                         * search */
-                                       rstart = binsearch(NULL, 0, rvals, 
rvars, rwidth, rstart + (lscan == 0 ? 0 : rscan), rend, v, cmp, rreverse, 0);
+                                       rstart = binsearch(NULL, 0, rvals, 
rvars, rwidth, rstart + (lscan == 0 ? 0 : rscan), rend, v, cmp, rordering, 0);
                                } else {
                                        /* scan r for v */
                                        while (rstart < rend &&
-                                              rreverse * cmp(v, VALUE(r, 
rstart)) > 0)
+                                              rordering * cmp(v, VALUE(r, 
rstart)) > 0)
                                                rstart++;
                                }
                        }
@@ -344,15 +395,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                 * sorted (lscan == 0) we'll always do
                                 * a binary search */
                                if (rscan < (BUN) (rcandend - rcand) &&
-                                   rreverse * cmp(v, VALUE(r, rcandend[-rscan 
- 1] - r->hseqbase)) < 0) {
+                                   rordering * cmp(v, VALUE(r, rcandend[-rscan 
- 1] - r->hseqbase)) < 0) {
                                        /* value too far away in r or
                                         * l not sorted: use binary
                                         * search */
-                                       rcandend = rcand + binsearch(rcand, 
r->hseqbase, rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - (lscan == 0 ? 
0 : rscan), v, cmp, rreverse, 1);
+                                       rcandend = rcand + binsearch(rcand, 
r->hseqbase, rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - (lscan == 0 ? 
0 : rscan), v, cmp, rordering, 1);
                                } else {
                                        /* scan r for v */
                                        while (rcand < rcandend &&
-                                              rreverse * cmp(v, VALUE(r, 
rcandend[-1] - r->hseqbase)) < 0)
+                                              rordering * cmp(v, VALUE(r, 
rcandend[-1] - r->hseqbase)) < 0)
                                                rcandend--;
                                }
                        } else {
@@ -362,15 +413,15 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                 * sorted (lscan == 0) we'll always do
                                 * a binary search */
                                if (rscan < rend - rstart &&
-                                   rreverse * cmp(v, VALUE(r, rend - rscan - 
1)) < 0) {
+                                   rordering * cmp(v, VALUE(r, rend - rscan - 
1)) < 0) {
                                        /* value too far away in r or
                                         * l not sorted: use binary
                                         * search */
-                                       rend = binsearch(NULL, 0, rvals, rvars, 
rwidth, rstart, rend - (lscan == 0 ? 0 : rscan), v, cmp, rreverse, 1);
+                                       rend = binsearch(NULL, 0, rvals, rvars, 
rwidth, rstart, rend - (lscan == 0 ? 0 : rscan), v, cmp, rordering, 1);
                                } else {
                                        /* scan r for v */
                                        while (rstart < rend &&
-                                              rreverse * cmp(v, VALUE(r, rend 
- 1)) < 0)
+                                              rordering * cmp(v, VALUE(r, rend 
- 1)) < 0)
                                                rend--;
                                }
                        }
@@ -388,7 +439,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                    cmp(v, VALUE(r, rcand[rscan] - 
r->hseqbase)) == 0) {
                                        /* range too large: use binary
                                         * search */
-                                       nr = binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, rscan, rcandend - rcand, v, cmp, rreverse, 1);
+                                       nr = binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, rscan, rcandend - rcand, v, cmp, rordering, 1);
                                        rcand += nr;
                                } else {
                                        /* scan r for end of range */
@@ -406,7 +457,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                    cmp(v, VALUE(r, rstart + rscan)) == 0) {
                                        /* range too large: use binary
                                         * search */
-                                       nr = binsearch(NULL, 0, rvals, rvars, 
rwidth, rstart + rscan, rend, v, cmp, rreverse, 1);
+                                       nr = binsearch(NULL, 0, rvals, rvars, 
rwidth, rstart + rscan, rend, v, cmp, rordering, 1);
                                        nr -= rstart;
                                        rstart += nr;
                                } else {
@@ -425,7 +476,7 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT 
                                 * a binary search */
                                if (rscan < (BUN) (rcandend - rcand) &&
                                    cmp(v, VALUE(r, rcandend[-rscan - 1] - 
r->hseqbase)) == 0) {
-                                       nr = binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - rscan, v, cmp, rreverse, 0);
+                                       nr = binsearch(rcand, r->hseqbase, 
rvals, rvars, rwidth, 0, (BUN) (rcandend - rcand) - rscan, v, cmp, rordering, 
0);
                                        nr = (BUN) (rcandend - rcand) - nr;
                                        rcandend -= nr;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to