Changeset: 1e7118ddd7e3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1e7118ddd7e3
Modified Files:
gdk/gdk_join.c
Branch: default
Log Message:
Prefer hashjoin over one-sided mergejoin in certain conditions.
When a sorted and an unsorted column are joined, use hashjoin except
when the unsorted column is very small (to be determined whether this
is a good idea, and what the limit should be if it is), or both
columns are very large (hash thrashing is likely worse than lots of
binary searches).
diffs (201 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -1733,6 +1733,50 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT
return GDK_FAIL;
}
+/* Make the implementation choices for various left joins. */
+static gdk_return
+subleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
nil_matches, BUN estimate, int nil_on_miss, int semi, int must_match, const
char *name)
+{
+ BAT *r1, *r2;
+ BUN lcount, rcount;
+
+ *r1p = NULL;
+ *r2p = NULL;
+ if (joinparamcheck(l, r, sl, sr, name) == GDK_FAIL)
+ return GDK_FAIL;
+
+ lcount = BATcount(l);
+ if (sl)
+ lcount = MIN(lcount, BATcount(sl));
+ rcount = BATcount(r);
+ if (sr)
+ rcount = MIN(rcount, BATcount(sr));
+ if (lcount == 0 || rcount == 0) {
+ r1 = BATnew(TYPE_void, TYPE_void, 0);
+ BATseqbase(r1, 0);
+ BATseqbase(BATmirror(r1), 0);
+ r2 = BATnew(TYPE_void, TYPE_void, 0);
+ BATseqbase(r2, 0);
+ BATseqbase(BATmirror(r2), 0);
+ *r1p = r1;
+ *r2p = r2;
+ return GDK_SUCCEED;
+ }
+
+ if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), name) == GDK_FAIL)
+ return GDK_FAIL;
+ *r1p = r1;
+ *r2p = r2;
+ if ((r->tsorted || r->trevsorted) &&
+ (r->ttype == TYPE_void ||
+ lcount < 1024 ||
+ BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0) +
2 * sizeof(BUN)) > GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1)))
+ return mergejoin(r1, r2, l, r, sl, sr, nil_matches,
+ nil_on_miss, semi, must_match);
+ return hashjoin(r1, r2, l, r, sl, sr, nil_matches,
+ nil_on_miss, semi, must_match);
+}
+
/* Perform an equi-join over l and r. Returns two new, aligned,
* dense-headed bats with in the tail the oids (head column values) of
* matching tuples. The result is in the same order as l (i.e. r1 is
@@ -1740,19 +1784,8 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT
gdk_return
BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
nil_matches, BUN estimate)
{
- BAT *r1, *r2;
-
- *r1p = NULL;
- *r2p = NULL;
- if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
- return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubleftjoin") == GDK_FAIL)
- return GDK_FAIL;
- *r1p = r1;
- *r2p = r2;
- if (r->tsorted || r->trevsorted)
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
- return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+ return subleftjoin(r1p, r2p, l, r, sl, sr, nil_matches, estimate,
+ 0, 0, 0, "BATsubleftjoin");
}
/* Perform an equi-join over l and r. Returns two new, aligned,
@@ -1762,19 +1795,8 @@ BATsubleftjoin(BAT **r1p, BAT **r2p, BAT
gdk_return
BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr,
int nil_matches, BUN estimate)
{
- BAT *r1, *r2;
-
- *r1p = NULL;
- *r2p = NULL;
- if (joinparamcheck(l, r, sl, sr, "BATsubleftjoin") == GDK_FAIL)
- return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubleftjoin") == GDK_FAIL)
- return GDK_FAIL;
- *r1p = r1;
- *r2p = r2;
- if (r->tsorted || r->trevsorted)
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 1);
- return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 1);
+ return subleftjoin(r1p, r2p, l, r, sl, sr, nil_matches, estimate,
+ 0, 0, 1, "BATsubleftfetchjoin");
}
/* Performs a left outer join over l and r. Returns two new, aligned,
@@ -1785,19 +1807,8 @@ BATsubleftfetchjoin(BAT **r1p, BAT **r2p
gdk_return
BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
nil_matches, BUN estimate)
{
- BAT *r1, *r2;
-
- *r1p = NULL;
- *r2p = NULL;
- if (joinparamcheck(l, r, sl, sr, "BATsubouterjoin") == GDK_FAIL)
- return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubouterjoin") == GDK_FAIL)
- return GDK_FAIL;
- *r1p = r1;
- *r2p = r2;
- if (r->tsorted || r->trevsorted)
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 1, 0, 0);
- return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 1, 0, 0);
+ return subleftjoin(r1p, r2p, l, r, sl, sr, nil_matches, estimate,
+ 1, 0, 0, "BATsubouterjoin");
}
/* Perform a semi-join over l and r. Returns two new, aligned,
@@ -1807,19 +1818,8 @@ BATsubouterjoin(BAT **r1p, BAT **r2p, BA
gdk_return
BATsubsemijoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
nil_matches, BUN estimate)
{
- BAT *r1, *r2;
-
- *r1p = NULL;
- *r2p = NULL;
- if (joinparamcheck(l, r, sl, sr, "BATsubsemijoin") == GDK_FAIL)
- return GDK_FAIL;
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubsemijoin") == GDK_FAIL)
- return GDK_FAIL;
- *r1p = r1;
- *r2p = r2;
- if (r->tsorted || r->trevsorted)
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 1, 0);
- return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 1, 0);
+ return subleftjoin(r1p, r2p, l, r, sl, sr, nil_matches, estimate,
+ 0, 1, 0, "BATsubsemijoin");
}
gdk_return
@@ -1883,7 +1883,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
{
BAT *r1, *r2;
BUN lcount, rcount;
+ BUN lsize, rsize;
int swap;
+ size_t mem_size;
*r1p = NULL;
*r2p = NULL;
@@ -1911,24 +1913,40 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
*r1p = r1;
*r2p = r2;
swap = 0;
+
+ /* some statistics to help us decide */
+ lsize = BATcount(l) * (Tsize(l) + (l->T->vheap ? l->T->vheap->size : 0)
+ 2 * sizeof(BUN));
+ rsize = BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0)
+ 2 * sizeof(BUN));
+ mem_size = GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1);
+
if ((l->tsorted || l->trevsorted) && (r->tsorted || r->trevsorted)) {
- /* both sorted, don't swap */
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+ /* both sorted, smallest on left */
+ if (BATcount(l) <= BATcount(r))
+ return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0,
0, 0);
+ else
+ return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0,
0, 0);
} else if (l->T->hash && r->T->hash) {
/* both have hash, smallest on right */
- if (lcount < rcount)
- swap = 1;
+ swap = lcount < rcount;
} else if (l->T->hash) {
/* only left has hash, swap */
swap = 1;
} else if (r->T->hash) {
/* only right has hash, don't swap */
swap = 0;
- } else if (l->tsorted || l->trevsorted) {
- /* only left is sorted, swap */
+ } else if ((l->tsorted || l->trevsorted) &&
+ (l->ttype == TYPE_void || rcount < 1024 || MIN(lsize, rsize)
> mem_size)) {
+ /* only left is sorted, swap; but only if right is
+ * "large" and the smaller of the two isn't too large
+ * (i.e. prefer hash over binary search, but only if
+ * the hash table doesn't cause thrashing) */
return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0);
- } else if (r->tsorted || r->trevsorted) {
- /* only right is sorted, don't swap */
+ } else if ((r->tsorted || r->trevsorted) &&
+ (r->ttype == TYPE_void || lcount < 1024 || MIN(lsize, rsize)
> mem_size)) {
+ /* only right is sorted, don't swap; but only if left
+ * is "large" and the smaller of the two isn't too
+ * large (i.e. prefer hash over binary search, but
+ * only if the hash table doesn't cause thrashing) */
return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
} else if (BATcount(l) < BATcount(r)) {
/* no hashes, not sorted, create hash on smallest BAT */
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list