Changeset: 1dad1e7691a0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1dad1e7691a0
Modified Files:
gdk/gdk_join.c
gdk/gdk_private.h
gdk/gdk_select.c
Branch: Jul2015
Log Message:
Calculate maximum join result size and use that to cap growth of BATs.
This should fix bug 3791.
diffs (truncated from 359 to 300 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -108,10 +108,41 @@ joinparamcheck(BAT *l, BAT *r1, BAT *r2,
}
/* Create the result bats for a join. */
-static gdk_return
-joininitresults(BAT **r1p, BAT **r2p, BUN size, const char *func)
+static BUN
+joininitresults(BAT **r1p, BAT **r2p, BUN lcnt, BUN rcnt, int lkey, int rkey,
int semi, int nil_on_miss, BUN estimate, const char *func)
{
BAT *r1, *r2;
+ BUN maxsize, size;
+
+ lkey |= lcnt <= 1;
+ rkey |= rcnt <= 1;
+
+ if (rkey | semi) {
+ /* each entry left matches at most one on right, in
+ * case nil_on_miss is also set, each entry matches
+ * exactly one */
+ maxsize = lcnt;
+ } else if (lkey) {
+ /* each entry on right is matched at most once */
+ if (nil_on_miss) {
+ /* one entry left could match all right, and
+ * all other entries left match nil */
+ maxsize = lcnt + rcnt - 1;
+ } else {
+ maxsize = rcnt;
+ }
+ } else {
+ /* in the worst case we have a full cross product */
+ if (lcnt == 0 || rcnt == 0)
+ maxsize = nil_on_miss ? lcnt : 0;
+ else if (BUN_MAX / lcnt >= rcnt)
+ maxsize = BUN_MAX;
+ else
+ maxsize = lcnt * rcnt;
+ }
+ size = estimate == BUN_NONE ? lcnt : estimate;
+ if (size > maxsize)
+ size = maxsize;
r1 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT);
r2 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT);
@@ -120,7 +151,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU
BBPreclaim(r2);
*r1p = *r2p = NULL;
GDKerror("%s: cannot create output BATs.\n", func);
- return GDK_FAIL;
+ return BUN_NONE;
}
BATseqbase(r1, 0);
BATseqbase(r2, 0);
@@ -138,7 +169,7 @@ joininitresults(BAT **r1p, BAT **r2p, BU
r2->tdense = 1;
*r1p = r1;
*r2p = r2;
- return GDK_SUCCEED;
+ return maxsize;
}
#define VALUE(s, x) (s##vars ? \
@@ -809,7 +840,8 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l,
*/
static gdk_return
mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
- int nil_matches, int nil_on_miss, int semi, int must_match)
+ int nil_matches, int nil_on_miss, int semi, int must_match,
+ BUN maxsize)
{
BUN lstart, lend, lcnt;
const oid *lcand, *lcandend;
@@ -1537,6 +1569,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
BUN newcap = (BUN) ((double) total / (total - (lcand ?
(BUN) (lcandend - lcand) : (lend - lstart))) * (BATcount(r1) + nl * nr) * 1.1);
if (newcap < nl * nr + BATcount(r1))
newcap = nl * nr + BATcount(r1) + 1024;
+ if (newcap > maxsize)
+ newcap = maxsize;
/* make sure heap.free is set properly before
* extending */
BATsetcount(r1, BATcount(r1));
@@ -1724,6 +1758,8 @@ binsearchcand(const oid *cand, BUN lo, B
do { \
if (BUNlast(r1) == BATcapacity(r1)) { \
newcap = BATgrows(r1); \
+ if (newcap > maxsize) \
+ newcap = maxsize; \
BATsetcount(r1, BATcount(r1)); \
BATsetcount(r2, BATcount(r2)); \
if (BATextend(r1, newcap) != GDK_SUCCEED || \
@@ -1752,7 +1788,7 @@ binsearchcand(const oid *cand, BUN lo, B
simple_EQ(v, BUNtloc(bi, hb), TYPE))
static gdk_return
-hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches,
int nil_on_miss, int semi, int must_match)
+hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches,
int nil_on_miss, int semi, int must_match, BUN maxsize)
{
BUN lstart, lend, lcnt;
const oid *lcand = NULL, *lcandend = NULL;
@@ -1916,6 +1952,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
r2->tkey = 0;
if (BUNlast(r1) == BATcapacity(r1)) {
newcap = BATgrows(r1);
+ if (newcap > maxsize)
+ newcap = maxsize;
BATsetcount(r1, BATcount(r1));
BATsetcount(r2, BATcount(r2));
if (BATextend(r1, newcap) !=
GDK_SUCCEED ||
@@ -2111,7 +2149,7 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *
#define MASK_NE (MASK_LT | MASK_GT)
static gdk_return
-thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode)
+thetajoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int opcode, BUN
maxsize)
{
BUN lstart, lend, lcnt;
const oid *lcand = NULL, *lcandend = NULL;
@@ -2270,6 +2308,8 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT
continue;
if (BUNlast(r1) == BATcapacity(r1)) {
newcap = BATgrows(r1);
+ if (newcap > maxsize)
+ newcap = maxsize;
BATsetcount(r1, BATcount(r1));
BATsetcount(r2, BATcount(r2));
if (BATextend(r1, newcap) != GDK_SUCCEED ||
@@ -2335,7 +2375,7 @@ thetajoin(BAT *r1, BAT *r2, BAT *l, BAT
static gdk_return
bandjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
- const void *c1, const void *c2, int li, int hi)
+ const void *c1, const void *c2, int li, int hi, BUN maxsize)
{
BUN lstart, lend, lcnt;
const oid *lcand = NULL, *lcandend = NULL;
@@ -2671,6 +2711,8 @@ bandjoin(BAT *r1, BAT *r2, BAT *l, BAT *
}
if (BUNlast(r1) == BATcapacity(r1)) {
newcap = BATgrows(r1);
+ if (newcap > maxsize)
+ newcap = maxsize;
BATsetcount(r1, BATcount(r1));
BATsetcount(r2, BATcount(r2));
if (BATextend(r1, newcap) != GDK_SUCCEED ||
@@ -2739,7 +2781,7 @@ static gdk_return
subleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
nil_matches, BUN estimate, int nil_on_miss, int semi, int must_match, const
char *name)
{
BAT *r1, *r2;
- BUN lcount, rcount;
+ BUN lcount, rcount, maxsize;
*r1p = NULL;
*r2p = NULL;
@@ -2769,7 +2811,7 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l
return GDK_SUCCEED;
}
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), name) != GDK_SUCCEED)
+ if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey,
r->tkey, semi, nil_on_miss, estimate, name)) == BUN_NONE)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
@@ -2782,9 +2824,9 @@ subleftjoin(BAT **r1p, BAT **r2p, BAT *l
lcount < 1024 ||
BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0) +
2 * sizeof(BUN)) > GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1)))
return mergejoin(r1, r2, l, r, sl, sr, nil_matches,
- nil_on_miss, semi, must_match);
+ nil_on_miss, semi, must_match, maxsize);
return hashjoin(r1, r2, l, r, sl, sr, nil_matches,
- nil_on_miss, semi, must_match);
+ nil_on_miss, semi, must_match, maxsize);
}
/* Perform an equi-join over l and r. Returns two new, aligned,
@@ -2836,6 +2878,7 @@ gdk_return
BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int
op, int nil_matches, BUN estimate)
{
BAT *r1, *r2;
+ BUN maxsize;
int opcode = 0;
/* encode operator as a bit mask into opcode */
@@ -2866,14 +2909,12 @@ BATsubthetajoin(BAT **r1p, BAT **r2p, BA
*r2p = NULL;
if (joinparamcheck(l, r, NULL, sl, sr, "BATsubthetajoin") !=
GDK_SUCCEED)
return GDK_FAIL;
- if (joininitresults(&r1, &r2,
- estimate != BUN_NONE ? estimate : sl ? BATcount(sl)
: BATcount(l),
- "BATsubthetajoin") != GDK_SUCCEED)
+ if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) :
BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate,
"BATsubthetajoin")) == BUN_NONE)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
- return thetajoin(r1, r2, l, r, sl, sr, opcode);
+ return thetajoin(r1, r2, l, r, sl, sr, opcode, maxsize);
}
gdk_return
@@ -2882,6 +2923,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
BAT *r1, *r2;
BUN lcount, rcount, lpcount, rpcount;
BUN lsize, rsize;
+ BUN maxsize;
int lhash, rhash;
bat lparent, rparent;
int swap;
@@ -2913,7 +2955,7 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
*r2p = r2;
return GDK_SUCCEED;
}
- if (joininitresults(&r1, &r2, estimate != BUN_NONE ? estimate : sl ?
BATcount(sl) : BATcount(l), "BATsubjoin") != GDK_SUCCEED)
+ if ((maxsize = joininitresults(&r1, &r2, lcount, rcount, l->tkey,
r->tkey, 0, 0, estimate, "BATsubjoin")) == BUN_NONE)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
@@ -2949,9 +2991,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
} else if ((l->tsorted || l->trevsorted) && (r->tsorted ||
r->trevsorted)) {
/* both sorted, smallest on left */
if (BATcount(l) <= BATcount(r))
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0,
0, 0);
+ return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0,
0, 0, maxsize);
else
- return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0,
0, 0);
+ return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0,
0, 0, maxsize);
} else if (lhash && rhash) {
/* both have hash, smallest on right */
swap = lcount < rcount;
@@ -2967,14 +3009,14 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
* "large" and the smaller of the two isn't too large
* (i.e. prefer hash over binary search, but only if
* the hash table doesn't cause thrashing) */
- return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0);
+ return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0,
maxsize);
} else if ((r->tsorted || r->trevsorted) &&
(r->ttype == TYPE_void || lcount < 1024 || MIN(lsize, rsize)
> mem_size)) {
/* only right is sorted, don't swap; but only if left
* is "large" and the smaller of the two isn't too
* large (i.e. prefer hash over binary search, but
* only if the hash table doesn't cause thrashing) */
- return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+ return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0,
maxsize);
} else if ((l->batPersistence == PERSISTENT ||
(lparent != 0 &&
BBPquickdesc(abs(lparent), 0)->batPersistence ==
PERSISTENT)) &&
@@ -2999,9 +3041,9 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l,
swap = 1;
}
if (swap) {
- return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0);
+ return hashjoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0,
maxsize);
} else {
- return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0);
+ return hashjoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0,
maxsize);
}
}
@@ -3010,19 +3052,18 @@ BATsubbandjoin(BAT **r1p, BAT **r2p, BAT
const void *c1, const void *c2, int li, int hi, BUN estimate)
{
BAT *r1, *r2;
+ BUN maxsize;
*r1p = NULL;
*r2p = NULL;
if (joinparamcheck(l, r, NULL, sl, sr, "BATsubbandjoin") != GDK_SUCCEED)
return GDK_FAIL;
- if (joininitresults(&r1, &r2,
- estimate != BUN_NONE ? estimate : sl ? BATcount(sl)
: BATcount(l),
- "BATsubbandjoin") != GDK_SUCCEED)
+ if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) :
BATcount(l), sr ? BATcount(sr) : BATcount(r), 0, 0, 0, 0, estimate,
"BATsubbandjoin")) == BUN_NONE)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
- return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi);
+ return bandjoin(r1, r2, l, r, sl, sr, c1, c2, li, hi, maxsize);
}
gdk_return
@@ -3030,21 +3071,20 @@ BATsubrangejoin(BAT **r1p, BAT **r2p, BA
BAT *sl, BAT *sr, int li, int hi, BUN estimate)
{
BAT *r1, *r2;
+ BUN maxsize;
*r1p = NULL;
*r2p = NULL;
if (joinparamcheck(l, rl, rh, sl, sr, "BATsubrangejoin") != GDK_SUCCEED)
return GDK_FAIL;
- if (joininitresults(&r1, &r2,
- estimate != BUN_NONE ? estimate : sl ? BATcount(sl)
: BATcount(l),
- "BATsubrangejoin") != GDK_SUCCEED)
+ if ((maxsize = joininitresults(&r1, &r2, sl ? BATcount(sl) :
BATcount(l), sr ? BATcount(sr) : BATcount(rl), 0, 0, 0, 0, estimate,
"BATsubrangejoin")) == BUN_NONE)
return GDK_FAIL;
*r1p = r1;
*r2p = r2;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list