Changeset: cbeba89afadb for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cbeba89afadb
Modified Files:
gdk/gdk_join.c
Branch: default
Log Message:
Optimizations for join result size.
Use the smallest of the left and right counts to initialize the join
result size, but with a minimum of 1024. When extending, grow a bit
faster. Always check whether we need to extend.
diffs (truncated from 346 to 300 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -149,7 +149,9 @@ joininitresults(BAT **r1p, BAT **r2p, BU
/* a BAT cannot grow larger than BUN_MAX */
maxsize = BUN_MAX;
}
- size = estimate == BUN_NONE ? lcnt : estimate;
+ size = estimate == BUN_NONE ? lcnt < rcnt ? lcnt : rcnt : estimate;
+ if (size < 1024)
+ size = 1024;
if (size > maxsize)
size = maxsize;
if ((rkey | semi | only_misses) & nil_on_miss) {
@@ -210,6 +212,37 @@ joininitresults(BAT **r1p, BAT **r2p, BU
#define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o))
+#define MAYBEEXTEND_PROGRESS(CNT, PROGRESS) \
+ do { \
+ BUN N = (CNT); \
+ if (BATcount(r1) + N > BATcapacity(r1)) { \
+ /* make some extra space by extrapolating how */ \
+ /* much more we need (fraction of l we've seen */ \
+ /* so far is used as the fraction of the */ \
+ /* expected result size we've produced so */ \
+ /* far) */ \
+ BUN newcap = (BUN) ((double) lcnt / (lcnt - (PROGRESS))
* (BATcount(r1) + N) * 1.5); \
+ if (newcap < N + BATcount(r1)) \
+ newcap = N + BATcount(r1) + 1024; \
+ if (newcap > maxsize) \
+ newcap = maxsize; \
+ /* make sure heap.free is set properly before \
+ * extending */ \
+ BATsetcount(r1, BATcount(r1)); \
+ if (BATextend(r1, newcap) != GDK_SUCCEED) \
+ goto bailout; \
+ if (r2) { \
+ BATsetcount(r2, BATcount(r2)); \
+ if (BATextend(r2, newcap) != GDK_SUCCEED) \
+ goto bailout; \
+ assert(BATcapacity(r1) == BATcapacity(r2)); \
+ } \
+ } \
+ } while (0)
+
+#define MAYBEEXTEND(CNT) MAYBEEXTEND_PROGRESS(CNT, lcand ? (BUN)
(lcandend - lcand) : (lend - lstart))
+#define MAYBEEXTEND_NO_CAND(CNT) MAYBEEXTEND_PROGRESS(CNT, lend - lstart)
+
/* Return BATs through r1p and r2p for the case that there is no
* match between l and r, taking all flags into consideration.
*
@@ -824,7 +857,7 @@ mergejoin_int(BAT **r1p, BAT **r2p, BAT
bool nil_matches, BUN estimate, lng t0, bool swapped)
{
BAT *r1, *r2;
- BUN lstart, lend;
+ BUN lstart, lend, lcnt;
BUN rstart, rend;
BUN lscan, rscan; /* opportunistic scan window */
BUN maxsize;
@@ -844,6 +877,7 @@ mergejoin_int(BAT **r1p, BAT **r2p, BAT
lstart = rstart = 0;
lend = BATcount(l);
+ lcnt = lend - lstart;
rend = BATcount(r);
lvals = (const int *) Tloc(l, 0);
rvals = (const int *) Tloc(r, 0);
@@ -1004,27 +1038,7 @@ mergejoin_int(BAT **r1p, BAT **r2p, BAT
}
/* make space: nl values in l match nr values in r, so
* we need to add nl * nr values in the results */
- if (BATcount(r1) + nl * nr > BATcapacity(r1)) {
- /* make some extra space by extrapolating how
- * much more we need (fraction of l we've seen
- * so far is used as the fraction of the
- * expected result size we've produced so
- * far) */
- BUN newcap = (BUN) ((double) BATcount(l) / (BATcount(l)
- (lend - lstart)) * (BATcount(r1) + nl * nr) * 1.1);
- if (newcap < nl * nr + BATcount(r1))
- newcap = nl * nr + BATcount(r1) + 1024;
- if (newcap > maxsize)
- newcap = maxsize;
- /* make sure heap.free is set properly before
- * extending */
- BATsetcount(r1, BATcount(r1));
- if (BATextend(r1, newcap) != GDK_SUCCEED)
- goto bailout;
- BATsetcount(r2, BATcount(r2));
- if (BATextend(r2, newcap) != GDK_SUCCEED)
- goto bailout;
- assert(BATcapacity(r1) == BATcapacity(r2));
- }
+ MAYBEEXTEND_NO_CAND(nl * nr);
/* maintain properties */
if (nl > 1) {
@@ -1120,7 +1134,7 @@ mergejoin_lng(BAT **r1p, BAT **r2p, BAT
bool nil_matches, BUN estimate, lng t0, bool swapped)
{
BAT *r1, *r2;
- BUN lstart, lend;
+ BUN lstart, lend, lcnt;
BUN rstart, rend;
BUN lscan, rscan; /* opportunistic scan window */
BUN maxsize;
@@ -1140,6 +1154,7 @@ mergejoin_lng(BAT **r1p, BAT **r2p, BAT
lstart = rstart = 0;
lend = BATcount(l);
+ lcnt = lend - lstart;
rend = BATcount(r);
lvals = (const lng *) Tloc(l, 0);
rvals = (const lng *) Tloc(r, 0);
@@ -1300,27 +1315,7 @@ mergejoin_lng(BAT **r1p, BAT **r2p, BAT
}
/* make space: nl values in l match nr values in r, so
* we need to add nl * nr values in the results */
- if (BATcount(r1) + nl * nr > BATcapacity(r1)) {
- /* make some extra space by extrapolating how
- * much more we need (fraction of l we've seen
- * so far is used as the fraction of the
- * expected result size we've produced so
- * far) */
- BUN newcap = (BUN) ((double) BATcount(l) / (BATcount(l)
- (lend - lstart)) * (BATcount(r1) + nl * nr) * 1.1);
- if (newcap < nl * nr + BATcount(r1))
- newcap = nl * nr + BATcount(r1) + 1024;
- if (newcap > maxsize)
- newcap = maxsize;
- /* make sure heap.free is set properly before
- * extending */
- BATsetcount(r1, BATcount(r1));
- if (BATextend(r1, newcap) != GDK_SUCCEED)
- goto bailout;
- BATsetcount(r2, BATcount(r2));
- if (BATextend(r2, newcap) != GDK_SUCCEED)
- goto bailout;
- assert(BATcapacity(r1) == BATcapacity(r2));
- }
+ MAYBEEXTEND_NO_CAND(nl * nr);
/* maintain properties */
if (nl > 1) {
@@ -1729,6 +1724,7 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
}
if (nlx > 0) {
if (only_misses) {
+ MAYBEEXTEND(nlx);
if (lcand) {
lskipped |= nlx > 1;
while (nlx > 0) {
@@ -1754,6 +1750,7 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
r2->trevsorted = false;
r2->tkey = false;
}
+ MAYBEEXTEND(nlx);
if (lcand) {
lskipped |= nlx > 1;
while (nlx > 0) {
@@ -2184,29 +2181,7 @@ mergejoin(BAT **r1p, BAT **r2p, BAT *l,
}
/* make space: nl values in l match nr values in r, so
* we need to add nl * nr values in the results */
- if (BATcount(r1) + nl * nr > BATcapacity(r1)) {
- /* make some extra space by extrapolating how
- * much more we need (fraction of l we've seen
- * so far is used as the fraction of the
- * expected result size we've produced so
- * far) */
- BUN newcap = (BUN) ((double) lcnt / (lcnt - (lcand ?
(BUN) (lcandend - lcand) : (lend - lstart))) * (BATcount(r1) + nl * nr) * 1.1);
- if (newcap < nl * nr + BATcount(r1))
- newcap = nl * nr + BATcount(r1) + 1024;
- if (newcap > maxsize)
- newcap = maxsize;
- /* make sure heap.free is set properly before
- * extending */
- BATsetcount(r1, BATcount(r1));
- if (BATextend(r1, newcap) != GDK_SUCCEED)
- goto bailout;
- if (r2) {
- BATsetcount(r2, BATcount(r2));
- if (BATextend(r2, newcap) != GDK_SUCCEED)
- goto bailout;
- assert(BATcapacity(r1) == BATcapacity(r2));
- }
- }
+ MAYBEEXTEND(nl * nr);
/* maintain properties */
if (nl > 1) {
@@ -2422,20 +2397,7 @@ binsearchcand(const oid *cand, BUN lo, B
#define HASHLOOPBODY() \
do { \
- if (BUNlast(r1) == BATcapacity(r1)) { \
- newcap = BATgrows(r1); \
- if (newcap > maxsize) \
- newcap = maxsize; \
- BATsetcount(r1, BATcount(r1)); \
- if (BATextend(r1, newcap) != GDK_SUCCEED) \
- goto bailout; \
- if (r2) { \
- BATsetcount(r2, BATcount(r2)); \
- if (BATextend(r2, newcap) != GDK_SUCCEED) \
- goto bailout; \
- assert(BATcapacity(r1) == BATcapacity(r2)); \
- } \
- } \
+ MAYBEEXTEND(1); \
APPEND(r1, lo); \
if (r2) \
APPEND(r2, ro); \
@@ -2508,7 +2470,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
BUN rb;
BUN rl, rh;
oid rseq;
- BUN nr, newcap;
+ BUN nr;
const char *lvals;
const char *lvars;
int lwidth;
@@ -2658,14 +2620,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
if (nr == 0) {
if (only_misses) {
nr = 1;
- if (BUNlast(r1) == BATcapacity(r1)) {
- newcap = BATgrows(r1);
- if (newcap > maxsize)
- newcap = maxsize;
- BATsetcount(r1, BATcount(r1));
- if (BATextend(r1, newcap) !=
GDK_SUCCEED)
- goto bailout;
- }
+ MAYBEEXTEND(1);
APPEND(r1, lo);
if (lskipped)
r1->tseqbase = oid_nil;
@@ -2674,17 +2629,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
r2->tnil = true;
r2->tnonil = false;
r2->tkey = false;
- if (BUNlast(r1) == BATcapacity(r1)) {
- newcap = BATgrows(r1);
- if (newcap > maxsize)
- newcap = maxsize;
- BATsetcount(r1, BATcount(r1));
- BATsetcount(r2, BATcount(r2));
- if (BATextend(r1, newcap) !=
GDK_SUCCEED ||
- BATextend(r2, newcap) !=
GDK_SUCCEED)
- goto bailout;
- assert(BATcapacity(r1) ==
BATcapacity(r2));
- }
+ MAYBEEXTEND(1);
APPEND(r1, lo);
APPEND(r2, oid_nil);
} else {
@@ -2840,14 +2785,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
if (nr == 0) {
if (only_misses) {
nr = 1;
- if (BUNlast(r1) == BATcapacity(r1)) {
- newcap = BATgrows(r1);
- if (newcap > maxsize)
- newcap = maxsize;
- BATsetcount(r1, BATcount(r1));
- if (BATextend(r1, newcap) !=
GDK_SUCCEED)
- goto bailout;
- }
+ MAYBEEXTEND(1);
APPEND(r1, lo);
if (lskipped)
r1->tseqbase = oid_nil;
@@ -2856,17 +2794,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT *l, B
r2->tnil = true;
r2->tnonil = false;
r2->tkey = false;
- if (BUNlast(r1) == BATcapacity(r1)) {
- newcap = BATgrows(r1);
- if (newcap > maxsize)
- newcap = maxsize;
- BATsetcount(r1, BATcount(r1));
- BATsetcount(r2, BATcount(r2));
- if (BATextend(r1, newcap) !=
GDK_SUCCEED ||
- BATextend(r2, newcap) !=
GDK_SUCCEED)
- goto bailout;
- assert(BATcapacity(r1) ==
BATcapacity(r2));
- }
+ MAYBEEXTEND(1);
APPEND(r1, lo);
APPEND(r2, oid_nil);
} else {
@@ -2965,7 +2893,6 @@ thetajoin(BAT **r1p, BAT **r2p, BAT *l,
const oid *p;
oid lastr = 0; /* last value inserted into r2 */
BUN n, nr;
- BUN newcap;
oid lo, ro;
int c;
bool lskipped = false; /* whether we skipped values in l */
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list