Changeset: 56466eee11e3 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/56466eee11e3
Modified Files:
gdk/gdk_aggr.c
gdk/gdk_imprints.c
sql/server/rel_optimizer.c
Branch: default
Log Message:
Merge with Oct2020 branch.
diffs (197 lines):
diff --git a/gdk/gdk_aggr.c b/gdk/gdk_aggr.c
--- a/gdk/gdk_aggr.c
+++ b/gdk/gdk_aggr.c
@@ -3777,7 +3777,8 @@ static BAT *
doBATgroupquantile(BAT *b, BAT *g, BAT *e, BAT *s, int tp, double quantile,
bool skip_nils, bool abort_on_error, bool average)
{
- bool freeb = false, freeg = false;
+ BAT *origb = b;
+ BAT *origg = g;
oid min, max;
BUN ngrp;
BUN nils = 0;
@@ -3844,12 +3845,10 @@ doBATgroupquantile(BAT *b, BAT *g, BAT *
b = BATproject(s, b);
if (b == NULL)
return NULL;
- freeb = true;
if (g) {
g = BATproject(s, g);
if (g == NULL)
goto bunins_failed;
- freeg = true;
}
}
@@ -3869,27 +3868,25 @@ doBATgroupquantile(BAT *b, BAT *g, BAT *
else
bn = COLcopy(b, tp, false, TRANSIENT);
BAThseqbase(bn, g->tseqbase); /* deals with NULL */
- if (freeb)
+ if (b != origb)
BBPunfix(b->batCacheid);
- if (freeg)
+ if (g != origg)
BBPunfix(g->batCacheid);
return bn;
}
if (BATsort(&t1, &t2, NULL, g, NULL, NULL, false, false, false)
!= GDK_SUCCEED)
goto bunins_failed;
- if (freeg)
+ if (g != origg)
BBPunfix(g->batCacheid);
g = t1;
- freeg = true;
if (BATsort(&t1, NULL, NULL, b, t2, g, false, false, false) !=
GDK_SUCCEED) {
BBPunfix(t2->batCacheid);
goto bunins_failed;
}
- if (freeb)
+ if (b != origb)
BBPunfix(b->batCacheid);
b = t1;
- freeb = true;
BBPunfix(t2->batCacheid);
if (average)
@@ -4064,7 +4061,7 @@ doBATgroupquantile(BAT *b, BAT *g, BAT *
goto bunins_failed;
}
- if (freeb)
+ if (b != origb)
BBPunfix(b->batCacheid);
bn->tkey = BATcount(bn) <= 1;
@@ -4076,15 +4073,15 @@ doBATgroupquantile(BAT *b, BAT *g, BAT *
"e=" ALGOOPTBATFMT ",s=" ALGOOPTBATFMT
",quantile=%g,average=%s -> " ALGOOPTBATFMT
"; start " OIDFMT ", count " BUNFMT " (" LLFMT " usec)\n",
- ALGOBATPAR(b), ALGOOPTBATPAR(g), ALGOOPTBATPAR(e),
+ ALGOBATPAR(origb), ALGOOPTBATPAR(origg), ALGOOPTBATPAR(e),
ALGOOPTBATPAR(s), quantile, average ? "true" : "false",
ALGOOPTBATPAR(bn), ci.seq, ncand, GDKusec() - t0);
return bn;
bunins_failed:
- if (freeb)
+ if (b && b != origb)
BBPunfix(b->batCacheid);
- if (freeg)
+ if (g && g != origg)
BBPunfix(g->batCacheid);
if (bn)
BBPunfix(bn->batCacheid);
diff --git a/gdk/gdk_imprints.c b/gdk/gdk_imprints.c
--- a/gdk/gdk_imprints.c
+++ b/gdk/gdk_imprints.c
@@ -18,6 +18,107 @@
#include "gdk_private.h"
#include "gdk_imprints.h"
+/*
+ * The imprints heap consists of five parts:
+ * - header
+ * - bins
+ * - stats
+ * - imps
+ * - dict
+ *
+ * The header consists of four size_t values `size_t hdata[4]':
+ * - hdata[0] = (1 << 16) | (IMPRINTS_VERSION << 8) | (uint8_t) imprints->bits
+ * - hdata[1] = imprints->impcnt
+ * - hdata[2] = imprints->dictcnt
+ * - hdata[3] = BATcount(b)
+ * The first word of the header includes a version number in case the
+ * format changes, and a bit to indicate that the data was synced to disk.
+ * This bit is the last thing written, so if it is set when reading back
+ * the imprints heap, the data is complete. The fourth word is the size of
+ * the BAT for which the imprints were created. If this size differs from
+ * the size of the BAT at the time of reading the imprints back from disk,
+ * the imprints are not usable.
+ *
+ * The bins area starts immediately after the header. It consists of 64
+ * values in the domain of the BAT `TYPE bins[64]'.
+ *
+ * The stats area starts immediately after the bins area. It consists of
+ * three times an array of 64 64 (32) bit integers `BUN stats[3][64]'. The
+ * three arrays represent respectively min, max, and cnt for each of the 64
+ * bins, so stats can be seen as `BUN min_bins[64]; BUN max_bins[64]; BUN
+ * cnt_bins[64];'. The min and max values are positions of the smallest
+ * and largest non-nil value in the corresponding bin.
+ *
+ * The imps area starts immediately after the stats area. It consists of
+ * one mask per "page" of the input BAT indicating in which bins the values
+ * in that page fall. The size of the mask is given by imprints->bits.
+ * The list of masks may be run-length compressed, see the dict area. A
+ * “page” is 64 bytes worth of values, so the number of values depends on
+ * the type of the value.
+ *
+ * The dict area starts immediately after the imps area. It consists of
+ * one cchdc_t value per "page" of the input. The precise use is described
+ * below.
+ *
+ * There are up to 64 bins into which values are sorted. The number of
+ * bins depends on the number of unique values in the input BAT (actually
+ * on the number of unique values in a random sample of 2048 values of the
+ * input BAT) and is 8, 16, 32, or 64. The number of bits in the mask is
+ * stored in imprints->bits. The boundaries of the bins are dynamically
+ * determined when the imprints are created and stored in the bins array.
+ * In fact, bins[n] contains the lower boundary of the n-th bin (0 <= n <
+ * N, N the number of bins). The value of bins[0] is not actually used:
+ * all values smaller than bins[1] are sorted into this bin, including NIL.
+ * The boundaries are simply computed by stepping with large steps through
+ * the sorted sample and taking 63 (or 31, 15, 7) equally spaced values
+ * from there.
+ *
+ * Once the appropriate bin n is determined for a particular value v
+ * (bins[n] <= v < bins[n+1]), a bitmask can be constructed for the value
+ * as ((uintN_t)1 << n) where N is the number of bits that are used for the
+ * bitmasks and n is the number of the bin (0 <= n < N).
+ *
+ * The input BAT is divided into "pages" where a page is 64 bytes. This
+ * means the number of rows in a page depends on the size of the values: 64
+ * for byte-sized values down to 4 for hugeint (128 bit) values. For each
+ * page, a bitmask is created which is the imprint for that page. The
+ * bitmask has a bit set for each bin into which a value inside the page
+ * falls. These bitmasks (imprints) are stored in the imprints->imps
+ * array, but with a twist, see below.
+ *
+ * The imprints->dict array is an array of cchdc_t values. A cchdc_t value
+ * consists of a bit .repeat and a 24-bit value .cnt. The sum of the .cnt
+ * values is equal to the total number of pages in the input BAT. If the
+ * .repeat value is 0, there are .cnt consecutive imprint bitmasks in the
+ * imprints->imps array, each for one page. If the .repeat value is 1,
+ * there is a single imprint bitmask in the imprints->imps array which is
+ * valid for the next .cnt pages. In this way a run-length encoding
+ * compression scheme is implemented for imprints.
+ *
+ * Imprints are used for range selects, i.e. finding all rows in a BAT
+ * whose value is inside some given range, or alternatively, all rows in a
+ * BAT whose value is outside some given range (anti select).
+ *
+ * A range necessarily covers one or more consecutive bins. A bit mask is
+ * created for all bins that fall fully inside the range being selected (in
+ * gdk_select.c called "innermask"), and a bit mask is created for all bins
+ * that fall fully or partially inside the range (called "mask" in
+ * gdk_select.c). Note that for an "anti" select, i.e. a select which
+ * matches everything except a given range, the bits in the bit masks are
+ * not consecutive.
+ *
+ * We then go through the imps table. All pages where the only set bits
+ * are also set in "innermask" can be blindly added to the result: all
+ * values fall inside the range. All pages where none of the set bits are
+ * also set in "mask" can be blindly skipped: no value falls inside the
+ * range. For the remaining pages, we scan the page and check each
+ * individual value to see whether it is selected.
+ *
+ * Extra speed up is achieved by the run-length encoding of the imps table.
+ * If a mask is in the category of fully inside the range or fully outside,
+ * the complete set of pages can be added/skipped in one go.
+ */
+
#define IMPRINTS_VERSION 2
#define IMPRINTS_HEADER_SIZE 4 /* nr of size_t fields in header */
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list