Changeset: c010c4131406 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c010c4131406
Modified Files:
clients/Tests/MAL-signatures.stable.out
clients/Tests/MAL-signatures.stable.out.int128
clients/Tests/exports.stable.out
gdk/gdk.h
gdk/gdk_batop.c
gdk/gdk_group.c
gdk/gdk_private.h
gdk/gdk_unique.c
monetdb5/modules/kernel/group.c
monetdb5/modules/kernel/group.h
monetdb5/modules/kernel/group.mal
monetdb5/modules/mal/groupby.c
Branch: default
Log Message:
Implemented candidate lists for BATgroup.
diffs (truncated from 1060 to 300 lines):
diff --git a/clients/Tests/MAL-signatures.stable.out
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -7694,10 +7694,10 @@ Ready.
[ "group", "multicolumn", "pattern group.multicolumn(b:bat[:any]...)
(ref:bat[:oid],grp:bat[:oid],hist:bat[:any]) ", "GROUPmulticolumngroup;",
"Derivation of a group index over multiple columns." ]
[ "group", "subgroup", "command group.subgroup(b:bat[:any_1])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup1;",
"" ]
[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup2;",
"" ]
-[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup4;",
"" ]
+[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup3;",
"" ]
[ "group", "subgroupdone", "command group.subgroupdone(b:bat[:any_1])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup1;",
"" ]
[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup2;",
"" ]
-[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup4;",
"" ]
+[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup3;",
"" ]
[ "gsl", "chi2prob", "command gsl.chi2prob(d:dbl,i:dbl):dbl ",
"GSLchisqProb;", "Chi Squared probability" ]
[ "identifier", "#fromstr", "command identifier.#fromstr():void ",
"IDfromString;", "Convert a string to an identifier without any check"
]
[ "identifier", "#tostr", "command identifier.#tostr():void ",
"IDtoString;", "Convert identifier to string equivalent" ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -10050,10 +10050,10 @@ Ready.
[ "group", "multicolumn", "pattern group.multicolumn(b:bat[:any]...)
(ref:bat[:oid],grp:bat[:oid],hist:bat[:any]) ", "GROUPmulticolumngroup;",
"Derivation of a group index over multiple columns." ]
[ "group", "subgroup", "command group.subgroup(b:bat[:any_1])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup1;",
"" ]
[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup2;",
"" ]
-[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup4;",
"" ]
+[ "group", "subgroup", "command
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup3;",
"" ]
[ "group", "subgroupdone", "command group.subgroupdone(b:bat[:any_1])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup1;",
"" ]
[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup2;",
"" ]
-[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup4;",
"" ]
+[ "group", "subgroupdone", "command
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng])
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ", "GRPsubgroup3;",
"" ]
[ "gsl", "chi2prob", "command gsl.chi2prob(d:dbl,i:dbl):dbl ",
"GSLchisqProb;", "Chi Squared probability" ]
[ "identifier", "#fromstr", "command identifier.#fromstr():void ",
"IDfromString;", "Convert a string to an identifier without any check"
]
[ "identifier", "#tostr", "command identifier.#tostr():void ",
"IDtoString;", "Convert identifier to string equivalent" ]
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -122,7 +122,7 @@ void BATfakeCommit(BAT *b);
gdk_return BATfirstn(BAT **topn, BAT **gids, BAT *b, BAT *cands, BAT *grps,
BUN n, int asc, int distinct);
int BATgetaccess(BAT *b);
PROPrec *BATgetprop(BAT *b, int idx);
-gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *g,
BAT *e, BAT *h);
+gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *s,
BAT *g, BAT *e, BAT *h);
const char *BATgroupaggrinit(BAT *b, BAT *g, BAT *e, BAT *s, oid *minp, oid
*maxp, BUN *ngrpp, BUN *startp, BUN *endp, const oid **candp, const oid
**candendp);
gdk_return BATgroupavg(BAT **bnp, BAT **cntsp, BAT *b, BAT *g, BAT *e, BAT *s,
int tp, int skip_nils, int abort_on_error);
BAT *BATgroupcount(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int skip_nils, int
abort_on_error);
@@ -1144,7 +1144,8 @@ str FCTshutdown(Client cntxt, MalBlkPtr
str GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr
pci);
str GRPsubgroup1(bat *ngid, bat *next, bat *nhis, const bat *bid);
str GRPsubgroup2(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat
*gid);
-str GRPsubgroup4(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat
*gid, const bat *eid, const bat *hid);
+str GRPsubgroup3(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat
*gid, const bat *eid, const bat *hid);
+str GRPsubgroup4(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat
*sid, const bat *gid, const bat *eid, const bat *hid);
str IDentifier(identifier *retval, str *in);
int IDfromString(str src, int *len, identifier *retval);
str IDprelude(void *ret);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1461,7 +1461,7 @@ gdk_export int BATgetaccess(BAT *b);
gdk_export gdk_return BATclear(BAT *b, int force);
gdk_export BAT *COLcopy(BAT *b, int tt, int writeable, int role);
-gdk_export gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT
*b, BAT *g, BAT *e, BAT *h);
+gdk_export gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT
*b, BAT *s, BAT *g, BAT *e, BAT *h);
/*
* @- BAT Input/Output
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -1455,7 +1455,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
bn->tnosorted = 0;
bn->tnorevsorted = 0;
if (groups) {
- if (BATgroup_internal(groups, NULL, NULL, bn, g, NULL, NULL, 1)
!= GDK_SUCCEED)
+ if (BATgroup_internal(groups, NULL, NULL, bn, NULL, g, NULL,
NULL, 1) != GDK_SUCCEED)
goto error;
if ((*groups)->tkey &&
(g == NULL || (g->tsorted && g->trevsorted))) {
diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c
--- a/gdk/gdk_group.c
+++ b/gdk/gdk_group.c
@@ -9,25 +9,32 @@
#include "monetdb_config.h"
#include "gdk.h"
#include "gdk_private.h"
+#include "gdk_cand.h"
/* how much to extend the extent and histo bats when we run out of space */
#define GROUPBATINCR 8192
/* BATgroup returns three bats that indicate the grouping of the input
- * bat. All input and output bats (must) have dense head columns.
+ * bat.
+ *
* Grouping means that all equal values are in the same group, and
* differing values are in different groups. If specified, the input
- * bat g gives a pre-existing grouping. This bat must be aligned with
- * b.
+ * bat g gives a pre-existing grouping which is then subdivided. If a
+ * candidate list s is specified, groups (both the pre-existing
+ * grouping in g and the output grouping) are aligned with the
+ * candidate list, else they are aligned with b.
*
* The outputs are as follows.
- * The groups bat has a dense head which is aligned with the input bat
- * b, and the tail has group id's (type oid).
- * The extents and histo bats have the group id in the head (a dense
- * sequence starting at 0). The tail of extents is the head oid from
- * b of a representative of the group. The tail of histo is of type
- * lng and contains the number of elements from b that are member of
- * the group.
+ *
+ * The groups bat is aligned with the candidate list s, or the input
+ * bat b if there is no candidate list, and the tail has group id's
+ * (type oid).
+ *
+ * The extents and histo bats are indexed by group id. The tail of
+ * extents is the head oid from b of a representative of the group.
+ * The tail of histo is of type lng and contains the number of
+ * elements from b that are member of the group. The extents BAT can
+ * be used as a candidate list (sorted and unique).
*
* The extents and histo bats are optionally created. The groups bat
* is always created. In other words, the groups argument may not be
@@ -41,9 +48,9 @@
* If all values in b are known to be equal (both sorted and reverse
* sorted), we produce a single group or copy the input group.
*
- * If the input bats b and g are sorted, or if the subsorted flag is
- * set (only used by BATsort), we only need to compare consecutive
- * values.
+ * If the input bats b and g are sorted (either direction) or g is not
+ * specified and b is sorted, or if the subsorted flag is set (only
+ * used by BATsort), we only need to compare consecutive values.
*
* If the input bat b is sorted, but g is not, we can compare
* consecutive values in b and need to scan sections of g for equal
@@ -81,38 +88,41 @@
} \
} \
if (extents) \
- exts[ngrp] = hseqb + (oid) (p - r); \
+ exts[ngrp] = hseqb + p; \
if (histo) \
cnts[ngrp] = 1; \
- ngrps[p - r] = ngrp; \
- ngrp++; \
+ ngrps[r] = ngrp++; \
} while (0)
#define GRP_compare_consecutive_values(INIT_0,INIT_1,COMP,KEEP)
\
do { \
INIT_0; \
- for (r = 0, p = r + 1, q = r + BATcount(b); \
- p < q; \
- p++) { \
+ for (r = 0; r < cnt; r++) { \
+ if (cand) { \
+ p = *cand++ - b->hseqbase; \
+ } else { \
+ p = start++; \
+ } \
+ assert(p < end); \
INIT_1; \
- if ((grps && *grps != prev) || COMP) { \
+ if (ngrp == 0 || (grps && grps[r] != prev) || COMP) { \
GRPnotfound(); \
} else { \
- ngrps[p - r] = ngrp - 1; \
+ ngrps[r] = ngrp - 1; \
if (histo) \
cnts[ngrp - 1]++; \
} \
KEEP; \
if (grps) \
- prev = *grps++; \
+ prev = grps[r]; \
} \
} while(0)
#define GRP_compare_consecutive_values_tpe(TYPE) \
GRP_compare_consecutive_values( \
/* INIT_0 */ const TYPE *w = (TYPE *) Tloc(b, 0); \
- TYPE pw = w[0] , \
+ TYPE pw = 0 , \
/* INIT_1 */ , \
/* COMP */ w[p] != pw , \
/* KEEP */ pw = w[p] \
@@ -120,7 +130,7 @@
#define GRP_compare_consecutive_values_any() \
GRP_compare_consecutive_values( \
- /* INIT_0 */ pv = BUNtail(bi, 0) , \
+ /* INIT_0 */ pv = NULL , \
/* INIT_1 */ v = BUNtail(bi, p) , \
/* COMP */ cmp(v, pv) != 0 , \
/* KEEP */ pv = v \
@@ -131,40 +141,46 @@
do { \
INIT_0; \
pgrp[grps[0]] = 0; \
- for (j = r = 0, p = r + 1, q = r + BATcount(b); \
- p < q; \
- p++) { \
+ j = 0; \
+ for (r = 0; r < cnt; r++) { \
+ if (cand) { \
+ p = *cand++ - b->hseqbase; \
+ } else { \
+ p = start++; \
+ } \
+ assert(p < end); \
INIT_1; \
- if (COMP) { \
- /* range [j, p) is all same value */ \
- /* i is position where we saw p's old \
- * group last */ \
- i = pgrp[grps[p - r]]; \
+ if (ngrp != 0 && COMP) { \
+ /* range [j, r) is all same value */ \
+ /* i is position where we saw r's */ \
+ /* old group last */ \
+ i = pgrp[grps[r]]; \
/* p is new position where we saw this \
* group */ \
- pgrp[grps[p - r]] = p; \
- if (j <= i && i < p) { \
- /* i is position of equal \
- * value in same old group as \
- * p, so p gets same new group \
- * as i */ \
- oid grp = ngrps[i - r]; \
- ngrps[p - r] = grp; \
+ pgrp[grps[r]] = r; \
+ if (j <= i && i < r) { \
+ /* i is position of equal */ \
+ /* value in same old group */ \
+ /* as r, so r gets same new */ \
+ /* group as i */ \
+ oid grp = ngrps[i]; \
+ ngrps[r] = grp; \
if (histo) \
cnts[grp]++; \
if (gn->tsorted && \
grp != ngrp - 1) \
gn->tsorted = 0; \
- /* we found the value/group \
- * combination, go to next \
- * value */ \
+ /* we found the value/group */ \
+ /* combination, go to next */ \
+ /* value */ \
continue; \
} \
} else { \
/* value differs from previous value */ \
- j = p; \
+ /* (or is the first) */ \
+ j = r; \
KEEP; \
- pgrp[grps[p - r]] = p; \
+ pgrp[grps[r]] = r; \
} \
/* start a new group */ \
GRPnotfound(); \
@@ -174,7 +190,7 @@
#define GRP_subscan_old_groups_tpe(TYPE) \
GRP_subscan_old_groups( \
/* INIT_0 */ const TYPE *w = (TYPE *) Tloc(b, 0); \
- TYPE pw = w[0] , \
+ TYPE pw = 0 , \
/* INIT_1 */ , \
/* COMP */ w[p] == pw , \
/* KEEP */ pw = w[p] \
@@ -182,7 +198,7 @@
#define GRP_subscan_old_groups_any() \
GRP_subscan_old_groups( \
- /* INIT_0 */ pv = BUNtail(bi, 0) , \
+ /* INIT_0 */ pv = NULL , \
/* INIT_1 */ v = BUNtail(bi, p) , \
/* COMP */ cmp(v, pv) == 0 , \
/* KEEP */ pv = v \
@@ -207,9 +223,13 @@
#define GRP_use_existing_hash_table(INIT_0,INIT_1,COMP)
\
do { \
INIT_0; \
- for (r = lo, p = r, q = hi; \
- p < q; \
- p++) { \
+ for (r = 0; r < cnt; r++) { \
+ if (cand) { \
+ p = cand[r] - hseqb + lo; \
+ } else { \
+ p = start + r; \
+ } \
+ assert(p < end); \
INIT_1; \
/* this loop is similar, but not equal, to */ \
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list