Changeset: c010c4131406 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c010c4131406
Modified Files:
        clients/Tests/MAL-signatures.stable.out
        clients/Tests/MAL-signatures.stable.out.int128
        clients/Tests/exports.stable.out
        gdk/gdk.h
        gdk/gdk_batop.c
        gdk/gdk_group.c
        gdk/gdk_private.h
        gdk/gdk_unique.c
        monetdb5/modules/kernel/group.c
        monetdb5/modules/kernel/group.h
        monetdb5/modules/kernel/group.mal
        monetdb5/modules/mal/groupby.c
Branch: default
Log Message:

Implemented candidate lists for BATgroup.


diffs (truncated from 1060 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -7694,10 +7694,10 @@ Ready.
 [ "group",     "multicolumn",  "pattern group.multicolumn(b:bat[:any]...) 
(ref:bat[:oid],grp:bat[:oid],hist:bat[:any]) ",      "GROUPmulticolumngroup;",  
     "Derivation of a group index over multiple columns."    ]
 [ "group",     "subgroup",     "command group.subgroup(b:bat[:any_1]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup1;",       
 ""      ]
 [ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup2;",   
     ""      ]
-[ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup4;",   
     ""      ]
+[ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup3;",   
     ""      ]
 [ "group",     "subgroupdone", "command group.subgroupdone(b:bat[:any_1]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup1;",   
     ""      ]
 [ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup2;",       
 ""      ]
-[ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup4;",       
 ""      ]
+[ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup3;",       
 ""      ]
 [ "gsl",       "chi2prob",     "command gsl.chi2prob(d:dbl,i:dbl):dbl ",       
"GSLchisqProb;",        "Chi Squared probability"       ]
 [ "identifier",        "#fromstr",     "command identifier.#fromstr():void ",  
"IDfromString;",        "Convert a string to an identifier without any check"   
]
 [ "identifier",        "#tostr",       "command identifier.#tostr():void ",    
"IDtoString;",  "Convert identifier to string equivalent"       ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -10050,10 +10050,10 @@ Ready.
 [ "group",     "multicolumn",  "pattern group.multicolumn(b:bat[:any]...) 
(ref:bat[:oid],grp:bat[:oid],hist:bat[:any]) ",      "GROUPmulticolumngroup;",  
     "Derivation of a group index over multiple columns."    ]
 [ "group",     "subgroup",     "command group.subgroup(b:bat[:any_1]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup1;",       
 ""      ]
 [ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup2;",   
     ""      ]
-[ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup4;",   
     ""      ]
+[ "group",     "subgroup",     "command 
group.subgroup(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup3;",   
     ""      ]
 [ "group",     "subgroupdone", "command group.subgroupdone(b:bat[:any_1]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",      "GRPsubgroup1;",   
     ""      ]
 [ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup2;",       
 ""      ]
-[ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup4;",       
 ""      ]
+[ "group",     "subgroupdone", "command 
group.subgroupdone(b:bat[:any_1],g:bat[:oid],e:bat[:oid],h:bat[:lng]) 
(groups:bat[:oid],extents:bat[:oid],histo:bat[:lng]) ",  "GRPsubgroup3;",       
 ""      ]
 [ "gsl",       "chi2prob",     "command gsl.chi2prob(d:dbl,i:dbl):dbl ",       
"GSLchisqProb;",        "Chi Squared probability"       ]
 [ "identifier",        "#fromstr",     "command identifier.#fromstr():void ",  
"IDfromString;",        "Convert a string to an identifier without any check"   
]
 [ "identifier",        "#tostr",       "command identifier.#tostr():void ",    
"IDtoString;",  "Convert identifier to string equivalent"       ]
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -122,7 +122,7 @@ void BATfakeCommit(BAT *b);
 gdk_return BATfirstn(BAT **topn, BAT **gids, BAT *b, BAT *cands, BAT *grps, 
BUN n, int asc, int distinct);
 int BATgetaccess(BAT *b);
 PROPrec *BATgetprop(BAT *b, int idx);
-gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *g, 
BAT *e, BAT *h);
+gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT *b, BAT *s, 
BAT *g, BAT *e, BAT *h);
 const char *BATgroupaggrinit(BAT *b, BAT *g, BAT *e, BAT *s, oid *minp, oid 
*maxp, BUN *ngrpp, BUN *startp, BUN *endp, const oid **candp, const oid 
**candendp);
 gdk_return BATgroupavg(BAT **bnp, BAT **cntsp, BAT *b, BAT *g, BAT *e, BAT *s, 
int tp, int skip_nils, int abort_on_error);
 BAT *BATgroupcount(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int skip_nils, int 
abort_on_error);
@@ -1144,7 +1144,8 @@ str FCTshutdown(Client cntxt, MalBlkPtr 
 str GROUPmulticolumngroup(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr 
pci);
 str GRPsubgroup1(bat *ngid, bat *next, bat *nhis, const bat *bid);
 str GRPsubgroup2(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat 
*gid);
-str GRPsubgroup4(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat 
*gid, const bat *eid, const bat *hid);
+str GRPsubgroup3(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat 
*gid, const bat *eid, const bat *hid);
+str GRPsubgroup4(bat *ngid, bat *next, bat *nhis, const bat *bid, const bat 
*sid, const bat *gid, const bat *eid, const bat *hid);
 str IDentifier(identifier *retval, str *in);
 int IDfromString(str src, int *len, identifier *retval);
 str IDprelude(void *ret);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1461,7 +1461,7 @@ gdk_export int BATgetaccess(BAT *b);
 gdk_export gdk_return BATclear(BAT *b, int force);
 gdk_export BAT *COLcopy(BAT *b, int tt, int writeable, int role);
 
-gdk_export gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT 
*b, BAT *g, BAT *e, BAT *h);
+gdk_export gdk_return BATgroup(BAT **groups, BAT **extents, BAT **histo, BAT 
*b, BAT *s, BAT *g, BAT *e, BAT *h);
 
 /*
  * @- BAT Input/Output
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -1455,7 +1455,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
        bn->tnosorted = 0;
        bn->tnorevsorted = 0;
        if (groups) {
-               if (BATgroup_internal(groups, NULL, NULL, bn, g, NULL, NULL, 1) 
!= GDK_SUCCEED)
+               if (BATgroup_internal(groups, NULL, NULL, bn, NULL, g, NULL, 
NULL, 1) != GDK_SUCCEED)
                        goto error;
                if ((*groups)->tkey &&
                    (g == NULL || (g->tsorted && g->trevsorted))) {
diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c
--- a/gdk/gdk_group.c
+++ b/gdk/gdk_group.c
@@ -9,25 +9,32 @@
 #include "monetdb_config.h"
 #include "gdk.h"
 #include "gdk_private.h"
+#include "gdk_cand.h"
 
 /* how much to extend the extent and histo bats when we run out of space */
 #define GROUPBATINCR   8192
 
 /* BATgroup returns three bats that indicate the grouping of the input
- * bat.  All input and output bats (must) have dense head columns.
+ * bat.
+ *
  * Grouping means that all equal values are in the same group, and
  * differing values are in different groups.  If specified, the input
- * bat g gives a pre-existing grouping.  This bat must be aligned with
- * b.
+ * bat g gives a pre-existing grouping which is then subdivided.  If a
+ * candidate list s is specified, groups (both the pre-existing
+ * grouping in g and the output grouping) are aligned with the
+ * candidate list, else they are aligned with b.
  *
  * The outputs are as follows.
- * The groups bat has a dense head which is aligned with the input bat
- * b, and the tail has group id's (type oid).
- * The extents and histo bats have the group id in the head (a dense
- * sequence starting at 0).  The tail of extents is the head oid from
- * b of a representative of the group.  The tail of histo is of type
- * lng and contains the number of elements from b that are member of
- * the group.
+ *
+ * The groups bat is aligned with the candidate list s, or the input
+ * bat b if there is no candidate list, and the tail has group id's
+ * (type oid).
+ *
+ * The extents and histo bats are indexed by group id.  The tail of
+ * extents is the head oid from b of a representative of the group.
+ * The tail of histo is of type lng and contains the number of
+ * elements from b that are member of the group.  The extents BAT can
+ * be used as a candidate list (sorted and unique).
  *
  * The extents and histo bats are optionally created.  The groups bat
  * is always created.  In other words, the groups argument may not be
@@ -41,9 +48,9 @@
  * If all values in b are known to be equal (both sorted and reverse
  * sorted), we produce a single group or copy the input group.
  *
- * If the input bats b and g are sorted, or if the subsorted flag is
- * set (only used by BATsort), we only need to compare consecutive
- * values.
+ * If the input bats b and g are sorted (either direction) or g is not
+ * specified and b is sorted, or if the subsorted flag is set (only
+ * used by BATsort), we only need to compare consecutive values.
  *
  * If the input bat b is sorted, but g is not, we can compare
  * consecutive values in b and need to scan sections of g for equal
@@ -81,38 +88,41 @@
                        }                                               \
                }                                                       \
                if (extents)                                            \
-                       exts[ngrp] = hseqb + (oid) (p - r);             \
+                       exts[ngrp] = hseqb + p;                         \
                if (histo)                                              \
                        cnts[ngrp] = 1;                                 \
-               ngrps[p - r] = ngrp;                                    \
-               ngrp++;                                                 \
+               ngrps[r] = ngrp++;                                      \
        } while (0)
 
 
 #define GRP_compare_consecutive_values(INIT_0,INIT_1,COMP,KEEP)                
\
        do {                                                            \
                INIT_0;                                                 \
-               for (r = 0, p = r + 1, q = r + BATcount(b);             \
-                    p < q;                                             \
-                    p++) {                                             \
+               for (r = 0; r < cnt; r++) {                             \
+                       if (cand) {                                     \
+                               p = *cand++ - b->hseqbase;              \
+                       } else {                                        \
+                               p = start++;                            \
+                       }                                               \
+                       assert(p < end);                                \
                        INIT_1;                                         \
-                       if ((grps && *grps != prev) || COMP) {          \
+                       if (ngrp == 0 || (grps && grps[r] != prev) || COMP) { \
                                GRPnotfound();                          \
                        } else {                                        \
-                               ngrps[p - r] = ngrp - 1;                \
+                               ngrps[r] = ngrp - 1;                    \
                                if (histo)                              \
                                        cnts[ngrp - 1]++;               \
                        }                                               \
                        KEEP;                                           \
                        if (grps)                                       \
-                               prev = *grps++;                         \
+                               prev = grps[r];                         \
                }                                                       \
        } while(0)
 
 #define GRP_compare_consecutive_values_tpe(TYPE)               \
        GRP_compare_consecutive_values(                         \
        /* INIT_0 */    const TYPE *w = (TYPE *) Tloc(b, 0);    \
-                       TYPE pw = w[0]                  ,       \
+                       TYPE pw = 0                     ,       \
        /* INIT_1 */                                    ,       \
        /* COMP   */    w[p] != pw                      ,       \
        /* KEEP   */    pw = w[p]                               \
@@ -120,7 +130,7 @@
 
 #define GRP_compare_consecutive_values_any()                   \
        GRP_compare_consecutive_values(                         \
-       /* INIT_0 */    pv = BUNtail(bi, 0)             ,       \
+       /* INIT_0 */    pv = NULL                       ,       \
        /* INIT_1 */    v = BUNtail(bi, p)              ,       \
        /* COMP   */    cmp(v, pv) != 0                 ,       \
        /* KEEP   */    pv = v                                  \
@@ -131,40 +141,46 @@
        do {                                                            \
                INIT_0;                                                 \
                pgrp[grps[0]] = 0;                                      \
-               for (j = r = 0, p = r + 1, q = r + BATcount(b);         \
-                    p < q;                                             \
-                    p++) {                                             \
+               j = 0;                                                  \
+               for (r = 0; r < cnt; r++) {                             \
+                       if (cand) {                                     \
+                               p = *cand++ - b->hseqbase;              \
+                       } else {                                        \
+                               p = start++;                            \
+                       }                                               \
+                       assert(p < end);                                \
                        INIT_1;                                         \
-                       if (COMP) {                                     \
-                               /* range [j, p) is all same value */    \
-                               /* i is position where we saw p's old   \
-                                * group last */                        \
-                               i = pgrp[grps[p - r]];                  \
+                       if (ngrp != 0 && COMP) {                        \
+                               /* range [j, r) is all same value */    \
+                               /* i is position where we saw r's */    \
+                               /* old group last */                    \
+                               i = pgrp[grps[r]];                      \
                                /* p is new position where we saw this  \
                                 * group */                             \
-                               pgrp[grps[p - r]] = p;                  \
-                               if (j <= i && i < p)    {               \
-                                       /* i is position of equal       \
-                                        * value in same old group as   \
-                                        * p, so p gets same new group  \
-                                        * as i */                      \
-                                       oid grp = ngrps[i - r];         \
-                                       ngrps[p - r] = grp;             \
+                               pgrp[grps[r]] = r;                      \
+                               if (j <= i && i < r)    {               \
+                                       /* i is position of equal */    \
+                                       /* value in same old group */   \
+                                       /* as r, so r gets same new */  \
+                                       /* group as i */                \
+                                       oid grp = ngrps[i];             \
+                                       ngrps[r] = grp;                 \
                                        if (histo)                      \
                                                cnts[grp]++;            \
                                        if (gn->tsorted &&              \
                                            grp != ngrp - 1)            \
                                                gn->tsorted = 0;        \
-                                       /* we found the value/group     \
-                                        * combination, go to next      \
-                                        * value */                     \
+                                       /* we found the value/group */  \
+                                       /* combination, go to next */   \
+                                       /* value */                     \
                                        continue;                       \
                                }                                       \
                        } else {                                        \
                                /* value differs from previous value */ \
-                               j = p;                                  \
+                               /* (or is the first) */                 \
+                               j = r;                                  \
                                KEEP;                                   \
-                               pgrp[grps[p - r]] = p;                  \
+                               pgrp[grps[r]] = r;                      \
                        }                                               \
                        /* start a new group */                         \
                        GRPnotfound();                                  \
@@ -174,7 +190,7 @@
 #define GRP_subscan_old_groups_tpe(TYPE)                       \
        GRP_subscan_old_groups(                                 \
        /* INIT_0 */    const TYPE *w = (TYPE *) Tloc(b, 0);    \
-                       TYPE pw = w[0]                  ,       \
+                       TYPE pw = 0                     ,       \
        /* INIT_1 */                                    ,       \
        /* COMP   */    w[p] == pw                      ,       \
        /* KEEP   */    pw = w[p]                               \
@@ -182,7 +198,7 @@
 
 #define GRP_subscan_old_groups_any()                           \
        GRP_subscan_old_groups(                                 \
-       /* INIT_0 */    pv = BUNtail(bi, 0)             ,       \
+       /* INIT_0 */    pv = NULL                       ,       \
        /* INIT_1 */    v = BUNtail(bi, p)              ,       \
        /* COMP   */    cmp(v, pv) == 0                 ,       \
        /* KEEP   */    pv = v                                  \
@@ -207,9 +223,13 @@
 #define GRP_use_existing_hash_table(INIT_0,INIT_1,COMP)                        
\
        do {                                                            \
                INIT_0;                                                 \
-               for (r = lo, p = r, q = hi;                             \
-                    p < q;                                             \
-                    p++) {                                             \
+               for (r = 0; r < cnt; r++) {                             \
+                       if (cand) {                                     \
+                               p = cand[r] - hseqb + lo;               \
+                       } else {                                        \
+                               p = start + r;                          \
+                       }                                               \
+                       assert(p < end);                                \
                        INIT_1;                                         \
                        /* this loop is similar, but not equal, to */   \
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to