Changeset: faa738a1c89c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=faa738a1c89c
Removed Files:
        monetdb5/modules/mal/Tests/groupby01.mal
Modified Files:
        clients/Tests/MAL-signatures.stable.out
        monetdb5/modules/mal/Tests/All
        monetdb5/modules/mal/Tests/groupby00.mal
        monetdb5/modules/mal/groupby.c
        monetdb5/modules/mal/groupby.h
        monetdb5/modules/mal/groupby.mal
        monetdb5/modules/mal/mal_io.c
Branch: default
Log Message:

Simplify the interface
After discussions decided to keep the api extension minimal.
The key contribution of the multi-group is to determine the
order of evaluation. The aggregation is a separate issue.


diffs (truncated from 529 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -36844,6 +36844,10 @@ pattern calc.iszero(v:bte):bit
 address CMDvarISZERO;
 comment Unary check for zero of V
 
+command calc.json(j:json):json 
+address JSONstr2json;
+comment Convert JSON to JSON. Dealing with escape characters
+
 command calc.json(j:str):json 
 address JSONstr2json;
 comment Convert string to its JSON. Dealing with escape characters
@@ -39239,26 +39243,6 @@ command geom.point(x:dbl,y:dbl):wkb
 address wkbcreatepoint;
 comment Construct a point from two geometries
 
-pattern group.avg(pivot:bat[:oid,:any_1],b:bat[:oid,:any_2]...) 
(o:bat[:oid,:oid],a:bat[:oid,:any_2]) 
-address GROUPavg;
-comment Derive the grouping of all tuples in the BATs and determine their 
average value.
-
-pattern group.count(b:bat[:oid,:any]...) (o:bat[:oid,:oid],cnt:bat[:oid,:wrd]) 
-address GROUPcount;
-comment Derive the grouping of all tuples in the BATs and derive their count 
them.
-
-pattern group.id(b:bat[:oid,:any]...):bat[:oid,:oid] 
-address GROUPid;
-comment Derive the grouping of all tuples in the BATs.
-
-pattern group.min(target:bat[:oid,:any_1],b:bat[:oid,:any]...) 
(o:bat[:oid,:oid],mi:bat[:oid,:any_1]) 
-address GROUPmin;
-comment Derive the grouping of all tuples in the BATs and their determine 
minimum value.
-
-pattern group.max(target:bat[:oid,:any_1],b:bat[:oid,:any]...) 
(o:bat[:oid,:oid],X_4:bat[:oid,:any_1]) 
-address GROUPmax;
-comment Derive the grouping of all tuples in the BATs and determine their 
maximum value.
-
 command 
group.subgroupdone(b:bat[:oid,:any_1],g:bat[:oid,:oid],e:bat[:oid,:oid],h:bat[:oid,:wrd])
 (groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd]) 
 address GRPsubgroup4;
 command group.subgroupdone(b:bat[:oid,:any_1],g:bat[:oid,:oid]) 
(groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd]) 
@@ -39271,9 +39255,9 @@ command group.subgroup(b:bat[:oid,:any_1
 address GRPsubgroup2;
 command group.subgroup(b:bat[:oid,:any_1]) 
(groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd]) 
 address GRPsubgroup1;
-pattern group.subgroup(b:bat[:oid,:any]...) 
(ref:bat[:oid,:oid],grp:bat[:oid,:any],ext:bat[:oid,:any]) 
+pattern group.subgroup(b:bat[:oid,:any]...) 
(ref:bat[:oid,:oid],grp:bat[:oid,:oid],hist:bat[:oid,:any]) 
 address GROUPmulticolumn;
-comment Old-fashioned derivation of a group index over multiple columns.
+comment Derivation of a group index over multiple columns.
 
 command identifier.#fromstr():void 
 address IDfromString;
diff --git a/monetdb5/modules/mal/Tests/All b/monetdb5/modules/mal/Tests/All
--- a/monetdb5/modules/mal/Tests/All
+++ b/monetdb5/modules/mal/Tests/All
@@ -19,7 +19,6 @@ mserver00
 qgram
 
 groupby00
-groupby01
 
 # statistics00
 #statistics01
diff --git a/monetdb5/modules/mal/Tests/groupby00.mal 
b/monetdb5/modules/mal/Tests/groupby00.mal
--- a/monetdb5/modules/mal/Tests/groupby00.mal
+++ b/monetdb5/modules/mal/Tests/groupby00.mal
@@ -1,4 +1,5 @@
 #testing the group_by minimalistics
+# this is the backward compatibility sequence
 
 b1:= bat.new(:oid,:int);
 b2:= bat.new(:oid,:int);
diff --git a/monetdb5/modules/mal/Tests/groupby01.mal 
b/monetdb5/modules/mal/Tests/groupby01.mal
deleted file mode 100644
--- a/monetdb5/modules/mal/Tests/groupby01.mal
+++ /dev/null
@@ -1,77 +0,0 @@
-#testing the group_by minimalistics
-
-b1:= bat.new(:oid,:int);
-b2:= bat.new(:oid,:int);
-b3:= bat.new(:oid,:int);
-
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,2);
-bat.append(b1,2);
-bat.append(b1,2);
-bat.append(b1,2);
-
-bat.append(b2,3);
-bat.append(b2,3);
-bat.append(b2,4);
-bat.append(b2,4);
-bat.append(b2,3);
-bat.append(b2,3);
-bat.append(b2,4);
-bat.append(b2,4);
-
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-
-io.print(b1,b2,b3);
-
-#single column groups
-(g1,grp,ext):= group.subgroup(b1);
-z1:= group.id(b1,g1);
-io.print(z1);
-
-(g2,grp,ext):= group.subgroup(b2);
-z2:= group.id(b2,g2);
-io.print(z2);
-
-(g3,grp,ext):= group.subgroup(b3);
-z3:= group.id(b3,g3);
-io.print(z3);
-
-(g1_1,grp,ext):= group.subgroup(b1,b1);
-z1_1:= group.id(b1,b1);
-io.print(z1_1);
-
-(g1_2,grp,ext):= group.subgroup(b1,b2);
-z1_2:= group.id(b1,b2);
-io.print(z1_2);
-
-(g1_3,grp,ext):= group.subgroup(b1,b3);
-z1_3:= group.id(b1,b3);
-io.print(z1_3);
-
-(g1_1_1,grp,ext):= group.subgroup(b1,b1,b1);
-z1_1_1:= group.id(b1,b1,b1);
-io.print(z1_1_1);
-
-(g1_2_2,grp,ext):= group.subgroup(b1,b2,b2);
-z1_2_2:= group.id(b1,b2,b2);
-io.print(z1_2_2);
-
-(g1_3_3,grp,ext):= group.subgroup(b1,b3,b3);
-z1_3_3:= group.id(b1,b3,b3);
-io.print(z1_3_3);
-
-(g1_2_3,grp,ext):= group.subgroup(b1,b2,b3);
-z1_2_3:= group.id(b1,b2,b3);
-io.print(z1_2_3);
-
-
diff --git a/monetdb5/modules/mal/groupby.c b/monetdb5/modules/mal/groupby.c
--- a/monetdb5/modules/mal/groupby.c
+++ b/monetdb5/modules/mal/groupby.c
@@ -20,16 +20,15 @@
 /*
  * (c) Martin Kersten
  * Multicolumn group-by support
- * The group-by support module is meant to replace and speedup the kernel 
grouping routines.
- * The latter was originally designed in a memory constraint setting and an 
exercise in
- * performing column-wise grouping incrementally. The effect is that these 
routines are
- * now a major performance hindrances.
+ * The group-by support module is meant to simplify code analysis and
+ * speedup the kernel on multi-attribute grouping routines.
  *
  * The target is to support SQL-like multicolumngroup_by operations, which are 
lists of
  * attributes and a group aggregate function.
  * Each group can be represented with an oid into the n-ary table.
  * Consider the query "select count(*), max(A) from R group by A, B,C." whose 
code
  * snippet in MAL would become something like:
+ *
  * @verbatim
  * _1:bat[:oid,:int]  := sql.bind("sys","r","a",0);
  * _2:bat[:oid,:str]  := sql.bind("sys","r","b",0);
@@ -37,11 +36,10 @@
  * ...
  * _9 := algebra.select(_1,0,100);
  * ..
- * (grp_4:bat[:oid,:wrd], gid:bat[:oid,:oid]) := groupby.count(_9,  _1,_2 _3);
- * (grp_5:bat[:oid,:lng], gid:bat[:oid,:oid]) := groupby.max(_9,_2, _1,_2,_3);
+ * (grp_4:bat[:oid,:wrd], gid:bat[:oid,:oid]) := groupby.count(_9,_2);
+ * (grp_5:bat[:oid,:lng], gid:bat[:oid,:oid]) := groupby.max(_9,_2,_3);
  * @end verbatim
  *
- * All instructions have a candidate oid list.
  * The id() function merely becomes the old-fashioned oid-based group 
identification list.
  * This way related values can be obtained from the attribute columns. It can 
be the input
  * for the count() function, which saves some re-computation.
@@ -65,6 +63,7 @@
  */
 typedef struct{
        bat *bid;       /* input bats */
+       BAT *candidate; /* list */
        BAT **cols;
        BUN *unique; /* number of different values */
        int last;
@@ -74,7 +73,7 @@ typedef struct{
 static AGGRtask*
 GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){
        AGGRtask *a;
-       int i,j,k;
+       int i;
        BAT *b, *bs, *bh = NULL;
        BUN sample;
 
@@ -98,7 +97,6 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
                                BBPreleaseref(a->cols[a->last]->batCacheid);
                        return NULL;
                }
-               a->size = BATcount(b);
                sample = BATcount(b) < 1000 ? BATcount(b): 1000;
                bs = BATsample( b, sample);
                if (bs) {
@@ -106,12 +104,29 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
                        a->unique[a->last] = BATcount(bh);
                        if ( bh ) BBPreleaseref(bh->batCacheid);
                }
+               if ( b->tsorted)
+                       a->unique[a->last] = 1000; /* sorting helps grouping */
+               a->size = BATcount(b);
                if ( bs ) BBPreleaseref(bs->batCacheid);
        }
 
+#ifdef _DEBUG_GROUPBY_
+       for(i=0; i<a->last; i++)
+               mnstr_printf(cntxt->fdout,"#group %d unique "BUNFMT "\n", i, 
a->unique[i]);
+#endif
+       return a;
+}
+
+static void 
+GROUPcollectSort(AGGRtask *a, int start, int finish)
+{
+       int i,j,k;
+       BAT *b;
+       BUN sample;
+
        /* sort the columns by decreasing unique */
-       for (i = 1; i< a->last; i++)
-       for( j = i+1; j<a->last; j++)
+       for (i = start; i< finish; i++)
+       for( j = i+1; j<finish; j++)
        if ( a->unique[i] < a->unique[j]){
                k =a->bid[i];
                a->bid[i] = a->bid[j];
@@ -125,11 +140,6 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
                a->unique[i] = a->unique[j];
                a->unique[j] = sample;
        }
-#ifdef _DEBUG_GROUPBY_
-       for(i=0; i<a->last; i++)
-               mnstr_printf(cntxt->fdout,"#group %d unique "BUNFMT "\n", i, 
a->unique[i]);
-#endif
-       return a;
 }
 
 static void
@@ -142,159 +152,6 @@ GROUPdelete(AGGRtask *a){
        GDKfree(a);
 }
 
-// Collect the unique group identifiers for all
-str
-GROUPid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-       int *ret = (int*) getArgReference(stk,pci,0);
-       AGGRtask *a;
-       BAT *bn;
-
-       a = GROUPcollect(cntxt,mb,stk,pci);
-       bn =  BATnew(TYPE_void,TYPE_oid, a->size);
-       if ( bn == NULL) {
-               GROUPdelete(a);
-               throw(MAL,"groupby.id",MAL_MALLOC_FAIL);
-       }
-       BATseqbase(bn,0);
-
-       GROUPdelete(a);
-       BBPkeepref(*ret= bn->batCacheid);
-       return MAL_SUCCEED;
-}
-
-str
-GROUPcountTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-       (void) cntxt;
-       (void) mb;
-       (void) stk;
-       (void) pci;
-       return MAL_SUCCEED;
-}
-
-str
-GROUPmaxTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-       (void) cntxt;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to