Changeset: faa738a1c89c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=faa738a1c89c
Removed Files:
monetdb5/modules/mal/Tests/groupby01.mal
Modified Files:
clients/Tests/MAL-signatures.stable.out
monetdb5/modules/mal/Tests/All
monetdb5/modules/mal/Tests/groupby00.mal
monetdb5/modules/mal/groupby.c
monetdb5/modules/mal/groupby.h
monetdb5/modules/mal/groupby.mal
monetdb5/modules/mal/mal_io.c
Branch: default
Log Message:
Simplify the interface
After discussions decided to keep the api extension minimal.
The key contribution of the multi-group is to determine the
order of evaluation. The aggregation is a separate issue.
diffs (truncated from 529 to 300 lines):
diff --git a/clients/Tests/MAL-signatures.stable.out
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -36844,6 +36844,10 @@ pattern calc.iszero(v:bte):bit
address CMDvarISZERO;
comment Unary check for zero of V
+command calc.json(j:json):json
+address JSONstr2json;
+comment Convert JSON to JSON. Dealing with escape characters
+
command calc.json(j:str):json
address JSONstr2json;
comment Convert string to its JSON. Dealing with escape characters
@@ -39239,26 +39243,6 @@ command geom.point(x:dbl,y:dbl):wkb
address wkbcreatepoint;
comment Construct a point from two geometries
-pattern group.avg(pivot:bat[:oid,:any_1],b:bat[:oid,:any_2]...)
(o:bat[:oid,:oid],a:bat[:oid,:any_2])
-address GROUPavg;
-comment Derive the grouping of all tuples in the BATs and determine their
average value.
-
-pattern group.count(b:bat[:oid,:any]...) (o:bat[:oid,:oid],cnt:bat[:oid,:wrd])
-address GROUPcount;
-comment Derive the grouping of all tuples in the BATs and derive their count
them.
-
-pattern group.id(b:bat[:oid,:any]...):bat[:oid,:oid]
-address GROUPid;
-comment Derive the grouping of all tuples in the BATs.
-
-pattern group.min(target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(o:bat[:oid,:oid],mi:bat[:oid,:any_1])
-address GROUPmin;
-comment Derive the grouping of all tuples in the BATs and their determine
minimum value.
-
-pattern group.max(target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(o:bat[:oid,:oid],X_4:bat[:oid,:any_1])
-address GROUPmax;
-comment Derive the grouping of all tuples in the BATs and determine their
maximum value.
-
command
group.subgroupdone(b:bat[:oid,:any_1],g:bat[:oid,:oid],e:bat[:oid,:oid],h:bat[:oid,:wrd])
(groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd])
address GRPsubgroup4;
command group.subgroupdone(b:bat[:oid,:any_1],g:bat[:oid,:oid])
(groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd])
@@ -39271,9 +39255,9 @@ command group.subgroup(b:bat[:oid,:any_1
address GRPsubgroup2;
command group.subgroup(b:bat[:oid,:any_1])
(groups:bat[:oid,:oid],extents:bat[:oid,:oid],histo:bat[:oid,:wrd])
address GRPsubgroup1;
-pattern group.subgroup(b:bat[:oid,:any]...)
(ref:bat[:oid,:oid],grp:bat[:oid,:any],ext:bat[:oid,:any])
+pattern group.subgroup(b:bat[:oid,:any]...)
(ref:bat[:oid,:oid],grp:bat[:oid,:oid],hist:bat[:oid,:any])
address GROUPmulticolumn;
-comment Old-fashioned derivation of a group index over multiple columns.
+comment Derivation of a group index over multiple columns.
command identifier.#fromstr():void
address IDfromString;
diff --git a/monetdb5/modules/mal/Tests/All b/monetdb5/modules/mal/Tests/All
--- a/monetdb5/modules/mal/Tests/All
+++ b/monetdb5/modules/mal/Tests/All
@@ -19,7 +19,6 @@ mserver00
qgram
groupby00
-groupby01
# statistics00
#statistics01
diff --git a/monetdb5/modules/mal/Tests/groupby00.mal
b/monetdb5/modules/mal/Tests/groupby00.mal
--- a/monetdb5/modules/mal/Tests/groupby00.mal
+++ b/monetdb5/modules/mal/Tests/groupby00.mal
@@ -1,4 +1,5 @@
#testing the group_by minimalistics
+# this is the backward compatibility sequence
b1:= bat.new(:oid,:int);
b2:= bat.new(:oid,:int);
diff --git a/monetdb5/modules/mal/Tests/groupby01.mal
b/monetdb5/modules/mal/Tests/groupby01.mal
deleted file mode 100644
--- a/monetdb5/modules/mal/Tests/groupby01.mal
+++ /dev/null
@@ -1,77 +0,0 @@
-#testing the group_by minimalistics
-
-b1:= bat.new(:oid,:int);
-b2:= bat.new(:oid,:int);
-b3:= bat.new(:oid,:int);
-
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,1);
-bat.append(b1,2);
-bat.append(b1,2);
-bat.append(b1,2);
-bat.append(b1,2);
-
-bat.append(b2,3);
-bat.append(b2,3);
-bat.append(b2,4);
-bat.append(b2,4);
-bat.append(b2,3);
-bat.append(b2,3);
-bat.append(b2,4);
-bat.append(b2,4);
-
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-bat.append(b3,5);
-bat.append(b3,6);
-
-io.print(b1,b2,b3);
-
-#single column groups
-(g1,grp,ext):= group.subgroup(b1);
-z1:= group.id(b1,g1);
-io.print(z1);
-
-(g2,grp,ext):= group.subgroup(b2);
-z2:= group.id(b2,g2);
-io.print(z2);
-
-(g3,grp,ext):= group.subgroup(b3);
-z3:= group.id(b3,g3);
-io.print(z3);
-
-(g1_1,grp,ext):= group.subgroup(b1,b1);
-z1_1:= group.id(b1,b1);
-io.print(z1_1);
-
-(g1_2,grp,ext):= group.subgroup(b1,b2);
-z1_2:= group.id(b1,b2);
-io.print(z1_2);
-
-(g1_3,grp,ext):= group.subgroup(b1,b3);
-z1_3:= group.id(b1,b3);
-io.print(z1_3);
-
-(g1_1_1,grp,ext):= group.subgroup(b1,b1,b1);
-z1_1_1:= group.id(b1,b1,b1);
-io.print(z1_1_1);
-
-(g1_2_2,grp,ext):= group.subgroup(b1,b2,b2);
-z1_2_2:= group.id(b1,b2,b2);
-io.print(z1_2_2);
-
-(g1_3_3,grp,ext):= group.subgroup(b1,b3,b3);
-z1_3_3:= group.id(b1,b3,b3);
-io.print(z1_3_3);
-
-(g1_2_3,grp,ext):= group.subgroup(b1,b2,b3);
-z1_2_3:= group.id(b1,b2,b3);
-io.print(z1_2_3);
-
-
diff --git a/monetdb5/modules/mal/groupby.c b/monetdb5/modules/mal/groupby.c
--- a/monetdb5/modules/mal/groupby.c
+++ b/monetdb5/modules/mal/groupby.c
@@ -20,16 +20,15 @@
/*
* (c) Martin Kersten
* Multicolumn group-by support
- * The group-by support module is meant to replace and speedup the kernel
grouping routines.
- * The latter was originally designed in a memory constraint setting and an
exercise in
- * performing column-wise grouping incrementally. The effect is that these
routines are
- * now a major performance hindrances.
+ * The group-by support module is meant to simplify code analysis and
+ * speedup the kernel on multi-attribute grouping routines.
*
* The target is to support SQL-like multicolumngroup_by operations, which are
lists of
* attributes and a group aggregate function.
* Each group can be represented with an oid into the n-ary table.
* Consider the query "select count(*), max(A) from R group by A, B,C." whose
code
* snippet in MAL would become something like:
+ *
* @verbatim
* _1:bat[:oid,:int] := sql.bind("sys","r","a",0);
* _2:bat[:oid,:str] := sql.bind("sys","r","b",0);
@@ -37,11 +36,10 @@
* ...
* _9 := algebra.select(_1,0,100);
* ..
- * (grp_4:bat[:oid,:wrd], gid:bat[:oid,:oid]) := groupby.count(_9, _1,_2 _3);
- * (grp_5:bat[:oid,:lng], gid:bat[:oid,:oid]) := groupby.max(_9,_2, _1,_2,_3);
+ * (grp_4:bat[:oid,:wrd], gid:bat[:oid,:oid]) := groupby.count(_9,_2);
+ * (grp_5:bat[:oid,:lng], gid:bat[:oid,:oid]) := groupby.max(_9,_2,_3);
* @end verbatim
*
- * All instructions have a candidate oid list.
* The id() function merely becomes the old-fashioned oid-based group
identification list.
* This way related values can be obtained from the attribute columns. It can
be the input
* for the count() function, which saves some re-computation.
@@ -65,6 +63,7 @@
*/
typedef struct{
bat *bid; /* input bats */
+ BAT *candidate; /* list */
BAT **cols;
BUN *unique; /* number of different values */
int last;
@@ -74,7 +73,7 @@ typedef struct{
static AGGRtask*
GROUPcollect( Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci){
AGGRtask *a;
- int i,j,k;
+ int i;
BAT *b, *bs, *bh = NULL;
BUN sample;
@@ -98,7 +97,6 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
BBPreleaseref(a->cols[a->last]->batCacheid);
return NULL;
}
- a->size = BATcount(b);
sample = BATcount(b) < 1000 ? BATcount(b): 1000;
bs = BATsample( b, sample);
if (bs) {
@@ -106,12 +104,29 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
a->unique[a->last] = BATcount(bh);
if ( bh ) BBPreleaseref(bh->batCacheid);
}
+ if ( b->tsorted)
+ a->unique[a->last] = 1000; /* sorting helps grouping */
+ a->size = BATcount(b);
if ( bs ) BBPreleaseref(bs->batCacheid);
}
+#ifdef _DEBUG_GROUPBY_
+ for(i=0; i<a->last; i++)
+ mnstr_printf(cntxt->fdout,"#group %d unique "BUNFMT "\n", i,
a->unique[i]);
+#endif
+ return a;
+}
+
+static void
+GROUPcollectSort(AGGRtask *a, int start, int finish)
+{
+ int i,j,k;
+ BAT *b;
+ BUN sample;
+
/* sort the columns by decreasing unique */
- for (i = 1; i< a->last; i++)
- for( j = i+1; j<a->last; j++)
+ for (i = start; i< finish; i++)
+ for( j = i+1; j<finish; j++)
if ( a->unique[i] < a->unique[j]){
k =a->bid[i];
a->bid[i] = a->bid[j];
@@ -125,11 +140,6 @@ GROUPcollect( Client cntxt, MalBlkPtr mb
a->unique[i] = a->unique[j];
a->unique[j] = sample;
}
-#ifdef _DEBUG_GROUPBY_
- for(i=0; i<a->last; i++)
- mnstr_printf(cntxt->fdout,"#group %d unique "BUNFMT "\n", i,
a->unique[i]);
-#endif
- return a;
}
static void
@@ -142,159 +152,6 @@ GROUPdelete(AGGRtask *a){
GDKfree(a);
}
-// Collect the unique group identifiers for all
-str
-GROUPid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
- int *ret = (int*) getArgReference(stk,pci,0);
- AGGRtask *a;
- BAT *bn;
-
- a = GROUPcollect(cntxt,mb,stk,pci);
- bn = BATnew(TYPE_void,TYPE_oid, a->size);
- if ( bn == NULL) {
- GROUPdelete(a);
- throw(MAL,"groupby.id",MAL_MALLOC_FAIL);
- }
- BATseqbase(bn,0);
-
- GROUPdelete(a);
- BBPkeepref(*ret= bn->batCacheid);
- return MAL_SUCCEED;
-}
-
-str
-GROUPcountTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
- (void) cntxt;
- (void) mb;
- (void) stk;
- (void) pci;
- return MAL_SUCCEED;
-}
-
-str
-GROUPmaxTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
- (void) cntxt;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list