MonetDB: stratified_sampling - Add missing sampling files.

2017-06-28 Thread Abe Wits
Changeset: bdcd02833b9b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bdcd02833b9b
Added Files:
sql/backends/monet5/sample/80_sample.sql
sql/backends/monet5/sample/Makefile.ag
sql/backends/monet5/sample/Tests/All
sql/backends/monet5/sample/Tests/weightedsample.sql
sql/backends/monet5/sample/sample.c
sql/backends/monet5/sample/sample.h
Modified Files:
sql/server/sql_parser.y
Branch: stratified_sampling
Log Message:

Add missing sampling files.


diffs (183 lines):

diff --git a/sql/backends/monet5/sample/80_sample.sql 
b/sql/backends/monet5/sample/80_sample.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sample/80_sample.sql
@@ -0,0 +1,13 @@
+-- This Source Code Form is subject to the terms of the Mozilla Public
+-- License, v. 2.0.  If a copy of the MPL was not distributed with this
+-- file, You can obtain one at http://mozilla.org/MPL/2.0/.
+--
+-- Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+
+-- add function signatures to SQL catalog
+
+
+-- Reverse a string
+create function weighted_sample(src double, cnt bigint)
+returns boolean external name libsample.weighted_sample;--TODO possibly 
nolibsample
+
diff --git a/sql/backends/monet5/sample/Makefile.ag 
b/sql/backends/monet5/sample/Makefile.ag
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sample/Makefile.ag
@@ -0,0 +1,36 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0.  If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+
+INCLUDES = .. \
+../../../include \
+   ../../../common \
+   ../../../storage \
+   ../../../server \
+   ../../../../monetdb5/modules/atoms \
+   ../../../../monetdb5/modules/kernel \
+   ../../../../monetdb5/mal \
+   ../../../../monetdb5/modules/mal \
+   ../../../../monetdb5/optimizer \
+   ../../../../common/options \
+   ../../../../common/stream \
+   ../../../../gdk
+
+lib__sample = {
+   MODULE
+   DIR = libdir/monetdb5
+   SOURCES = sample.c sample.h sample_impl.h
+   LIBS = ../../../../monetdb5/tools/libmonetdb5 \
+  ../../../../gdk/libbat
+}
+
+headers_sql = {
+   HEADERS = sql
+   DIR = libdir/monetdb5/createdb
+   SOURCES = 80_sample.sql
+}
+
+EXTRA_DIST_DIR = Tests
+
diff --git a/sql/backends/monet5/sample/Tests/All 
b/sql/backends/monet5/sample/Tests/All
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sample/Tests/All
@@ -0,0 +1,1 @@
+weightedsample
diff --git a/sql/backends/monet5/sample/Tests/weightedsample.sql 
b/sql/backends/monet5/sample/Tests/weightedsample.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sample/Tests/weightedsample.sql
@@ -0,0 +1,9 @@
+set optimizer = 'sequential_pipe';
+-- ADD FLAG TO DISALLOW PARALLELIZATION (MITOSIS) FOR weighted_sample
+CREATE TABLE wsample (i INTEGER, weights DOUBLE);
+INSERT INTO wsample VALUES (1, 1), (2, 1), (3, 1), (4, 1), (5, 1);
+
+
+explain SELECT i FROM wsample WHERE weighted_sample(weights, 2);
+SELECT i FROM wsample WHERE weighted_sample(weights, 2);
+
diff --git a/sql/backends/monet5/sample/sample.c 
b/sql/backends/monet5/sample/sample.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sample/sample.c
@@ -0,0 +1,47 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+ */
+
+/* monetdb_config.h must be the first include in each .c file */
+
+#include "monetdb_config.h"
+#include "sample.h"
+
+#ifdef notdefined //!!!TODO
+
+/* MAL wrapper */
+char *
+UDFBATweightedsample(bat *ret, const bat *arg, const lng *cnt)
+{//bat = identifier, BAT is actual bat, BATdescriptor turns ID into BAT
+   BAT *res = NULL, *src = NULL;
+   char *msg = NULL;
+
+   /* assert calling sanity */
+   assert(ret != NULL && arg != NULL);
+
+   /* bat-id -> BAT-descriptor */
+   if ((src = BATdescriptor(*arg)) == NULL)
+   throw(MAL, "batudf.reverse", RUNTIME_OBJECT_MISSING);
+   printf("Count: %lld\n", *cnt);
+
+   //TODO Type checking
+   /* do the work */
+   //msg = UDFBATreverse_ ( , src );//TODO
+   throw(MAL, "batudf.reverse", "LOLFAIL");//TODO
+   res = _BATsample(arg, *cnt, BAT *cdf)
+   
+   /* release input BAT-descriptor */
+   //BBPunfix(src->batCacheid);
+
+   //if (msg == MAL_SUCCEED) {
+   /* register result BAT in buffer pool */
+   //  BBPkeepref((*ret = res->batCacheid));
+   //}
+   return msg;
+}
+
+#endif
diff --git a/sql/backends/monet5/sample/sample.h 
b/sql/backends/monet5/sample/sample.h
new 

MonetDB: stratified_sampling - cast non-double weights for weigh...

2017-05-15 Thread Abe Wits
Changeset: 3468b2d208f3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3468b2d208f3
Modified Files:
gdk/gdk_sample.c
monetdb5/modules/mal/sample.mal
Branch: stratified_sampling
Log Message:

cast non-double weights for weighted sampling


diffs (179 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -143,12 +143,6 @@ OIDTreeToBATAntiset(struct oidtreenode *
((oid *) bat->theap.base)[bat->batCount++] = noid;
 }
 
-/* inorder traversal, gives us a bit BAT */
-/*BAT *bat OIDTreeToBITBAT(struct oidtreenode)
-{
-   //TODO create this function
-}*/
-
 
 /* BATsample takes uniform samples of void headed BATs */
 BAT *
@@ -248,6 +242,8 @@ BATsample(BAT *b, BUN n)
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
 {//TODO test correctness extensively
+   BAT* weights;
+   bit weights_are_cast;
BAT* sample;
oid* oids;/* points to the oids in sample */
dbl* w_ptr;//TODO types of w
@@ -264,30 +260,45 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
ERRORcheck(w->ttype == TYPE_str || w->ttype == TYPE_void,
"BATsample: type of weights not 
castable to doubles\n", NULL);
-   ERRORcheck(w->ttype != TYPE_dbl,
-   "BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
+
+   if(w->ttype != TYPE_dbl) {
+   weights = BATconvert(w, NULL, TYPE_dbl, 0);
+   ERRORcheck(weights == NULL, "BATsample: could not cast weights 
to doubles\n", NULL);
+   weights_are_cast = 1;
+   } else {
+   weights = w;
+   weights_are_cast = 0;
+   }
+   //ERRORcheck(w->ttype != TYPE_dbl,
+   //  "BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
//TODO: handle NULL values in w_ptr
 
cnt = BATcount(b);
 
sample = COLnew(0, TYPE_oid, n, TRANSIENT);
 
-   if(sample == NULL)
+   if(sample == NULL) {
+   if(weights_are_cast)//if weights where converted, delete 
converted BAT
+   BBPunfix(weights->batCacheid);
return NULL;
-   if(n == 0)
+   }
+   if(n == 0) {
+   if(weights_are_cast)
+   BBPunfix(weights->batCacheid);
return sample;
+   }
 
 
keys = (dbl*) GDKmalloc(sizeof(dbl)*n);
if(keys == NULL) {
+   if(weights_are_cast)
+   BBPunfix(weights->batCacheid);
BBPunfix(sample->batCacheid);
return NULL;
}
 
-
-
oids = (oid *) Tloc(sample, 0);
-   w_ptr = (dbl*) Tloc(w, 0);
+   w_ptr = (dbl*) Tloc(weights, 0);
 
if(!mt_rng) {
mt_rng = mtwist_new();
@@ -303,13 +314,25 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
for(j=0; i < n && j < cnt; j++) {
if(w_ptr[j] == 0.0)
continue;
+   if(w_ptr[j] < 0.0) {
+   BBPunfix(sample->batCacheid);
+   GDKfree(keys);
+   if(weights_are_cast)
+   BBPunfix(weights->batCacheid);
+   GDKerror("BATsample: w contains negative weights\n");
+   return NULL;
+   }
oids[i] = (oid)(j+minoid);
keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl?
i++;
}
-   if(i < n) {/* not enough non-zero weights: cannot take sample */
+
+   if(i < n) {
BBPunfix(sample->batCacheid);
GDKfree(keys);
+   if(weights_are_cast)
+   BBPunfix(weights->batCacheid);
+   GDKerror("BATsample: sample size bigger than number of non-zero 
weights\n");
return NULL;
}
 
@@ -339,6 +362,8 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
}
 
GDKfree(keys);
+   if(weights_are_cast)
+   BBPunfix(weights->batCacheid);
 
sample->trevsorted = sample->batCount <= 1;
sample->tsorted = sample->batCount <= 1;
@@ -352,27 +377,3 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
 
 
-/* BATweightedbitbat creates a bit BAT of length cnt containing n 1s and cnt-n 
0s */
-/* Note that the type of w should be castable to doubles */
-/*BAT *
-BATweightedbitbat(BUN cnt, BUN n, BAT *w)
-{
-   BAT* res;
-   res = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
-   BATsetcount(res, cnt);
-   
-   //Need to adjust _BATsample so it will return a bit BAT with bools 
denoting if element is selected
-   //Now it will rather return a subset
-   //TODO rewrite _BATsample to support this, add call to _BATsample
-   //Why did we choose for this UDF notation?
-   //+ easier 

MonetDB: stratified_sampling - add dbl type to sql parser

2017-05-15 Thread Abe Wits
Changeset: 16ba98a3221e for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=16ba98a3221e
Modified Files:
gdk/gdk_sample.c
sql/server/rel_select.c
sql/server/sql_parser.y
sql/server/sql_scan.c
sql/server/sql_symbol.c
sql/server/sql_symbol.h
Branch: stratified_sampling
Log Message:

add dbl type to sql parser


diffs (truncated from 346 to 300 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -267,18 +267,25 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
ERRORcheck(w->ttype != TYPE_dbl,
"BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
//TODO: handle NULL values in w_ptr
+
cnt = BATcount(b);
 
+   sample = COLnew(0, TYPE_oid, n, TRANSIENT);
+
+   if(sample == NULL)
+   return NULL;
+   if(n == 0)
+   return sample;
+
+
keys = (dbl*) GDKmalloc(sizeof(dbl)*n);
-   if(keys == NULL)
-   return NULL;
-
-   sample = COLnew(0, TYPE_oid, n, TRANSIENT);
-   if(sample == NULL) {
-   free(keys);
+   if(keys == NULL) {
+   BBPunfix(sample->batCacheid);
return NULL;
}
 
+
+
oids = (oid *) Tloc(sample, 0);
w_ptr = (dbl*) Tloc(w, 0);
 
@@ -301,7 +308,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
i++;
}
if(i < n) {/* not enough non-zero weights: cannot take sample */
-   BBPunfix(sample->batCacheid);//TODO why not unfix?
+   BBPunfix(sample->batCacheid);
GDKfree(keys);
return NULL;
}
diff --git a/sql/server/rel_select.c b/sql/server/rel_select.c
--- a/sql/server/rel_select.c
+++ b/sql/server/rel_select.c
@@ -4704,25 +4704,27 @@ rel_select_exp(mvc *sql, sql_rel *rel, S
if (sn->sample) {
list *exps = new_exp_list(sql->sa);
if (sn->sample->token == SQL_WEIGHTED_SAMPLE) {
-   // weighted sampling
-   // parse the sample size and weight vector and pass it 
on to rel_sample
dlist *l = sn->sample->data.lval;
-
-   lng sample_size = l->h->data.l_val;
-   sql_exp* sample_size_exp = exp_atom_lng(sql->sa, 
sample_size);
+   sql_exp* sample_size_exp = NULL;
 
exp_kind iek = {type_value, card_column, FALSE};
symbol* weights = l->h->next->data.sym;
sql_exp* weights_exp = rel_value_exp(sql, , 
weights, 0, iek);
+
+   if(l->h->type == type_lng) {
+   lng sample_size = l->h->data.l_val;
+   sample_size_exp = exp_atom_lng(sql->sa, 
sample_size);
+   } else if(l->h->type == type_dbl) {
+   dbl sampling_fraction = l->h->data.fval;
+   sample_size_exp = exp_atom_dbl(sql->sa, 
sampling_fraction);
+   }
+
if (!sample_size_exp || !weights_exp)
return NULL;
append(exps, sample_size_exp);
append(exps, weights_exp);
-
-   weighted_sample = 1;
} else {
-   // uniform sampling
-   // parse the sample size and pass it on to rel_sample
+   /* uniform sampling */
sql_exp *o = rel_value_exp( sql, , sn->sample, 0, 
ek);
if (!o)
return NULL;
diff --git a/sql/server/sql_parser.y b/sql/server/sql_parser.y
--- a/sql/server/sql_parser.y
+++ b/sql/server/sql_parser.y
@@ -41,6 +41,7 @@
 #define append_symbol(l,d)   dlist_append_symbol( SA, l, d)
 #define append_string(l,d)   dlist_append_string( SA, l, d)
 #define append_type(l,d) dlist_append_type( SA, l, d)
+#define append_dbl(l,d)  dlist_append_dbl( SA, l, d)
 
 #define _atom_string(t, v)   atom_string(SA, t, v)
 
@@ -483,6 +484,10 @@ int yydebug=1;
poslng
nonzerolng
 
+%type 
+   dblval
+   probdbl
+
 %type 
opt_brackets
 
@@ -512,7 +517,7 @@ int yydebug=1;
 
 /* sql prefixes to avoid name clashes on various architectures */
 %token 
-   IDENT aTYPE ALIAS AGGR AGGR2 RANK sqlINT OIDNUM HEXADECIMAL INTNUM 
APPROXNUM 
+   IDENT aTYPE ALIAS AGGR AGGR2 RANK sqlINT OIDNUM HEXADECIMAL APPROXNUM 
sqlDBL
USING 
GLOBAL CAST CONVERT
CHARACTER VARYING LARGE OBJECT VARCHAR CLOB sqlTEXT BINARY sqlBLOB
@@ -3328,17 +,24 @@ opt_sample:
  sql_subtype *t = sql_bind_localtype("lng");
  $$ = _newAtomNode( atom_int(SA, t, $2));
}
- |  SAMPLE INTNUM  {
+ | 

MonetDB: stratified_sampling - explicitly cast oid to bun

2017-05-10 Thread Abe Wits
Changeset: 85bd91c69325 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=85bd91c69325
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

explicitly cast oid to bun


diffs (21 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -159,7 +159,7 @@ BATsample(BAT *b, BUN n)
BUN rescnt;
struct oidtreenode *tree = NULL;
 
-   unsigned int range;
+   BUN range;
 
BATcheck(b, "BATsample", NULL);
ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n", 
NULL);
@@ -210,7 +210,7 @@ BATsample(BAT *b, BUN n)
mtwist_seed(mt_rng, rand());
}

-   range = (int) (maxoid - minoid);
+   range = (BUN) (maxoid - minoid);

/* while we do not have enough sample OIDs yet */
for (rescnt = 0; rescnt < n; rescnt++) {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - explicitly cast oid to int

2017-05-10 Thread Abe Wits
Changeset: 14f6fd85b29f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=14f6fd85b29f
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

explicitly cast oid to int


diffs (12 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -210,7 +210,7 @@ BATsample(BAT *b, BUN n)
mtwist_seed(mt_rng, rand());
}

-   range = maxoid - minoid;
+   range = (int) (maxoid - minoid);

/* while we do not have enough sample OIDs yet */
for (rescnt = 0; rescnt < n; rescnt++) {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - fix memory leak, make mtwist global

2017-05-09 Thread Abe Wits
Changeset: 10700761ef2a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=10700761ef2a
Modified Files:
common/utils/mtwist.c
gdk/gdk_sample.c
monetdb5/modules/kernel/microbenchmark.c
monetdb5/modules/mal/sample.c
Branch: stratified_sampling
Log Message:

fix memory leak, make mtwist global


diffs (233 lines):

diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
--- a/common/utils/mtwist.c
+++ b/common/utils/mtwist.c
@@ -35,8 +35,10 @@
  * @a Dave Beckett, Abe Wits
  * @* Mersenne Twister (MT19937) algorithm
  * 
- * This random number generator has very good statistical properties,
- * and outperforms most stl implementations of rand() in terms of speed
+ * This random number generator outperforms most stl implementations
+ * of rand() in terms of speed. Dieharder tests confirm that the
+ * numbers generated are statistically much closer to true random
+ * numbers than those generated by a typical LCG (including the gcc rand()).
  *
  * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
  * http://en.wikipedia.org/wiki/Mersenne_twister
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -86,6 +86,8 @@ struct oidtreenode {
struct oidtreenode *right;
 };
 
+mtwist *mt_rng = NULL;
+
 static int
 OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
 {
@@ -156,7 +158,7 @@ BATsample(BAT *b, BUN n)
BUN cnt, slen;
BUN rescnt;
struct oidtreenode *tree = NULL;
-   mtwist *mt_rng;
+
unsigned int range;
 
BATcheck(b, "BATsample", NULL);
@@ -201,10 +203,12 @@ BATsample(BAT *b, BUN n)
return NULL;
}

-   /* create and seed Mersenne Twister */
-   mt_rng = mtwist_new();
+   if(!mt_rng) {
+   /* create and seed Mersenne Twister */
+   mt_rng = mtwist_new();
 
-   mtwist_seed(mt_rng, rand());
+   mtwist_seed(mt_rng, rand());
+   }

range = maxoid - minoid;

@@ -249,7 +253,6 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
dbl* w_ptr;//TODO types of w
dbl* keys;/* keys as defined in Alg-A-exp */
BUN cnt, i, j;
-   mtwist *mt_rng;
BUN pos, childpos;
oid item;
dbl r, xw, r2, key, tw;
@@ -266,7 +269,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
//TODO: handle NULL values in w_ptr
cnt = BATcount(b);
 
-   keys = (dbl*) malloc(sizeof(dbl)*n);
+   keys = (dbl*) GDKmalloc(sizeof(dbl)*n);
if(keys == NULL)
return NULL;
 
@@ -279,8 +282,10 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
oids = (oid *) Tloc(sample, 0);
w_ptr = (dbl*) Tloc(w, 0);
 
-   mt_rng = mtwist_new();
-   mtwist_seed(mt_rng, rand());
+   if(!mt_rng) {
+   mt_rng = mtwist_new();
+   mtwist_seed(mt_rng, rand());
+   }
 
BATsetcount(sample, n);
/* obtain sample */
@@ -295,19 +300,25 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl?
i++;
}
-   if(i < n)/* not enough non-zero weights: cannot take sample */
+   if(i < n) {/* not enough non-zero weights: cannot take sample */
+   BBPunfix(sample->batCacheid);//TODO why not unfix?
+   GDKfree(keys);
return NULL;
+   }
 
heapify(compKeysGT, SWAP3);
 
while(true) {
r = mtwist_drand(mt_rng);
xw = log(r)/log(keys[0]);
-   for(;j w_ptr[j]; j++)
+   for(;j= w_ptr[j]; j++)
xw -= w_ptr[j];
if(j >= cnt) break;
 
-   /* At this point, w_ptr[c]+w_ptr[c+1]+...+w_ptr[i-1] < xw < 
w_ptr[c]+w_ptr[c+1]+...+w_ptr[i] */
+   /* At this point:
+*  w_ptr[c]+w_ptr[c+1]+...+w_ptr[i-1]
+*   <  xw (the initial value, log(r)/log(keys[0]))
+*   <= w_ptr[c]+w_ptr[c+1]+...+w_ptr[i] */
tw = pow(keys[0], w_ptr[j]);
r2 = mtwist_drand(mt_rng)*(1-tw)+tw;
key = pow(r2, 1/w_ptr[j]);
@@ -320,7 +331,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
j++;/* Increment j so j=c (c is defined in Alg-A-exp) */
}
 
-   free(keys);
+   GDKfree(keys);
 
sample->trevsorted = sample->batCount <= 1;
sample->tsorted = sample->batCount <= 1;
diff --git a/monetdb5/modules/kernel/microbenchmark.c 
b/monetdb5/modules/kernel/microbenchmark.c
--- a/monetdb5/modules/kernel/microbenchmark.c
+++ b/monetdb5/modules/kernel/microbenchmark.c
@@ -22,6 +22,8 @@
 #include "microbenchmark.h"
 #include "mtwist.h&

MonetDB: stratified_sampling - remove debug output, improve comm...

2017-05-09 Thread Abe Wits
Changeset: aaaebd745aa0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=aaaebd745aa0
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

remove debug output, improve comments


diffs (118 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -64,7 +64,7 @@
} while (0)
 
 
-//CUSTOM SWAP AND COMPARE FUNCTION
+/* CUSTOM SWAP AND COMPARE FUNCTION */
 #define SWAP3(p1, p2)  \
do {\
item = oids[p1];\
@@ -243,11 +243,11 @@ BATsample(BAT *b, BUN n)
 /* based on Alg-A-exp from 'Weighted random sampling with a reservoir' by 
Efraimidis and Spirakis (2006) */
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
-{//TODO: does not create correct samples yet (so it seems! or is the output 
broken?)
+{//TODO test correctness extensively
BAT* sample;
-   oid* oids;//points to the oids in sample
+   oid* oids;/* points to the oids in sample */
dbl* w_ptr;//TODO types of w
-   dbl* keys;//keys as defined in Alg-A-exp
+   dbl* keys;/* keys as defined in Alg-A-exp */
BUN cnt, i, j;
mtwist *mt_rng;
BUN pos, childpos;
@@ -263,7 +263,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
"BATsample: type of weights not 
castable to doubles\n", NULL);
ERRORcheck(w->ttype != TYPE_dbl,
"BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
-
+   //TODO: handle NULL values in w_ptr
cnt = BATcount(b);
 
keys = (dbl*) malloc(sizeof(dbl)*n);
@@ -276,67 +276,49 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
return NULL;
}
 
-
-   //oids = (oid *)sample->theap.base;
oids = (oid *) Tloc(sample, 0);
-   w_ptr = (dbl*) Tloc(w, 0);//TODO is this the right way to get w_ptr?
+   w_ptr = (dbl*) Tloc(w, 0);
 
mt_rng = mtwist_new();
mtwist_seed(mt_rng, rand());
 
BATsetcount(sample, n);
/* obtain sample */
-   //TODO: reservoir sampling with exponential jumps
-   //Initialize prioqueue
-   i=0;//i indices the initial sample (filled with elements with non-zero 
weight)
-   //j indices the oids and weights
+
+   /* Initialize prioqueue */
+   i=0;/* i indices the initial sample (filled with elements with non-zero 
weight) */
+   /* j indices the oids and weights */
for(j=0; i < n && j < cnt; j++) {
if(w_ptr[j] == 0.0)
continue;
oids[i] = (oid)(j+minoid);
-   keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl? NULL values in w_ptr
+   keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl?
i++;
}
-   if(i < n)//not enough non-zero weights
+   if(i < n)/* not enough non-zero weights: cannot take sample */
return NULL;
-   heapify(compKeysGT, SWAP3);//NOTE: writes to 'i'
 
-   fprintf(stderr,"\n\n ID | KEY   (INITIAL HEAP)\n");
-   fprintf(stderr,"+\n");
-   for(i=0; i= cnt) break;
-   //At this point, w_ptr[c]+w_ptr[c+1]+...+w_ptr[i-1] < xw < 
w_ptr[c]+w_ptr[c+1]+...+w_ptr[i]
+
+   /* At this point, w_ptr[c]+w_ptr[c+1]+...+w_ptr[i-1] < xw < 
w_ptr[c]+w_ptr[c+1]+...+w_ptr[i] */
tw = pow(keys[0], w_ptr[j]);
r2 = mtwist_drand(mt_rng)*(1-tw)+tw;
key = pow(r2, 1/w_ptr[j]);
 
-   //Replace element with lowest key in prioqueue
+   /* Replace element with lowest key in prioqueue */
oids[0] = (oid)(j+minoid);
keys[0] = key;
-   siftup(compKeysGT, 0, SWAP3);//NOTE: writes to 'key'
-
-   fprintf(stderr,"\n\n ID | KEY   (<%d,%.2f> INSERTED)\n", 
(int)j, key);
-   fprintf(stderr,"+\n");
-   for(i=0; i

MonetDB: stratified_sampling - write reservoir sampling

2017-05-09 Thread Abe Wits
Changeset: bb51b8eece7c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb51b8eece7c
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

write reservoir sampling


diffs (140 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -32,6 +32,49 @@
 #undef BATsample
 
 
+//TODO share these with gkd_firstn.c
+#define siftup(OPER, START, SWAP)  \
+   do {\
+   pos = (START);  \
+   childpos = (pos << 1) + 1;  \
+   while (childpos < n) {  \
+   /* find most extreme child */   \
+   if (childpos + 1 < n && \
+   !(OPER(childpos + 1, childpos)))\
+   childpos++; \
+   /* compare parent with most extreme child */\
+   if (!OPER(pos, childpos)) { \
+   /* already correctly ordered */ \
+   break;  \
+   }   \
+   /* exchange parent with child and sift child */ \
+   /* further */   \
+   SWAP(pos, childpos);\
+   pos = childpos; \
+   childpos = (pos << 1) + 1;  \
+   }   \
+   } while (0)
+
+#define heapify(OPER, SWAP)\
+   do {\
+   for (i = n / 2; i > 0; i--) \
+   siftup(OPER, i - 1, SWAP);  \
+   } while (0)
+
+#define SWAP3(p1, p2)  \
+   do {\
+   item = oids[p1];\
+   oids[p1] = oids[p2];\
+   oids[p2] = item;\
+   key = keys[p1]; \
+   keys[p1] = keys[p2];\
+   keys[p2] = key; \
+   } while (0)
+
+#define compKeysGT(p1, p2) 
\
+   (keys[p1] > keys[p2] || 
\
+   keys[p1] == keys[p2] && oids[p1] > oids[p2])
+
 /* this is a straightforward implementation of a binary tree */
 struct oidtreenode {
oid o;
@@ -194,14 +237,23 @@ BATsample(BAT *b, BUN n)
 
 /* BATweightedsample takes weighted samples of void headed BATs */
 /* Note that the type of w should be castable to doubles */
+/* based on Alg-A-exp from 'Weighted random sampling with a reservoir' by 
Efraimidis and Spirakis (2006) */
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
 {
BAT* sample;
+   oid* oids;//points to the oids in sample
dbl* w_ptr;//TODO types of w
-   dbl* cdf_ptr;
-   BUN cnt, i;
+   dbl* keys;//keys as defined in Alg-A-exp
+   BUN cnt, i, j;
bit antiset;
+   mtwist *mt_rng;
+   BUN pos, childpos;
+   oid item;
+   dbl r, xw, r2, key, tw;
+
+   oid minoid = b->hseqbase;
+   oid maxoid = b->hseqbase + cnt;
 
BATcheck(b, "BATsample", NULL);
BATcheck(w, "BATsample", NULL);
@@ -213,10 +265,57 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
cnt = BATcount(b);
 
-   antiset = n > cnt / 2;
 
-   /* obtain sample */
-   sample = NULL;//TODO: reservoir sampling with exponential jumps
+   keys = (double*) malloc(sizeof(double)*n);
+   if(keys == NULL)
+   return NULL;
+
+   sample = COLnew(0, TYPE_oid, n, TRANSIENT);
+   if(sample == NULL) {
+   free(keys);
+   return NULL;
+   }
+
+   oids = (oid *)sample->theap.base;
+
+   mt_rng = mtwist_new();
+   mtwist_seed(mt_rng, rand());
+
+   BATsetcount(sample, n);
+   /* obtain sample */
+   //TODO: reservoir sampling with exponential jumps
+   for(j=0; j 0 && j < cnt) {
+   xw -= w[j];
+   j++;
+   }
+   if(j >= cnt) break;

MonetDB: stratified_sampling - simplify OIDTreeMaybeInsert

2017-05-09 Thread Abe Wits
Changeset: 69cd1aa8a456 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=69cd1aa8a456
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

simplify OIDTreeMaybeInsert


diffs (35 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -42,23 +42,23 @@ struct oidtreenode {
 static int
 OIDTreeMaybeInsert(struct oidtreenode *tree, oid o, BUN allocated)
 {
-   struct oidtreenode **nodep;
+   struct oidtreenode *nodep;
 
if (allocated == 0) {
tree->left = tree->right = NULL;
tree->o = o;
return 1;
}
-   nodep = 
-   while (*nodep) {
-   if (o == (*nodep)->o)
+   nodep = tree;
+   while (nodep) {
+   if (o == nodep->o)
return 0;
-   if (o < (*nodep)->o)
-   nodep = &(*nodep)->left;
+   if (o < nodep->o)
+   nodep = nodep->left;
else
-   nodep = &(*nodep)->right;
+   nodep = nodep->right;
}
-   *nodep = [allocated];
+   nodep = [allocated];
tree[allocated].left = tree[allocated].right = NULL;
tree[allocated].o = o;
return 1;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - remove CDF-based weighted sampling

2017-05-09 Thread Abe Wits
Changeset: 9c93f4253ff6 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9c93f4253ff6
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

remove CDF-based weighted sampling


diffs (157 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -100,13 +100,10 @@ OIDTreeToBATAntiset(struct oidtreenode *
//TODO create this function
 }*/
 
-/* 
- * _BATsample is the internal (weighted) sampling function without replacement
- * If cdf=NULL, an uniform sample is taken
- * Otherwise it is assumed the cdf increases monotonically
- */
-static BAT *
-_BATsample(BAT *b, BUN n, BAT *cdf)
+
+/* BATsample takes uniform samples of void headed BATs */
+BAT *
+BATsample(BAT *b, BUN n)
 {
BAT *bn;
BUN cnt, slen;
@@ -115,8 +112,6 @@ static BAT *
mtwist *mt_rng;
unsigned int range;
dbl random;
-   dbl cdf_max;
-   dbl* cdf_ptr;
 
BATcheck(b, "BATsample", NULL);
ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n", 
NULL);
@@ -167,48 +162,18 @@ static BAT *

range = maxoid - minoid;

-   /* sample OIDs (method depends on w) */
-   if(cdf == NULL) {
-   /* no weights, hence do uniform sampling */
-
-   /* while we do not have enough sample OIDs yet */
-   for (rescnt = 0; rescnt < n; rescnt++) {
-   oid candoid;
-   do {
-   /* generate a new random OID in 
[minoid, maxoid[
-* that is including minoid, excluding 
maxoid*/
-   candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
-   /* if that candidate OID was already
-* generated, try again */
-   } while (!OIDTreeMaybeInsert(tree, candoid, 
rescnt));
-   }
-
-   } else {
-   /* do weighted sampling */
-   
-   cdf_ptr = (dbl*) Tloc(cdf, 0);
-   if (!antiset)
-   cdf_max = cdf_ptr[cnt-1];
-   else
-   cdf_max = cdf_ptr[0];
-  //TODO how to type/cast cdf_max?
-
-   /* generate candoids, using CDF */
-   for (rescnt = 0; rescnt < n; rescnt++) {
-   oid candoid;
-
-   do {
-   random = mtwist_drand(mt_rng)*cdf_max;
-   /* generate a new random OID with 
minoid <= OID < maxoid */
-   /* note that cdf has already been 
adjusted for antiset case */
-   candoid = (oid) ( minoid + (oid) 
SORTfndfirst(cdf, ) );
-   /* if that candidate OID was already
-* generated, try again */
-   } while (!OIDTreeMaybeInsert(tree, candoid, 
rescnt));
-   }
+   /* while we do not have enough sample OIDs yet */
+   for (rescnt = 0; rescnt < n; rescnt++) {
+   oid candoid;
+   do {
+   /* generate a new random OID in [minoid, maxoid[
+* that is including minoid, excluding maxoid*/
+   candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
+   /* if that candidate OID was already
+* generated, try again */
+   } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
}
 
-
if (!antiset) {
OIDTreeToBAT(tree, bn);
} else {
@@ -227,20 +192,11 @@ static BAT *
return bn;
 }
 
-
-/* BATsample takes uniform samples of void headed BATs */
-BAT *
-BATsample(BAT *b, BUN n)
-{
-   return _BATsample(b, n, NULL);
-}
-
 /* BATweightedsample takes weighted samples of void headed BATs */
 /* Note that the type of w should be castable to doubles */
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
 {
-   BAT* cdf;
BAT* sample;
dbl* w_ptr;//TODO types of w
dbl* cdf_ptr;
@@ -259,44 +215,14 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
antiset = n > cnt / 2;
 
-   cdf = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
-   BATsetcount(cdf, cnt);
-   
-   /* calculate cumilative distribution function */
-   w_ptr = (dbl*) Tloc(w, 0);//TODO support different types w
-   cdf_ptr = (dbl*) Tloc(cdf, 0);
-
-  

MonetDB: stratified_sampling - remove wrd type from mal files

2017-04-20 Thread Abe Wits
Changeset: 976a921f2d82 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=976a921f2d82
Modified Files:
monetdb5/modules/mal/sample.c
monetdb5/modules/mal/sample.mal
Branch: stratified_sampling
Log Message:

remove wrd type from mal files


diffs (42 lines):

diff --git a/monetdb5/modules/mal/sample.c b/monetdb5/modules/mal/sample.c
--- a/monetdb5/modules/mal/sample.c
+++ b/monetdb5/modules/mal/sample.c
@@ -17,7 +17,7 @@
  * query, b) the base tables, and c) the entire database schema. Sampling
  * can be performed during query execution, as well as during data loading in
  * the case of predefined sampling indexes. In addition to the sampling
- * methods, a number of query plan optimisations for sampling are introduced on
+ * methods, a number of query plan optimizations for sampling are introduced on
  * the SQL and MAL level.
  *
  * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
diff --git a/monetdb5/modules/mal/sample.mal b/monetdb5/modules/mal/sample.mal
--- a/monetdb5/modules/mal/sample.mal
+++ b/monetdb5/modules/mal/sample.mal
@@ -20,7 +20,7 @@ command subuniform(b:bat[:any],p:dbl):ba
 address SAMPLEuniform_dbl
 comment "Returns the oids of a uniform sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
 
-command subweighted(b:bat[:any],s:wrd,w:bat[:dbl]):bat[:oid]
+command subweighted(b:bat[:any],s:lng,w:bat[:dbl]):bat[:oid]
 address SAMPLEweighted
 comment "Returns the oids of a weighted sample (without replacement) of size 
s";
 
@@ -29,7 +29,7 @@ address SAMPLEweighted_dbl
 comment "Returns the oids of a weighted sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
 
 
-command subuniform_bitbat(b:bat[:any],s:wrd):bat[:bit]
+command subuniform_bitbat(b:bat[:any],s:lng):bat[:bit]
 address SAMPLEuniform_bitbat
 comment "Returns the oids of a uniform sample (without replacement) of size s";
 
@@ -37,7 +37,7 @@ command subuniform_bitbat(b:bat[:any],p:
 address SAMPLEuniform_bitbat_dbl
 comment "Returns the oids of a uniform sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
 
-command subweighted_bitbat(b:bat[:any],s:wrd,w:bat[:dbl]):bat[:bit]
+command subweighted_bitbat(b:bat[:any],s:lng,w:bat[:dbl]):bat[:bit]
 address SAMPLEweighted_bitbat
 comment "Returns the oids of a weighted sample (without replacement) of size 
s";
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - remove wrd, BUNfirst and BATnew

2017-04-20 Thread Abe Wits
Changeset: dc251b317e2f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=dc251b317e2f
Modified Files:
gdk/gdk_sample.c
monetdb5/modules/kernel/microbenchmark.c
monetdb5/modules/mal/sample.c
monetdb5/modules/mal/sample.h
Branch: stratified_sampling
Log Message:

remove wrd, BUNfirst and BATnew


diffs (109 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -186,7 +186,7 @@ static BAT *
} else {
/* do weighted sampling */

-   cdf_ptr = (dbl*) Tloc(cdf, BUNfirst(cdf));
+   cdf_ptr = (dbl*) Tloc(cdf, 0);
if (!antiset)
cdf_max = cdf_ptr[cnt-1];
else
@@ -199,8 +199,7 @@ static BAT *
 
do {
random = mtwist_drand(mt_rng)*cdf_max;
-   /* generate a new random OID in 
[minoid, maxoid[
-* that is including minoid, excluding 
maxoid*/
+   /* generate a new random OID with 
minoid <= OID < maxoid */
/* note that cdf has already been 
adjusted for antiset case */
candoid = (oid) ( minoid + (oid) 
SORTfndfirst(cdf, ) );
/* if that candidate OID was already
@@ -260,14 +259,16 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
antiset = n > cnt / 2;
 
-   cdf = BATnew(TYPE_void, TYPE_dbl, cnt, TRANSIENT);
+   cdf = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
BATsetcount(cdf, cnt);

/* calculate cumilative distribution function */
-   w_ptr = (dbl*) Tloc(w, BUNfirst(w));//TODO support different types w
-   cdf_ptr = (dbl*) Tloc(cdf, BUNfirst(cdf));
+   w_ptr = (dbl*) Tloc(w, 0);//TODO support different types w
+   cdf_ptr = (dbl*) Tloc(cdf, 0);
 
cdf_ptr[0] = (dbl)w_ptr[0];
+
+   /* remove NULL values */
for (i = 1; i < cnt; i++) {
if((dbl)w_ptr[i] == dbl_nil) {//TODO fix NULL-test if w can 
have different types
cdf_ptr[i] = cdf_ptr[i-1];
@@ -302,7 +303,7 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 BATweightedbitbat(BUN cnt, BUN n, BAT *w)
 {
BAT* res;
-   res = BATnew(TYPE_void, TYPE_dbl, cnt, TRANSIENT);
+   res = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
BATsetcount(res, cnt);

//Need to adjust _BATsample so it will return a bit BAT with bools 
denoting if element is selected
diff --git a/monetdb5/modules/kernel/microbenchmark.c 
b/monetdb5/modules/kernel/microbenchmark.c
--- a/monetdb5/modules/kernel/microbenchmark.c
+++ b/monetdb5/modules/kernel/microbenchmark.c
@@ -374,7 +374,6 @@ MBMmix(bat *bn, bat *batid)
throw(MAL, "microbenchmark.mix", RUNTIME_OBJECT_MISSING);
 
n = BATcount(b);
-   firstbun = BUNfirst(b);
 
mt_rng = mtwist_new();
 
diff --git a/monetdb5/modules/mal/sample.c b/monetdb5/modules/mal/sample.c
--- a/monetdb5/modules/mal/sample.c
+++ b/monetdb5/modules/mal/sample.c
@@ -106,7 +106,7 @@ SAMPLEuniform_dbl(bat *r, bat *b, dbl *p
 }
 
 str
-SAMPLEweighted(bat *r, bat *b, wrd *s, bat *w) {
+SAMPLEweighted(bat *r, bat *b, lng *s, bat *w) {
BAT *br, *bb, *bw;
 
if ((bw = BATdescriptor(*w)) == NULL ) {
@@ -128,7 +128,7 @@ str
 SAMPLEweighted_dbl(bat *r, bat *b, dbl *p, bat *w) {
BAT *bb;
double pr = *p;
-   wrd s;
+   lng s;
 
if ( pr < 0.0 || pr > 1.0 ) {
throw(MAL, "sample.subweighted", ILLEGAL_ARGUMENT
@@ -140,7 +140,7 @@ SAMPLEweighted_dbl(bat *r, bat *b, dbl *
if ((bb = BATdescriptor(*b)) == NULL) {
throw(MAL, "sample.subweighted", INTERNAL_BAT_ACCESS);
}
-   s = (wrd) (pr*(double)BATcount(bb));
+   s = (lng) (pr*(double)BATcount(bb));
BBPunfix(bb->batCacheid);
return SAMPLEweighted(r, b, , w);
 }
diff --git a/monetdb5/modules/mal/sample.h b/monetdb5/modules/mal/sample.h
--- a/monetdb5/modules/mal/sample.h
+++ b/monetdb5/modules/mal/sample.h
@@ -23,10 +23,10 @@ SAMPLEuniform(bat *r, bat *b, lng *s);
 mal_export str
 SAMPLEuniform_dbl(bat *r, bat *b, dbl *p);
 
-sample_export str
-SAMPLEweighted(bat *r, bat *b, wrd *s, bat *w);
+mal_export str
+SAMPLEweighted(bat *r, bat *b, lng *s, bat *w);
 
-sample_export str
+mal_export str
 SAMPLEweighted_dbl(bat *r, bat *b, dbl *p, bat *w);
 
 #endif
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - implement weighted sampling UDF

2017-04-20 Thread Abe Wits
Changeset: 2ad57d092fcd for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2ad57d092fcd
Modified Files:
gdk/gdk.h
gdk/gdk_sample.c
monetdb5/modules/mal/sample.c
monetdb5/modules/mal/sample.h
monetdb5/modules/mal/sample.mal
sql/backends/monet5/Makefile.ag
Branch: stratified_sampling
Log Message:

implement weighted sampling UDF


diffs (truncated from 386 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -3026,12 +3026,17 @@ gdk_export gdk_return BATfirstn(BAT **to
  * @tab BATsample (BAT *b, n)
  * @end multitable
  *
- * The routine BATsample returns a random sample on n BUNs of a BAT.
+ * The routine BATsample returns a random sample containing n BUNs of a BAT.
  *
  */
 gdk_export BAT *BATsample(BAT *b, BUN n);
 
 /*
+ * The routine BATweightedsample returns a weighted random sample containing n 
BUNs of a BAT.
+ */
+gdk_export BAT *BATweightedsample(BAT *b, BUN n, BAT *w);
+
+/*
  *
  */
 #define MAXPARAMS  32
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -31,6 +31,7 @@
 
 #undef BATsample
 
+
 /* this is a straightforward implementation of a binary tree */
 struct oidtreenode {
oid o;
@@ -81,21 +82,31 @@ OIDTreeToBATAntiset(struct oidtreenode *
oid noid;
 
if (node->left != NULL)
-   OIDTreeToBATAntiset(node->left, bat, start, node->o);
+   OIDTreeToBATAntiset(node->left, bat, start, node->o);
else
for (noid = start; noid < node->o; noid++)
((oid *) bat->T->heap.base)[bat->batFirst + 
bat->batCount++] = noid;
 
if (node->right != NULL)
-   OIDTreeToBATAntiset(node->right, bat, node->o + 1, stop);
+   OIDTreeToBATAntiset(node->right, bat, node->o + 1, 
stop);
else
-   for (noid = node->o+1; noid < stop; noid++)
-   ((oid *) 
bat->T->heap.base)[bat->batFirst + bat->batCount++] = noid;
+   for (noid = node->o + 1; noid < stop; noid++)
+   ((oid *) bat->T->heap.base)[bat->batFirst + 
bat->batCount++] = noid;
 }
 
-/* BATsample implements sampling for void headed BATs */
-BAT *
-BATsample(BAT *b, BUN n)
+/* inorder traversal, gives us a bit BAT */
+/*BAT *bat OIDTreeToBITBAT(struct oidtreenode)
+{
+   //TODO create this function
+}*/
+
+/* 
+ * _BATsample is the internal (weighted) sampling function without replacement
+ * If cdf=NULL, an uniform sample is taken
+ * Otherwise it is assumed the cdf increases monotonically
+ */
+static BAT *
+_BATsample(BAT *b, BUN n, BAT *cdf)
 {
BAT *bn;
BUN cnt, slen;
@@ -103,6 +114,9 @@ BATsample(BAT *b, BUN n)
struct oidtreenode *tree = NULL;
mtwist *mt_rng;
unsigned int range;
+   dbl random;
+   dbl cdf_max;
+   dbl* cdf_ptr;
 
BATcheck(b, "BATsample", NULL);
assert(BAThdense(b));
@@ -148,7 +162,6 @@ BATsample(BAT *b, BUN n)
GDKfree(tree);
return NULL;
}
-   /* while we do not have enough sample OIDs yet */

/* create and seed Mersenne Twister */
mt_rng = mtwist_new();
@@ -157,16 +170,49 @@ BATsample(BAT *b, BUN n)

range = maxoid - minoid;

-   for (rescnt = 0; rescnt < n; rescnt++) {
-   oid candoid;
-   do {
-   /* generate a new random OID in [minoid, maxoid[
-* that is including minoid, excluding maxoid*/
-   candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
-   /* if that candidate OID was already
-* generated, try again */
-   } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
+   /* sample OIDs (method depends on w) */
+   if(cdf == NULL) {
+   /* no weights, hence do uniform sampling */
+
+   /* while we do not have enough sample OIDs yet */
+   for (rescnt = 0; rescnt < n; rescnt++) {
+   oid candoid;
+   do {
+   /* generate a new random OID in 
[minoid, maxoid[
+* that is including minoid, excluding 
maxoid*/
+   candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
+   /* if that candidate OID was already
+* generated, try again */
+   } while (!OIDTreeMaybeInsert(tree, candoid, 
rescnt));
+   }
+
+   

MonetDB: stratified_sampling - Merge default into stratified_sam...

2017-04-20 Thread Abe Wits
Changeset: 242f87d7ee59 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=242f87d7ee59
Added Files:
MacOSX/MonetDB_logo.png
clients/R/Tests/dplyr.timeout
gdk/gdk_orderidx.c
java/tests/Test_CisValid.java
java/tests/Test_FetchSize.java
monetdb5/modules/mal/Tests/orderidx00.malC
monetdb5/modules/mal/Tests/orderidx00.stable.err
monetdb5/modules/mal/Tests/orderidx00.stable.out
monetdb5/modules/mal/Tests/orderidx01.malC
monetdb5/modules/mal/Tests/orderidx01.stable.err
monetdb5/modules/mal/Tests/orderidx01.stable.out
monetdb5/modules/mal/Tests/orderidx02.malC
monetdb5/modules/mal/Tests/orderidx02.stable.err
monetdb5/modules/mal/Tests/orderidx02.stable.out
monetdb5/modules/mal/Tests/orderidx04.malC
monetdb5/modules/mal/Tests/orderidx04.stable.err
monetdb5/modules/mal/Tests/orderidx04.stable.out
monetdb5/modules/mal/orderidx.c
monetdb5/modules/mal/orderidx.h
monetdb5/modules/mal/orderidx.mal
sql/backends/monet5/sql_orderidx.c
sql/backends/monet5/sql_orderidx.h
sql/benchmarks/orderindex/experiment.sh
sql/benchmarks/tpch/Tests/lowcardinality.sql
sql/benchmarks/tpch/Tests/lowcardinality.stable.err
sql/benchmarks/tpch/Tests/lowcardinality.stable.out
sql/include/sql_query.h
sql/jdbc/tests/Tests/Test_CisValid.SQL.bat
sql/jdbc/tests/Tests/Test_CisValid.SQL.sh
sql/jdbc/tests/Tests/Test_CisValid.stable.err
sql/jdbc/tests/Tests/Test_CisValid.stable.out
sql/jdbc/tests/Tests/Test_FetchSize.SQL.bat
sql/jdbc/tests/Tests/Test_FetchSize.SQL.sh
sql/jdbc/tests/Tests/Test_FetchSize.stable.err
sql/jdbc/tests/Tests/Test_FetchSize.stable.out
sql/scripts/18_index.sql
sql/test/BugTracker-2014/Tests/querylog.Bug-3607.stable.err.single
sql/test/BugTracker-2014/Tests/querylog.Bug-3607.stable.out.single

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_ALTER_TABLE_crash.Bug-4010.sql

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_ALTER_TABLE_crash.Bug-4010.stable.err

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_ALTER_TABLE_crash.Bug-4010.stable.out

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_UPDATE_crash.Bug-4010.sql

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_UPDATE_crash.Bug-4010.stable.err

sql/test/BugTracker-2016/Tests/RELEASE_SAVEPOINT_after_UPDATE_crash.Bug-4010.stable.out

sql/test/BugTracker-2016/Tests/column_alias_in_where_clause.Bug-3947.stable.out.int128
sql/test/BugTracker-2016/Tests/data3987.csv

sql/test/BugTracker-2016/Tests/decimal_vs_integer.Bug-3941.stable.out.32bit
sql/test/BugTracker-2016/Tests/epoch.Bug-3979.sql
sql/test/BugTracker-2016/Tests/epoch.Bug-3979.stable.err
sql/test/BugTracker-2016/Tests/epoch.Bug-3979.stable.out
sql/test/BugTracker-2016/Tests/fk-smaller-pk.Bug-3983.sql
sql/test/BugTracker-2016/Tests/fk-smaller-pk.Bug-3983.stable.err
sql/test/BugTracker-2016/Tests/fk-smaller-pk.Bug-3983.stable.out
sql/test/BugTracker-2016/Tests/invalidcolumns.Bug-3968.stable.err
sql/test/BugTracker-2016/Tests/invalidcolumns.Bug-3968.stable.out
sql/test/BugTracker-2016/Tests/isaUUID_function.Bug-3997.sql
sql/test/BugTracker-2016/Tests/isaUUID_function.Bug-3997.stable.err
sql/test/BugTracker-2016/Tests/isaUUID_function.Bug-3997.stable.out

sql/test/BugTracker-2016/Tests/join-with-references-2sides-crashes.Bug-3980.sql

sql/test/BugTracker-2016/Tests/join-with-references-2sides-crashes.Bug-3980.stable.err

sql/test/BugTracker-2016/Tests/join-with-references-2sides-crashes.Bug-3980.stable.out
sql/test/BugTracker-2016/Tests/leftjoin.Bug-3981.sql
sql/test/BugTracker-2016/Tests/leftjoin.Bug-3981.stable.err
sql/test/BugTracker-2016/Tests/leftjoin.Bug-3981.stable.out
sql/test/BugTracker-2016/Tests/malformed-copy-int.Bug-3987.sql.in
sql/test/BugTracker-2016/Tests/malformed-copy-int.Bug-3987.stable.err
sql/test/BugTracker-2016/Tests/malformed-copy-int.Bug-3987.stable.out

sql/test/BugTracker-2016/Tests/memory-consumption-query-PLAN-25joins.Bug-3972.sql

sql/test/BugTracker-2016/Tests/memory-consumption-query-PLAN-25joins.Bug-3972.stable.err

sql/test/BugTracker-2016/Tests/memory-consumption-query-PLAN-25joins.Bug-3972.stable.out
sql/test/BugTracker-2016/Tests/rename_exps.Bug-3974.sql
sql/test/BugTracker-2016/Tests/rename_exps.Bug-3974.stable.err
sql/test/BugTracker-2016/Tests/rename_exps.Bug-3974.stable.out
sql/test/BugTracker-2016/Tests/rename_exps.Bug-3974.stable.out.32bit
sql/test/BugTracker-2016/Tests/storagemodel.sql

MonetDB: stratified_sampling - Fix formatting

2016-05-13 Thread Abe Wits
Changeset: 95f9ff39f21a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=95f9ff39f21a
Modified Files:
common/utils/mtwist.c
common/utils/mtwist.h
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Fix formatting


diffs (truncated from 317 to 300 lines):

diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
--- a/common/utils/mtwist.c
+++ b/common/utils/mtwist.c
@@ -49,9 +49,9 @@
 #define MTWIST_MATRIX_A 0x9908B0DFUL
 
 #define MTWIST_MIXBITS(u, v) (((u)_UPPER_MASK) | 
((v)_LOWER_MASK))
-#define MTWIST_TWIST(u, v)   \
-((MTWIST_MIXBITS(u, v) >> 1) ^ \
- ((v)&1UL ? MTWIST_MATRIX_A : 0UL))
+#define MTWIST_TWIST(u, v)\
+   ((MTWIST_MIXBITS(u, v) >> 1) ^ \
+((v)&1UL ? MTWIST_MATRIX_A : 0UL))
 
 /**
  * mtwist_new:
@@ -61,16 +61,16 @@
  * Return value: new MT object or NULL on failure
  */
 mtwist* mtwist_new(void) {
-mtwist* mt;
+   mtwist* mt;
 
-mt = (mtwist*)calloc(1, sizeof(*mt));
-if (!mt) return NULL;
+   mt = (mtwist*)calloc(1, sizeof(*mt));
+   if (!mt) return NULL;
 
-mt->remaining = 0;
-mt->next = NULL;
-mt->seeded = 0;
+   mt->remaining = 0;
+   mt->next = NULL;
+   mt->seeded = 0;
 
-return mt;
+   return mt;
 }
 
 /**
@@ -80,7 +80,7 @@ mtwist* mtwist_new(void) {
  * Destroy a Mersenne Twister object
  */
 void mtwist_free(mtwist* mt) {
-if (mt) free(mt);
+   if (mt) free(mt);
 }
 
 /**
@@ -91,38 +91,38 @@ void mtwist_free(mtwist* mt) {
  * Initialise a Mersenne Twister with an unsigned 32 bit int seed
  */
 void mtwist_seed(mtwist* mt, unsigned int seed) {
-int i;
+   int i;
 
-if (!mt) return;
+   if (!mt) return;
 
-mt->state[0] = (unsigned int)(seed & MTWIST_FULL_MASK);
-for (i = 1; i < MTWIST_N; i++) {
-mt->state[i] =
-(1812433253UL * (mt->state[i - 1] ^ (mt->state[i - 1] >> 30)) +
- i);
-mt->state[i] &= MTWIST_FULL_MASK;
-}
+   mt->state[0] = (unsigned int)(seed & MTWIST_FULL_MASK);
+   for (i = 1; i < MTWIST_N; i++) {
+   mt->state[i] =
+   (1812433253UL * (mt->state[i - 1] ^ (mt->state[i - 1] 
>> 30)) +
+i);
+   mt->state[i] &= MTWIST_FULL_MASK;
+   }
 
-mt->remaining = 0;
-mt->next = NULL;
+   mt->remaining = 0;
+   mt->next = NULL;
 
-mt->seeded = 1;
+   mt->seeded = 1;
 }
 
 static void mtwist_update_state(mtwist* mt) {
-int count;
-unsigned int* p = mt->state;
+   int count;
+   unsigned int* p = mt->state;
 
-for (count = (MTWIST_N - MTWIST_M + 1); --count; p++)
-*p = p[MTWIST_M] ^ MTWIST_TWIST(p[0], p[1]);
+   for (count = (MTWIST_N - MTWIST_M + 1); --count; p++)
+   *p = p[MTWIST_M] ^ MTWIST_TWIST(p[0], p[1]);
 
-for (count = MTWIST_M; --count; p++)
-*p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], p[1]);
+   for (count = MTWIST_M; --count; p++)
+   *p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], p[1]);
 
-*p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], mt->state[0]);
+   *p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], mt->state[0]);
 
-mt->remaining = MTWIST_N;
-mt->next = mt->state;
+   mt->remaining = MTWIST_N;
+   mt->next = mt->state;
 }
 
 /**
@@ -134,26 +134,26 @@ static void mtwist_update_state(mtwist* 
  * Return value: unsigned int with 32 valid bits
  */
 inline unsigned int mtwist_u32rand(mtwist* mt) {
-unsigned int r;
+   unsigned int r;
 
-if (!mt) return 0UL;
+   if (!mt) return 0UL;
 
-if (!mt->seeded) mtwist_seed(mt, 0);
+   if (!mt->seeded) mtwist_seed(mt, 0);
 
-if (!mt->remaining) mtwist_update_state(mt);
+   if (!mt->remaining) mtwist_update_state(mt);
 
-r = *mt->next++;
-mt->remaining--;
+   r = *mt->next++;
+   mt->remaining--;
 
-/* Tempering */
-r ^= (r >> 11);
-r ^= (r << 7) & 0x9D2C5680UL;
-r ^= (r << 15) & 0xEFC6UL;
-r ^= (r >> 18);
+   /* Tempering */
+   r ^= (r >> 11);
+   r ^= (r << 7) & 0x9D2C5680UL;
+   r ^= (r << 15) & 0xEFC6UL;
+   r ^= (r >> 18);
 
-r &= MTWIST_FULL_MASK;
+   r &= MTWIST_FULL_MASK;
 
-return r;
+   return r;
 }
 
 /**
@@ -165,16 +165,16 @@ inline unsigned int mtwist_u32rand(mtwis
  * Return value: random double in the range 0.0 inclusive to 1.0 exclusive;
  *[0.0, 1.0) */
 inline double mtwist_drand(mtwist* mt) {
-unsigned int r;
-double d;
+   unsigned int r;
+   double d;
 
-if (!mt) return 0.0;
+   if (!mt) return 0.0;
 
-r = mtwist_u32rand(mt);
+   r = mtwist_u32rand(mt);
 
-d = r / 4294967296.0; /* 2^32 */
+   d = r / 4294967296.0; /* 2^32 */
 
-return d;
+   return d;
 }
 
 
@@ -189,29 +189,29 @@ inline double mtwist_drand(mtwist* mt) {
  * [a,b] 
  */
 inline int mtwist_uniform_int(mtwist* mt, int a, int b) {
-unsigned int range, scale, max_x, 

MonetDB: stratified_sampling - Add utils to microbenchmark Makefile

2016-05-13 Thread Abe Wits
Changeset: d7201a422ad3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d7201a422ad3
Modified Files:
monetdb5/modules/kernel/Makefile.ag
monetdb5/modules/kernel/microbenchmark.c
Branch: stratified_sampling
Log Message:

Add utils to microbenchmark Makefile


diffs (22 lines):

diff --git a/monetdb5/modules/kernel/Makefile.ag 
b/monetdb5/modules/kernel/Makefile.ag
--- a/monetdb5/modules/kernel/Makefile.ag
+++ b/monetdb5/modules/kernel/Makefile.ag
@@ -8,6 +8,7 @@ INCLUDES = ../../mal \
   ../atoms \
   ../../../common/options \
   ../../../common/stream \
+  ../../../common/utils \
   ../../../gdk
 
 MTSAFE
diff --git a/monetdb5/modules/kernel/microbenchmark.c 
b/monetdb5/modules/kernel/microbenchmark.c
--- a/monetdb5/modules/kernel/microbenchmark.c
+++ b/monetdb5/modules/kernel/microbenchmark.c
@@ -20,7 +20,6 @@
 #include 
 #include 
 #include "microbenchmark.h"
-
 #include "mtwist.h"
 
 static gdk_return
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - Remove obsolete code

2016-05-13 Thread Abe Wits
Changeset: 7753663c9dc4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7753663c9dc4
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Remove obsolete code


diffs (34 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -31,21 +31,6 @@
 
 #undef BATsample
 
-#ifdef STATIC_CODE_ANALYSIS
-#define DRAND (0.5)
-#else
-/* the range of rand() is [0..RAND_MAX], i.e. inclusive;
- * cast first, add later: on Linux RAND_MAX == INT_MAX, so adding 1
- * will overflow, but INT_MAX does fit in a double */
-#if RAND_MAX < 46340   /* 46340*46340 = 2147395600 < INT_MAX */
-/* random range is too small, double it */
-#define DRAND ((double)(rand() * (RAND_MAX + 1) + rand()) / ((double) 
((RAND_MAX + 1) * (RAND_MAX + 1
-#else
-#define DRAND ((double)rand() / ((double)RAND_MAX + 1))
-#endif
-#endif
-
-
 /* this is a straightforward implementation of a binary tree */
 struct oidtreenode {
oid o;
@@ -175,8 +160,6 @@ BATsample(BAT *b, BUN n)
for (rescnt = 0; rescnt < n; rescnt++) {
oid candoid;
do {
-   //candoid = (oid) (minoid + DRAND * (maxoid - 
minoid));
-
/* generate a new random OID in [minoid, maxoid[
  * that is including minoid, excluding maxoid*/
 candoid = (oid) ( minoid + (mtwist_u32rand(mt_rng)%range) );
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - Use Mersenne Twister in microbenc...

2016-05-13 Thread Abe Wits
Changeset: b0514875a835 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b0514875a835
Modified Files:
monetdb5/modules/kernel/microbenchmark.c
Branch: stratified_sampling
Log Message:

Use Mersenne Twister in microbenchmark (and fix formatting)


diffs (180 lines):

diff --git a/monetdb5/modules/kernel/microbenchmark.c 
b/monetdb5/modules/kernel/microbenchmark.c
--- a/monetdb5/modules/kernel/microbenchmark.c
+++ b/monetdb5/modules/kernel/microbenchmark.c
@@ -21,9 +21,7 @@
 #include 
 #include "microbenchmark.h"
 
-#ifdef STATIC_CODE_ANALYSIS
-#define rand() 0
-#endif
+#include "mtwist.h"
 
 static gdk_return
 BATrandom(BAT **bn, oid *base, wrd *size, int *domain, int seed)
@@ -32,6 +30,7 @@ BATrandom(BAT **bn, oid *base, wrd *size
BUN i;
BAT *b = NULL;
int *restrict val;
+   mtwist *mt_rng;
 
if (*size > (wrd)BUN_MAX) {
GDKerror("BATrandom: size must not exceed BUN_MAX");
@@ -62,21 +61,19 @@ BATrandom(BAT **bn, oid *base, wrd *size
val = (int *) Tloc(b, BUNfirst(b));
 
/* create BUNs with random distribution */
+   mt_rng = mtwist_new();
+   mtwist_seed(mt_rng, seed);
+
if (seed != int_nil)
-   srand(seed);
+   mtwist_seed(mt_rng, seed);
+
if (*domain == int_nil) {
-   for (i = 0; i < n; i++) {
-   val[i] = rand();
+   for (i = 0; i < n; i++) {
+   val[i] = mtwist_u32rand(mt_rng) % INT_MAX;
}
-#if RAND_MAX < 46340   /* 46340*46340 = 2147395600 < INT_MAX */
-   } else if (*domain > RAND_MAX + 1) {
-   for (i = 0; i < n; i++) {
-   val[i] = (rand() * (RAND_MAX + 1) + rand()) % *domain;
-   }
-#endif
} else {
-   for (i = 0; i < n; i++) {
-   val[i] = rand() % *domain;
+   for (i = 0; i < n; i++) {
+   val[i] = mtwist_u32rand(mt_rng) % *domain;
}
}
 
@@ -102,6 +99,8 @@ BATuniform(BAT **bn, oid *base, wrd *siz
BAT *b = NULL;
int *restrict val;
int v;
+   mtwist *mt_rng;
+
 
if (*size > (wrd)BUN_MAX) {
GDKerror("BATuniform: size must not exceed BUN_MAX");
@@ -132,15 +131,18 @@ BATuniform(BAT **bn, oid *base, wrd *siz
val = (int *) Tloc(b, BUNfirst(b));
 
/* create BUNs with uniform distribution */
-for (v = 0, i = 0; i < n; i++) {
+   for (v = 0, i = 0; i < n; i++) {
val[i] = v;
if (++v >= *domain)
v = 0;
}
 
+   mt_rng = mtwist_new();
+
/* mix BUNs randomly */
for (r = 0, i = 0; i < n; i++) {
-   const BUN j = i + ((r += rand()) % (n - i));
+   
+   const BUN j = i + ((r += mtwist_u32rand(mt_rng)) % (n - i));
const int tmp = val[i];
 
val[i] = val[j];
@@ -170,6 +172,8 @@ BATskewed(BAT **bn, oid *base, wrd *size
int *restrict val;
const BUN skewedSize = ((*skew) * n) / 100;
const int skewedDomain = ((100 - (*skew)) * (*domain)) / 100;
+   mtwist *mt_rng;
+
 
if (*size > (wrd)BUN_MAX) {
GDKerror("BATskewed: size must not exceed BUN_MAX = " BUNFMT, 
BUN_MAX);
@@ -204,15 +208,17 @@ BATskewed(BAT **bn, oid *base, wrd *size
}
val = (int *) Tloc(b, BUNfirst(b));
 
+   mt_rng = mtwist_new();
+
/* create BUNs with skewed distribution */
for (i = 0; i < skewedSize; i++)
-   val[i] = rand() % skewedDomain;
+   val[i] = mtwist_u32rand(mt_rng) % skewedDomain;
for( ; i < n; i++)
-   val[i] = (rand() % (*domain - skewedDomain)) + skewedDomain;
+   val[i] = (mtwist_u32rand(mt_rng) % (*domain - skewedDomain)) + 
skewedDomain;
 
/* mix BUNs randomly */
for (r = 0, i = 0; i < n; i++) {
-   const BUN j = i + ((r += rand()) % (n - i));
+   const BUN j = i + ((r += mtwist_u32rand(mt_rng)) % (n - i));
const int tmp = val[i];
 
val[i] = val[j];
@@ -279,7 +285,7 @@ BATnormal(BAT **bn, oid *base, wrd *size
b = BATnew(TYPE_void, TYPE_int, n, TRANSIENT);
if (b == NULL) {
return GDK_FAIL;
-}
+   }
if (n == 0) {
b->tsorted = 1;
b->trevsorted = 0;
@@ -297,8 +303,8 @@ BATnormal(BAT **bn, oid *base, wrd *size
 
abs = (unsigned int *) GDKmalloc(d * sizeof(unsigned int));
if (abs == NULL) {
-   BBPreclaim(b);
-   return GDK_FAIL;
+   BBPreclaim(b);
+   return GDK_FAIL;
}
rel = (flt *) abs;
 
@@ -327,16 +333,16 @@ BATnormal(BAT **bn, oid *base, wrd *size
 
/* create BUNs with normal distribution */
for (j = 0, i = 0; i < n && j < d; i++) {
-   

MonetDB: stratified_sampling - Update Makefiles and fix minor bugs

2016-05-13 Thread Abe Wits
Changeset: 1a1be68cd958 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1a1be68cd958
Modified Files:
common/utils/Makefile.ag
common/utils/mtwist.c
gdk/Makefile.ag
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Update Makefiles and fix minor bugs


diffs (61 lines):

diff --git a/common/utils/Makefile.ag b/common/utils/Makefile.ag
--- a/common/utils/Makefile.ag
+++ b/common/utils/Makefile.ag
@@ -30,4 +30,9 @@ lib_msabaoth = {
SOURCES = msabaoth.h msabaoth.c
 }
 
+lib_mtwist = {
+   NOINST
+   SOURCES = mtwist.h mtwist.c
+}
+
 EXTRA_DIST = s_nextafterf.c math_private.h strptime.c
diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
--- a/common/utils/mtwist.c
+++ b/common/utils/mtwist.c
@@ -189,16 +189,17 @@ inline double mtwist_drand(mtwist* mt) {
  * [a,b] 
  */
 inline int mtwist_uniform_int(mtwist* mt, int a, int b) {
+unsigned int range, scale, max_x, x;
 if(b < a) {//invalid range!
 return 0;
 }
-unsigned int range = b-a+1;
-unsigned int scale = 4294967295UL/range;
+range = b-a+1;
+scale = 4294967295UL/range;
 //4294967295UL=2^32-1=RAND_MAX for this Mersenne Twister
-unsigned int max_x = range*scale;
+max_x = range*scale;
+
 //x will be uniform in [0, max_x[
 //Since past%range=0, x%range will be uniform in [0,range[
-unsigned int x; 
 do {
 x = mtwist_u32rand(mt);
 } while(x >= max_x);
diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -35,6 +35,7 @@ lib_gdk = {
LIBS = ../common/options/libmoptions \
../common/stream/libstream \
../common/utils/libmutils \
+   ../common/utils/libmtwist \
$(MATH_LIBS) $(SOCKET_LIBS) $(zlib_LIBS) $(BZ_LIBS) \
$(MALLOC_LIBS) $(PTHREAD_LIBS) $(DL_LIBS) $(PSAPILIB) 
$(KVM_LIBS)
 }
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -27,7 +27,7 @@
 #include "gdk.h"
 #include "gdk_private.h"
 
-#include 
+#include "mtwist.h"
 
 #undef BATsample
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - Merge default into stratified_sam...

2016-05-13 Thread Abe Wits
Changeset: d2f4b3857070 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d2f4b3857070
Added Files:
sql/test/BugTracker-2016/Tests/consolidated_table.Bug-3954.sql
sql/test/BugTracker-2016/Tests/invalidcolumns.Bug-3968.sql
sql/test/emptydb-upgrade-chain-hge/Tests/check.SQL.py.src
sql/test/emptydb-upgrade-chain-hge/Tests/check.stable.err.int128
sql/test/emptydb-upgrade-chain-hge/Tests/dump.stable.err.int128
sql/test/emptydb-upgrade-chain-hge/Tests/package.stable.err.int128
sql/test/emptydb-upgrade-chain-hge/Tests/package.stable.out.int128
sql/test/emptydb-upgrade-chain-hge/Tests/unpackage.stable.err.int128
sql/test/emptydb-upgrade-chain-hge/Tests/unpackage.stable.out.int128
sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.err.int128
sql/test/emptydb-upgrade-chain/Tests/check.SQL.py.src
sql/test/emptydb-upgrade-hge/Tests/check.SQL.py.src
sql/test/emptydb-upgrade-hge/Tests/check.stable.err.int128
sql/test/emptydb-upgrade-hge/Tests/dump.stable.err.int128
sql/test/emptydb-upgrade-hge/Tests/unpackage.stable.err.int128
sql/test/emptydb-upgrade-hge/Tests/unpackage.stable.out.int128
sql/test/emptydb-upgrade-hge/Tests/upgrade.stable.err.int128
sql/test/emptydb-upgrade/Tests/check.SQL.py.src
sql/test/emptydb/Tests/check.SQL.py.src
sql/test/emptydb/Tests/package-hge.stable.err.int128
sql/test/emptydb/Tests/package-hge.stable.out.int128
Removed Files:
sql/test/emptydb-upgrade-chain-hge/Tests/check.SQL.py
sql/test/emptydb-upgrade-chain-hge/Tests/check.stable.err
sql/test/emptydb-upgrade-chain-hge/Tests/dump.stable.err
sql/test/emptydb-upgrade-chain-hge/Tests/package.stable.err
sql/test/emptydb-upgrade-chain-hge/Tests/package.stable.out
sql/test/emptydb-upgrade-chain-hge/Tests/unpackage.stable.err
sql/test/emptydb-upgrade-chain-hge/Tests/unpackage.stable.out
sql/test/emptydb-upgrade-chain-hge/Tests/upgrade.stable.err
sql/test/emptydb-upgrade-chain/Tests/check.SQL.py
sql/test/emptydb-upgrade-hge/Tests/check.SQL.py
sql/test/emptydb-upgrade-hge/Tests/check.stable.err
sql/test/emptydb-upgrade-hge/Tests/dump.stable.err
sql/test/emptydb-upgrade-hge/Tests/unpackage.stable.err
sql/test/emptydb-upgrade-hge/Tests/unpackage.stable.out
sql/test/emptydb-upgrade-hge/Tests/upgrade.stable.err
sql/test/emptydb-upgrade/Tests/check.SQL.py
sql/test/emptydb/Tests/check.SQL.py
sql/test/emptydb/Tests/package-hge.stable.err
sql/test/emptydb/Tests/package-hge.stable.out
Modified Files:
.hgtags
MonetDB.spec
NT/installer32/MonetDB-ODBC-Installer.vdproj
NT/installer32/MonetDB5-Geom-Module.vdproj
NT/installer32/MonetDB5-SQL-Installer.vdproj
NT/installer64/MonetDB-ODBC-Installer.vdproj
NT/installer64/MonetDB5-Geom-Module.vdproj
NT/installer64/MonetDB5-SQL-Installer.vdproj
NT/monetdb_config.h.in
NT/rules.msc
clients/R/MonetDB.R/DESCRIPTION
clients/Tests/All
clients/Tests/MAL-signatures.stable.out
clients/Tests/MAL-signatures.stable.out.int128
clients/Tests/SQL-dump.SQL.py
clients/Tests/SQL-dump.stable.out
clients/Tests/SQL-dump.stable.out.int128
clients/Tests/exports.stable.out
clients/mapiclient/dump.c
clients/mapilib/mapi.rc
clients/odbc/driver/driver.rc
clients/odbc/winsetup/setup.rc
clients/python2/setup.py
clients/python3/setup.py
configure.ag
debian/changelog
gdk/ChangeLog.Jun2016
gdk/gdk.h
gdk/gdk_aggr.c
gdk/gdk_atoms.c
gdk/gdk_bat.c
gdk/gdk_batop.c
gdk/gdk_bbp.c
gdk/gdk_calc.c
gdk/gdk_calc_compare.h
gdk/gdk_group.c
gdk/gdk_heap.c
gdk/gdk_logger.c
gdk/gdk_logger.h
gdk/gdk_project.c
gdk/gdk_system.c
gdk/gdk_tm.c
gdk/libbat.rc
geom/monetdb5/geom.c
geom/monetdb5/geom.h
geom/monetdb5/geom_upgrade.c
java/ChangeLog.Jun2016
java/build.properties
java/pom.xml
java/release.txt
java/src/main/java/nl/cwi/monetdb/jdbc/MonetDatabaseMetaData.java
libversions
monetdb5/modules/atoms/batxml.c
monetdb5/modules/atoms/blob.c
monetdb5/modules/atoms/json.c
monetdb5/modules/kernel/algebra.c
monetdb5/modules/kernel/batmmath.c
monetdb5/modules/kernel/batmmath.h
monetdb5/modules/kernel/batmmath.mal
monetdb5/modules/mal/batcalc.c
monetdb5/modules/mal/bbp.c
monetdb5/modules/mal/bbp.h
monetdb5/modules/mal/bbp.mal
monetdb5/optimizer/opt_deadcode.c
monetdb5/tools/libmonetdb5.rc
sql/backends/monet5/sql.c

MonetDB: stratified_sampling - Change sample to use Marsenne Twi...

2016-05-13 Thread Abe Wits
Changeset: 25fc42cc1c58 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=25fc42cc1c58
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Change sample to use Marsenne Twister


diffs (37 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -116,6 +116,8 @@ BATsample(BAT *b, BUN n)
BUN cnt, slen;
BUN rescnt;
struct oidtreenode *tree = NULL;
+mtwist *mt_rng;
+unsigned int range;
 
BATcheck(b, "BATsample", NULL);
assert(BAThdense(b));
@@ -164,16 +166,20 @@ BATsample(BAT *b, BUN n)
/* while we do not have enough sample OIDs yet */
 
 /* create and seed Mersenne Twister */
-mtwist *mt_rng = NULL;
 mt_rng = mtwist_new();
+
 mtwist_seed(mt_rng, rand());
 
+range = maxoid - minoid;
+
for (rescnt = 0; rescnt < n; rescnt++) {
oid candoid;
do {
-   /* generate a new random OID */
-   candoid = (oid) (minoid + DRAND * (maxoid - 
minoid));
-candoid = (oid) (minoid + ())
+   //candoid = (oid) (minoid + DRAND * (maxoid - 
minoid));
+
+   /* generate a new random OID in [minoid, maxoid[
+ * that is including minoid, excluding maxoid*/
+candoid = (oid) ( minoid + (mtwist_u32rand(mt_rng)%range) );
/* if that candidate OID was already
 * generated, try again */
} while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - Fix typos

2016-05-13 Thread Abe Wits
Changeset: ff8072a9696f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ff8072a9696f
Modified Files:
common/utils/mtwist.c
common/utils/mtwist.h
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Fix typos


diffs (48 lines):

diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
--- a/common/utils/mtwist.c
+++ b/common/utils/mtwist.c
@@ -189,7 +189,6 @@ inline double mtwist_drand(mtwist* mt) {
  * [a,b] 
  */
 inline int mtwist_uniform_int(mtwist* mt, int a, int b) {
-return mtwist_u32rand(mt)%b;
 if(b < a) {//invalid range!
 return 0;
 }
diff --git a/common/utils/mtwist.h b/common/utils/mtwist.h
--- a/common/utils/mtwist.h
+++ b/common/utils/mtwist.h
@@ -60,8 +60,8 @@ void mtwist_free(mtwist* mt);
 
 /* methods */
 void mtwist_seed(mtwist* mt, unsigned int seed);
-inline unsigned int mtwist_u32rand(mtwist* mt);
-inline double mtwist_drand(mtwist* mt);
-inline int mtwist_uniform_int(mtwist* mt, int a, int b);
+unsigned int mtwist_u32rand(mtwist* mt);
+double mtwist_drand(mtwist* mt);
+int mtwist_uniform_int(mtwist* mt, int a, int b);
 
 #endif
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -162,11 +162,18 @@ BATsample(BAT *b, BUN n)
return NULL;
}
/* while we do not have enough sample OIDs yet */
+
+/* create and seed Mersenne Twister */
+mtwist *mt_rng = NULL;
+mt_rng = mtwist_new();
+mtwist_seed(mt_rng, rand());
+
for (rescnt = 0; rescnt < n; rescnt++) {
oid candoid;
do {
/* generate a new random OID */
candoid = (oid) (minoid + DRAND * (maxoid - 
minoid));
+candoid = (oid) (minoid + ())
/* if that candidate OID was already
 * generated, try again */
} while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - Add Mersenne Prime Twister (under...

2016-05-13 Thread Abe Wits
Changeset: 5bc10ab3140c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5bc10ab3140c
Added Files:
common/utils/mtwist.c
common/utils/mtwist.h
Branch: stratified_sampling
Log Message:

Add Mersenne Prime Twister (under Unlicense)


diffs (252 lines):

diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
new file mode 100644
--- /dev/null
+++ b/common/utils/mtwist.c
@@ -0,0 +1,175 @@
+/* 
+ * mtwist.c - Mersenne Twister functions
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * For more information, please refer to 
+ *
+ */
+
+#include "monetdb_config.h"
+#include 
+#include 
+
+/*
+ * Mersenne Twister (MT19937) algorithm
+ * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ *
+ * http://en.wikipedia.org/wiki/Mersenne_twister
+ *
+ */
+
+#define MTWIST_UPPER_MASK 0x8000UL
+#define MTWIST_LOWER_MASK 0x7FFFUL
+#define MTWIST_FULL_MASK 0xUL
+#define MTWIST_MATRIX_A 0x9908B0DFUL
+
+#define MTWIST_MIXBITS(u, v) (((u)_UPPER_MASK) | 
((v)_LOWER_MASK))
+#define MTWIST_TWIST(u, v)   \
+((MTWIST_MIXBITS(u, v) >> 1) ^ \
+ ((v)&1UL ? MTWIST_MATRIX_A : 0UL))
+
+/**
+ * mtwist_new:
+ *
+ * Construct a Mersenne Twister object
+ *
+ * Return value: new MT object or NULL on failure
+ */
+mtwist* mtwist_new(void) {
+mtwist* mt;
+
+mt = (mtwist*)calloc(1, sizeof(*mt));
+if (!mt) return NULL;
+
+mt->remaining = 0;
+mt->next = NULL;
+mt->seeded = 0;
+
+return mt;
+}
+
+/**
+ * mtwist_free:
+ * @mt: mt object
+ *
+ * Destroy a Mersenne Twister object
+ */
+void mtwist_free(mtwist* mt) {
+if (mt) free(mt);
+}
+
+/**
+ * mtwist_seed:
+ * @mt: mt object
+ * @seed: seed (lower 32 bits used)
+ *
+ * Initialise a Mersenne Twister with an unsigned 32 bit int seed
+ */
+void mtwist_seed(mtwist* mt, unsigned int seed) {
+int i;
+
+if (!mt) return;
+
+mt->state[0] = (unsigned int)(seed & MTWIST_FULL_MASK);
+for (i = 1; i < MTWIST_N; i++) {
+mt->state[i] =
+(1812433253UL * (mt->state[i - 1] ^ (mt->state[i - 1] >> 30)) +
+ i);
+mt->state[i] &= MTWIST_FULL_MASK;
+}
+
+mt->remaining = 0;
+mt->next = NULL;
+
+mt->seeded = 1;
+}
+
+static void mtwist_update_state(mtwist* mt) {
+int count;
+unsigned int* p = mt->state;
+
+for (count = (MTWIST_N - MTWIST_M + 1); --count; p++)
+*p = p[MTWIST_M] ^ MTWIST_TWIST(p[0], p[1]);
+
+for (count = MTWIST_M; --count; p++)
+*p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], p[1]);
+
+*p = p[MTWIST_M - MTWIST_N] ^ MTWIST_TWIST(p[0], mt->state[0]);
+
+mt->remaining = MTWIST_N;
+mt->next = mt->state;
+}
+
+/**
+ * mtwist_u32rand:
+ * @mt: mt object
+ *
+ * Get a random unsigned 32 bit integer from the random number generator
+ *
+ * Return value: unsigned int with 32 valid bits
+ */
+unsigned int mtwist_u32rand(mtwist* mt) {
+unsigned int r;
+
+if (!mt) return 0UL;
+
+if (!mt->seeded) mtwist_seed(mt, 0);
+
+if (!mt->remaining) mtwist_update_state(mt);
+
+r = *mt->next++;
+mt->remaining--;
+
+/* Tempering */
+r ^= (r >> 11);
+r ^= (r << 7) & 0x9D2C5680UL;
+r ^= (r << 15) & 0xEFC6UL;
+r ^= (r >> 18);
+
+r &= MTWIST_FULL_MASK;
+
+return (unsigned int)r;
+}
+
+/**
+ * mtwist_drand:
+ * @mt: mt object
+ *
+ * Get a random double from the random number generator
+ *
+ * Return value: random double in the range 0.0 inclusive to 1.0 exclusive;
+ *[0.0, 1.0) */
+double mtwist_drand(mtwist* mt) {
+unsigned int r;
+double d;
+
+if (!mt) return 0.0;
+
+r = mtwist_u32rand(mt);
+
+d = r / 4294967296.0; /* 2^32 */
+
+return d;
+}
diff --git a/common/utils/mtwist.h b/common/utils/mtwist.h
new file mode 100644
--- /dev/null
+++ 

MonetDB: stratified_sampling - Add uniform sampling, update desc...

2016-05-13 Thread Abe Wits
Changeset: ae9e402f89b9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ae9e402f89b9
Modified Files:
common/utils/mtwist.c
common/utils/mtwist.h
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

Add uniform sampling, update desciptions, fix includes


diffs (129 lines):

diff --git a/common/utils/mtwist.c b/common/utils/mtwist.c
--- a/common/utils/mtwist.c
+++ b/common/utils/mtwist.c
@@ -29,13 +29,16 @@
  */
 
 #include "monetdb_config.h"
-#include 
 #include 
 
 /*
- * Mersenne Twister (MT19937) algorithm
+ * @a Dave Beckett, Abe Wits
+ * @* Mersenne Twister (MT19937) algorithm
+ * 
+ * This random number generator has very good statistical properties,
+ * and outperforms most stl implementations of rand() in terms of speed
+ *
  * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
- *
  * http://en.wikipedia.org/wiki/Mersenne_twister
  *
  */
@@ -130,7 +133,7 @@ static void mtwist_update_state(mtwist* 
  *
  * Return value: unsigned int with 32 valid bits
  */
-unsigned int mtwist_u32rand(mtwist* mt) {
+inline unsigned int mtwist_u32rand(mtwist* mt) {
 unsigned int r;
 
 if (!mt) return 0UL;
@@ -150,7 +153,7 @@ unsigned int mtwist_u32rand(mtwist* mt) 
 
 r &= MTWIST_FULL_MASK;
 
-return (unsigned int)r;
+return r;
 }
 
 /**
@@ -161,7 +164,7 @@ unsigned int mtwist_u32rand(mtwist* mt) 
  *
  * Return value: random double in the range 0.0 inclusive to 1.0 exclusive;
  *[0.0, 1.0) */
-double mtwist_drand(mtwist* mt) {
+inline double mtwist_drand(mtwist* mt) {
 unsigned int r;
 double d;
 
@@ -173,3 +176,42 @@ double mtwist_drand(mtwist* mt) {
 
 return d;
 }
+
+
+/**
+ * mtwist_uniform_int:
+ * @a, b; two integers such that a<=b
+ *
+ * Get an int in an interval uniform randomly from the
+ * random number generator.
+ *
+ * Return value: random interval in range a inclusive to b inclusive;
+ * [a,b] 
+ */
+inline int mtwist_uniform_int(mtwist* mt, int a, int b) {
+return mtwist_u32rand(mt)%b;
+if(b < a) {//invalid range!
+return 0;
+}
+unsigned int range = b-a+1;
+unsigned int scale = 4294967295UL/range;
+//4294967295UL=2^32-1=RAND_MAX for this Mersenne Twister
+unsigned int max_x = range*scale;
+//x will be uniform in [0, max_x[
+//Since past%range=0, x%range will be uniform in [0,range[
+unsigned int x; 
+do {
+x = mtwist_u32rand(mt);
+} while(x >= max_x);
+
+return a+(x/scale);
+//x is uniform in [0,max_x[ = [0,range*scale[
+//hence x/scale is uniform in [0,range[=[0,b-a+1[
+//thus a+(x/scale) is uniform in [a,b]
+
+//alternative: return a+(x%range); 
+//x is uniform in [0,max_x[ = [0,range*scale[
+//hence (x%range) is uniform in [0,range[=[0,b-a+1[
+//thus a+(x%range) is uniform in [a,b]
+}
+
diff --git a/common/utils/mtwist.h b/common/utils/mtwist.h
--- a/common/utils/mtwist.h
+++ b/common/utils/mtwist.h
@@ -60,8 +60,8 @@ void mtwist_free(mtwist* mt);
 
 /* methods */
 void mtwist_seed(mtwist* mt, unsigned int seed);
-unsigned int mtwist_u32rand(mtwist* mt);
-double mtwist_drand(mtwist* mt);
-
+inline unsigned int mtwist_u32rand(mtwist* mt);
+inline double mtwist_drand(mtwist* mt);
+inline int mtwist_uniform_int(mtwist* mt, int a, int b);
 
 #endif
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -7,7 +7,7 @@
  */
 
 /*
- * @a Lefteris Sidirourgos, Hannes Muehleisen
+ * @a Lefteris Sidirourgos, Hannes Muehleisen, Abe Wits
  * @* Low level sample facilities
  *
  * This sampling implementation generates a sorted set of OIDs by
@@ -27,6 +27,8 @@
 #include "gdk.h"
 #include "gdk_private.h"
 
+#include 
+
 #undef BATsample
 
 #ifdef STATIC_CODE_ANALYSIS
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: stratified_sampling - create stratified_sampling branch

2016-03-23 Thread Abe Wits
Changeset: e1cab0bf2541 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e1cab0bf2541
Branch: stratified_sampling
Log Message:

create stratified_sampling branch

___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list