Changeset: 9c93f4253ff6 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9c93f4253ff6
Modified Files:
        gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:

remove CDF-based weighted sampling


diffs (157 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -100,13 +100,10 @@ OIDTreeToBATAntiset(struct oidtreenode *
        //TODO create this function
 }*/
 
-/* 
- * _BATsample is the internal (weighted) sampling function without replacement
- * If cdf=NULL, an uniform sample is taken
- * Otherwise it is assumed the cdf increases monotonically
- */
-static BAT *
-_BATsample(BAT *b, BUN n, BAT *cdf)
+
+/* BATsample takes uniform samples of void headed BATs */
+BAT *
+BATsample(BAT *b, BUN n)
 {
        BAT *bn;
        BUN cnt, slen;
@@ -115,8 +112,6 @@ static BAT *
        mtwist *mt_rng;
        unsigned int range;
        dbl random;
-       dbl cdf_max;
-       dbl* cdf_ptr;
 
        BATcheck(b, "BATsample", NULL);
        ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n", 
NULL);
@@ -167,48 +162,18 @@ static BAT *
                
                range = maxoid - minoid;
                
-               /* sample OIDs (method depends on w) */
-               if(cdf == NULL) {
-                       /* no weights, hence do uniform sampling */
-
-                       /* while we do not have enough sample OIDs yet */
-                       for (rescnt = 0; rescnt < n; rescnt++) {
-                               oid candoid;
-                               do {
-                                       /* generate a new random OID in 
[minoid, maxoid[
-                                        * that is including minoid, excluding 
maxoid*/
-                                       candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
-                                       /* if that candidate OID was already
-                                        * generated, try again */
-                               } while (!OIDTreeMaybeInsert(tree, candoid, 
rescnt));
-                       }
-
-               } else {
-                       /* do weighted sampling */
-                       
-                       cdf_ptr = (dbl*) Tloc(cdf, 0);
-                       if (!antiset)
-                               cdf_max = cdf_ptr[cnt-1];
-                       else
-                               cdf_max = cdf_ptr[0];
-                          //TODO how to type/cast cdf_max?
-
-                       /* generate candoids, using CDF */
-                       for (rescnt = 0; rescnt < n; rescnt++) {
-                               oid candoid;
-
-                               do {
-                                       random = mtwist_drand(mt_rng)*cdf_max;
-                                       /* generate a new random OID with 
minoid <= OID < maxoid */
-                                       /* note that cdf has already been 
adjusted for antiset case */
-                                       candoid = (oid) ( minoid + (oid) 
SORTfndfirst(cdf, &random) );
-                                       /* if that candidate OID was already
-                                        * generated, try again */
-                               } while (!OIDTreeMaybeInsert(tree, candoid, 
rescnt));
-                       }
+               /* while we do not have enough sample OIDs yet */
+               for (rescnt = 0; rescnt < n; rescnt++) {
+                       oid candoid;
+                       do {
+                               /* generate a new random OID in [minoid, maxoid[
+                                * that is including minoid, excluding maxoid*/
+                               candoid = (oid) ( minoid + 
(mtwist_u32rand(mt_rng)%range) );
+                               /* if that candidate OID was already
+                                * generated, try again */
+                       } while (!OIDTreeMaybeInsert(tree, candoid, rescnt));
                }
 
-
                if (!antiset) {
                        OIDTreeToBAT(tree, bn);
                } else {
@@ -227,20 +192,11 @@ static BAT *
        return bn;
 }
 
-
-/* BATsample takes uniform samples of void headed BATs */
-BAT *
-BATsample(BAT *b, BUN n)
-{
-       return _BATsample(b, n, NULL);
-}
-
 /* BATweightedsample takes weighted samples of void headed BATs */
 /* Note that the type of w should be castable to doubles */
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
 {
-       BAT* cdf;
        BAT* sample;
        dbl* w_ptr;//TODO types of w
        dbl* cdf_ptr;
@@ -259,44 +215,14 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
        antiset = n > cnt / 2;
 
-       cdf = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
-       BATsetcount(cdf, cnt);
-       
-       /* calculate cumilative distribution function */
-       w_ptr = (dbl*) Tloc(w, 0);//TODO support different types w
-       cdf_ptr = (dbl*) Tloc(cdf, 0);
-
-       cdf_ptr[0] = (dbl)w_ptr[0];
-
-       /* remove NULL values */
-       for (i = 1; i < cnt; i++) {
-               if((dbl)w_ptr[i] == dbl_nil) {//TODO fix NULL-test if w can 
have different types
-                       cdf_ptr[i] = cdf_ptr[i-1];
-               } else {
-                       cdf_ptr[i] = ((dbl)w_ptr[i]) + cdf_ptr[i-1];
-               }
-       }
-       if (!antiset) {
-               cdf->tsorted = 1;
-               cdf->trevsorted = cnt <= 1;
-       } else {
-               /* in antiset notation, we have to flip probabilities */
-               for (i = 0; i < cnt; i++) {
-                        cdf_ptr[i] = cdf_ptr[cnt-1] - cdf_ptr[i];
-               }
-               cdf->tsorted = cnt <= 1;
-               cdf->trevsorted = 1;
-       }
-       
        /* obtain sample */
-       sample = _BATsample(b, n, cdf);
-       
-       BATdelete(cdf);
+       sample = NULL;//TODO: reservoir sampling with exponential jumps
 
        return sample;
 }
 
 
+
 /* BATweightedbitbat creates a bit BAT of length cnt containing n 1s and cnt-n 
0s */
 /* Note that the type of w should be castable to doubles */
 /*BAT *
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to