Changeset: 3468b2d208f3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3468b2d208f3
Modified Files:
        gdk/gdk_sample.c
        monetdb5/modules/mal/sample.mal
Branch: stratified_sampling
Log Message:

cast non-double weights for weighted sampling


diffs (179 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -143,12 +143,6 @@ OIDTreeToBATAntiset(struct oidtreenode *
                        ((oid *) bat->theap.base)[bat->batCount++] = noid;
 }
 
-/* inorder traversal, gives us a bit BAT */
-/*BAT *bat OIDTreeToBITBAT(struct oidtreenode)
-{
-       //TODO create this function
-}*/
-
 
 /* BATsample takes uniform samples of void headed BATs */
 BAT *
@@ -248,6 +242,8 @@ BATsample(BAT *b, BUN n)
 BAT *
 BATweightedsample(BAT *b, BUN n, BAT *w)
 {//TODO test correctness extensively
+       BAT* weights;
+       bit weights_are_cast;
        BAT* sample;
        oid* oids;/* points to the oids in sample */
        dbl* w_ptr;//TODO types of w
@@ -264,30 +260,45 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
        ERRORcheck(w->ttype == TYPE_str || w->ttype == TYPE_void,
                                        "BATsample: type of weights not 
castable to doubles\n", NULL);
-       ERRORcheck(w->ttype != TYPE_dbl,
-                                       "BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
+
+       if(w->ttype != TYPE_dbl) {
+               weights = BATconvert(w, NULL, TYPE_dbl, 0);
+               ERRORcheck(weights == NULL, "BATsample: could not cast weights 
to doubles\n", NULL);
+               weights_are_cast = 1;
+       } else {
+               weights = w;
+               weights_are_cast = 0;
+       }
+       //ERRORcheck(w->ttype != TYPE_dbl,
+       //                              "BATsample: type of weights must be 
doubles\n", NULL);//TODO types of w (want to remove this)
        //TODO: handle NULL values in w_ptr
 
        cnt = BATcount(b);
 
        sample = COLnew(0, TYPE_oid, n, TRANSIENT);
 
-       if(sample == NULL)
+       if(sample == NULL) {
+               if(weights_are_cast)//if weights where converted, delete 
converted BAT
+                       BBPunfix(weights->batCacheid);
                return NULL;
-       if(n == 0)
+       }
+       if(n == 0) {
+               if(weights_are_cast)
+                       BBPunfix(weights->batCacheid);
                return sample;
+       }
 
 
        keys = (dbl*) GDKmalloc(sizeof(dbl)*n);
        if(keys == NULL) {
+               if(weights_are_cast)
+                       BBPunfix(weights->batCacheid);
                BBPunfix(sample->batCacheid);
                return NULL;
        }
 
-
-
        oids = (oid *) Tloc(sample, 0);
-       w_ptr = (dbl*) Tloc(w, 0);
+       w_ptr = (dbl*) Tloc(weights, 0);
 
        if(!mt_rng) {
                mt_rng = mtwist_new();
@@ -303,13 +314,25 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
        for(j=0; i < n && j < cnt; j++) {
                if(w_ptr[j] == 0.0)
                        continue;
+               if(w_ptr[j] < 0.0) {
+                       BBPunfix(sample->batCacheid);
+                       GDKfree(keys);
+                       if(weights_are_cast)
+                               BBPunfix(weights->batCacheid);
+                       GDKerror("BATsample: w contains negative weights\n");
+                       return NULL;
+               }
                oids[i] = (oid)(j+minoid);
                keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast 
1.0 to dbl?
                i++;
        }
-       if(i < n) {/* not enough non-zero weights: cannot take sample */
+
+       if(i < n) {
                BBPunfix(sample->batCacheid);
                GDKfree(keys);
+               if(weights_are_cast)
+                       BBPunfix(weights->batCacheid);
+               GDKerror("BATsample: sample size bigger than number of non-zero 
weights\n");
                return NULL;
        }
 
@@ -339,6 +362,8 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
        }
 
        GDKfree(keys);
+       if(weights_are_cast)
+               BBPunfix(weights->batCacheid);
 
        sample->trevsorted = sample->batCount <= 1;
        sample->tsorted = sample->batCount <= 1;
@@ -352,27 +377,3 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
 
 
 
-/* BATweightedbitbat creates a bit BAT of length cnt containing n 1s and cnt-n 
0s */
-/* Note that the type of w should be castable to doubles */
-/*BAT *
-BATweightedbitbat(BUN cnt, BUN n, BAT *w)
-{
-       BAT* res;
-       res = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
-       BATsetcount(res, cnt);
-       
-       //Need to adjust _BATsample so it will return a bit BAT with bools 
denoting if element is selected
-       //Now it will rather return a subset
-       //TODO rewrite _BATsample to support this, add call to _BATsample
-       //Why did we choose for this UDF notation?
-       //+ easier to implement (no parsing addition)
-       //- slow
-       //- actually yields uglier code
-       //Why implement something like that? Hence we should choose for the 
other notation?
-       
-       
-       return res;
-}
-*/
-
-
diff --git a/monetdb5/modules/mal/sample.mal b/monetdb5/modules/mal/sample.mal
--- a/monetdb5/modules/mal/sample.mal
+++ b/monetdb5/modules/mal/sample.mal
@@ -20,31 +20,14 @@ command subuniform(b:bat[:any],p:dbl):ba
 address SAMPLEuniform_dbl
 comment "Returns the oids of a uniform sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
 
-command subweighted(b:bat[:any],s:lng,w:bat[:dbl]):bat[:oid]
+command subweighted(b:bat[:any],s:lng,w:bat[:any]):bat[:oid]
 address SAMPLEweighted
 comment "Returns the oids of a weighted sample (without replacement) of size 
s";
 
-command subweighted(b:bat[:any],p:dbl,w:bat[:dbl]):bat[:oid]
+command subweighted(b:bat[:any],p:dbl,w:bat[:any]):bat[:oid]
 address SAMPLEweighted_dbl
 comment "Returns the oids of a weighted sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
 
 
-command subuniform_bitbat(b:bat[:any],s:lng):bat[:bit]
-address SAMPLEuniform_bitbat
-comment "Returns the oids of a uniform sample (without replacement) of size s";
-
-command subuniform_bitbat(b:bat[:any],p:dbl):bat[:bit]
-address SAMPLEuniform_bitbat_dbl
-comment "Returns the oids of a uniform sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
-
-command subweighted_bitbat(b:bat[:any],s:lng,w:bat[:dbl]):bat[:bit]
-address SAMPLEweighted_bitbat
-comment "Returns the oids of a weighted sample (without replacement) of size 
s";
-
-command subweighted_bitbat(b:bat[:any],p:dbl,w:bat[:dbl]):bat[:bit]
-address SAMPLEweighted_bitbat_dbl
-comment "Returns the oids of a weighted sample (without replacement) of size = 
(p x count(b)), where 0 <= p <= 1.0";
-
-
 
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to