Changeset: 3468b2d208f3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3468b2d208f3
Modified Files:
gdk/gdk_sample.c
monetdb5/modules/mal/sample.mal
Branch: stratified_sampling
Log Message:
cast non-double weights for weighted sampling
diffs (179 lines):
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -143,12 +143,6 @@ OIDTreeToBATAntiset(struct oidtreenode *
((oid *) bat->theap.base)[bat->batCount++] = noid;
}
-/* inorder traversal, gives us a bit BAT */
-/*BAT *bat OIDTreeToBITBAT(struct oidtreenode)
-{
- //TODO create this function
-}*/
-
/* BATsample takes uniform samples of void headed BATs */
BAT *
@@ -248,6 +242,8 @@ BATsample(BAT *b, BUN n)
BAT *
BATweightedsample(BAT *b, BUN n, BAT *w)
{//TODO test correctness extensively
+ BAT* weights;
+ bit weights_are_cast;
BAT* sample;
oid* oids;/* points to the oids in sample */
dbl* w_ptr;//TODO types of w
@@ -264,30 +260,45 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
ERRORcheck(w->ttype == TYPE_str || w->ttype == TYPE_void,
"BATsample: type of weights not
castable to doubles\n", NULL);
- ERRORcheck(w->ttype != TYPE_dbl,
- "BATsample: type of weights must be
doubles\n", NULL);//TODO types of w (want to remove this)
+
+ if(w->ttype != TYPE_dbl) {
+ weights = BATconvert(w, NULL, TYPE_dbl, 0);
+ ERRORcheck(weights == NULL, "BATsample: could not cast weights
to doubles\n", NULL);
+ weights_are_cast = 1;
+ } else {
+ weights = w;
+ weights_are_cast = 0;
+ }
+ //ERRORcheck(w->ttype != TYPE_dbl,
+ // "BATsample: type of weights must be
doubles\n", NULL);//TODO types of w (want to remove this)
//TODO: handle NULL values in w_ptr
cnt = BATcount(b);
sample = COLnew(0, TYPE_oid, n, TRANSIENT);
- if(sample == NULL)
+ if(sample == NULL) {
+ if(weights_are_cast)//if weights where converted, delete
converted BAT
+ BBPunfix(weights->batCacheid);
return NULL;
- if(n == 0)
+ }
+ if(n == 0) {
+ if(weights_are_cast)
+ BBPunfix(weights->batCacheid);
return sample;
+ }
keys = (dbl*) GDKmalloc(sizeof(dbl)*n);
if(keys == NULL) {
+ if(weights_are_cast)
+ BBPunfix(weights->batCacheid);
BBPunfix(sample->batCacheid);
return NULL;
}
-
-
oids = (oid *) Tloc(sample, 0);
- w_ptr = (dbl*) Tloc(w, 0);
+ w_ptr = (dbl*) Tloc(weights, 0);
if(!mt_rng) {
mt_rng = mtwist_new();
@@ -303,13 +314,25 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
for(j=0; i < n && j < cnt; j++) {
if(w_ptr[j] == 0.0)
continue;
+ if(w_ptr[j] < 0.0) {
+ BBPunfix(sample->batCacheid);
+ GDKfree(keys);
+ if(weights_are_cast)
+ BBPunfix(weights->batCacheid);
+ GDKerror("BATsample: w contains negative weights\n");
+ return NULL;
+ }
oids[i] = (oid)(j+minoid);
keys[i] = pow(mtwist_drand(mt_rng),1.0/w_ptr[j]);//TODO cast
1.0 to dbl?
i++;
}
- if(i < n) {/* not enough non-zero weights: cannot take sample */
+
+ if(i < n) {
BBPunfix(sample->batCacheid);
GDKfree(keys);
+ if(weights_are_cast)
+ BBPunfix(weights->batCacheid);
+ GDKerror("BATsample: sample size bigger than number of non-zero
weights\n");
return NULL;
}
@@ -339,6 +362,8 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
}
GDKfree(keys);
+ if(weights_are_cast)
+ BBPunfix(weights->batCacheid);
sample->trevsorted = sample->batCount <= 1;
sample->tsorted = sample->batCount <= 1;
@@ -352,27 +377,3 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
-/* BATweightedbitbat creates a bit BAT of length cnt containing n 1s and cnt-n
0s */
-/* Note that the type of w should be castable to doubles */
-/*BAT *
-BATweightedbitbat(BUN cnt, BUN n, BAT *w)
-{
- BAT* res;
- res = COLnew(0, TYPE_dbl, cnt, TRANSIENT);
- BATsetcount(res, cnt);
-
- //Need to adjust _BATsample so it will return a bit BAT with bools
denoting if element is selected
- //Now it will rather return a subset
- //TODO rewrite _BATsample to support this, add call to _BATsample
- //Why did we choose for this UDF notation?
- //+ easier to implement (no parsing addition)
- //- slow
- //- actually yields uglier code
- //Why implement something like that? Hence we should choose for the
other notation?
-
-
- return res;
-}
-*/
-
-
diff --git a/monetdb5/modules/mal/sample.mal b/monetdb5/modules/mal/sample.mal
--- a/monetdb5/modules/mal/sample.mal
+++ b/monetdb5/modules/mal/sample.mal
@@ -20,31 +20,14 @@ command subuniform(b:bat[:any],p:dbl):ba
address SAMPLEuniform_dbl
comment "Returns the oids of a uniform sample (without replacement) of size =
(p x count(b)), where 0 <= p <= 1.0";
-command subweighted(b:bat[:any],s:lng,w:bat[:dbl]):bat[:oid]
+command subweighted(b:bat[:any],s:lng,w:bat[:any]):bat[:oid]
address SAMPLEweighted
comment "Returns the oids of a weighted sample (without replacement) of size
s";
-command subweighted(b:bat[:any],p:dbl,w:bat[:dbl]):bat[:oid]
+command subweighted(b:bat[:any],p:dbl,w:bat[:any]):bat[:oid]
address SAMPLEweighted_dbl
comment "Returns the oids of a weighted sample (without replacement) of size =
(p x count(b)), where 0 <= p <= 1.0";
-command subuniform_bitbat(b:bat[:any],s:lng):bat[:bit]
-address SAMPLEuniform_bitbat
-comment "Returns the oids of a uniform sample (without replacement) of size s";
-
-command subuniform_bitbat(b:bat[:any],p:dbl):bat[:bit]
-address SAMPLEuniform_bitbat_dbl
-comment "Returns the oids of a uniform sample (without replacement) of size =
(p x count(b)), where 0 <= p <= 1.0";
-
-command subweighted_bitbat(b:bat[:any],s:lng,w:bat[:dbl]):bat[:bit]
-address SAMPLEweighted_bitbat
-comment "Returns the oids of a weighted sample (without replacement) of size
s";
-
-command subweighted_bitbat(b:bat[:any],p:dbl,w:bat[:dbl]):bat[:bit]
-address SAMPLEweighted_bitbat_dbl
-comment "Returns the oids of a weighted sample (without replacement) of size =
(p x count(b)), where 0 <= p <= 1.0";
-
-
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list