Changeset: bb51b8eece7c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb51b8eece7c
Modified Files:
gdk/gdk_sample.c
Branch: stratified_sampling
Log Message:
write reservoir sampling
diffs (140 lines):
diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
--- a/gdk/gdk_sample.c
+++ b/gdk/gdk_sample.c
@@ -32,6 +32,49 @@
#undef BATsample
+//TODO share these with gkd_firstn.c
+#define siftup(OPER, START, SWAP) \
+ do { \
+ pos = (START); \
+ childpos = (pos << 1) + 1; \
+ while (childpos < n) { \
+ /* find most extreme child */ \
+ if (childpos + 1 < n && \
+ !(OPER(childpos + 1, childpos))) \
+ childpos++; \
+ /* compare parent with most extreme child */ \
+ if (!OPER(pos, childpos)) { \
+ /* already correctly ordered */ \
+ break; \
+ } \
+ /* exchange parent with child and sift child */ \
+ /* further */ \
+ SWAP(pos, childpos); \
+ pos = childpos; \
+ childpos = (pos << 1) + 1; \
+ } \
+ } while (0)
+
+#define heapify(OPER, SWAP) \
+ do { \
+ for (i = n / 2; i > 0; i--) \
+ siftup(OPER, i - 1, SWAP); \
+ } while (0)
+
+#define SWAP3(p1, p2) \
+ do { \
+ item = oids[p1]; \
+ oids[p1] = oids[p2]; \
+ oids[p2] = item; \
+ key = keys[p1]; \
+ keys[p1] = keys[p2]; \
+ keys[p2] = key; \
+ } while (0)
+
+#define compKeysGT(p1, p2)
\
+ (keys[p1] > keys[p2] ||
\
+ keys[p1] == keys[p2] && oids[p1] > oids[p2])
+
/* this is a straightforward implementation of a binary tree */
struct oidtreenode {
oid o;
@@ -194,14 +237,23 @@ BATsample(BAT *b, BUN n)
/* BATweightedsample takes weighted samples of void headed BATs */
/* Note that the type of w should be castable to doubles */
+/* based on Alg-A-exp from 'Weighted random sampling with a reservoir' by
Efraimidis and Spirakis (2006) */
BAT *
BATweightedsample(BAT *b, BUN n, BAT *w)
{
BAT* sample;
+ oid* oids;//points to the oids in sample
dbl* w_ptr;//TODO types of w
- dbl* cdf_ptr;
- BUN cnt, i;
+ dbl* keys;//keys as defined in Alg-A-exp
+ BUN cnt, i, j;
bit antiset;
+ mtwist *mt_rng;
+ BUN pos, childpos;
+ oid item;
+ dbl r, xw, r2, key, tw;
+
+ oid minoid = b->hseqbase;
+ oid maxoid = b->hseqbase + cnt;
BATcheck(b, "BATsample", NULL);
BATcheck(w, "BATsample", NULL);
@@ -213,10 +265,57 @@ BATweightedsample(BAT *b, BUN n, BAT *w)
cnt = BATcount(b);
- antiset = n > cnt / 2;
- /* obtain sample */
- sample = NULL;//TODO: reservoir sampling with exponential jumps
+ keys = (double*) malloc(sizeof(double)*n);
+ if(keys == NULL)
+ return NULL;
+
+ sample = COLnew(0, TYPE_oid, n, TRANSIENT);
+ if(sample == NULL) {
+ free(keys);
+ return NULL;
+ }
+
+ oids = (oid *)sample->theap.base;
+
+ mt_rng = mtwist_new();
+ mtwist_seed(mt_rng, rand());
+
+ BATsetcount(sample, n);
+ /* obtain sample */
+ //TODO: reservoir sampling with exponential jumps
+ for(j=0; j<n; j++) {
+ oids[j] = j+minoid;
+ keys[j] = pow(mtwist_drand(mt_rng),1.0/w[j]);
+ }
+ heapify(compKeysGT, SWAP3);//NOTE: writes to 'i'
+
+ j=n;
+ while(true) {
+ r = mtwist_drand(mt_rng);
+ xw = log(r)/log(keys[oids[0]-minoid]);
+ while(xw > 0 && j < cnt) {
+ xw -= w[j];
+ j++;
+ }
+ if(j >= cnt) break;
+ tw = pow(keys[oids[0]-minoid], w[j]);
+ r2 = mtwist_drand(mt_rng)*(1-tw)+tw;
+ key = pow(r2, 1/w[j]);
+
+ oids[0] = j+minoid;
+ keys[0] = key;
+ siftup(compKeysGT, 0, SWAP3);//NOTE: writes to 'key'
+ }
+
+ free(keys);
+
+ sample->trevsorted = sample->batCount <= 1;
+ sample->tsorted = sample->batCount <= 1;
+ sample->tkey = 1;
+ sample->tdense = sample->batCount <= 1;
+ if (sample->batCount == 1)
+ sample->tseqbase = *(oid *) Tloc(sample, 0);
return sample;
}
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list