Changeset: 6e2b2913b7d0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6e2b2913b7d0
Added Files:
        gdk/gdk_sample.c
Branch: default
Log Message:

add sample.c file


diffs (115 lines):

diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_sample.c
@@ -0,0 +1,110 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2012 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/*
+ * @f gdk_sample
+ * @a Lefteris Sidirourgos
+ * @* Low level sample facilities
+ *
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+
+#define DRAND ((double)rand()/(double)RAND_MAX)
+
+/*
+ * @+ Uniform Sampling.
+ *
+ * The implementation of the uniform sampling is based on the algorithm A as
+ * described in the paper "Faster Methods for Random Sampling" by Jeffrey Scott
+ * Vitter. Algorithm A is not the fastest one, but it only makes s calls in
+ * function random() and it is simpler than the other more complex and CPU
+ * intensive algorithms in the literature.
+ *
+ * Algorithm A instead of performing one random experiment for each row to
+ * decide if it should be included in the sample or not, it skips S rows
+ * and includes the S+1 row. The algorithm scans the input relation
+ * sequentially and maintains the unique and sort properties. The sample is
+ * without replacement.
+ */
+
+BAT *
+BATsample1(BAT *b, BUN n)
+{
+       BAT *bn;
+       BUN cnt;
+
+       BATcheck(b, "BATsample");
+       ERRORcheck(n > BUN_MAX, "BATsample: sample size larger than BUN_MAX\n");
+       ALGODEBUG THRprintf(GDKout, "#BATsample: sample " BUNFMT " 
elements.\n", n);
+
+       cnt = BATcount(b);
+       if (cnt <= n) {
+               bn = BATcopy(b, b->htype, b->ttype, FALSE);
+       } else {
+               BUN top = cnt - n;
+               BUN smp = n;
+               BATiter iter = bat_iterator(b);
+               BUN p = BUNfirst(b)-1;
+               bn = BATnew(
+                       b->htype==TYPE_void && 
b->hseqbase!=oid_nil?TYPE_oid:b->htype,
+                       b->ttype==TYPE_void && 
b->tseqbase!=oid_nil?TYPE_oid:b->ttype,
+                       n);
+               if (bn == NULL)
+                       return NULL;
+               if (n == 0)
+                       return bn;
+               while (smp-->1) { /* loop until all but 1 values are sampled */
+                       double v = DRAND;
+                       double quot = (double)top/(double)cnt;
+                       BUN jump = 0;
+                       while (quot > v) { /* determine how many positions to 
jump */
+                               jump++;
+                               top--;
+                               cnt--;
+                               quot *= (double)top/(double)cnt;
+                       }
+                       p += (jump+1);
+                       cnt--;
+                       bunfastins(bn, BUNhead(iter, p), BUNtail(iter,p));
+               }
+               /* 1 left */
+               p += (BUN) rand() % cnt;
+               bunfastins(bn, BUNhead(iter, p+1), BUNtail(iter,p+1));
+
+               /* property management */
+               bn->tsorted = BATtordered(b);
+               bn->hsorted = BAThordered(b);
+               bn->hdense = FALSE;
+               bn->tdense = FALSE;
+               BATkey(bn, BAThkey(b));
+               BATkey(BATmirror(bn), BATtkey(b));
+               bn->H->nonil = b->H->nonil;
+               bn->T->nonil = b->T->nonil;
+               BATsetcount(bn, n);
+       }
+
+       return bn;
+
+bunins_failed:
+       BBPreclaim(bn);
+       return NULL;
+}
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to