Changeset: f33ea08015bb for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f33ea08015bb
Modified Files:
monetdb5/modules/mal/Makefile.ag
monetdb5/modules/mal/mal_init.mal
monetdb5/modules/mal/sample.c
monetdb5/modules/mal/sample.c.bak
monetdb5/modules/mal/sample.h
monetdb5/modules/mal/sample.mal
Branch: default
Log Message:
The sample module for the monetdb5 server.
diffs (truncated from 440 to 300 lines):
diff --git a/monetdb5/modules/mal/Makefile.ag b/monetdb5/modules/mal/Makefile.ag
--- a/monetdb5/modules/mal/Makefile.ag
+++ b/monetdb5/modules/mal/Makefile.ag
@@ -64,7 +64,8 @@ lib_mal = {
transaction.c \
txtsim.c txtsim.h \
urlbox.mx \
- zorder.mx
+ zorder.mx \
+ sample.c sample.h
}
headers_mal = {
@@ -79,9 +80,9 @@ headers_mal = {
mal_mapi.mx sabaoth.mal remote.mal \
txtsim.mal recycle.mal \
cluster.mx trader.mal pma.mx \
- tokenizer.mx zorder.mx
+ tokenizer.mx zorder.mx sample.mal
}
-EXTRA_DIST = prelude.mx algebraExtensions.mal attach.mal batExtensions.mal
chopper.mal constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal
mkey.mal pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal
transaction.mal txtsim.mal tablet.mal tablet.h
+EXTRA_DIST = prelude.mx algebraExtensions.mal attach.mal batExtensions.mal
chopper.mal constraints.mal groupby.mal histogram.mal mal_init.mal manual.mal
mkey.mal pcre.mal profiler.mal recycle.mal remote.mal sabaoth.mal trader.mal
transaction.mal txtsim.mal tablet.mal tablet.h sample.mal
EXTRA_DIST_DIR = Tests
diff --git a/monetdb5/modules/mal/mal_init.mal
b/monetdb5/modules/mal/mal_init.mal
--- a/monetdb5/modules/mal/mal_init.mal
+++ b/monetdb5/modules/mal/mal_init.mal
@@ -104,6 +104,7 @@ include mal_mapi;
include profiler;
include statistics; # experimental
+include sample;
include optimizer;
include opt_support;
diff --git a/monetdb5/modules/mal/sample.c b/monetdb5/modules/mal/sample.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/sample.c
@@ -0,0 +1,155 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2011 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/*
+ * @a Lefteris Sidirourgos
+ * @d 30/08/2011
+ * @+ The sampling facilities
+ *
+ * In the context of the SciBORQ project, we introduce a number of sampling
+ * techniques in the MonetDB software stack. Our goal is to provide methods
+ * for performing sampling (uniform and weighted) over a) the result of a
+ * query, b) the base tables, and c) the entire database schema. Sampling
+ * can be performed during query execution, as well as during data loading in
+ * the case of predefined sampling indexes. In addition to the sampling
+ * methods, a number of query plan optimisations for sampling are introduced on
+ * the SQL and MAL level.
+ *
+ * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
+ * query execution. That is steering query execution over many layers of
+ * samples with different size in order to achieve either strict error bounds
+ * or limited execution time. For more details see the SciBORQ module.
+ *
+ * In the following, details are presented on the implementation and the usage
+ * of each sampling method.
+ */
+
+#include "monetdb_config.h"
+#include <gdk.h>
+#include <mal_exception.h>
+#include "sample.h"
+
+/*
+ * @- Uniform Sampling.
+ *
+ * A new SQL operator has been added to support sampling the result of a query.
+ * The syntax for sampling is:
+ * SELECT ... FROM ... WHERE ... SAMPLE s
+ *
+ * where s if is an integer greater than 1, it defines the number of rows to be
+ * in the sample. If s is a double between [0.0,1.0] the it refers to the
+ * percentage of the result to be sampled. That is if s=0.3 then the sample
+ * will be 30% the size of the query result.
+ *
+ * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
+ * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
+ * subquery. However, if this is needed, then one may define a function, for
+ * example
+ *
+ * CREATE FUNCTION mysample ()
+ * RETURNS TABLE(col a,...)
+ * BEGIN
+ * RETURN
+ * SELECT a,...
+ * FROM name_table
+ * SAMPLE 100;
+ * end;
+ *
+ * and then use function mysample() for example to populate a new table with
+ * the sample. E.g.,
+ *
+ * INSERT INTO sample_table (SELECT * FROM mysample());
+ *
+ * The implementation of the uniform sampling is based on the algorithm A as
+ * described in the paper "Faster Methods for Random Sampling" by Jeffrey Scott
+ * Vitter. Algorithm A is not the fastest one, but it only makes s calls in
+ * function random() and it is simpler than the other more complex and CPU
+ * intensive algorithms in the literature.
+ *
+ * Algorithm A instead of performing one random experiment for each row to
+ * decide if it should be included in the sample or not, it skips S rows
+ * and includes the S+1 row. The algorithm scans the input relation
+ * sequentially and maintains the unique and sort properties. The sample is
+ * without replacement.
+ */
+
+sample_export str
+SAMPLEuniform(bat *r, bat *b, ptr s) {
+ BAT *br, *bb;
+ BUN p, sz = *(wrd *)s, top, N, n, jump;
+ BATiter iter;
+ double v, quot;
+
+ if ((bb = BATdescriptor(*b)) == NULL) {
+ throw(MAL, "sample.uniform", INTERNAL_BAT_ACCESS);
+ }
+
+ N = BATcount(bb);
+ if (sz > N) { /* if the sample is bigger than the input relation */
+ BBPkeepref(*r = bb->batCacheid);
+ return MAL_SUCCEED;
+ }
+
+ if ((br = BATnew(TYPE_oid, bb->ttype, sz)) == NULL) {
+ BBPunfix(bb->batCacheid);
+ throw(MAL, "sample.uniform", MAL_MALLOC_FAIL);
+ }
+
+ n = sz;
+ top = N-n;
+ p = BUNfirst(bb)-1;
+ iter = bat_iterator(bb);
+ while (n-->1) { /* loop until only 1 free spot is left for the sample */
+ v = DRAND;
+ jump = 0;
+ quot = (double)top/(double)N;
+ while (quot > v) { /* determine how many positions to jump */
+ jump++;
+ top--;
+ N--;
+ quot *= (double)top/(double)N;
+ }
+ p += (jump+1);
+ N--;
+ bunfastins(br, BUNhead(iter, p), BUNtail(iter,p));
+ }
+ /* 1 row left to be added in the sample */
+ p += (BUN) rand() % N;
+ bunfastins(br, BUNhead(iter, p+1), BUNtail(iter,p+1));
+
+ br->tsorted = bb->tsorted;
+ br->hsorted = bb->hsorted;
+ br->tkey = bb->tkey;
+ br->hkey = bb->hkey;
+ br->hdense = FALSE;
+ BATseqbase(br, bb->hseqbase);
+ BATsetcount(br, sz);
+
+ BATprint(br);
+
+ BBPunfix(bb->batCacheid);
+ BBPkeepref(*r = br->batCacheid);
+ return MAL_SUCCEED;
+
+ bunins_failed:
+ BBPunfix(bb->batCacheid);
+ BBPunfix(br->batCacheid);
+ throw(MAL, "sample.uniform", OPERATION_FAILED "bunfastins");
+}
+
diff --git a/monetdb5/modules/mal/sample.c.bak
b/monetdb5/modules/mal/sample.c.bak
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/sample.c.bak
@@ -0,0 +1,155 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2011 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/*
+ * @a Lefteris Sidirourgos
+ * @d 30/08/2011
+ * @+ The sampling facilities
+ *
+ * In the context of the SciBORQ project, we introduce a number of sampling
+ * techniques in the MonetDB software stack. Our goal is to provide methods
+ * for performing sampling (uniform and weighted) over a) the result of a
+ * query, b) the base tables, and c) the entire database schema. Sampling
+ * can be performed during query execution, as well as during data loading in
+ * the case of predefined sampling indeces. In addition to the sampling
+ * methods, a number of query plan optimizations for sampling are introduced on
+ * the SQL and MAL level.
+ *
+ * Besides the sampling methods, SciBORQ also aims at multi-layered bounded
+ * query execution. That is steering query execution over many layers of
+ * samples with different size in order to achieve either strict error bounds
+ * or limited execution time. For more details see the SciBORQ module.
+ *
+ * In the following, details are presented on the implementation and the usage
+ * of each sampling method.
+ */
+
+#include "monetdb_config.h"
+#include <gdk.h>
+#include <mal_exception.h>
+#include "sample.h"
+
+/*
+ * @- Uniform Sampling.
+ *
+ * A new SQL operator has been added to support sampling the result of a query.
+ * The syntax for sampling is:
+ * SELECT ... FROM ... WHERE ... SAMPLE s
+ *
+ * where s if is an integer greater than 1, it defines the number of rows to be
+ * in the sample. If s is a double between [0.0,1.0] the it refers to the
+ * percentage of the result to be sampled. That is if s=0.3 then the sample
+ * will be 30% the size of the query result.
+ *
+ * SAMPLE is been treated as LIMIT, ORDER BY, etc., that means that it can only
+ * be in the outer most SELECT clause, i.e., SAMPLE cannot appear in a
+ * subquery. However, if this is needed, then one may define a function, for
+ * example
+ *
+ * CREATE FUNCTION mysample ()
+ * RETURNS TABLE(col a,...)
+ * BEGIN
+ * RETURN
+ * SELECT a,...
+ * FROM name_table
+ * SAMPLE 100;
+ * end;
+ *
+ * and then use function mysample() for example to populate a new table with
+ * the sample. E.g.,
+ *
+ * INSERT INTO sample_table (SELECT * FROM mysample());
+ *
+ * The implementation of the uniform sampling is based on the algorithm A as
+ * described in the paper "Faster Methods for Random Sampling" by Jeffrey Scott
+ * Vitter. Algorithm A is not the fastest one, but it only makes s calls in
+ * function random() and it is simpler than the other more complex and CPU
+ * intensive algorithms in the literature.
+ *
+ * Algorithm A instead of performing one random experiment for each row to
+ * decide if it should be included in the sample or not, it skips S rows
+ * and includes the S+1 row. The algorithm scans the input relation
+ * sequencially and maintains the unique and sort properties. The sample is
+ * without replacement.
+ */
+
+sample_export str
+SAMPLEuniform(bat *r, bat *b, ptr s) {
+ BAT *br, *bb;
+ BUN p, sz = *(wrd *)s, top, N, n, jump;
+ BATiter iter;
+ double v, quot;
+
+ if ((bb = BATdescriptor(*b)) == NULL) {
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list