Changeset: db953c1a74be for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=db953c1a74be
Modified Files:
gdk/Makefile.ag
gdk/gdk.mx
monetdb5/modules/mal/sample.c
monetdb5/modules/mal/sample.h
Branch: default
Log Message:
Moving the sample facilities to gdk.
BATsample allready exists in gdk, but is a slower and pseudo-uniform algorithm.
The new gdk_sample.c file contains now a BATsample1 method. It will substitute
the previous BATsample method in time (once enough testing is done). The
monetdb5 signatures and mal calls remain the same.)
diffs (173 lines):
diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -40,7 +40,7 @@ lib_gdk = {
gdk_qsort.mx gdk_ssort.mx gdk_storage.c gdk_bat.mx \
gdk_delta.c gdk_relop.mx gdk_system.c gdk_value.mx \
gdk_rangejoin.mx \
- gdk_posix.c gdk_logger.c \
+ gdk_posix.c gdk_logger.c gdk_sample.c \
gdk_private.h gdk_delta.h gdk_logger.h gdk_posix.h \
gdk_system.h gdk_tm.h gdk_storage.h \
gdk_calc.c \
diff --git a/gdk/gdk.mx b/gdk/gdk.mx
--- a/gdk/gdk.mx
+++ b/gdk/gdk.mx
@@ -3261,6 +3261,7 @@ gdk_export BAT *BATkunion(BAT *b, BAT *c
gdk_export BAT *BATsdiff(BAT *b, BAT *c);
gdk_export BAT *BATkdiff(BAT *b, BAT *c);
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
gdk_export BAT *BATcalcnegate(BAT *b, int accum);
gdk_export BAT *BATcalcabsolute(BAT *b, int accum);
gdk_export BAT *BATcalcincr(BAT *b, int accum, int abort_on_error);
@@ -3320,7 +3321,19 @@ gdk_export BAT *BATcalccstne(const ValRe
gdk_export BAT *BATcalccmp(BAT *b1, BAT *b2);
gdk_export BAT *BATcalccmpcst(BAT *b, const ValRecord *v);
gdk_export BAT *BATcalccstcmp(const ValRecord *v, BAT *b);
+=======
+/*
+ * @- BAT sample operators
+ *
+ * @multitable @columnfractions 0.08 0.7
+ * @item BAT *
+ * @tab BATsample (BAT *b, n)
+ * @end multitable
+ */
+gdk_export BAT *BATsample1(BAT *b, BUN n);
+>>>>>>> /tmp/gdk.mx~other.R3THkP
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
gdk_export int VARcalcnot(ValPtr ret, const ValRecord *v);
gdk_export int VARcalcnegate(ValPtr ret, const ValRecord *v);
gdk_export int VARcalcabsolute(ValPtr ret, const ValRecord *v);
@@ -3352,11 +3365,16 @@ gdk_export BAT *BATcalcbetweencstcst(BAT
gdk_export BAT *BATcalcbetweenbatcst(BAT *b, BAT *lo, const ValRecord *hi);
gdk_export BAT *BATcalcbetweencstbat(BAT *b, const ValRecord *lo, BAT *hi);
gdk_export int VARcalcbetween(ValPtr ret, const ValRecord *v, const ValRecord
*lo, const ValRecord *hi);
+=======
+>>>>>>> /tmp/gdk.mx~other.R3THkP
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
gdk_export BAT *BATconvert(BAT *b, int tp, int abort_on_error);
gdk_export int VARconvert(ValPtr ret, const ValRecord *v, int abort_on_error);
gdk_export int BATcalcavg(BAT *b, dbl *avg, BUN *vals);
+=======
+>>>>>>> /tmp/gdk.mx~other.R3THkP
/* generic n-ary multijoin beast, with defines to interpret retval */
#define MULTIJOIN_SORTED(r) ((char*) &r)[0]
#define MULTIJOIN_KEY(r) ((char*) &r)[1]
diff --git a/monetdb5/modules/mal/sample.c b/monetdb5/modules/mal/sample.c
--- a/monetdb5/modules/mal/sample.c
+++ b/monetdb5/modules/mal/sample.c
@@ -76,79 +76,23 @@
*
* INSERT INTO sample_table (SELECT * FROM mysample());
*
- * The implementation of the uniform sampling is based on the algorithm A as
- * described in the paper "Faster Methods for Random Sampling" by Jeffrey Scott
- * Vitter. Algorithm A is not the fastest one, but it only makes s calls in
- * function random() and it is simpler than the other more complex and CPU
- * intensive algorithms in the literature.
- *
- * Algorithm A instead of performing one random experiment for each row to
- * decide if it should be included in the sample or not, it skips S rows
- * and includes the S+1 row. The algorithm scans the input relation
- * sequentially and maintains the unique and sort properties. The sample is
- * without replacement.
*/
sample_export str
SAMPLEuniform(bat *r, bat *b, ptr s) {
BAT *br, *bb;
- BUN p, sz = *(wrd *)s, top, N, n, jump;
- BATiter iter;
- double v, quot;
if ((bb = BATdescriptor(*b)) == NULL) {
throw(MAL, "sample.uniform", INTERNAL_BAT_ACCESS);
}
-
- N = BATcount(bb);
- if (sz > N) { /* if the sample is bigger than the input relation */
- BBPkeepref(*r = bb->batCacheid);
- return MAL_SUCCEED;
- }
-
- if ((br = BATnew(TYPE_oid, bb->ttype, sz)) == NULL) {
- BBPunfix(bb->batCacheid);
- throw(MAL, "sample.uniform", MAL_MALLOC_FAIL);
- }
-
- n = sz;
- top = N-n;
- p = BUNfirst(bb)-1;
- iter = bat_iterator(bb);
- while (n-->1) { /* loop until only 1 free spot is left for the sample */
- v = DRAND;
- jump = 0;
- quot = (double)top/(double)N;
- while (quot > v) { /* determine how many positions to jump */
- jump++;
- top--;
- N--;
- quot *= (double)top/(double)N;
- }
- p += (jump+1);
- N--;
- bunfastins(br, BUNhead(iter, p), BUNtail(iter,p));
- }
- /* 1 row left to be added in the sample */
- p += (BUN) rand() % N;
- bunfastins(br, BUNhead(iter, p+1), BUNtail(iter,p+1));
-
- br->tsorted = bb->tsorted;
- br->hsorted = bb->hsorted;
- br->tkey = bb->tkey;
- br->hkey = bb->hkey;
- br->hdense = FALSE;
- BATseqbase(br, bb->hseqbase);
- BATsetcount(br, sz);
+ br = BATsample1(bb,*(BUN *)s);
+ if (br == NULL)
+ throw(MAL, "sample.uniform", OPERATION_FAILED);
BBPunfix(bb->batCacheid);
BBPkeepref(*r = br->batCacheid);
return MAL_SUCCEED;
- bunins_failed:
- BBPunfix(bb->batCacheid);
- BBPunfix(br->batCacheid);
- throw(MAL, "sample.uniform", OPERATION_FAILED "bunfastins");
}
sample_export str
@@ -158,12 +102,12 @@ SAMPLEuniform_dbl(bat *r, bat *b, ptr p)
wrd s;
if ( pr < 0.0 || pr > 1.0 ) {
- throw(MAL, "sample.uniform", ILLEGAL_ARGUMENT " p should be
between 0 and 1.0" );
+ throw(MAL, "sample.uniform", ILLEGAL_ARGUMENT
+ " p should be between 0 and 1.0" );
} else if (pr == 0) {/* special case */
s = 0;
return SAMPLEuniform(r, b, (ptr)&s);
}
-
if ((bb = BATdescriptor(*b)) == NULL) {
throw(MAL, "sample.uniform", INTERNAL_BAT_ACCESS);
}
diff --git a/monetdb5/modules/mal/sample.h b/monetdb5/modules/mal/sample.h
--- a/monetdb5/modules/mal/sample.h
+++ b/monetdb5/modules/mal/sample.h
@@ -38,8 +38,6 @@
#define sample_export extern
#endif
-#define DRAND ((double)rand()/(double)RAND_MAX)
-
sample_export str
SAMPLEuniform(bat *r, bat *b, ptr s);
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list