Changeset: db953c1a74be for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=db953c1a74be
Modified Files:
        gdk/Makefile.ag
        gdk/gdk.mx
        monetdb5/modules/mal/sample.c
        monetdb5/modules/mal/sample.h
Branch: default
Log Message:

Moving the sample facilities to gdk.

BATsample allready exists in gdk, but is a slower and pseudo-uniform algorithm. 
The new gdk_sample.c file contains now a BATsample1 method. It will substitute 
the previous BATsample method in time (once enough testing is done). The 
monetdb5 signatures and mal calls remain the same.)


diffs (173 lines):

diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -40,7 +40,7 @@ lib_gdk = {
                gdk_qsort.mx gdk_ssort.mx gdk_storage.c gdk_bat.mx \
                gdk_delta.c gdk_relop.mx gdk_system.c gdk_value.mx \
                gdk_rangejoin.mx \
-               gdk_posix.c gdk_logger.c \
+               gdk_posix.c gdk_logger.c gdk_sample.c \
                gdk_private.h gdk_delta.h gdk_logger.h gdk_posix.h \
                gdk_system.h gdk_tm.h gdk_storage.h \
                gdk_calc.c \
diff --git a/gdk/gdk.mx b/gdk/gdk.mx
--- a/gdk/gdk.mx
+++ b/gdk/gdk.mx
@@ -3261,6 +3261,7 @@ gdk_export BAT *BATkunion(BAT *b, BAT *c
 gdk_export BAT *BATsdiff(BAT *b, BAT *c);
 gdk_export BAT *BATkdiff(BAT *b, BAT *c);
 
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
 gdk_export BAT *BATcalcnegate(BAT *b, int accum);
 gdk_export BAT *BATcalcabsolute(BAT *b, int accum);
 gdk_export BAT *BATcalcincr(BAT *b, int accum, int abort_on_error);
@@ -3320,7 +3321,19 @@ gdk_export BAT *BATcalccstne(const ValRe
 gdk_export BAT *BATcalccmp(BAT *b1, BAT *b2);
 gdk_export BAT *BATcalccmpcst(BAT *b, const ValRecord *v);
 gdk_export BAT *BATcalccstcmp(const ValRecord *v, BAT *b);
+=======
+/*
+ * @- BAT sample operators
+ *
+ * @multitable @columnfractions 0.08 0.7
+ * @item BAT *
+ * @tab BATsample (BAT *b, n)
+ * @end multitable
+ */
+gdk_export BAT *BATsample1(BAT *b, BUN n);
+>>>>>>> /tmp/gdk.mx~other.R3THkP
 
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
 gdk_export int VARcalcnot(ValPtr ret, const ValRecord *v);
 gdk_export int VARcalcnegate(ValPtr ret, const ValRecord *v);
 gdk_export int VARcalcabsolute(ValPtr ret, const ValRecord *v);
@@ -3352,11 +3365,16 @@ gdk_export BAT *BATcalcbetweencstcst(BAT
 gdk_export BAT *BATcalcbetweenbatcst(BAT *b, BAT *lo, const ValRecord *hi);
 gdk_export BAT *BATcalcbetweencstbat(BAT *b, const ValRecord *lo, BAT *hi);
 gdk_export int VARcalcbetween(ValPtr ret, const ValRecord *v, const ValRecord 
*lo, const ValRecord *hi);
+=======
+>>>>>>> /tmp/gdk.mx~other.R3THkP
 
+<<<<<<< /ufs/lsidir/develop/monet/current/MonetDB/gdk/gdk.mx
 gdk_export BAT *BATconvert(BAT *b, int tp, int abort_on_error);
 gdk_export int VARconvert(ValPtr ret, const ValRecord *v, int abort_on_error);
 gdk_export int BATcalcavg(BAT *b, dbl *avg, BUN *vals);
 
+=======
+>>>>>>> /tmp/gdk.mx~other.R3THkP
 /* generic n-ary multijoin beast, with defines to interpret retval */
 #define MULTIJOIN_SORTED(r)    ((char*) &r)[0]
 #define MULTIJOIN_KEY(r)       ((char*) &r)[1]
diff --git a/monetdb5/modules/mal/sample.c b/monetdb5/modules/mal/sample.c
--- a/monetdb5/modules/mal/sample.c
+++ b/monetdb5/modules/mal/sample.c
@@ -76,79 +76,23 @@
  *
  * INSERT INTO sample_table (SELECT * FROM mysample());
  *
- * The implementation of the uniform sampling is based on the algorithm A as
- * described in the paper "Faster Methods for Random Sampling" by Jeffrey Scott
- * Vitter. Algorithm A is not the fastest one, but it only makes s calls in
- * function random() and it is simpler than the other more complex and CPU
- * intensive algorithms in the literature.
- *
- * Algorithm A instead of performing one random experiment for each row to
- * decide if it should be included in the sample or not, it skips S rows
- * and includes the S+1 row. The algorithm scans the input relation
- * sequentially and maintains the unique and sort properties. The sample is
- * without replacement.
  */
 
 sample_export str
 SAMPLEuniform(bat *r, bat *b, ptr s) {
        BAT *br, *bb;
-       BUN p, sz = *(wrd *)s, top, N, n, jump;
-       BATiter iter;
-       double v, quot;
 
        if ((bb = BATdescriptor(*b)) == NULL) {
                throw(MAL, "sample.uniform", INTERNAL_BAT_ACCESS);
        }
-
-       N = BATcount(bb);
-       if  (sz > N) { /* if the sample is bigger than the input relation */
-               BBPkeepref(*r = bb->batCacheid);
-               return MAL_SUCCEED;
-       }
-
-       if ((br = BATnew(TYPE_oid, bb->ttype, sz)) == NULL) {
-               BBPunfix(bb->batCacheid);
-               throw(MAL, "sample.uniform", MAL_MALLOC_FAIL);
-       }
-
-       n = sz;
-       top = N-n;
-       p = BUNfirst(bb)-1;
-       iter = bat_iterator(bb);
-       while (n-->1) { /* loop until only 1 free spot is left for the sample */
-               v = DRAND;
-               jump = 0;
-               quot = (double)top/(double)N;
-               while (quot > v) { /* determine how many positions to jump */
-                       jump++;
-                       top--;
-                       N--;
-                       quot *= (double)top/(double)N;
-               }
-               p += (jump+1);
-               N--;
-               bunfastins(br, BUNhead(iter, p), BUNtail(iter,p));
-       }
-       /* 1 row left to be added in the sample */
-       p += (BUN) rand() % N;
-       bunfastins(br, BUNhead(iter, p+1), BUNtail(iter,p+1));
-
-       br->tsorted = bb->tsorted;
-       br->hsorted = bb->hsorted;
-       br->tkey = bb->tkey;
-       br->hkey = bb->hkey;
-       br->hdense = FALSE;
-       BATseqbase(br, bb->hseqbase);
-       BATsetcount(br, sz);
+       br = BATsample1(bb,*(BUN *)s);
+       if (br == NULL)
+               throw(MAL, "sample.uniform", OPERATION_FAILED);
 
        BBPunfix(bb->batCacheid);
        BBPkeepref(*r = br->batCacheid);
        return MAL_SUCCEED;
 
-       bunins_failed:
-       BBPunfix(bb->batCacheid);
-       BBPunfix(br->batCacheid);
-       throw(MAL, "sample.uniform", OPERATION_FAILED "bunfastins");
 }
 
 sample_export str
@@ -158,12 +102,12 @@ SAMPLEuniform_dbl(bat *r, bat *b, ptr p)
        wrd s;
 
        if ( pr < 0.0 || pr > 1.0 ) {
-               throw(MAL, "sample.uniform", ILLEGAL_ARGUMENT " p should be 
between 0 and 1.0" );
+               throw(MAL, "sample.uniform", ILLEGAL_ARGUMENT
+                               " p should be between 0 and 1.0" );
        } else if (pr == 0) {/* special case */
                s = 0;
                return SAMPLEuniform(r, b, (ptr)&s);
        }
-
        if ((bb = BATdescriptor(*b)) == NULL) {
                throw(MAL, "sample.uniform", INTERNAL_BAT_ACCESS);
        }
diff --git a/monetdb5/modules/mal/sample.h b/monetdb5/modules/mal/sample.h
--- a/monetdb5/modules/mal/sample.h
+++ b/monetdb5/modules/mal/sample.h
@@ -38,8 +38,6 @@
 #define sample_export extern
 #endif
 
-#define DRAND ((double)rand()/(double)RAND_MAX)
-
 sample_export str
 SAMPLEuniform(bat *r, bat *b, ptr s);
 
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to