Changeset: 3425ea0d73a2 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3425ea0d73a2 Modified Files: gdk/ChangeLog.Oct2014 gdk/gdk_sample.c Branch: Oct2014 Log Message:
merger diffs (231 lines): diff --git a/gdk/ChangeLog.Oct2014 b/gdk/ChangeLog.Oct2014 --- a/gdk/ChangeLog.Oct2014 +++ b/gdk/ChangeLog.Oct2014 @@ -1,3 +1,10 @@ # ChangeLog file for MonetDB # This file is updated with Maddlog +* Wed Nov 5 2014 Sjoerd Mullender <[email protected]> +- Fixed some problems with BATsample. It was possible for BATsample to + return a value that was just beyond the end of the sampled BAT. Also, + on some systems the range of the rand() function is rather limited + (0..32767) and trying to get a sample larger than this range would + result in an infinite loop. + diff --git a/gdk/gdk_sample.c b/gdk/gdk_sample.c --- a/gdk/gdk_sample.c +++ b/gdk/gdk_sample.c @@ -21,17 +21,17 @@ * @a Lefteris Sidirourgos, Hannes Muehleisen * @* Low level sample facilities * - * This sampling implementation generates a sorted set of OIDs by calling the - * random number generator, and uses a binary tree to eliminate duplicates. - * The elements of the tree are then used to create a sorted sample BAT. - * This implementation has a logarithmic complexity that only depends on the - * sample size. + * This sampling implementation generates a sorted set of OIDs by + * calling the random number generator, and uses a binary tree to + * eliminate duplicates. The elements of the tree are then used to + * create a sorted sample BAT. This implementation has a logarithmic + * complexity that only depends on the sample size. * - * There is a pathological case when the sample size is almost the size of the BAT. - * Then, many collisions occur and performance degrades. To catch this, we - * switch to antiset semantics when the sample size is larger than half the BAT - * size. Then, we generate the values that should be omitted from the sample. - * + * There is a pathological case when the sample size is almost the + * size of the BAT. Then, many collisions occur and performance + * degrades. To catch this, we switch to antiset semantics when the + * sample size is larger than half the BAT size. Then, we generate the + * values that should be omitted from the sample. */ #include "monetdb_config.h" @@ -46,31 +46,27 @@ /* the range of rand() is [0..RAND_MAX], i.e. inclusive; * cast first, add later: on Linux RAND_MAX == INT_MAX, so adding 1 * will overflow, but INT_MAX does fit in a double */ +#if RAND_MAX < 46340 /* 46340*46340 = 2147395600 < INT_MAX */ +/* random range is too small, double it */ +#define DRAND ((double)(rand() * (RAND_MAX + 1) + rand()) / ((double) ((RAND_MAX + 1) * (RAND_MAX + 1)))) +#else #define DRAND ((double)rand() / ((double)RAND_MAX + 1)) #endif +#endif /* this is a straightforward implementation of a binary tree */ struct oidtreenode { BUN oid; - struct oidtreenode* left; - struct oidtreenode* right; + struct oidtreenode *left; + struct oidtreenode *right; }; -static int OIDTreeLookup(struct oidtreenode* node, BUN target) { - while (node) { - if (target == node->oid) - return (TRUE); - else if (target < node->oid) - node = node->left; - else - node = node->right; - } - return FALSE; -} +static struct oidtreenode * +OIDTreeNew(BUN oid) +{ + struct oidtreenode *node = GDKmalloc(sizeof(struct oidtreenode)); -static struct oidtreenode* OIDTreeNew(BUN oid) { - struct oidtreenode *node = GDKmalloc(sizeof(struct oidtreenode)); if (node == NULL) { GDKerror("#BATsample: memory allocation error"); return NULL ; @@ -78,22 +74,29 @@ static struct oidtreenode* OIDTreeNew(BU node->oid = oid; node->left = NULL; node->right = NULL; - return (node); + return node; } -static struct oidtreenode* OIDTreeInsert(struct oidtreenode** nodep, BUN oid) { +static int +OIDTreeMaybeInsert(struct oidtreenode **nodep, BUN oid) +{ while (*nodep) { - if (oid <= (*nodep)->oid) + if (oid == (*nodep)->oid) + return 0; + if (oid < (*nodep)->oid) nodep = &(*nodep)->left; else nodep = &(*nodep)->right; } - *nodep = OIDTreeNew(oid); - return *nodep; + if ((*nodep = OIDTreeNew(oid)) == NULL) + return -1; + return 1; } /* inorder traversal, gives us a sorted BAT */ -static void OIDTreeToBAT(struct oidtreenode* node, BAT *bat) { +static void +OIDTreeToBAT(struct oidtreenode *node, BAT *bat) +{ if (node->left != NULL) OIDTreeToBAT(node->left, bat); ((oid *) bat->T->heap.base)[bat->batFirst + bat->batCount++] = node->oid; @@ -102,14 +105,17 @@ static void OIDTreeToBAT(struct oidtreen } /* Antiset traversal, give us all values but the ones in the tree */ -static void OIDTreeToBATAntiset(struct oidtreenode* node, BAT *bat, BUN start, BUN stop) { +static void +OIDTreeToBATAntiset(struct oidtreenode *node, BAT *bat, BUN start, BUN stop) +{ BUN noid; + if (node->left != NULL) OIDTreeToBATAntiset(node->left, bat, start, node->oid); - else + else for (noid = start+1; noid < node->oid; noid++) - ((oid *) bat->T->heap.base)[bat->batFirst + bat->batCount++] = noid; - + ((oid *) bat->T->heap.base)[bat->batFirst + bat->batCount++] = noid; + if (node->right != NULL) OIDTreeToBATAntiset(node->right, bat, node->oid, stop); else @@ -117,7 +123,9 @@ static void OIDTreeToBATAntiset(struct o ((oid *) bat->T->heap.base)[bat->batFirst + bat->batCount++] = noid; } -static void OIDTreeDestroy(struct oidtreenode* node) { +static void +OIDTreeDestroy(struct oidtreenode *node) +{ if (node == NULL) { return; } @@ -133,11 +141,12 @@ static void OIDTreeDestroy(struct oidtre /* BATsample implements sampling for void headed BATs */ BAT * -BATsample(BAT *b, BUN n) { +BATsample(BAT *b, BUN n) +{ BAT *bn; BUN cnt, slen; BUN rescnt = 0; - struct oidtreenode* tree = NULL; + struct oidtreenode *tree = NULL; BATcheck(b, "BATsample"); assert(BAThdense(b)); @@ -169,31 +178,32 @@ BATsample(BAT *b, BUN n) { } else { BUN minoid = b->hseqbase; BUN maxoid = b->hseqbase + cnt; - /* if someone samples more than half of our tree, we do the antiset */ - bit antiset = n > cnt/2; + /* if someone samples more than half of our tree, we + * do the antiset */ + bit antiset = n > cnt / 2; slen = n; - if (antiset) + if (antiset) n = cnt - n; - + bn = BATnew(TYPE_void, TYPE_oid, slen, TRANSIENT); - if (bn == NULL ) { + if (bn == NULL) { GDKerror("#BATsample: memory allocation error"); return NULL; } /* while we do not have enough sample OIDs yet */ while (rescnt < n) { BUN candoid; - struct oidtreenode* ttree; + int rc; do { /* generate a new random OID */ - /* coverity[dont_call] */ candoid = (BUN) (minoid + DRAND * (maxoid - minoid)); - /* if that candidate OID was already generated, try again */ - } while (OIDTreeLookup(tree, candoid)); - ttree = OIDTreeInsert(&tree, candoid); - if (ttree == NULL) { + /* if that candidate OID was already + * generated, try again */ + } while ((rc = OIDTreeMaybeInsert(&tree, candoid)) == 0); + if (rc < 0) { GDKerror("#BATsample: memory allocation error"); - /* if malloc fails, we still need to clean up the tree */ + /* if malloc fails, we still need to + * clean up the tree */ OIDTreeDestroy(tree); return NULL; } @@ -202,7 +212,7 @@ BATsample(BAT *b, BUN n) { if (!antiset) { OIDTreeToBAT(tree, bn); } else { - OIDTreeToBATAntiset(tree, bn, minoid-1, maxoid+1); + OIDTreeToBATAntiset(tree, bn, minoid - 1, maxoid + 1); } OIDTreeDestroy(tree); @@ -221,4 +231,3 @@ BATsample(BAT *b, BUN n) { } return bn; } - _______________________________________________ checkin-list mailing list [email protected] https://www.monetdb.org/mailman/listinfo/checkin-list
