Changeset: 248da36f41c7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=248da36f41c7
Modified Files:
gdk/gdk_firstn.c
monetdb5/modules/mal/Tests/pqueue.stable.out
monetdb5/modules/mal/Tests/pqueue2.stable.out
monetdb5/tests/gdkTests/Tests/firstn.stable.out
Branch: Oct2014
Log Message:
Use a binary heap for the implementation of the simplest form of BATfirstn.
This code was inspired by a suggestion from Richard Hughes (in the
form of code that I didn't end up using).
diffs (283 lines):
diff --git a/gdk/gdk_firstn.c b/gdk/gdk_firstn.c
--- a/gdk/gdk_firstn.c
+++ b/gdk/gdk_firstn.c
@@ -55,47 +55,85 @@
* the major key.
*/
-#define shuffle_unique_body(COMPARE) \
+/* We use a binary heap for the implementation of the simplest form of
+ * first-N. During processing, the oids list forms a heap with the
+ * root at position 0 and the children of a node at position n at
+ * positions 2*n+1 and 2*n+2. The parent node is always
+ * smaller/larger (depending on the value of asc) than its children
+ * (recursively). The heapify macro creates the heap from the input
+ * in-place. We start off with a heap containing the first N elements
+ * of the input, and then go over the rest of the input, replacing the
+ * root of the heap with a new value if appropriate (if the new value
+ * is among the first-N seen so far). The siftup macro then restores
+ * the heap property. */
+#define siftup(OPER, START, GET) \
do { \
- for (i = cand ? *cand++ - b->hseqbase : start; \
- i < end; \
- cand < candend ? (i = *cand++ - b->hseqbase) : i++) { \
- for (j = 0; j < n; j++) { \
- if (j == top) { \
- assert(top < n); \
- oids[top++] = i + b->hseqbase; \
- break; \
- } \
- assert(oids[j] >= b->hseqbase); \
- assert(oids[j] - b->hseqbase < i); \
- if (COMPARE) { \
- if (top < n) \
- top++; \
- for (k = top - 1; k > j; k--) \
- oids[k] = oids[k - 1]; \
- oids[j] = i + b->hseqbase; \
- break; \
- } \
+ pos = (START); \
+ childpos = (pos << 1) + 1; \
+ while (childpos < n) { \
+ /* find most extreme child */ \
+ if (childpos + 1 < n && \
+ !(OPER(GET(oids[childpos + 1]), \
+ GET(oids[childpos])))) \
+ childpos++; \
+ /* compare parent with most extreme child */ \
+ if (OPER(GET(oids[pos]), GET(oids[childpos]))) { \
+ /* exchange parent with child and */ \
+ /* sift child further */ \
+ item = oids[pos]; \
+ oids[pos] = oids[childpos]; \
+ oids[childpos] = item; \
+ pos = childpos; \
+ childpos = (pos << 1) + 1; \
+ continue; \
+ } \
+ break; \
+ } \
+ } while (0)
+
+#define heapify(OPER, GET) \
+ do { \
+ for (i = n / 2; i > 0; i--) \
+ siftup(OPER, i - 1, GET); \
+ } while (0)
+
+#define GETVAL(o) vals[(o) - b->hseqbase]
+#define GETVALany(o) BUNtail(bi, (o) - b->hseqbase + BUNfirst(b))
+#define LTany(a, b) (cmp(a, b) < 0)
+#define GTany(a, b) (cmp(a, b) > 0)
+
+#define shuffle_unique(TYPE, OPER, GET)
\
+ do { \
+ const TYPE *vals = (const TYPE *) Tloc(b, BUNfirst(b)); \
+ heapify(OPER, GET); \
+ while (cand ? cand < candend : start < end) { \
+ i = cand ? *cand++ : start++ + b->hseqbase; \
+ if (OPER(GET(i), GET(oids[0]))) { \
+ oids[0] = i; \
+ siftup(OPER, 0, GET); \
} \
} \
} while (0)
-#define shuffle_unique(TYPE, OPER) \
- do { \
- const TYPE *v = (const TYPE *) Tloc(b, BUNfirst(b)); \
- shuffle_unique_body(OPER(v[i], v[oids[j] - b->hseqbase])); \
- } while (0)
-
+/* This version of BATfirstn returns a list of N oids (where N is the
+ * smallest among BATcount(b), BATcount(s), and n). The oids returned
+ * refer to the N smallest/largest (depending on asc) tail values of b
+ * (taking the optional candidate list s into account). If there are
+ * multiple equal values to take us past N, we return a subset of those.
+ */
static BAT *
BATfirstn_unique(BAT *b, BAT *s, BUN n, int asc)
{
BAT *bn;
BATiter bi = bat_iterator(b);
oid *oids;
- BUN top, i, j, k, cnt, start, end;
+ BUN i, cnt, start, end;
const oid *cand, *candend;
int tpe = b->ttype;
int (*cmp)(const void *, const void *);
+ /* variables used in heapify/siftup macros */
+ oid item;
+ BUN pos, childpos;
CANDINIT(b, s, start, end, cnt, cand, candend);
@@ -156,7 +194,6 @@ BATfirstn_unique(BAT *b, BAT *s, BUN n,
BATsetcount(bn, n);
BATseqbase(bn, 0);
oids = (oid *) Tloc(bn, BUNfirst(bn));
- top = 0;
cmp = BATatoms[b->ttype].atomCmp;
/* if base type has same comparison function as type itself, we
* can use the base type */
@@ -165,52 +202,73 @@ BATfirstn_unique(BAT *b, BAT *s, BUN n,
/* note, this takes care of types oid and wrd */
tpe = ATOMstorage(tpe);
}
+ if (cand) {
+ for (i = 0; i < n; i++)
+ oids[i] = *cand++;
+ } else {
+ for (i = 0; i < n; i++)
+ oids[i] = start++ + b->hseqbase;
+ }
if (asc) {
switch (tpe) {
case TYPE_bte:
- shuffle_unique(bte, LT);
+ shuffle_unique(bte, LT, GETVAL);
break;
case TYPE_sht:
- shuffle_unique(sht, LT);
+ shuffle_unique(sht, LT, GETVAL);
break;
case TYPE_int:
- shuffle_unique(int, LT);
+ shuffle_unique(int, LT, GETVAL);
break;
case TYPE_lng:
- shuffle_unique(lng, LT);
+ shuffle_unique(lng, LT, GETVAL);
break;
case TYPE_flt:
- shuffle_unique(flt, LT);
+ shuffle_unique(flt, LT, GETVAL);
break;
case TYPE_dbl:
- shuffle_unique(dbl, LT);
+ shuffle_unique(dbl, LT, GETVAL);
break;
default:
- shuffle_unique_body(cmp(BUNtail(bi, i + BUNfirst(b)),
BUNtail(bi, oids[j] - b->hseqbase + BUNfirst(b))) < 0);
+ heapify(LTany, GETVALany);
+ while (cand ? cand < candend : start < end) {
+ i = cand ? *cand++ : start++ + b->hseqbase;
+ if (LTany(GETVALany(i), GETVALany(oids[0]))) {
+ oids[0] = i;
+ siftup(LTany, 0, GETVALany);
+ }
+ }
break;
}
} else {
switch (tpe) {
case TYPE_bte:
- shuffle_unique(bte, GT);
+ shuffle_unique(bte, GT, GETVAL);
break;
case TYPE_sht:
- shuffle_unique(sht, GT);
+ shuffle_unique(sht, GT, GETVAL);
break;
case TYPE_int:
- shuffle_unique(int, GT);
+ shuffle_unique(int, GT, GETVAL);
break;
case TYPE_lng:
- shuffle_unique(lng, GT);
+ shuffle_unique(lng, GT, GETVAL);
break;
case TYPE_flt:
- shuffle_unique(flt, GT);
+ shuffle_unique(flt, GT, GETVAL);
break;
case TYPE_dbl:
- shuffle_unique(dbl, GT);
+ shuffle_unique(dbl, GT, GETVAL);
break;
default:
- shuffle_unique_body(cmp(BUNtail(bi, i + BUNfirst(b)),
BUNtail(bi, oids[j] - b->hseqbase + BUNfirst(b))) > 0);
+ heapify(GTany, GETVALany);
+ while (cand ? cand < candend : start < end) {
+ i = cand ? *cand++ : start++ + b->hseqbase;
+ if (GTany(GETVALany(i), GETVALany(oids[0]))) {
+ oids[0] = i;
+ siftup(GTany, 0, GETVALany);
+ }
+ }
break;
}
}
diff --git a/monetdb5/modules/mal/Tests/pqueue.stable.out
b/monetdb5/modules/mal/Tests/pqueue.stable.out
--- a/monetdb5/modules/mal/Tests/pqueue.stable.out
+++ b/monetdb5/modules/mal/Tests/pqueue.stable.out
@@ -103,7 +103,7 @@ end main;
# h t # name
# void oid # type
#--------------------------#
-[ 0@0, 2@0 ]
+[ 0@0, 3@0 ]
[ 1@0, 4@0 ]
[ 2@0, 5@0 ]
[ 3@0, 6@0 ]
diff --git a/monetdb5/modules/mal/Tests/pqueue2.stable.out
b/monetdb5/modules/mal/Tests/pqueue2.stable.out
--- a/monetdb5/modules/mal/Tests/pqueue2.stable.out
+++ b/monetdb5/modules/mal/Tests/pqueue2.stable.out
@@ -166,7 +166,7 @@ end main;
# h t # name
# void oid # type
#--------------------------#
-[ 0@0, 2@0 ]
+[ 0@0, 3@0 ]
[ 1@0, 4@0 ]
[ 2@0, 5@0 ]
[ 3@0, 6@0 ]
diff --git a/monetdb5/tests/gdkTests/Tests/firstn.stable.out
b/monetdb5/tests/gdkTests/Tests/firstn.stable.out
--- a/monetdb5/tests/gdkTests/Tests/firstn.stable.out
+++ b/monetdb5/tests/gdkTests/Tests/firstn.stable.out
@@ -488,8 +488,8 @@ end main;
# void oid # type
#--------------------------#
[ 0@0, 0@0 ]
-[ 1@0, 1@0 ]
-[ 2@0, 2@0 ]
+[ 1@0, 2@0 ]
+[ 2@0, 3@0 ]
[ 3@0, 4@0 ]
[ 4@0, 5@0 ]
[ 5@0, 7@0 ]
@@ -499,8 +499,8 @@ end main;
# void str # type
#--------------------------#
[ 0@0, "2" ]
-[ 1@0, "3" ]
-[ 2@0, "1" ]
+[ 1@0, "1" ]
+[ 2@0, "3" ]
[ 3@0, "1" ]
[ 4@0, "2" ]
[ 5@0, "2" ]
@@ -685,8 +685,8 @@ end main;
#--------------------------#
[ 0@0, 0@0 ]
[ 1@0, 1@0 ]
-[ 2@0, 2@0 ]
-[ 3@0, 3@0 ]
+[ 2@0, 3@0 ]
+[ 3@0, 4@0 ]
[ 4@0, 5@0 ]
[ 5@0, 6@0 ]
[ 6@0, 7@0 ]
@@ -696,8 +696,8 @@ end main;
#--------------------------#
[ 0@0, "2" ]
[ 1@0, "3" ]
-[ 2@0, "1" ]
-[ 3@0, "3" ]
+[ 2@0, "3" ]
+[ 3@0, "1" ]
[ 4@0, "2" ]
[ 5@0, "3" ]
[ 6@0, "2" ]
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list