Changeset: 248da36f41c7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=248da36f41c7
Modified Files:
        gdk/gdk_firstn.c
        monetdb5/modules/mal/Tests/pqueue.stable.out
        monetdb5/modules/mal/Tests/pqueue2.stable.out
        monetdb5/tests/gdkTests/Tests/firstn.stable.out
Branch: Oct2014
Log Message:

Use a binary heap for the implementation of the simplest form of BATfirstn.
This code was inspired by a suggestion from Richard Hughes (in the
form of code that I didn't end up using).


diffs (283 lines):

diff --git a/gdk/gdk_firstn.c b/gdk/gdk_firstn.c
--- a/gdk/gdk_firstn.c
+++ b/gdk/gdk_firstn.c
@@ -55,47 +55,85 @@
  * the major key.
  */
 
-#define shuffle_unique_body(COMPARE)                                   \
+/* We use a binary heap for the implementation of the simplest form of
+ * first-N.  During processing, the oids list forms a heap with the
+ * root at position 0 and the children of a node at position n at
+ * positions 2*n+1 and 2*n+2.  The parent node is always
+ * smaller/larger (depending on the value of asc) than its children
+ * (recursively).  The heapify macro creates the heap from the input
+ * in-place.  We start off with a heap containing the first N elements
+ * of the input, and then go over the rest of the input, replacing the
+ * root of the heap with a new value if appropriate (if the new value
+ * is among the first-N seen so far).  The siftup macro then restores
+ * the heap property. */
+#define siftup(OPER, START, GET)                                       \
        do {                                                            \
-               for (i = cand ? *cand++ - b->hseqbase : start;          \
-                    i < end;                                           \
-                    cand < candend ? (i = *cand++ - b->hseqbase) : i++) { \
-                       for (j = 0; j < n; j++) {                       \
-                               if (j == top) {                         \
-                                       assert(top < n);                \
-                                       oids[top++] = i + b->hseqbase;  \
-                                       break;                          \
-                               }                                       \
-                               assert(oids[j] >= b->hseqbase);         \
-                               assert(oids[j] - b->hseqbase < i);      \
-                               if (COMPARE) {                          \
-                                       if (top < n)                    \
-                                               top++;                  \
-                                       for (k = top - 1; k > j; k--)   \
-                                               oids[k] = oids[k - 1];  \
-                                       oids[j] = i + b->hseqbase;      \
-                                       break;                          \
-                               }                                       \
+           pos = (START);                                              \
+               childpos = (pos << 1) + 1;                              \
+               while (childpos < n) {                                  \
+                       /* find most extreme child */                   \
+                       if (childpos + 1 < n &&                         \
+                           !(OPER(GET(oids[childpos + 1]),             \
+                                  GET(oids[childpos]))))               \
+                               childpos++;                             \
+                       /* compare parent with most extreme child */    \
+                       if (OPER(GET(oids[pos]), GET(oids[childpos]))) { \
+                               /* exchange parent with child and */    \
+                               /* sift child further */                \
+                               item = oids[pos];                       \
+                               oids[pos] = oids[childpos];             \
+                               oids[childpos] = item;                  \
+                               pos = childpos;                         \
+                               childpos = (pos << 1) + 1;              \
+                               continue;                               \
+                       }                                               \
+                       break;                                          \
+               }                                                       \
+       } while (0)
+
+#define heapify(OPER, GET)                             \
+       do {                                            \
+               for (i = n / 2; i > 0; i--)             \
+                       siftup(OPER, i - 1, GET);       \
+       } while (0)
+
+#define GETVAL(o)      vals[(o) - b->hseqbase]
+#define GETVALany(o)   BUNtail(bi, (o) - b->hseqbase + BUNfirst(b))
+#define LTany(a, b)    (cmp(a, b) < 0)
+#define GTany(a, b)    (cmp(a, b) > 0)
+
+#define shuffle_unique(TYPE, OPER, GET)                                        
\
+       do {                                                            \
+               const TYPE *vals = (const TYPE *) Tloc(b, BUNfirst(b)); \
+               heapify(OPER, GET);                                     \
+               while (cand ? cand < candend : start < end) {           \
+                       i = cand ? *cand++ : start++ + b->hseqbase;     \
+                       if (OPER(GET(i), GET(oids[0]))) {               \
+                               oids[0] = i;                            \
+                               siftup(OPER, 0, GET);                   \
                        }                                               \
                }                                                       \
        } while (0)
 
-#define shuffle_unique(TYPE, OPER)                                     \
-       do {                                                            \
-               const TYPE *v = (const TYPE *) Tloc(b, BUNfirst(b));    \
-               shuffle_unique_body(OPER(v[i], v[oids[j] - b->hseqbase])); \
-       } while (0)
-
+/* This version of BATfirstn returns a list of N oids (where N is the
+ * smallest among BATcount(b), BATcount(s), and n).  The oids returned
+ * refer to the N smallest/largest (depending on asc) tail values of b
+ * (taking the optional candidate list s into account).  If there are
+ * multiple equal values to take us past N, we return a subset of those.
+ */
 static BAT *
 BATfirstn_unique(BAT *b, BAT *s, BUN n, int asc)
 {
        BAT *bn;
        BATiter bi = bat_iterator(b);
        oid *oids;
-       BUN top, i, j, k, cnt, start, end;
+       BUN i, cnt, start, end;
        const oid *cand, *candend;
        int tpe = b->ttype;
        int (*cmp)(const void *, const void *);
+       /* variables used in heapify/siftup macros */
+       oid item;
+       BUN pos, childpos;
 
        CANDINIT(b, s, start, end, cnt, cand, candend);
 
@@ -156,7 +194,6 @@ BATfirstn_unique(BAT *b, BAT *s, BUN n, 
        BATsetcount(bn, n);
        BATseqbase(bn, 0);
        oids = (oid *) Tloc(bn, BUNfirst(bn));
-       top = 0;
        cmp = BATatoms[b->ttype].atomCmp;
        /* if base type has same comparison function as type itself, we
         * can use the base type */
@@ -165,52 +202,73 @@ BATfirstn_unique(BAT *b, BAT *s, BUN n, 
                /* note, this takes care of types oid and wrd */
                tpe = ATOMstorage(tpe);
        }
+       if (cand) {
+               for (i = 0; i < n; i++)
+                       oids[i] = *cand++;
+       } else {
+               for (i = 0; i < n; i++)
+                       oids[i] = start++ + b->hseqbase;
+       }
        if (asc) {
                switch (tpe) {
                case TYPE_bte:
-                       shuffle_unique(bte, LT);
+                       shuffle_unique(bte, LT, GETVAL);
                        break;
                case TYPE_sht:
-                       shuffle_unique(sht, LT);
+                       shuffle_unique(sht, LT, GETVAL);
                        break;
                case TYPE_int:
-                       shuffle_unique(int, LT);
+                       shuffle_unique(int, LT, GETVAL);
                        break;
                case TYPE_lng:
-                       shuffle_unique(lng, LT);
+                       shuffle_unique(lng, LT, GETVAL);
                        break;
                case TYPE_flt:
-                       shuffle_unique(flt, LT);
+                       shuffle_unique(flt, LT, GETVAL);
                        break;
                case TYPE_dbl:
-                       shuffle_unique(dbl, LT);
+                       shuffle_unique(dbl, LT, GETVAL);
                        break;
                default:
-                       shuffle_unique_body(cmp(BUNtail(bi, i + BUNfirst(b)), 
BUNtail(bi, oids[j] - b->hseqbase + BUNfirst(b))) < 0);
+                       heapify(LTany, GETVALany);
+                       while (cand ? cand < candend : start < end) {
+                               i = cand ? *cand++ : start++ + b->hseqbase;
+                               if (LTany(GETVALany(i), GETVALany(oids[0]))) {
+                                       oids[0] = i;
+                                       siftup(LTany, 0, GETVALany);
+                               }
+                       }
                        break;
                }
        } else {
                switch (tpe) {
                case TYPE_bte:
-                       shuffle_unique(bte, GT);
+                       shuffle_unique(bte, GT, GETVAL);
                        break;
                case TYPE_sht:
-                       shuffle_unique(sht, GT);
+                       shuffle_unique(sht, GT, GETVAL);
                        break;
                case TYPE_int:
-                       shuffle_unique(int, GT);
+                       shuffle_unique(int, GT, GETVAL);
                        break;
                case TYPE_lng:
-                       shuffle_unique(lng, GT);
+                       shuffle_unique(lng, GT, GETVAL);
                        break;
                case TYPE_flt:
-                       shuffle_unique(flt, GT);
+                       shuffle_unique(flt, GT, GETVAL);
                        break;
                case TYPE_dbl:
-                       shuffle_unique(dbl, GT);
+                       shuffle_unique(dbl, GT, GETVAL);
                        break;
                default:
-                       shuffle_unique_body(cmp(BUNtail(bi, i + BUNfirst(b)), 
BUNtail(bi, oids[j] - b->hseqbase + BUNfirst(b))) > 0);
+                       heapify(GTany, GETVALany);
+                       while (cand ? cand < candend : start < end) {
+                               i = cand ? *cand++ : start++ + b->hseqbase;
+                               if (GTany(GETVALany(i), GETVALany(oids[0]))) {
+                                       oids[0] = i;
+                                       siftup(GTany, 0, GETVALany);
+                               }
+                       }
                        break;
                }
        }
diff --git a/monetdb5/modules/mal/Tests/pqueue.stable.out 
b/monetdb5/modules/mal/Tests/pqueue.stable.out
--- a/monetdb5/modules/mal/Tests/pqueue.stable.out
+++ b/monetdb5/modules/mal/Tests/pqueue.stable.out
@@ -103,7 +103,7 @@ end main;
 # h    t  # name
 # void oid  # type
 #--------------------------#
-[ 0@0, 2@0  ]
+[ 0@0, 3@0  ]
 [ 1@0, 4@0  ]
 [ 2@0, 5@0  ]
 [ 3@0, 6@0  ]
diff --git a/monetdb5/modules/mal/Tests/pqueue2.stable.out 
b/monetdb5/modules/mal/Tests/pqueue2.stable.out
--- a/monetdb5/modules/mal/Tests/pqueue2.stable.out
+++ b/monetdb5/modules/mal/Tests/pqueue2.stable.out
@@ -166,7 +166,7 @@ end main;
 # h    t  # name
 # void oid  # type
 #--------------------------#
-[ 0@0, 2@0  ]
+[ 0@0, 3@0  ]
 [ 1@0, 4@0  ]
 [ 2@0, 5@0  ]
 [ 3@0, 6@0  ]
diff --git a/monetdb5/tests/gdkTests/Tests/firstn.stable.out 
b/monetdb5/tests/gdkTests/Tests/firstn.stable.out
--- a/monetdb5/tests/gdkTests/Tests/firstn.stable.out
+++ b/monetdb5/tests/gdkTests/Tests/firstn.stable.out
@@ -488,8 +488,8 @@ end main;
 # void oid  # type
 #--------------------------#
 [ 0@0, 0@0  ]
-[ 1@0, 1@0  ]
-[ 2@0, 2@0  ]
+[ 1@0, 2@0  ]
+[ 2@0, 3@0  ]
 [ 3@0, 4@0  ]
 [ 4@0, 5@0  ]
 [ 5@0, 7@0  ]
@@ -499,8 +499,8 @@ end main;
 # void str  # type
 #--------------------------#
 [ 0@0, "2"  ]
-[ 1@0, "3"  ]
-[ 2@0, "1"  ]
+[ 1@0, "1"  ]
+[ 2@0, "3"  ]
 [ 3@0, "1"  ]
 [ 4@0, "2"  ]
 [ 5@0, "2"  ]
@@ -685,8 +685,8 @@ end main;
 #--------------------------#
 [ 0@0, 0@0  ]
 [ 1@0, 1@0  ]
-[ 2@0, 2@0  ]
-[ 3@0, 3@0  ]
+[ 2@0, 3@0  ]
+[ 3@0, 4@0  ]
 [ 4@0, 5@0  ]
 [ 5@0, 6@0  ]
 [ 6@0, 7@0  ]
@@ -696,8 +696,8 @@ end main;
 #--------------------------#
 [ 0@0, "2"  ]
 [ 1@0, "3"  ]
-[ 2@0, "1"  ]
-[ 3@0, "3"  ]
+[ 2@0, "3"  ]
+[ 3@0, "1"  ]
 [ 4@0, "2"  ]
 [ 5@0, "3"  ]
 [ 6@0, "2"  ]
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to