Changeset: 9eff3359cb46 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/9eff3359cb46
Modified Files:
        gdk/gdk_join.c
        gdk/gdk_private.h
        gdk/gdk_string.c
Branch: Mar2025
Log Message:

For dup elim string bats, we can guess the number of unique values by counting 
strings.


diffs (111 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -3386,6 +3386,13 @@ count_unique(BAT *b, BAT *s, BUN *cnt1, 
                bat_iterator_end(&bi);
                return GDK_SUCCEED;
        }
+       if (bi.key) {
+               /* trivial: all values are distinct */
+               *cnt1 = half;
+               *cnt2 = ci.ncand;
+               bat_iterator_end(&bi);
+               return GDK_SUCCEED;
+       }
 
        assert(bi.type != TYPE_void);
 
@@ -3536,17 +3543,33 @@ count_unique(BAT *b, BAT *s, BUN *cnt1, 
 static double
 guess_uniques(BAT *b, struct canditer *ci)
 {
-       BUN cnt1, cnt2;
+       BUN cnt1 = 0, cnt2;
        BAT *s1;
 
        MT_lock_set(&b->theaplock);
        bool key = b->tkey;
        double unique_est = b->tunique_est;
        BUN batcount = BATcount(b);
+       if (b->ttype == TYPE_str && GDK_ELIMDOUBLES(b->tvheap)) {
+               cnt1 = countStrings(b->tvheap);
+               if (ci->s == NULL ||
+                   (ci->tpe == cand_dense && ci->ncand == batcount)) {
+                       if (b->tunique_est == 0)
+                               b->tunique_est = cnt1;
+               }
+       }
        MT_lock_unset(&b->theaplock);
-       if (key)
+       if (key || ci->ncand <= 1)
                return (double) ci->ncand;
 
+       if (cnt1 > 0) {
+               if (cnt1 >= batcount) {
+                       /* we may be able to set some properties */
+                       (void) BATordered(b);
+               }
+               return (double) (cnt1 < ci->ncand ? cnt1 : ci->ncand);
+       }
+
        if (ci->s == NULL ||
            (ci->tpe == cand_dense && ci->ncand == batcount)) {
                if (unique_est != 0) {
@@ -3589,6 +3612,10 @@ guess_uniques(BAT *b, struct canditer *c
 BUN
 BATguess_uniques(BAT *b, struct canditer *ci)
 {
+       if (b->batCount == 0 || (ci && ci->ncand == 0))
+               return 0;
+       if (b->batCount == 1 || (ci && ci->ncand == 1))
+               return 1;
        struct canditer lci;
        if (ci == NULL) {
                canditer_init(&lci, b, NULL);
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -122,6 +122,8 @@ BUN binsearch_flt(const oid *restrict in
        __attribute__((__visibility__("hidden")));
 BUN binsearch_dbl(const oid *restrict indir, oid offset, const dbl *restrict 
vals, BUN lo, BUN hi, dbl v, int ordering, int last)
        __attribute__((__visibility__("hidden")));
+BUN countStrings(const Heap *h)
+       __attribute__((__visibility__("hidden")));
 Heap *createOIDXheap(BAT *b, bool stable)
        __attribute__((__visibility__("hidden")));
 void doHASHdestroy(BAT *b, Hash *hs)
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -148,6 +148,31 @@ strCleanHash(Heap *h, bool rebuild)
        h->cleanhash = false;
 }
 
+
+BUN
+countStrings(const Heap *h)
+{
+       BUN n = 0;
+       size_t pos = GDK_STRHASHSIZE;
+       size_t pad;
+       const char *s;
+
+       while (pos < h->free) {
+               pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
+               if (pos + pad < GDK_ELIMLIMIT) {
+                       if (pad < sizeof(stridx_t))
+                               pad += GDK_VARALIGN;
+               } else if (pos >= GDK_ELIMLIMIT)
+                       pad = 0;
+               pos += pad;
+               s = h->base + pos;
+               n++;
+               pos += strlen(s) + 1;
+       }
+       return n;
+}
+
+
 /*
  * The strPut routine. The routine strLocate can be used to identify
  * the location of a string in the heap if it exists. Otherwise it
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to