Changeset: 9eff3359cb46 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/9eff3359cb46
Modified Files:
gdk/gdk_join.c
gdk/gdk_private.h
gdk/gdk_string.c
Branch: Mar2025
Log Message:
For dup elim string bats, we can guess the number of unique values by counting
strings.
diffs (111 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -3386,6 +3386,13 @@ count_unique(BAT *b, BAT *s, BUN *cnt1,
bat_iterator_end(&bi);
return GDK_SUCCEED;
}
+ if (bi.key) {
+ /* trivial: all values are distinct */
+ *cnt1 = half;
+ *cnt2 = ci.ncand;
+ bat_iterator_end(&bi);
+ return GDK_SUCCEED;
+ }
assert(bi.type != TYPE_void);
@@ -3536,17 +3543,33 @@ count_unique(BAT *b, BAT *s, BUN *cnt1,
static double
guess_uniques(BAT *b, struct canditer *ci)
{
- BUN cnt1, cnt2;
+ BUN cnt1 = 0, cnt2;
BAT *s1;
MT_lock_set(&b->theaplock);
bool key = b->tkey;
double unique_est = b->tunique_est;
BUN batcount = BATcount(b);
+ if (b->ttype == TYPE_str && GDK_ELIMDOUBLES(b->tvheap)) {
+ cnt1 = countStrings(b->tvheap);
+ if (ci->s == NULL ||
+ (ci->tpe == cand_dense && ci->ncand == batcount)) {
+ if (b->tunique_est == 0)
+ b->tunique_est = cnt1;
+ }
+ }
MT_lock_unset(&b->theaplock);
- if (key)
+ if (key || ci->ncand <= 1)
return (double) ci->ncand;
+ if (cnt1 > 0) {
+ if (cnt1 >= batcount) {
+ /* we may be able to set some properties */
+ (void) BATordered(b);
+ }
+ return (double) (cnt1 < ci->ncand ? cnt1 : ci->ncand);
+ }
+
if (ci->s == NULL ||
(ci->tpe == cand_dense && ci->ncand == batcount)) {
if (unique_est != 0) {
@@ -3589,6 +3612,10 @@ guess_uniques(BAT *b, struct canditer *c
BUN
BATguess_uniques(BAT *b, struct canditer *ci)
{
+ if (b->batCount == 0 || (ci && ci->ncand == 0))
+ return 0;
+ if (b->batCount == 1 || (ci && ci->ncand == 1))
+ return 1;
struct canditer lci;
if (ci == NULL) {
canditer_init(&lci, b, NULL);
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -122,6 +122,8 @@ BUN binsearch_flt(const oid *restrict in
__attribute__((__visibility__("hidden")));
BUN binsearch_dbl(const oid *restrict indir, oid offset, const dbl *restrict
vals, BUN lo, BUN hi, dbl v, int ordering, int last)
__attribute__((__visibility__("hidden")));
+BUN countStrings(const Heap *h)
+ __attribute__((__visibility__("hidden")));
Heap *createOIDXheap(BAT *b, bool stable)
__attribute__((__visibility__("hidden")));
void doHASHdestroy(BAT *b, Hash *hs)
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -148,6 +148,31 @@ strCleanHash(Heap *h, bool rebuild)
h->cleanhash = false;
}
+
+BUN
+countStrings(const Heap *h)
+{
+ BUN n = 0;
+ size_t pos = GDK_STRHASHSIZE;
+ size_t pad;
+ const char *s;
+
+ while (pos < h->free) {
+ pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
+ if (pos + pad < GDK_ELIMLIMIT) {
+ if (pad < sizeof(stridx_t))
+ pad += GDK_VARALIGN;
+ } else if (pos >= GDK_ELIMLIMIT)
+ pad = 0;
+ pos += pad;
+ s = h->base + pos;
+ n++;
+ pos += strlen(s) + 1;
+ }
+ return n;
+}
+
+
/*
* The strPut routine. The routine strLocate can be used to identify
* the location of a string in the heap if it exists. Otherwise it
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]