Changeset: 0343b3108ad3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0343b3108ad3
Modified Files:
        gdk/gdk.h
        gdk/gdk_hash.c
        gdk/gdk_hash.h
        gdk/gdk_join.c
Branch: imprints-join
Log Message:

change hash function using imprints info for imps_hashjoin()


diffs (truncated from 336 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1928,6 +1928,12 @@ gdk_export ptr ATOMdup(int id, const voi
 gdk_export gdk_return BAThash(BAT *b, BUN masksize);
 
 /*
+ * Compared to BAThash, this function just modifies the hash function by
+ * injecting the imprints information of the target column
+ */
+gdk_export gdk_return BAThash_imps(BAT *b, BUN masksize);
+
+/*
  * @- Column Imprints Functions
  *
  * @multitable @columnfractions 0.08 0.7
diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -134,6 +134,14 @@ HASHnew(Heap *hp, int tpe, BUN size, BUN
        return h;
 }
 
+#define GETBIN_IMPS(Z,X,B)                     \
+do {                                   \
+       int _i;                         \
+       Z = 0;                          \
+       for (_i = 1; _i < B; _i++)      \
+               Z += ((X) >= bins[_i]); \
+} while (0)
+
 #define starthash(TYPE)                                                        
\
        do {                                                            \
                TYPE *v = (TYPE *) BUNtloc(bi, 0);                      \
@@ -157,6 +165,24 @@ HASHnew(Heap *hp, int tpe, BUN size, BUN
                }                                               \
        } while (0)
 
+
+#define finishhash_imps(TYPE)                                  \
+       do {                                                    \
+               TYPE *v = (TYPE *) BUNtloc(bi, 0);              \
+               int bin;                                                        
\
+               Imprints *imprints = b->timprints;              \
+               const TYPE *restrict bins = (TYPE *) imprints->bins;            
        \
+               const int B = imprints->bits;                   \
+               for (; p < q; p++) {                                    \
+                       BUN c;                                                  
        \
+                       GETBIN_IMPS(bin, *(v + p), B);  \
+                       c = (BUN) hash_imps_##TYPE(h, v + p, bin);      \
+                                                               \
+                       HASHputlink(h, p, HASHget(h, c));       \
+                       HASHput(h, c, p);                       \
+               }                                               \
+       } while (0)
+
 /* collect HASH statistics for analysis */
 static void
 HASHcollisions(BAT *b, Hash *h)
@@ -531,6 +557,219 @@ BAThash(BAT *b, BUN masksize)
        return GDK_SUCCEED;
 }
 
+
+
+gdk_return
+BAThash_imps(BAT *b, BUN masksize)
+{
+       lng t0 = 0, t1 = 0;
+
+       assert(b->batCacheid > 0);
+       if (BATcheckhash(b)) {
+               return GDK_SUCCEED;
+       }
+       MT_lock_set(&GDKhashLock(b->batCacheid));
+       if (b->thash == NULL) {
+               unsigned int tpe = ATOMbasetype(b->ttype);
+               BUN cnt = BATcount(b);
+               BUN mask, maxmask = 0;
+               BUN p = 0, q = BUNlast(b), r;
+               Hash *h = NULL;
+               Heap *hp;
+               const char *nme = BBP_physical(b->batCacheid);
+               const char *ext = b->batCacheid > 0 ? "thash" : "hhash";
+               BATiter bi = bat_iterator(b);
+
+               ALGODEBUG fprintf(stderr, "#BAThash: create hash(%s#" BUNFMT 
");\n", BATgetId(b), BATcount(b));
+               if ((hp = GDKzalloc(sizeof(*hp))) == NULL ||
+                   (hp->farmid = BBPselectfarm(b->batRole, b->ttype, 
hashheap)) < 0 ||
+                   (hp->filename = GDKmalloc(strlen(nme) + 12)) == NULL) {
+                       MT_lock_unset(&GDKhashLock(b->batCacheid));
+                       GDKfree(hp);
+                       return GDK_FAIL;
+               }
+               hp->dirty = TRUE;
+               sprintf(hp->filename, "%s.%s", nme, ext);
+
+               /* cnt = 0, hopefully there is a proper capacity from
+                * which we can derive enough information */
+               if (!cnt)
+                       cnt = BATcapacity(b);
+
+               if (b->ttype == TYPE_void) {
+                       if (b->tseqbase == oid_nil) {
+                               MT_lock_unset(&GDKhashLock(b->batCacheid));
+                               ALGODEBUG fprintf(stderr, "#BAThash: cannot 
create hash-table on void-NIL column.\n");
+                               GDKfree(hp->filename);
+                               GDKfree(hp);
+                               GDKerror("BAThash: no hash on void/nil 
column\n");
+                               return GDK_FAIL;
+                       }
+                       ALGODEBUG fprintf(stderr, "#BAThash: creating 
hash-table on void column..\n");
+
+                       tpe = TYPE_void;
+               }
+               /* determine hash mask size p = first; then no dynamic
+                * scheme */
+               if (masksize > 0) {
+                       mask = HASHmask(masksize);
+               } else if (ATOMsize(tpe) == 1) {
+                       mask = (1 << 8);
+               } else if (ATOMsize(tpe) == 2) {
+                       mask = (1 << 16);
+               } else if (b->tkey) {
+                       mask = HASHmask(cnt);
+               } else {
+                       /* dynamic hash: we start with
+                        * HASHmask(cnt)/64; if there are too many
+                        * collisions we try HASHmask(cnt)/16, then
+                        * HASHmask(cnt)/4, and finally
+                        * HASHmask(cnt).  */
+                       maxmask = HASHmask(cnt);
+                       mask = maxmask >> 6;
+                       p += (cnt >> 2);        /* try out on first 25% of b */
+                       if (p > q)
+                               p = q;
+               }
+
+               ALGODEBUG t0 = GDKusec();
+
+               do {
+                       BUN nslots = mask >> 3; /* 1/8 full is too full */
+
+                       r = 0;
+                       if (h) {
+                               char *fnme;
+                               bte farmid;
+
+                               ALGODEBUG fprintf(stderr, "#BAThash: retry hash 
construction\n");
+                               fnme = GDKstrdup(hp->filename);
+                               farmid = hp->farmid;
+                               HEAPfree(hp, 1);
+                               memset(hp, 0, sizeof(*hp));
+                               hp->filename = fnme;
+                               hp->farmid = farmid;
+                               GDKfree(h);
+                               h = NULL;
+                       }
+                       /* create the hash structures */
+                       if ((h = HASHnew(hp, ATOMtype(b->ttype), 
BATcapacity(b), mask, BATcount(b))) == NULL) {
+
+                               MT_lock_unset(&GDKhashLock(b->batCacheid));
+                               GDKfree(hp->filename);
+                               GDKfree(hp);
+                               return GDK_FAIL;
+                       }
+
+                       switch (tpe) {
+                       case TYPE_bte:
+                               starthash(bte);
+                               break;
+                       case TYPE_sht:
+                               starthash(sht);
+                               break;
+                       case TYPE_flt:
+                               starthash(flt);
+                               break;
+                       case TYPE_int:
+                               starthash(int);
+                               break;
+                       case TYPE_dbl:
+                               starthash(dbl);
+                               break;
+                       case TYPE_lng:
+                               starthash(lng);
+                               break;
+#ifdef HAVE_HGE
+                       case TYPE_hge:
+                               starthash(hge);
+                               break;
+#endif
+                       default:
+                               for (; r < p; r++) {
+                                       ptr v = BUNtail(bi, r);
+                                       BUN c = (BUN) heap_hash_any(b->tvheap, 
h, v);
+
+                                       if (HASHget(h, c) == HASHnil(h) &&
+                                           nslots-- == 0)
+                                               break;  /* mask too full */
+                                       HASHputlink(h, r, HASHget(h, c));
+                                       HASHput(h, c, r);
+                               }
+                               break;
+                       }
+               } while (r < p && mask < maxmask && (mask <<= 2));
+
+               /* finish the hashtable with the current mask */
+               p = r;
+               switch (tpe) {
+               case TYPE_bte:
+                       finishhash(bte);
+                       break;
+               case TYPE_sht:
+                       finishhash(sht);
+                       break;
+               case TYPE_int:
+                       finishhash_imps(int);
+                       break;
+               case TYPE_flt:
+                       finishhash(flt);
+                       break;
+               case TYPE_dbl:
+                       finishhash(dbl);
+                       break;
+               case TYPE_lng:
+                       finishhash(lng);
+                       break;
+#ifdef HAVE_HGE
+               case TYPE_hge:
+                       finishhash(hge);
+                       break;
+#endif
+               default:
+                       for (; p < q; p++) {
+                               ptr v = BUNtail(bi, p);
+                               BUN c = (BUN) heap_hash_any(b->tvheap, h, v);
+
+                               HASHputlink(h, p, HASHget(h, c));
+                               HASHput(h, c, p);
+                       }
+                       break;
+               }
+#ifndef NDEBUG
+               /* clear unused part of Link array */
+               memset((char *) h->Link + q * h->width, 0, (h->lim - q) * 
h->width);
+#endif
+               hp->parentid = b->batCacheid;
+#ifdef PERSISTENTHASH
+               if (BBP_status(b->batCacheid) & BBPEXISTING) {
+                       MT_Id tid;
+                       struct hashsync *hs = GDKmalloc(sizeof(*hs));
+                       if (hs != NULL) {
+                               BBPfix(b->batCacheid);
+                               hs->id = b->batCacheid;
+                               hs->hp = hp;
+                               if (MT_create_thread(&tid, BAThashsync, hs,
+                                                    MT_THR_DETACHED) < 0) {
+                                       /* couldn't start thread: clean up */
+                                       BBPunfix(b->batCacheid);
+                                       GDKfree(hs);
+                               }
+                       }
+               } else
+                       ALGODEBUG fprintf(stderr, "#BAThash: NOT persisting 
hash %d\n", b->batCacheid);
+#endif
+               b->thash = h;
+               ALGODEBUG {
+                       t1 = GDKusec();
+                       fprintf(stderr, "#BAThash: hash construction " LLFMT " 
usec\n", t1 - t0);
+                       HASHcollisions(b, b->thash);
+               }
+       }
+       MT_lock_unset(&GDKhashLock(b->batCacheid));
+       return GDK_SUCCEED;
+}
+
 /*
  * The entry on which a value hashes can be calculated with the
  * routine HASHprobe.
diff --git a/gdk/gdk_hash.h b/gdk/gdk_hash.h
--- a/gdk/gdk_hash.h
+++ b/gdk/gdk_hash.h
@@ -157,6 +157,9 @@ gdk_export BUN HASHlist(Hash *h, BUN i);
 #define hash_oid(H,V)  hash_lng(H,V)
 #endif
 
+#define hash_imps_int(H,V,B)   (((BUN) mix_int(*(const unsigned int *) (V)) & 
((H)->mask >> 6)) | ((B) << 6))
+#define hash_imps_lng(H,V,B)   (((BUN) mix_lng(*(const ulng *) (V)) & 
((H)->mask >> 6)) | ((B) << 6))
+
 #define hash_flt(H,V)  hash_int(H,V)
 #define hash_dbl(H,V)  hash_lng(H,V)
 
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2580,9 +2580,21 @@ binsearchcand(const oid *cand, BUN lo, B
                if (hb >= (lo) && hb < (hi) &&                  \
                    simple_EQ(v, BUNtloc(bi, hb), TYPE))
 
+#define GETBIN_(Z,X,B)                 \
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to