Changeset: f5275d838579 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f5275d838579
Added Files:
        gdk/gdk_bloom.c
        gdk/gdk_bloom.h
Modified Files:
        gdk/Makefile.ag
        gdk/gdk.h
        gdk/gdk_join.c
        gdk/gdk_private.h
Branch: leftmart
Log Message:

Adding bloom filters support


diffs (truncated from 472 to 300 lines):

diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -28,6 +28,7 @@ lib_gdk = {
                gdk_calc.c gdk_calc.h gdk_calc_compare.h gdk_calc_private.h \
                gdk_aggr.c gdk_group.c \
                gdk_imprints.c gdk_imprints.h \
+               gdk_bloom.c gdk_bloom.h \
                gdk_join.c gdk_project.c \
                gdk_unique.c \
                gdk_firstn.c \
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -675,6 +675,7 @@ typedef struct {
 } Hash;
 
 typedef struct Imprints Imprints;
+typedef struct Bloomfilter Bloomfilter;
 
 
 /*
@@ -798,6 +799,7 @@ gdk_export int VALisnil(const ValRecord 
  *           Hash   *hhash;           // linear chained hash table on head
  *           Imprints *himprints;     // column imprints index on head
  *           orderidx horderidx;      // order oid index on head
+ *           bloomfilter hbloom;      // bloomfilter on head
  *           // Tail properties
  *           int    ttype;            // Tail type number
  *           str    tident;           // name for tail column
@@ -812,6 +814,7 @@ gdk_export int VALisnil(const ValRecord 
  *           Hash   *thash;           // linear chained hash table on tail
  *           Imprints *timprints;     // column imprints index on tail
  *           orderidx torderidx;      // order oid index on tail
+ *           bloomfilter tbloom;      // bloom filter on tail
  *  } BAT;
  * @end verbatim
  *
@@ -888,6 +891,7 @@ typedef struct {
        Hash *hash;             /* hash table */
        Imprints *imprints;     /* column imprints index */
        Heap *orderidx;         /* order oid index */
+       Bloomfilter *bloom;     /* a bloomfilter */
 
        PROPrec *props;         /* list of dynamic properties stored in the bat 
descriptor */
 } COLrec;
@@ -960,6 +964,9 @@ typedef int (*GDKfcn) ();
 #define talign         T->align
 #define horderidx      H->orderidx
 #define torderidx      T->orderidx
+#define hbloom         H->bloom
+#define tbloom         T->bloom
+
 
 
 /*
@@ -2018,9 +2025,14 @@ gdk_export gdk_return BAThash(BAT *b, BU
 gdk_export gdk_return BATimprints(BAT *b);
 gdk_export lng IMPSimprintsize(BAT *b);
 
+/* The ordered index structure */
+
 gdk_export gdk_return BATorderidx(BAT *b, int stable);
 gdk_export gdk_return GDKmergeidx(BAT *b, BAT**a, int n_ar);
 
+/* The bloom filters */
+gdk_export gdk_return BATbloom(BAT *b);
+
 /*
  * @- Multilevel Storage Modes
  *
diff --git a/gdk/gdk_bloom.c b/gdk/gdk_bloom.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_bloom.c
@@ -0,0 +1,303 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+ */
+
+/*
+ * Implementation of bloom filters.
+ * L.Sidirourgos
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+#include "gdk_bloom.h"
+
+#define BLOOM_VERSION  1
+#define BLOOM_HEADER_SIZE      4 /* nr of size_t fields in header */
+
+int
+BATcheckbloom(BAT *b)
+{
+       int ret;
+
+       MT_lock_set(&GDKhashLock(abs(b->batCacheid)));
+
+       /* if b does not have a bloom filter, but the parent has, then use it */
+       if ((b->T->bloom == NULL) && VIEWtparent(b)) {
+               BAT *pb = BATmirror(BATdescriptor(VIEWtparent(b)));
+               b->T->bloom = pb->T->bloom;
+               BBPunfix(pb->batCacheid);
+       }
+       ret = b->T->bloom != NULL;
+       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+
+       ALGODEBUG if (ret) fprintf(stderr, "#BATcheckbloom: already has a bloom 
%d\n", b->batCacheid);
+       return ret;
+}
+
+static BUN
+BLOOMsize(BUN cnt) {
+       BUN m = cnt;
+
+       /* smaller power of 2 that is greater or equal to cnt */
+       --m;
+       m |= m >> 1;
+       m |= m >> 2;
+       m |= m >> 4;
+       m |= m >> 8;
+       m |= m >> 16;
+#if SIZEOF_BUN == 8
+       m |= m >> 32;
+#endif
+       m++;
+
+       /* double it */
+       m <<= 1;
+
+       /* if m is almost 2*cnt, double again */
+       if (m / cnt == 2)
+               m <<= 1;
+
+       return m;
+}
+
+#define BLOOM_BUILD(TYPE)                                                      
\
+do {                                                                           
\
+       const TYPE *restrict col = (TYPE *) Tloc(b, b->batFirst);               
\
+       BUN p;                                                                  
\
+       BUN key,hv,x,y,z;                                                       
\
+       for (p=0; p<cnt; p++) {                                                 
\
+               key = (BUN) col[p];                                             
\
+               hash_init(key, x,y,z);                                          
\
+               next_hash(hv, x,y,z);                                           
\
+               filter[quotient8(hv)] |= (1 << remainder8(hv));                 
\
+               next_hash(hv, x,y,z);                                           
\
+               filter[quotient8(hv)] |= (1 << remainder8(hv));                 
\
+               if (bloom->kfunc == 3) {                                        
\
+                       next_hash(hv, x,y,z);                                   
\
+                       filter[quotient8(hv)] |= (1 << remainder8(hv));         
\
+               }                                                               
\
+       }                                                                       
\
+} while (0)
+
+gdk_return
+BATbloom(BAT *b)
+{
+       lng t0 = 0, t1 = 0;
+
+       assert(BAThdense(b));   /* assert void head */
+
+       /* we only create bloom filters for types that look like types we know 
*/
+       switch (ATOMbasetype(b->T->type)) {
+       case TYPE_bte:
+       case TYPE_sht:
+       case TYPE_int:
+       case TYPE_lng:
+#ifdef HAVE_HGE
+       case TYPE_hge:
+#endif
+       case TYPE_flt:
+       case TYPE_dbl:
+               break;
+       default:                /* type not supported */
+               /* doesn't look enough like base type: do nothing */
+               GDKerror("BATbloom: unsupported type\n");
+               return GDK_FAIL;
+       }
+
+       BATcheck(b, "BATbloom", GDK_FAIL);
+
+       if (BATcheckbloom(b))
+               return GDK_SUCCEED;
+       assert(b->T->bloom == NULL);
+
+       MT_lock_set(&GDKhashLock(abs(b->batCacheid)));
+       t0 = GDKusec();
+
+       if (b->T->bloom == NULL) {
+               BUN cnt;
+               size_t bytes;
+               Bloomfilter *bloom;
+               char *filter;
+
+               bloom = (Bloomfilter *) GDKzalloc(sizeof(Bloomfilter));
+               if (bloom == NULL) {
+                       GDKerror("#BATbloom: memory allocation error");
+                       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+                       return GDK_FAIL;
+               }
+               bloom->filter = (Heap *) GDKzalloc(sizeof(Heap));
+               if (bloom->filter == NULL) {
+                       GDKerror("#BATbloom: memory allocation error");
+                       GDKfree(bloom);
+                       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+                       return GDK_FAIL;
+               }
+
+               cnt = BATcount(b);
+               /* TODO: check also if max-min < mbits and use identiry hash */
+               if ( (ATOMbasetype(b->T->type) == TYPE_bte && (bloom->mbits = 
(1 << 8))) ||
+                    (ATOMbasetype(b->T->type) == TYPE_sht && (bloom->mbits = 
(1 << 16))) ) {
+                       bloom->kfunc = 1;
+                       bloom->mask = bloom->mbits-1;
+                       bytes = quotient8(bloom->mbits);
+               } else {
+                       bloom->mbits = BLOOMsize(cnt);
+                       /* 2 functions if the ratio is close to 3, 3 otherwise 
*/
+                       bloom->kfunc = bloom->mbits/cnt == 3 ? 2 : 3;
+                       bloom->mask = bloom->mbits-1;
+                       bytes = quotient8(bloom->mbits) + 1;
+               }
+
+               if (HEAPalloc(bloom->filter, bytes, 1) != GDK_SUCCEED) {
+                       GDKerror("#BATbloom: memory allocation error");
+                       GDKfree(bloom->filter);
+                       GDKfree(bloom);
+                       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+                       return GDK_FAIL;
+               }
+
+               ALGODEBUG fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: "
+                               "create bloom filter: mbits = " BUNFMT ", ratio 
= " BUNFMT
+                               "kfunc = %d, bytes = " BUNFMT "\n", BATgetId(b),
+                               BATcount(b), b->T->heap.filename,
+                               bloom->mbits,  bloom->mbits/BATcount(b),
+                               bloom->kfunc, quotient8(bloom->mbits));
+
+               filter = (unsigned char *) bloom->filter->base;
+
+               switch (ATOMbasetype(b->T->type)) {
+               case TYPE_bte:
+               {
+                       const unsigned char *restrict col = (unsigned char *) 
Tloc(b, b->batFirst);
+                       BUN p;
+                       assert(bloom->kfunc == 1);
+                       for (p=0; p<cnt; p++)
+                               filter[quotient8(col[p])] |= (1 << 
remainder8(col[p]));
+               }
+               break;
+               case TYPE_sht:
+               {
+                       const unsigned short *restrict col = (unsigned short *) 
Tloc(b, b->batFirst);
+                       BUN p;
+                       assert(bloom->kfunc == 1);
+                       for (p=0; p<cnt; p++)
+                               filter[quotient8(col[p])] |= (1 << 
remainder8(col[p]));
+               }
+               break;
+               case TYPE_int: BLOOM_BUILD(int); break;
+               case TYPE_lng: BLOOM_BUILD(lng); break;
+#ifdef HAVE_HGE
+               case TYPE_hge: BLOOM_BUILD(hge); break;
+#endif
+               case TYPE_flt: BLOOM_BUILD(flt); break;
+               case TYPE_dbl: BLOOM_BUILD(dbl); break;
+               default:
+                       /* should never reach here */
+                       assert(0);
+                       HEAPfree(bloom->filter,1);
+                       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+                       return GDK_FAIL;
+               }
+
+               b->T->bloom = bloom;
+       }
+
+       t1 = GDKusec();
+
+       ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT " 
usec\n", t1 - t0);
+       //print_stats(b, t1-t0);
+
+       MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+
+       assert(b->batCapacity >= BATcount(b));
+       return GDK_SUCCEED;
+}
+
+
+
+#if 0
+
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to