Changeset: f5275d838579 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f5275d838579
Added Files:
gdk/gdk_bloom.c
gdk/gdk_bloom.h
Modified Files:
gdk/Makefile.ag
gdk/gdk.h
gdk/gdk_join.c
gdk/gdk_private.h
Branch: leftmart
Log Message:
Adding bloom filters support
diffs (truncated from 472 to 300 lines):
diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -28,6 +28,7 @@ lib_gdk = {
gdk_calc.c gdk_calc.h gdk_calc_compare.h gdk_calc_private.h \
gdk_aggr.c gdk_group.c \
gdk_imprints.c gdk_imprints.h \
+ gdk_bloom.c gdk_bloom.h \
gdk_join.c gdk_project.c \
gdk_unique.c \
gdk_firstn.c \
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -675,6 +675,7 @@ typedef struct {
} Hash;
typedef struct Imprints Imprints;
+typedef struct Bloomfilter Bloomfilter;
/*
@@ -798,6 +799,7 @@ gdk_export int VALisnil(const ValRecord
* Hash *hhash; // linear chained hash table on head
* Imprints *himprints; // column imprints index on head
* orderidx horderidx; // order oid index on head
+ * bloomfilter hbloom; // bloomfilter on head
* // Tail properties
* int ttype; // Tail type number
* str tident; // name for tail column
@@ -812,6 +814,7 @@ gdk_export int VALisnil(const ValRecord
* Hash *thash; // linear chained hash table on tail
* Imprints *timprints; // column imprints index on tail
* orderidx torderidx; // order oid index on tail
+ * bloomfilter tbloom; // bloom filter on tail
* } BAT;
* @end verbatim
*
@@ -888,6 +891,7 @@ typedef struct {
Hash *hash; /* hash table */
Imprints *imprints; /* column imprints index */
Heap *orderidx; /* order oid index */
+ Bloomfilter *bloom; /* a bloomfilter */
PROPrec *props; /* list of dynamic properties stored in the bat
descriptor */
} COLrec;
@@ -960,6 +964,9 @@ typedef int (*GDKfcn) ();
#define talign T->align
#define horderidx H->orderidx
#define torderidx T->orderidx
+#define hbloom H->bloom
+#define tbloom T->bloom
+
/*
@@ -2018,9 +2025,14 @@ gdk_export gdk_return BAThash(BAT *b, BU
gdk_export gdk_return BATimprints(BAT *b);
gdk_export lng IMPSimprintsize(BAT *b);
+/* The ordered index structure */
+
gdk_export gdk_return BATorderidx(BAT *b, int stable);
gdk_export gdk_return GDKmergeidx(BAT *b, BAT**a, int n_ar);
+/* The bloom filters */
+gdk_export gdk_return BATbloom(BAT *b);
+
/*
* @- Multilevel Storage Modes
*
diff --git a/gdk/gdk_bloom.c b/gdk/gdk_bloom.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_bloom.c
@@ -0,0 +1,303 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+ */
+
+/*
+ * Implementation of bloom filters.
+ * L.Sidirourgos
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+#include "gdk_bloom.h"
+
+#define BLOOM_VERSION 1
+#define BLOOM_HEADER_SIZE 4 /* nr of size_t fields in header */
+
+int
+BATcheckbloom(BAT *b)
+{
+ int ret;
+
+ MT_lock_set(&GDKhashLock(abs(b->batCacheid)));
+
+ /* if b does not have a bloom filter, but the parent has, then use it */
+ if ((b->T->bloom == NULL) && VIEWtparent(b)) {
+ BAT *pb = BATmirror(BATdescriptor(VIEWtparent(b)));
+ b->T->bloom = pb->T->bloom;
+ BBPunfix(pb->batCacheid);
+ }
+ ret = b->T->bloom != NULL;
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+
+ ALGODEBUG if (ret) fprintf(stderr, "#BATcheckbloom: already has a bloom
%d\n", b->batCacheid);
+ return ret;
+}
+
+static BUN
+BLOOMsize(BUN cnt) {
+ BUN m = cnt;
+
+ /* smaller power of 2 that is greater or equal to cnt */
+ --m;
+ m |= m >> 1;
+ m |= m >> 2;
+ m |= m >> 4;
+ m |= m >> 8;
+ m |= m >> 16;
+#if SIZEOF_BUN == 8
+ m |= m >> 32;
+#endif
+ m++;
+
+ /* double it */
+ m <<= 1;
+
+ /* if m is almost 2*cnt, double again */
+ if (m / cnt == 2)
+ m <<= 1;
+
+ return m;
+}
+
+#define BLOOM_BUILD(TYPE)
\
+do {
\
+ const TYPE *restrict col = (TYPE *) Tloc(b, b->batFirst);
\
+ BUN p;
\
+ BUN key,hv,x,y,z;
\
+ for (p=0; p<cnt; p++) {
\
+ key = (BUN) col[p];
\
+ hash_init(key, x,y,z);
\
+ next_hash(hv, x,y,z);
\
+ filter[quotient8(hv)] |= (1 << remainder8(hv));
\
+ next_hash(hv, x,y,z);
\
+ filter[quotient8(hv)] |= (1 << remainder8(hv));
\
+ if (bloom->kfunc == 3) {
\
+ next_hash(hv, x,y,z);
\
+ filter[quotient8(hv)] |= (1 << remainder8(hv));
\
+ }
\
+ }
\
+} while (0)
+
+gdk_return
+BATbloom(BAT *b)
+{
+ lng t0 = 0, t1 = 0;
+
+ assert(BAThdense(b)); /* assert void head */
+
+ /* we only create bloom filters for types that look like types we know
*/
+ switch (ATOMbasetype(b->T->type)) {
+ case TYPE_bte:
+ case TYPE_sht:
+ case TYPE_int:
+ case TYPE_lng:
+#ifdef HAVE_HGE
+ case TYPE_hge:
+#endif
+ case TYPE_flt:
+ case TYPE_dbl:
+ break;
+ default: /* type not supported */
+ /* doesn't look enough like base type: do nothing */
+ GDKerror("BATbloom: unsupported type\n");
+ return GDK_FAIL;
+ }
+
+ BATcheck(b, "BATbloom", GDK_FAIL);
+
+ if (BATcheckbloom(b))
+ return GDK_SUCCEED;
+ assert(b->T->bloom == NULL);
+
+ MT_lock_set(&GDKhashLock(abs(b->batCacheid)));
+ t0 = GDKusec();
+
+ if (b->T->bloom == NULL) {
+ BUN cnt;
+ size_t bytes;
+ Bloomfilter *bloom;
+ char *filter;
+
+ bloom = (Bloomfilter *) GDKzalloc(sizeof(Bloomfilter));
+ if (bloom == NULL) {
+ GDKerror("#BATbloom: memory allocation error");
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+ return GDK_FAIL;
+ }
+ bloom->filter = (Heap *) GDKzalloc(sizeof(Heap));
+ if (bloom->filter == NULL) {
+ GDKerror("#BATbloom: memory allocation error");
+ GDKfree(bloom);
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+ return GDK_FAIL;
+ }
+
+ cnt = BATcount(b);
+ /* TODO: check also if max-min < mbits and use identiry hash */
+ if ( (ATOMbasetype(b->T->type) == TYPE_bte && (bloom->mbits =
(1 << 8))) ||
+ (ATOMbasetype(b->T->type) == TYPE_sht && (bloom->mbits =
(1 << 16))) ) {
+ bloom->kfunc = 1;
+ bloom->mask = bloom->mbits-1;
+ bytes = quotient8(bloom->mbits);
+ } else {
+ bloom->mbits = BLOOMsize(cnt);
+ /* 2 functions if the ratio is close to 3, 3 otherwise
*/
+ bloom->kfunc = bloom->mbits/cnt == 3 ? 2 : 3;
+ bloom->mask = bloom->mbits-1;
+ bytes = quotient8(bloom->mbits) + 1;
+ }
+
+ if (HEAPalloc(bloom->filter, bytes, 1) != GDK_SUCCEED) {
+ GDKerror("#BATbloom: memory allocation error");
+ GDKfree(bloom->filter);
+ GDKfree(bloom);
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+ return GDK_FAIL;
+ }
+
+ ALGODEBUG fprintf(stderr, "#BATbloom(b=%s#" BUNFMT ") %s: "
+ "create bloom filter: mbits = " BUNFMT ", ratio
= " BUNFMT
+ "kfunc = %d, bytes = " BUNFMT "\n", BATgetId(b),
+ BATcount(b), b->T->heap.filename,
+ bloom->mbits, bloom->mbits/BATcount(b),
+ bloom->kfunc, quotient8(bloom->mbits));
+
+ filter = (unsigned char *) bloom->filter->base;
+
+ switch (ATOMbasetype(b->T->type)) {
+ case TYPE_bte:
+ {
+ const unsigned char *restrict col = (unsigned char *)
Tloc(b, b->batFirst);
+ BUN p;
+ assert(bloom->kfunc == 1);
+ for (p=0; p<cnt; p++)
+ filter[quotient8(col[p])] |= (1 <<
remainder8(col[p]));
+ }
+ break;
+ case TYPE_sht:
+ {
+ const unsigned short *restrict col = (unsigned short *)
Tloc(b, b->batFirst);
+ BUN p;
+ assert(bloom->kfunc == 1);
+ for (p=0; p<cnt; p++)
+ filter[quotient8(col[p])] |= (1 <<
remainder8(col[p]));
+ }
+ break;
+ case TYPE_int: BLOOM_BUILD(int); break;
+ case TYPE_lng: BLOOM_BUILD(lng); break;
+#ifdef HAVE_HGE
+ case TYPE_hge: BLOOM_BUILD(hge); break;
+#endif
+ case TYPE_flt: BLOOM_BUILD(flt); break;
+ case TYPE_dbl: BLOOM_BUILD(dbl); break;
+ default:
+ /* should never reach here */
+ assert(0);
+ HEAPfree(bloom->filter,1);
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+ return GDK_FAIL;
+ }
+
+ b->T->bloom = bloom;
+ }
+
+ t1 = GDKusec();
+
+ ALGODEBUG fprintf(stderr, "#BATbloom: bloom construction " LLFMT "
usec\n", t1 - t0);
+ //print_stats(b, t1-t0);
+
+ MT_lock_unset(&GDKhashLock(abs(b->batCacheid)));
+
+ assert(b->batCapacity >= BATcount(b));
+ return GDK_SUCCEED;
+}
+
+
+
+#if 0
+
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list