Changeset: b5b470692dba for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b5b470692dba
Added Files:
gdk/gdk_hash.c
gdk/gdk_hash.h
Removed Files:
gdk/gdk_search.h
Modified Files:
gdk/Makefile.ag
gdk/gdk.h
gdk/gdk_search.c
Branch: embedded
Log Message:
merge with default
diffs (truncated from 1357 to 300 lines):
diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -14,7 +14,7 @@ lib_gdk = {
SOURCES = \
gdk.h gdk_cand.h gdk_atomic.h gdk_batop.c \
gdk_select.c \
- gdk_search.c gdk_search.h gdk_tm.c \
+ gdk_search.c gdk_hash.c gdk_hash.h gdk_tm.c \
gdk_align.c gdk_bbp.c gdk_bbp.h \
gdk_heap.c gdk_utils.c gdk_utils.h \
gdk_atoms.c gdk_atoms.h \
@@ -50,7 +50,7 @@ headers_h = {
gdk_calc.h \
gdk_delta.h \
gdk_posix.h \
- gdk_search.h \
+ gdk_hash.h \
gdk_system.h \
gdk_utils.h
}
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -1257,6 +1257,12 @@ gdk_export gdk_return BATdel(BAT *b, BAT
gdk_export gdk_return BUNinplace(BAT *b, BUN p, const void *right, bit force);
gdk_export gdk_return BATreplace(BAT *b, BAT *p, BAT *n, bit force);
+/* Functions to perform a binary search on a sorted BAT.
+ * See gdk_search.c for details. */
+gdk_export BUN SORTfnd(BAT *b, const void *v);
+gdk_export BUN SORTfndfirst(BAT *b, const void *v);
+gdk_export BUN SORTfndlast(BAT *b, const void *v);
+
gdk_export BUN BUNfnd(BAT *b, const void *right);
#define BUNfndVOID(b, v) \
@@ -2394,7 +2400,7 @@ gdk_export void GDKfatal(_In_z_ _Printf_
* @
*/
#include "gdk_delta.h"
-#include "gdk_search.h"
+#include "gdk_hash.h"
#include "gdk_atoms.h"
#include "gdk_bbp.h"
#include "gdk_utils.h"
diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_hash.c
@@ -0,0 +1,639 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+ */
+
+/*
+ * - Hash Table Creation
+ * The hash indexing scheme for BATs reserves a block of memory to
+ * maintain the hash table and a collision list. A one-to-one mapping
+ * exists between the BAT and the collision list using the BUN
+ * index. NOTE: we alloc the link list as a parallel array to the BUN
+ * array; hence the hash link array has the same size as
+ * BATcapacity(b) (not BATcount(b)). This allows us in the BUN insert
+ * and delete to assume that there is hash space iff there is BUN
+ * space. If there is no BUN space, the BATextend now destroys the
+ * hash table.
+ *
+ * The hash mask size is a power of two, so we can do bitwise AND on
+ * the hash (integer) number to quickly find the head of the bucket
+ * chain. Clearly, the hash mask size is a crucial parameter. If we
+ * know that the column is unique (hkey), we use direct hashing (mask
+ * size ~= BATcount). Otherwise we dynamically determine the mask size
+ * by starting out with mask size = BATcount/64 (just 1.5% of memory
+ * storage overhead). Then we start building the hash table on the
+ * first 25% of the BAT. As we aim for max-collisions list length of
+ * 4, the list on 25% should not exceed length 1. So, if a small
+ * number of collisssions occurs (mask/2) then we abandon the attempt
+ * and restart with a mask that is 4 times larger. This converges
+ * after three cycles to direct hashing.
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+
+static int
+HASHwidth(BUN hashsize)
+{
+ if (hashsize <= (BUN) BUN2_NONE)
+ return BUN2;
+#if SIZEOF_BUN <= 4
+ return BUN4;
+#else
+ if (hashsize <= (BUN) BUN4_NONE)
+ return BUN4;
+ return BUN8;
+#endif
+}
+
+BUN
+HASHmask(BUN cnt)
+{
+ BUN m = cnt;
+
+ /* find largest power of 2 smaller than or equal to cnt */
+ m |= m >> 1;
+ m |= m >> 2;
+ m |= m >> 4;
+ m |= m >> 8;
+ m |= m >> 16;
+#if SIZEOF_BUN == 8
+ m |= m >> 32;
+#endif
+ m -= m >> 1;
+
+ /* if cnt is more than 1/3 into the gap between m and 2*m,
+ double m */
+ if (m + m - cnt < 2 * (cnt - m))
+ m += m;
+ if (m < BATTINY)
+ m = BATTINY;
+ return m;
+}
+
+static void
+HASHclear(Hash *h)
+{
+ /* since BUN2_NONE, BUN4_NONE, BUN8_NONE
+ * are all equal to -1 (~0), i.e., have all bits set,
+ * we can use a simple memset() to clear the Hash,
+ * rather than iteratively assigning individual
+ * BUNi_NONE values in a for-loop
+ */
+ memset(h->Hash, 0xFF, (h->mask + 1) * h->width);
+}
+
+#define HASH_VERSION 1
+#define HASH_HEADER_SIZE 5 /* nr of size_t fields in header */
+
+Hash *
+HASHnew(Heap *hp, int tpe, BUN size, BUN mask, BUN count)
+{
+ Hash *h = NULL;
+ int width = HASHwidth(size);
+
+ if (HEAPalloc(hp, mask + size + HASH_HEADER_SIZE * SIZEOF_SIZE_T /
width, width) != GDK_SUCCEED)
+ return NULL;
+ hp->free = (mask + size) * width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
+ h = GDKmalloc(sizeof(Hash));
+ if (h == NULL)
+ return NULL;
+ h->lim = size;
+ h->mask = mask - 1;
+ h->width = width;
+ switch (width) {
+ case BUN2:
+ h->nil = (BUN) BUN2_NONE;
+ break;
+ case BUN4:
+ h->nil = (BUN) BUN4_NONE;
+ break;
+#ifdef BUN8
+ case BUN8:
+ h->nil = (BUN) BUN8_NONE;
+ break;
+#endif
+ default:
+ assert(0);
+ }
+ h->Link = hp->base + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
+ h->Hash = (void *) ((char *) h->Link + h->lim * width);
+ h->type = tpe;
+ h->heap = hp;
+ HASHclear(h); /* zero the mask */
+ ((size_t *) hp->base)[0] = HASH_VERSION;
+ ((size_t *) hp->base)[1] = size;
+ ((size_t *) hp->base)[2] = mask;
+ ((size_t *) hp->base)[3] = width;
+ ((size_t *) hp->base)[4] = count;
+ ALGODEBUG fprintf(stderr, "#HASHnew: create hash(size " BUNFMT ", mask
" BUNFMT ",width %d, nil " BUNFMT ", total " BUNFMT " bytes);\n", size, mask,
width, h->nil, (size + mask) * width);
+ return h;
+}
+
+#define starthash(TYPE)
\
+ do { \
+ TYPE *v = (TYPE *) BUNtloc(bi, 0); \
+ for (; r < p; r++) { \
+ BUN c = (BUN) hash_##TYPE(h, v+r); \
+ \
+ if (HASHget(h, c) == HASHnil(h) && nslots-- == 0) \
+ break; /* mask too full */ \
+ HASHputlink(h, r, HASHget(h, c)); \
+ HASHput(h, c, r); \
+ } \
+ } while (0)
+#define finishhash(TYPE) \
+ do { \
+ TYPE *v = (TYPE *) BUNtloc(bi, 0); \
+ for (; p < q; p++) { \
+ BUN c = (BUN) hash_##TYPE(h, v + p); \
+ \
+ HASHputlink(h, p, HASHget(h, c)); \
+ HASHput(h, c, p); \
+ } \
+ } while (0)
+
+/* collect HASH statistics for analysis */
+static void
+HASHcollisions(BAT *b, Hash *h)
+{
+ lng cnt, entries = 0, max = 0;
+ double total = 0;
+ BUN p, i, j, nil;
+
+ if (b == 0 || h == 0)
+ return;
+ nil = HASHnil(h);
+ for (i = 0, j = h->mask; i <= j; i++)
+ if ((p = HASHget(h, i)) != nil) {
+ entries++;
+ cnt = 0;
+ for (; p != nil; p = HASHgetlink(h, p))
+ cnt++;
+ if (cnt > max)
+ max = cnt;
+ total += cnt;
+ }
+ fprintf(stderr, "#BAThash: statistics (" BUNFMT ", entries " LLFMT ",
mask " BUNFMT ", max " LLFMT ", avg %2.6f);\n", BATcount(b), entries, h->mask,
max, entries == 0 ? 0 : total / entries);
+}
+
+/* Return TRUE if we have a hash on the tail, even if we need to read
+ * one from disk.
+ *
+ * Note that the b->T->hash pointer can be NULL, meaning there is no
+ * hash; (Hash *) 1, meaning there is no hash loaded, but it may exist
+ * on disk; or a valid pointer to a loaded hash. These values are
+ * maintained here, in the HASHdestroy/HASHremove and HASHfree
+ * functions, and in BBPdiskscan during initialization. */
+int
+BATcheckhash(BAT *b)
+{
+ int ret;
+ lng t;
+
+ t = GDKusec();
+ MT_lock_set(&GDKhashLock(abs(b->batCacheid)));
+ t = GDKusec() - t;
+ if (b->T->hash == (Hash *) 1) {
+ Hash *h;
+ Heap *hp;
+ const char *nme = BBP_physical(b->batCacheid);
+ const char *ext = b->batCacheid > 0 ? "thash" : "hhash";
+ int fd;
+
+ b->T->hash = NULL;
+ if ((hp = GDKzalloc(sizeof(*hp))) != NULL &&
+ (hp->farmid = BBPselectfarm(b->batRole, b->ttype,
hashheap)) >= 0 &&
+ (hp->filename = GDKmalloc(strlen(nme) + 12)) != NULL) {
+ sprintf(hp->filename, "%s.%s", nme, ext);
+
+ /* check whether a persisted hash can be found */
+ if ((fd = GDKfdlocate(hp->farmid, nme, "rb+", ext)) >=
0) {
+ size_t hdata[HASH_HEADER_SIZE];
+ struct stat st;
+
+ if ((h = GDKmalloc(sizeof(*h))) != NULL &&
+ read(fd, hdata, sizeof(hdata)) ==
sizeof(hdata) &&
+ hdata[0] == (
+#ifdef PERSISTENTHASH
+ ((size_t) 1 << 24) |
+#endif
+ HASH_VERSION) &&
+ hdata[4] == (size_t) BATcount(b) &&
+ fstat(fd, &st) == 0 &&
+ st.st_size >= (off_t) (hp->size = hp->free
= (hdata[1] + hdata[2]) * hdata[3] + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
+ HEAPload(hp, nme, ext, 0) == GDK_SUCCEED) {
+ h->lim = (BUN) hdata[1];
+ h->type = ATOMtype(b->ttype);
+ h->mask = (BUN) (hdata[2] - 1);
+ h->heap = hp;
+ h->width = (int) hdata[3];
+ switch (h->width) {
+ case BUN2:
+ h->nil = (BUN) BUN2_NONE;
+ break;
+ case BUN4:
+ h->nil = (BUN) BUN4_NONE;
+ break;
+#ifdef BUN8
+ case BUN8:
+ h->nil = (BUN) BUN8_NONE;
+ break;
+#endif
+ default:
+ assert(0);
+ }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list