Changeset: 6f7d34b56bed for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6f7d34b56bed
Modified Files:
gdk/gdk_hash.c
gdk/gdk_hash.h
Branch: linear-hashing
Log Message:
Use two masks instead of level + split.
diffs (229 lines):
diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -49,13 +49,9 @@ HASHwidth(BUN hashsize)
#endif
}
-BUN
-HASHmask(BUN cnt)
+static inline BUN
+hashmask(BUN m)
{
- BUN m = cnt;
-
-#if 0
- /* find largest power of 2 smaller than or equal to cnt */
m |= m >> 1;
m |= m >> 2;
m |= m >> 4;
@@ -64,6 +60,17 @@ HASHmask(BUN cnt)
#if SIZEOF_BUN == 8
m |= m >> 32;
#endif
+ return m;
+}
+
+BUN
+HASHmask(BUN cnt)
+{
+ BUN m = cnt;
+
+#if 0
+ /* find largest power of 2 smaller than or equal to cnt */
+ m = hashmask(m);
m -= m >> 1;
/* if cnt is more than 1/3 into the gap between m and 2*m,
@@ -91,35 +98,7 @@ HASHclear(Hash *h)
}
#define HASH_VERSION 3
-#define HASH_HEADER_SIZE 8 /* nr of size_t fields in header */
-
-/* calculate (uint8_t) floor(log2(mask)) */
-static inline uint8_t
-HASHmasklevel(BUN mask)
-{
- assert(mask != 0);
-
-#if SIZEOF_BUN == SIZEOF_INT && defined(__GNUC__)
- return (uint8_t) (31 - __builtin_clz((unsigned int) mask));
-#elif SIZEOF_BUN == SIZEOF_INT && defined(_MSC_VER)
- return (uint8_t) (31 - __lzcnt((unsigned int) mask));
-#elif SIZEOF_BUN == SIZEOF_LONG && defined(__GNUC__)
- return (uint8_t) (63 - __builtin_clzl((unsigned long) mask));
-#elif SIZEOF_BUN == SIZEOF_LONG && defined(_MSC_VER)
- return (uint8_t) (63 - __lzcnt64((unsigned __int64) mask));
-#else
- uint8_t n = 0;
- uint8_t c = SIZEOF_BUN * 4;
- do {
- BUN y = mask >> c;
- if (y) {
- n += c;
- mask = y;
- }
- } while ((c >>= 1) != 0);
- return n;
-#endif
-}
+#define HASH_HEADER_SIZE 7 /* nr of size_t fields in header */
static void
doHASHdestroy(BAT *b, Hash *hs)
@@ -160,8 +139,15 @@ HASHnew(Hash *h, int tpe, BUN size, BUN
if (HEAPalloc(&h->heapbckt, mask + HASH_HEADER_SIZE * SIZEOF_SIZE_T /
h->width, h->width) != GDK_SUCCEED)
return GDK_FAIL;
h->heapbckt.free = mask * h->width + HASH_HEADER_SIZE * SIZEOF_SIZE_T;
- h->level = HASHmasklevel(mask);
- h->split = mask - ((BUN) 1 << h->level);
+ h->nbucket = mask;
+ if (mask & (mask - 1)) {
+ h->mask2 = hashmask(mask);
+ h->mask1 = h->mask2 >> 1;
+ } else {
+ /* mask is a power of two */
+ h->mask1 = mask - 1;
+ h->mask2 = h->mask1 << 1 | 1;
+ }
switch (h->width) {
case BUN2:
h->nil = (BUN) BUN2_NONE;
@@ -182,12 +168,11 @@ HASHnew(Hash *h, int tpe, BUN size, BUN
HASHclear(h); /* zero the mask */
((size_t *) h->heapbckt.base)[0] = (size_t) HASH_VERSION;
((size_t *) h->heapbckt.base)[1] = (size_t) size;
- ((size_t *) h->heapbckt.base)[2] = (size_t) h->level;
- ((size_t *) h->heapbckt.base)[3] = (size_t) h->split;
- ((size_t *) h->heapbckt.base)[4] = (size_t) h->width;
- ((size_t *) h->heapbckt.base)[5] = (size_t) count;
- ((size_t *) h->heapbckt.base)[6] = (size_t) h->nunique;
- ((size_t *) h->heapbckt.base)[7] = (size_t) h->nheads;
+ ((size_t *) h->heapbckt.base)[2] = (size_t) h->nbucket;
+ ((size_t *) h->heapbckt.base)[3] = (size_t) h->width;
+ ((size_t *) h->heapbckt.base)[4] = (size_t) count;
+ ((size_t *) h->heapbckt.base)[5] = (size_t) h->nunique;
+ ((size_t *) h->heapbckt.base)[6] = (size_t) h->nheads;
ACCELDEBUG fprintf(stderr, "#%s: HASHnew: create hash(size " BUNFMT ",
mask " BUNFMT ", width %d, total " BUNFMT " bytes);\n", MT_thread_getname(),
size, mask, h->width, (size + mask) * h->width);
return GDK_SUCCEED;
}
@@ -332,10 +317,10 @@ HASHgrowbucket(BAT *b)
}
}
while (h->nunique >= (nbucket = NHASHBUCKETS(h)) * 7 / 8) {
- BUN old = h->split;
- BUN new = ((BUN) 1 << h->level) + h->split;
+ BUN new = h->nbucket;
+ BUN old = new & h->mask1;
BATiter bi = bat_iterator(b);
- BUN msk = (BUN) 1 << h->level;
+ BUN msk = h->mask1 + 1; /* == h->mask2 - h->mask1 */
assert(h->heapbckt.free == nbucket * h->width +
HASH_HEADER_SIZE * SIZEOF_SIZE_T);
if (h->heapbckt.free + h->width > h->heapbckt.size) {
@@ -347,10 +332,11 @@ HASHgrowbucket(BAT *b)
h->Bckt = h->heapbckt.base + HASH_HEADER_SIZE *
SIZEOF_SIZE_T;
}
assert(h->heapbckt.free + h->width <= h->heapbckt.size);
- if (++h->split == ((BUN) 1 << h->level)) {
- h->level++;
- h->split = 0;
+ if (h->nbucket == h->mask2) {
+ h->mask1 = h->mask2;
+ h->mask2 = h->mask2 << 1 | 1;
}
+ h->nbucket++;
h->heapbckt.free += h->width;
BUN lold, lnew, hb;
lold = lnew = HASHnil(h);
@@ -450,9 +436,9 @@ BATcheckhash(BAT *b)
((size_t) 1 << 24) |
#endif
HASH_VERSION) &&
- hdata[5] == (size_t) BATcount(b) &&
+ hdata[4] == (size_t) BATcount(b) &&
fstat(fd, &st) == 0 &&
- st.st_size >= (off_t)
(h->heapbckt.size = h->heapbckt.free = (((BUN) 1 << (h->level = (uint8_t)
hdata[2])) + (h->split = (BUN) hdata[3])) * (BUN) (h->width = (uint8_t)
hdata[4]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
+ st.st_size >= (off_t)
(h->heapbckt.size = h->heapbckt.free = (h->nbucket = (BUN) hdata[2]) * (BUN)
(h->width = (uint8_t) hdata[3]) + HASH_HEADER_SIZE * SIZEOF_SIZE_T) &&
close(fd) == 0 &&
(fd =
GDKfdlocate(h->heaplink.farmid, nme, "rb+", "thashl")) >= 0 &&
fstat(fd, &st) == 0 &&
@@ -460,8 +446,15 @@ BATcheckhash(BAT *b)
st.st_size >= (off_t)
(h->heaplink.size = h->heaplink.free = hdata[1] * h->width) &&
HEAPload(&h->heaplink, nme,
"thashl", false) == GDK_SUCCEED &&
HEAPload(&h->heapbckt, nme,
"thashb", false) == GDK_SUCCEED) {
- h->nunique = hdata[6];
- h->nheads = hdata[7];
+ if (h->nbucket & (h->nbucket -
1)) {
+ h->mask2 =
hashmask(h->nbucket);
+ h->mask1 = h->mask2 >>
1;
+ } else {
+ h->mask1 = h->nbucket -
1;
+ h->mask2 = h->mask1 <<
1 | 1;
+ }
+ h->nunique = hdata[5];
+ h->nheads = hdata[6];
h->type = ATOMtype(b->ttype);
switch (h->width) {
case BUN2:
@@ -538,12 +531,11 @@ BAThashsave(BAT *b, bool dosync)
* mean time */
((size_t *) hp->base)[0] = (size_t) HASH_VERSION;
((size_t *) hp->base)[1] = (size_t) (h->heaplink.free /
h->width);
- ((size_t *) hp->base)[2] = (size_t) h->level;
- ((size_t *) hp->base)[3] = (size_t) h->split;
- ((size_t *) hp->base)[4] = (size_t) h->width;
- ((size_t *) hp->base)[5] = (size_t) BATcount(b);
- ((size_t *) hp->base)[6] = (size_t) h->nunique;
- ((size_t *) hp->base)[7] = (size_t) h->nheads;
+ ((size_t *) hp->base)[2] = (size_t) h->nbucket;
+ ((size_t *) hp->base)[3] = (size_t) h->width;
+ ((size_t *) hp->base)[4] = (size_t) BATcount(b);
+ ((size_t *) hp->base)[5] = (size_t) h->nunique;
+ ((size_t *) hp->base)[6] = (size_t) h->nheads;
if (!b->theap.dirty &&
HEAPsave(&h->heaplink, h->heaplink.filename, NULL, dosync)
== GDK_SUCCEED &&
HEAPsave(hp, hp->filename, NULL, dosync) == GDK_SUCCEED) {
diff --git a/gdk/gdk_hash.h b/gdk/gdk_hash.h
--- a/gdk/gdk_hash.h
+++ b/gdk/gdk_hash.h
@@ -23,8 +23,9 @@
typedef struct Hash {
int type; /* type of index entity */
uint8_t width; /* width of hash entries */
- uint8_t level; /* mask is (1<<.level)-1 or (2<<.level)-1 */
- BUN split; /* ... depending on hash value and .split */
+ BUN mask1; /* .mask1 < .nbucket <= .mask2 */
+ BUN mask2;
+ BUN nbucket; /* ... depending on hash value and .nbucket */
BUN nil; /* nil representation */
BUN nunique; /* number of unique values */
BUN nheads; /* number of chain heads */
@@ -33,13 +34,11 @@ typedef struct Hash {
Heap heaplink; /* heap where the hash links are stored */
Heap heapbckt; /* heap where the hash buckets are stored */
} Hash;
-#define NHASHBUCKETS(h) (((BUN) 1 << (h)->level) + (h)->split)
+#define NHASHBUCKETS(h) ((h)->nbucket)
static inline BUN
HASHbucket(const Hash *h, BUN v)
{
- return (v & (((BUN) 1 << h->level) - 1)) < h->split
- ? v & (((BUN) 2 << h->level) - 1)
- : v & (((BUN) 1 << h->level) - 1);
+ return (v &= h->mask2) < h->nbucket ? v : v & h->mask1;
}
gdk_export gdk_return BAThash(BAT *b);
@@ -176,8 +175,8 @@ HASHgetlink(Hash *h, BUN i)
#define hash_loc(H,V) hash_any(H,V)
#define hash_var(H,V) hash_any(H,V)
#define hash_any(H,V) HASHbucket(H, ATOMhash((H)->type, (V)))
-#define hash_bte(H,V) (assert((H)->level >= 8), (BUN) mix_bte(*(const
unsigned char*) (V)))
-#define hash_sht(H,V) (assert((H)->level >= 16), (BUN) mix_sht(*(const
unsigned short*) (V)))
+#define hash_bte(H,V) (assert((H)->nbucket >= 256), (BUN) mix_bte(*(const
unsigned char*) (V)))
+#define hash_sht(H,V) (assert((H)->nbucket >= 65536), (BUN) mix_sht(*(const
unsigned short*) (V)))
#define hash_int(H,V) HASHbucket(H, (BUN) mix_int(*(const unsigned int *)
(V)))
/* XXX return size_t-sized value for 8-byte oid? */
#define hash_lng(H,V) HASHbucket(H, (BUN) mix_lng(*(const ulng *) (V)))
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list