Changeset: 30d504324760 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=30d504324760 Added Files: gdk/gdk_string.c Modified Files: gdk/Makefile.ag gdk/gdk_atoms.c gdk/gdk_private.h Branch: default Log Message:
Moved string atom implementation to a separate file. diffs (truncated from 1632 to 300 lines): diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag --- a/gdk/Makefile.ag +++ b/gdk/Makefile.ag @@ -21,7 +21,7 @@ lib_gdk = { gdk_orderidx.c \ gdk_align.c gdk_bbp.c gdk_bbp.h \ gdk_heap.c gdk_utils.c gdk_utils.h \ - gdk_atoms.c gdk_atoms.h \ + gdk_atoms.c gdk_atoms.h gdk_string.c \ gdk_qsort.c gdk_qsort_impl.h \ gdk_storage.c gdk_bat.c \ gdk_delta.c gdk_cross.c gdk_system.c gdk_value.c \ diff --git a/gdk/gdk_atoms.c b/gdk/gdk_atoms.c --- a/gdk/gdk_atoms.c +++ b/gdk/gdk_atoms.c @@ -258,7 +258,6 @@ const lng lng_nil = GDK_lng_min-1; const hge hge_nil = GDK_hge_min-1; #endif const oid oid_nil = (oid) 1 << (sizeof(oid) * 8 - 1); -const char str_nil[2] = { '\200', 0 }; const ptr ptr_nil = NULL; ptr @@ -407,13 +406,11 @@ TYPE##ToStr(char **dst, size_t *len, con return snprintf(*dst, *len, FMT, FMTCAST *src); \ } -#define num08(x) ((x) >= '0' && (x) <= '7') #define num10(x) GDKisdigit(x) -#define num16(x) isxdigit((unsigned char) (x)) #define base10(x) ((x) - '0') -#define base08(x) ((x) - '0') + +#define num16(x) isxdigit((unsigned char) (x)) #define base16(x) (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) >= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0') -#define mult08(x) ((x) << 3) #define mult16(x) ((x) << 4) static void * @@ -1056,746 +1053,6 @@ fltToStr(char **dst, size_t *len, const atom_io(flt, Int, int) -/* String Atom Implementation - * - * Strings are stored in two parts. The first part is the normal tail - * heap which contains a list of offsets. The second part is the - * theap which contains the actual strings. The offsets in the tail - * heap (a.k.a. offset heap) point into the theap (a.k.a. string - * heap). Strings are NULL-terminated and are stored without any - * escape sequec=nces. Strings are encoded using the UTF-8 encoding - * of Unicode. This means that individual "characters" (really, - * Unicode code points) can be between one and four bytes long. - * - * Because in many typical situations there are lots of duplicated - * string values that are being stored in a table, but also in many - * (other) typical situations there are very few duplicated string - * values stored, a scheme has been introduced to cater to both - * situations. - * - * When the string heap is "small" (defined as less than 64KiB), the - * string heap is fully duplicate eliminated. When the string heap - * grows beyond this size, the heap is not kept free of duplicate - * strings, but there is then a heuristic that tries to limit the - * number of duplicates. - * - * This is done by having a fixed sized hash table at the start of the - * string heap, and allocating space for collision lists in the first - * 64KiB of the string heap. After the first 64KiB no extra space is - * allocated for lists, so hash collisions cannot be resolved. - */ - -int -strNil(const char *s) -{ - return GDK_STRNIL(s); -} - -size_t -strLen(const char *s) -{ - return GDK_STRLEN(s); -} - -static int -strCmp(const char *l, const char *r) -{ - return GDK_STRCMP(l, r); -} - -int -strCmpNoNil(const unsigned char *l, const unsigned char *r) -{ - while (*l == *r) { - if (*l == 0) - return 0; - l++; - r++; - } - return (*l < *r) ? -1 : 1; -} - -static void -strHeap(Heap *d, size_t cap) -{ - size_t size; - - cap = MAX(cap, BATTINY); - size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * GDK_VARALIGN); - if (HEAPalloc(d, size, 1) == GDK_SUCCEED) { - d->free = GDK_STRHASHTABLE * sizeof(stridx_t); - d->dirty = 1; - memset(d->base, 0, d->free); - d->hashash = 0; -#ifndef NDEBUG - /* fill should solve initialization problems within valgrind */ - memset(d->base + d->free, 0, d->size - d->free); -#endif - } -} - - -BUN -strHash(const char *s) -{ - BUN res; - - GDK_STRHASH(s, res); - return res; -} - -void -strCleanHash(Heap *h, bool rebuild) -{ - stridx_t newhash[GDK_STRHASHTABLE]; - size_t pad, pos; - const size_t extralen = h->hashash ? EXTRALEN : 0; - BUN off, strhash; - const char *s; - - (void) rebuild; - if (!h->cleanhash) - return; - /* rebuild hash table for double elimination - * - * If appending strings to the BAT was aborted, if the heap - * was memory mapped, the hash in the string heap may well be - * incorrect. Therefore we don't trust it when we read in a - * string heap and we rebuild the complete table (it is small, - * so this won't take any time at all). - * Note that we will only do this the first time the heap is - * loaded, and only for heaps that existed when the server was - * started. */ - memset(newhash, 0, sizeof(newhash)); - pos = GDK_STRHASHSIZE; - while (pos < h->free && pos < GDK_ELIMLIMIT) { - pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1)); - if (pad < sizeof(stridx_t)) - pad += GDK_VARALIGN; - pos += pad + extralen; - s = h->base + pos; - if (h->hashash) - strhash = ((const BUN *) s)[-1]; - else - GDK_STRHASH(s, strhash); - off = strhash & GDK_STRHASHMASK; - newhash[off] = (stridx_t) (pos - extralen - sizeof(stridx_t)); - pos += GDK_STRLEN(s); - } - /* only set dirty flag if the hash table actually changed */ - if (memcmp(newhash, h->base, sizeof(newhash)) != 0) { - memcpy(h->base, newhash, sizeof(newhash)); - if (h->storage == STORE_MMAP) { - if (!(GDKdebug & NOSYNCMASK)) - (void) MT_msync(h->base, GDK_STRHASHSIZE); - } else - h->dirty = 1; - } -#ifndef NDEBUG - if (GDK_ELIMDOUBLES(h)) { - pos = GDK_STRHASHSIZE; - while (pos < h->free) { - pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1)); - if (pad < sizeof(stridx_t)) - pad += GDK_VARALIGN; - pos += pad + extralen; - s = h->base + pos; - assert(strLocate(h, s) != 0); - pos += GDK_STRLEN(s); - } - } -#endif - h->cleanhash = 0; -} - -/* - * The strPut routine. The routine strLocate can be used to identify - * the location of a string in the heap if it exists. Otherwise it - * returns zero. - */ -var_t -strLocate(Heap *h, const char *v) -{ - stridx_t *ref, *next; - const size_t extralen = h->hashash ? EXTRALEN : 0; - - /* search hash-table, if double-elimination is still in place */ - BUN off; - GDK_STRHASH(v, off); - off &= GDK_STRHASHMASK; - - /* should only use strLocate iff fully double eliminated */ - assert(GDK_ELIMBASE(h->free) == 0); - - /* search the linked list */ - for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) { - next = (stridx_t *) (h->base + *ref); - if (GDK_STRCMP(v, (str) (next + 1) + extralen) == 0) - return (var_t) ((sizeof(stridx_t) + *ref + extralen)); /* found */ - } - return 0; -} - -static var_t -strPut(Heap *h, var_t *dst, const char *v) -{ - size_t elimbase = GDK_ELIMBASE(h->free); - size_t pad; - size_t pos, len = GDK_STRLEN(v); - const size_t extralen = h->hashash ? EXTRALEN : 0; - stridx_t *bucket; - BUN off, strhash; - - GDK_STRHASH(v, off); - strhash = off; - off &= GDK_STRHASHMASK; - bucket = ((stridx_t *) h->base) + off; - - if (*bucket) { - /* the hash list is not empty */ - if (*bucket < GDK_ELIMLIMIT) { - /* small string heap (<64KiB) -- fully double - * eliminated: search the linked list */ - const stridx_t *ref = bucket; - - do { - pos = *ref + sizeof(stridx_t) + extralen; - if (GDK_STRCMP(v, h->base + pos) == 0) { - /* found */ - return *dst = (var_t) pos; - } - ref = (stridx_t *) (h->base + *ref); - } while (*ref); - } else { - /* large string heap (>=64KiB) -- there is no - * linked list, so only look at single - * entry */ - pos = *bucket + extralen; - if (GDK_STRCMP(v, h->base + pos) == 0) { - /* already in heap: reuse */ - return *dst = (var_t) pos; - } - } - } - /* the string was not found in the heap, we need to enter it */ - - pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1)); - if (elimbase == 0) { /* i.e. h->free < GDK_ELIMLIMIT */ - if (pad < sizeof(stridx_t)) { - /* make room for hash link */ - pad += GDK_VARALIGN; - } - } else if (extralen == 0) { /* i.e., h->hashash == FALSE */ - /* no VARSHIFT and no string hash value stored => no - * padding/alignment needed */ - pad = 0; - } else { - /* pad to align on VARALIGN for VARSHIFT and/or string - * hash value */ - pad &= (GDK_VARALIGN - 1); - } - - /* check heap for space (limited to a certain maximum after - * which nils are inserted) */ - if (h->free + pad + len + extralen >= h->size) { - size_t newsize = MAX(h->size, 4096); - - /* double the heap size until we have enough space */ - do { - if (newsize < 4 * 1024 * 1024) - newsize <<= 1; - else - newsize += 4 * 1024 * 1024; - } while (newsize <= h->free + pad + len + extralen); - - assert(newsize); - - if (h->free + pad + len + extralen >= (size_t) VAR_MAX) { - GDKerror("strPut: string heaps gets larger than %zuGiB.\n", (size_t) VAR_MAX >> 30); - return 0; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list